From 88e6fa7fdb1fbaf576237d95744c901c3699030b Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 18 Oct 2024 08:25:54 -0700 Subject: [PATCH 001/153] add the lsr-drop-solution=1 compiler flag (#1582) --- CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index cfcfa24b3..0700fe838 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -202,6 +202,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302) add_compile_options(-fno-offload-uniform-block) endif() endif() +if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000) + check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION) + if(HAS_LSR_DROP_SOLUTION) + message("Adding the lsr-drop-solution=1 compiler flag") + add_compile_options("SHELL: -mllvm --lsr-drop-solution=1") + endif() +endif() if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090) check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED) if(HAS_ENABLE_POST_MISCHED) -- GitLab From a285d6f9b5c8ada9f306fae9724d6788060e7e2a Mon Sep 17 00:00:00 2001 From: Haocong WANG Date: Fri, 18 Oct 2024 23:46:11 +0800 Subject: [PATCH 002/153] disable bad instance detected on MI308CPX (#1584) --- .../device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp index 5cebad491..5c525244e 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp @@ -46,7 +46,7 @@ using device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_instances = std::tuple< DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 192, 256, 64, 16, 8, 32, 32, 3, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 128, 16, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + // DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, // We prefer following instance, however, existing compiler bug cause it failed to generate sanity code. // DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> -- GitLab From 95e722a3b357334fe05b0a7f217b60c591592967 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Mon, 21 Oct 2024 10:52:11 +0800 Subject: [PATCH 003/153] [CK_TILE] Optimize fmha splitkv & splitkv combine kernels (#1577) * Use smaller width for lse_accum dist tensor * Update pipeline comment * Fix wrong distribution for lse_accum * Remove duplicate dim in lse_accum dist encoding * Decide fmha splitkv combine kernel kBlockSize by kM0 * Remove assumption of MPerThread=1 * Add log<4> & log<8> specialization * Enlarge occupancy array * Fix vector size for small tile * Add support for kMaxSplits=8 * Re-format gemm.hpp * Use 16x16x16 warp gemm for fwd_splitkv * Centralize policy code changes * Leave fp8/bf8 tile settings unchanged --- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 30 +++++---- ...lock_fmha_fwd_splitkv_combine_pipeline.hpp | 27 +++++--- ...plitkv_combine_pipeline_default_policy.hpp | 67 +++++++++++++------ .../pipeline/block_fmha_pipeline_problem.hpp | 3 +- ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 15 ++++- include/ck_tile/ops/gemm.hpp | 2 +- 6 files changed, 96 insertions(+), 48 deletions(-) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index 82cf3a5ab..57360ea99 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -191,7 +191,9 @@ using trait_{F_idx} = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_m template<> void fmha_fwd_splitkv_combine_oneshot_(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) {{ - if (a.num_splits <= 16) {{ + if (a.num_splits <= 8) {{ + kernel_runner<3>::run(s, a); + }} else if (a.num_splits <= 16) {{ kernel_runner<4>::run(s, a); }} else if (a.num_splits <= 32) {{ kernel_runner<5>::run(s, a); @@ -239,7 +241,7 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const FMHA_FWD_SPLITKV_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) && ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{ using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; - using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>; + using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>; return fmha_fwd_splitkv_(s, a); }} @@ -551,14 +553,14 @@ class FmhaFwdSplitKVCombineKernel: def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]: if dtype == 'fp16' or dtype == 'bf16': return { - '32' : FmhaFwdTileSize(128, 64, 16, 32, 32, 32, 2, 1, 1, 32, 32, 16, -1), - '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 32, 32, 16, -1), - '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 32, 32, 16, -1), - '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 32, 32, 16, -1), + '32' : FmhaFwdTileSize(32, 64, 16, 32, 32, 32, 2, 1, 1, 16, 16, 16, -1), + '64' : FmhaFwdTileSize(64, 64, 32, 64, 32, 64, 4, 1, 1, 16, 16, 16, -1), + '128' : FmhaFwdTileSize(64, 128, 32, 128, 32, 128, 4, 1, 1, 16, 16, 16, -1), + '256' : FmhaFwdTileSize(64, 128, 32, 256, 32, 256, 4, 1, 1, 16, 16, 16, -1), } elif dtype == 'fp8' or dtype == 'bf8': return { - '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 2, 1, 1, 32, 32, 32, -1), + '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 2, 1, 1, 32, 32, 32, -1), '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 32, 32, 32, -1), '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 32, 32, 32, -1) } @@ -568,16 +570,16 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]: def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[dict]: if dtype == 'fp16' or dtype == 'bf16': return { - '32' : FmhaFwdSplitKVCombineTileSize(64, 32, -1), - '64' : FmhaFwdSplitKVCombineTileSize(64, 64, -1), - '128' : FmhaFwdSplitKVCombineTileSize(64, 128, -1), - '256' : FmhaFwdSplitKVCombineTileSize(64, 256, -1), + '32' : FmhaFwdSplitKVCombineTileSize(16, 16, -1), + '64' : FmhaFwdSplitKVCombineTileSize(32, 32, -1), + '128' : FmhaFwdSplitKVCombineTileSize(32, 64, -1), + '256' : FmhaFwdSplitKVCombineTileSize(32, 128, -1), } elif dtype == 'fp8' or dtype == 'bf8': return { - '64' : FmhaFwdSplitKVCombineTileSize(64, 64, -1), - '128' : FmhaFwdSplitKVCombineTileSize(64, 128, -1), - '256' : FmhaFwdSplitKVCombineTileSize(64, 256, -1), + '64' : FmhaFwdSplitKVCombineTileSize(64, 32, -1), + '128' : FmhaFwdSplitKVCombineTileSize(64, 64, -1), + '256' : FmhaFwdSplitKVCombineTileSize(64, 128, -1), } else: return None diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp index 1afe0feab..7c49fce99 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp @@ -12,6 +12,16 @@ namespace detail { template struct log2; +template <> +struct log2<4> : std::integral_constant +{ +}; + +template <> +struct log2<8> : std::integral_constant +{ +}; + template <> struct log2<16> : std::integral_constant { @@ -72,18 +82,18 @@ struct BlockFmhaFwdSplitKVCombinePipeline { if constexpr(kHeadDimV <= 32) { - constexpr std::array occupancy{3, 3, 3, 1}; - return occupancy[detail::log2::value - 4]; + constexpr std::array occupancy{3, 3, 3, 3, 3, 1}; + return occupancy[detail::log2::value - 2]; } else if constexpr(kHeadDimV <= 128) { - constexpr std::array occupancy{3, 3, 2, 1}; - return occupancy[detail::log2::value - 4]; + constexpr std::array occupancy{3, 3, 3, 3, 2, 1}; + return occupancy[detail::log2::value - 2]; } else if constexpr(kHeadDimV <= 256) { - constexpr std::array occupancy{2, 2, 2, 1}; - return occupancy[detail::log2::value - 4]; + constexpr std::array occupancy{2, 2, 2, 2, 2, 1}; + return occupancy[detail::log2::value - 2]; } } }(); @@ -138,9 +148,8 @@ struct BlockFmhaFwdSplitKVCombinePipeline auto lse_accum = make_static_distributed_tensor( Policy::template MakeLSEaccRegTileDistribution()); - // copy LDS (shape=[kM0, kMaxSplits]) to lse_accum (shape=[kM0, max(kMaxSplits, warp_size)]) - // this will extend the distributed tensor width so that each thread in wave have data to - // reduce. + // copy LDS (shape=[kM0, kMaxSplits]) to lse_accum (shape=[kM0, kMaxSplits]) + // and fill up -INF values outside the [kM0, num_splits] region. { constexpr auto spans = decltype(lse_accum)::get_distributed_spans(); sweep_tile_span(spans[number<0>{}], [&](auto idx0) { diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp index 3327d4af8..ebd69c0cf 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp @@ -10,11 +10,26 @@ namespace ck_tile { struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy { + template + CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeForTile() + { + constexpr index_t PixelsPerThread = (M * N) / BlockSize; + static_assert(0 < PixelsPerThread); + + constexpr index_t MaxNPerThread = 16 / sizeof(DataType); + constexpr index_t NPerThread = min(MaxNPerThread, PixelsPerThread); + + return NPerThread; + } + + // alignment for dram lse tile (shape=[kMaxSplits, kM0]) template CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentLSE() { - using LSEDataType = remove_cvref_t; - return 16 / sizeof(LSEDataType); + return GetVectorSizeForTile(); } template @@ -47,29 +62,31 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy MakeLSEaccLdsBlockDescriptor().get_element_space_size(); } + // shape=[kMaxSplits, kM0] template CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccDramTileDistribution() { using LSEDataType = remove_cvref_t; constexpr index_t kBlockSize = Problem::kBlockSize; + constexpr index_t kNumWarps = Problem::kNumWarps; constexpr index_t kNPerBlock = Problem::kM0; constexpr index_t kMPerBlock = Problem::kMaxSplits; - constexpr index_t NPerThread = 16 / sizeof(LSEDataType); - constexpr index_t NThreads = kNPerBlock / NPerThread; + constexpr index_t NPerThread = + GetVectorSizeForTile(); + constexpr index_t NThreads = kNPerBlock / NPerThread; constexpr index_t MThreadsPerWarp = get_warp_size() / NThreads; - constexpr index_t TotalWarps = kBlockSize / get_warp_size(); - constexpr index_t MPerThread = kMPerBlock / (TotalWarps * MThreadsPerWarp); + constexpr index_t MPerThread = kMPerBlock / (kNumWarps * MThreadsPerWarp); static_assert(NThreads * NPerThread == kNPerBlock); - static_assert(MPerThread * TotalWarps * MThreadsPerWarp == kMPerBlock); + static_assert(MPerThread * kNumWarps * MThreadsPerWarp == kMPerBlock); return make_static_tile_distribution( tile_distribution_encoding, - tuple, + tuple, sequence>, tuple, sequence<1, 2>>, tuple, sequence<2, 0>>, @@ -77,15 +94,18 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy sequence<0, 1>>{}); } - // 3d + padding, [kMaxSplits, kM0] + // 3d + padding, shape=[kMaxSplits, kM0] template CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccLdsStoreBlockDescriptor() { using LSEDataType = remove_cvref_t; + constexpr index_t kBlockSize = Problem::kBlockSize; + constexpr index_t kMPerBlock = Problem::kMaxSplits; constexpr index_t kNPerBlock = Problem::kM0; - constexpr index_t NPack = 16 / sizeof(LSEDataType); + constexpr index_t NPack = + GetVectorSizeForTile(); constexpr auto lse_acc_lds_block_desc_0 = make_naive_tensor_descriptor( make_tuple(number{}, number{}, number{}), @@ -103,15 +123,18 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy return lse_acc_lds_block_desc; } - // 3d + padding, [kM0, kMaxSplits] + // 3d + padding, shape=[kM0, kMaxSplits] template CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccLdsBlockDescriptor() { using LSEDataType = remove_cvref_t; + constexpr index_t kBlockSize = Problem::kBlockSize; + constexpr index_t kMPerBlock = Problem::kMaxSplits; constexpr index_t kNPerBlock = Problem::kM0; - constexpr index_t NPack = 16 / sizeof(LSEDataType); + constexpr index_t NPack = + GetVectorSizeForTile(); constexpr auto lse_acc_lds_block_desc_0 = make_naive_tensor_descriptor( make_tuple(number{}, number{}, number{}), @@ -134,26 +157,28 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy { constexpr index_t kBlockSize = Problem::kBlockSize; - constexpr index_t kNPerBlock = max(Problem::kMaxSplits, get_warp_size()); + constexpr index_t kNPerBlock = Problem::kMaxSplits; constexpr index_t kMPerBlock = Problem::kM0; - constexpr index_t NThreads = get_warp_size(); + constexpr index_t NThreads = 4; constexpr index_t NPerThread = kNPerBlock / NThreads; - constexpr index_t MThreads = kBlockSize / NThreads; - constexpr index_t MPerThread = kMPerBlock / MThreads; + constexpr index_t MThreads = kBlockSize / NThreads; + constexpr index_t MPerThread = kMPerBlock / MThreads; + constexpr index_t MWarps = kBlockSize / get_warp_size(); + constexpr index_t MThreadPerWarp = get_warp_size() / NThreads; static_assert(NThreads * NPerThread == kNPerBlock); - static_assert(MThreads * MPerThread == kMPerBlock); + static_assert(MWarps * MThreadPerWarp * MPerThread == kMPerBlock); return make_static_tile_distribution( tile_distribution_encoding< sequence<1>, - tuple, sequence>, - tuple, sequence<2>>, - tuple, sequence<0>>, + tuple, sequence>, + tuple, sequence<2, 1>>, + tuple, sequence<0, 1>>, sequence<1, 2>, - sequence<1, 1>>{}); + sequence<2, 1>>{}); } template diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp index d254f07e2..1846664e7 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp @@ -115,7 +115,8 @@ struct BlockFmhaSplitKVCombinePipelineProblem using ODataType = remove_cvref_t; using Traits = remove_cvref_t; - static constexpr index_t kBlockSize = 256; + static constexpr index_t kNumWarps = kM0_ / (get_warp_size() / 4); + static constexpr index_t kBlockSize = kNumWarps * get_warp_size(); static constexpr bool kIsGroupMode = kIsGroupMode_; static constexpr index_t kHeadDimV = HeadDimV_; diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp index 8fa325241..a66d2be78 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp @@ -88,22 +88,33 @@ struct BlockFmhaPipelineQXCustomPolicy typename Problem::BlockFmhaShape::Gemm0WarpTile>>; constexpr auto warp_gemm = []() { + constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}); + static_assert(WarpGemmM == 16 || WarpGemmM == 32); + if constexpr(std::is_same_v && std::is_same_v && std::is_same_v) { - return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{}; + if constexpr(WarpGemmM == 32) + return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{}; + else // WarpGemmM == 16 + return WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution{}; } else if constexpr(std::is_same_v && std::is_same_v && std::is_same_v) { - return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{}; + if constexpr(WarpGemmM == 32) + return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{}; + else // WarpGemmM == 16 + return WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution{}; } else if constexpr(std::is_same_v && std::is_same_v && std::is_same_v) { + static_assert(WarpGemmM == 32); + // TODO: hard coded here. Otherwise, it may incorrect result constexpr index_t swizzle_factor = 4; return WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution< diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 436d964c3..e70825570 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -23,12 +23,12 @@ #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" -#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp" #include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp" #include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp" #include "ck_tile/ops/gemm/warp/warp_gemm.hpp" -- GitLab From 560917b1610eded84d4383c6927a2a2b8704b2a4 Mon Sep 17 00:00:00 2001 From: Thomas Ning Date: Mon, 21 Oct 2024 22:47:48 +0800 Subject: [PATCH 004/153] Ck profiler instance support (#1575) * The draft on ckProfiler instance add * support the ck profiler instance with same data types * add a small feature on the M and N variable switch. * Partially solve the incorrect result problem * fix based on ci cd --- ..._xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp | 9 +++++++- ..._xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp | 17 ++++++++++++-- ..._xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp | 6 +++++ ..._xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp | 6 +++++ ...emm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp | 16 +++++++++++-- ...emm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp | 23 ++++++++++++++++++- profiler/src/profile_gemm_universal.cpp | 23 +++++++++++++++---- 7 files changed, 90 insertions(+), 10 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp index 615711147..3300c4b0f 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp @@ -44,8 +44,11 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tu DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 2, 2, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, // Can we support this kind of odd case? 224(256) = 28*8 + (4*8) //DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, @@ -64,10 +67,13 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_instances = std::tup // Latency friendly DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, // Memory friendly DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 2, 16, 16, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 2, 2, 16, 16, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 4, 16, 16, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 4, 4, 16, 16, 2, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, @@ -75,7 +81,8 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_instances = std::tup DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 4, 4, 16, 16, 1, 2, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 4, 4, 16, 16, 1, 4, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 2, 4, 16, 16, 1, 4, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 2, 4, 16, 16, 1, 4, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 2, 2, 16, 16, 1, 4, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> // clang-format on >; } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp index 32a7d640d..d7b005118 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp @@ -44,13 +44,21 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tu // Compute friendly DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 8, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 8, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 8, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 64, 8, 8, 16, 16, 8, 7, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 32, 1, 8>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 2, 2, 32, 32, 2, 2, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> // clang-format on >; @@ -64,18 +72,23 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances = std::tup // Latency friendly DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 8, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 8, 16, 16, 1, 1, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 8, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, // Memory friendly DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 4, 8, 16, 16, 2, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 4, 4, 16, 16, 2, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 8, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 8, 16, 16, 1, 1, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 8, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 4, 8, 16, 16, 1, 2, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 4, 4, 16, 16, 1, 2, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 4, 8, 16, 16, 1, 4, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 2, 8, 16, 16, 1, 4, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 2, 8, 16, 16, 1, 4, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 2, 2, 16, 16, 1, 4, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> // clang-format on >; } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp index 2b1e84f0c..9566d5555 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp @@ -43,6 +43,8 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tu //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, @@ -63,14 +65,18 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances = std::tup // Latency friendly DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, // Memory friendly DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 2, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 2, 2, 16, 16, 4, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 4, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 4, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 8, 4, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp index d56771823..72162b65d 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp @@ -44,6 +44,8 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tu // Compute friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, // AGPR Spill @@ -69,8 +71,12 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tup DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, // Memory friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 4, 4, 16, 16, 4, 1, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 2, 2, 16, 16, 4, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 8, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp index d34c83a60..af9494f5a 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp @@ -41,6 +41,8 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, @@ -49,7 +51,9 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 256, 32, 8, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 128, 32, 8, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 2, 2, 32, 32, 2, 2, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> // clang-format on >; @@ -63,12 +67,19 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple< // Latency friendly DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 4, 4, 16, 16, 1, 1, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 2, 2, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, // Memory friendly DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 8, 2, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 2, 2, 32, 32, 2, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 2, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 2, 2, 16, 16, 4, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 8, 4, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 4, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 64, 8, 4, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, @@ -82,6 +93,7 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple< DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 64, 8, 4, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 4, 4, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 4, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp index ca90efa4c..f9d693f45 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp @@ -42,11 +42,20 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple // Compute friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 32, 4, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 16, 16, 8, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 16, 16, 8, 8, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, // AGPR Spill // DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, // AGPR Spill when use permuted lds layout. so, use padding for these two. @@ -70,13 +79,21 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple< // Latency friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 8, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, // Memory friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 8, 8, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 4, 4, 32, 32, 2, 1, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 2, 2, 32, 32, 2, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 8, 8, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 4, 4, 32, 32, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 2, 2, 32, 32, 2, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 64, 8, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 8, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, @@ -84,12 +101,16 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple< DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 8, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 8, 8, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 64, 8, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 64, 8, 8, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 8, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 8, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 4, 4, 32, 32, 1, 2, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 2, 2, 32, 32, 1, 2, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> // clang-format on >; } // namespace instance diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp index a2ef11713..f86dddc72 100644 --- a/profiler/src/profile_gemm_universal.cpp +++ b/profiler/src/profile_gemm_universal.cpp @@ -57,6 +57,25 @@ int profile_gemm_universal(int argc, char* argv[]) exit(1); } + int M; + int N; + int StrideA; + int StrideB; + // Analyze the unsupported matrix shapes, switch the M and N number + if(std::stoi(argv[9]) % 8 != 0 && std::stoi(argv[8]) % 8 == 0) + { + M = std::stoi(argv[9]); + StrideA = std::stoi(argv[12]); + N = std::stoi(argv[8]); + StrideB = std::stoi(argv[11]); + } + else + { + M = std::stoi(argv[8]); + StrideA = std::stoi(argv[11]); + N = std::stoi(argv[9]); + StrideB = std::stoi(argv[12]); + } const auto data_type = static_cast(std::stoi(argv[2])); const auto layout = static_cast(std::stoi(argv[3])); const bool do_verification = std::stoi(argv[4]); @@ -64,12 +83,8 @@ int profile_gemm_universal(int argc, char* argv[]) const bool do_log = std::stoi(argv[6]); const bool time_kernel = std::stoi(argv[7]); - const int M = std::stoi(argv[8]); - const int N = std::stoi(argv[9]); const int K = std::stoi(argv[10]); - const int StrideA = std::stoi(argv[11]); - const int StrideB = std::stoi(argv[12]); const int StrideC = std::stoi(argv[13]); const int KBatch = std::stoi(argv[14]); -- GitLab From d0565e33d6eb8f0c464080dcbd8f879250ca5067 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Oct 2024 08:34:53 -0700 Subject: [PATCH 005/153] Bump rocm-docs-core from 1.8.2 to 1.8.3 in /docs/sphinx (#1587) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.8.2 to 1.8.3. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.8.2...v1.8.3) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index fa1897e23..c2220e15d 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.8.2 +rocm-docs-core==1.8.3 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 7d0c92d04..0dc2e70c5 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.8.2 +rocm-docs-core==1.8.3 # via -r requirements.in six==1.16.0 # via pybtex -- GitLab From 794f2d64a8a03a1408126332451a7e75f589d4ef Mon Sep 17 00:00:00 2001 From: spolifroni-amd Date: Mon, 21 Oct 2024 11:35:57 -0400 Subject: [PATCH 006/153] added link to documentation (#1578) --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 4366ec032..053406515 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # Composable Kernel +> [!NOTE] +> The published documentation is available at [Composable Kernel](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). + The Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library uses general purpose kernel languages, such as HIP C++. -- GitLab From 3f710930f6f570e47025a30286ce12a1a3549bb7 Mon Sep 17 00:00:00 2001 From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Date: Mon, 21 Oct 2024 10:45:22 -0500 Subject: [PATCH 007/153] Update default stride (#1576) * Update default stride value to -1 * Fix format * Revert "Fix format" This reverts commit ae0c3649ec48e330bb162cd6a12fd3d2e3bee64a. --------- Co-authored-by: Harisankar Sadasivan <135730918+hsadasiv@users.noreply.github.com> --- example/01_gemm/common.hpp | 24 +++++++++---------- example/01_gemm/run_gemm_example.inc | 12 +++++----- .../01_gemm/run_gemm_example_streamk_v2.inc | 4 ++-- example/01_gemm/run_gemm_example_v2.inc | 12 +++++----- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp index eb1738e76..d08196924 100644 --- a/example/01_gemm/common.hpp +++ b/example/01_gemm/common.hpp @@ -29,9 +29,9 @@ struct ProblemSize final ck::index_t N = 4096; ck::index_t K = 4096; - ck::index_t StrideA = 0; - ck::index_t StrideB = 0; - ck::index_t StrideC = 0; + ck::index_t StrideA = -1; + ck::index_t StrideB = -1; + ck::index_t StrideC = -1; }; struct ProblemSizeStreamK final @@ -40,9 +40,9 @@ struct ProblemSizeStreamK final ck::index_t N = 4096; ck::index_t K = 4096; - ck::index_t StrideA = 0; - ck::index_t StrideB = 0; - ck::index_t StrideC = 0; + ck::index_t StrideA = -1; + ck::index_t StrideB = -1; + ck::index_t StrideC = -1; ck::index_t NumSKBlocks = -1; }; @@ -52,9 +52,9 @@ struct ProblemSizeStreamK_universal final ck::index_t N = 4096; ck::index_t K = 4096; - ck::index_t StrideA = 0; - ck::index_t StrideB = 0; - ck::index_t StrideC = 0; + ck::index_t StrideA = -1; + ck::index_t StrideB = -1; + ck::index_t StrideC = -1; ck::index_t Grid_size = -1; // defaults to max occupancy ck::index_t Streamk_sel = 1; // defaults to 1-tile SK @@ -66,9 +66,9 @@ struct ProblemSizeSplitK final ck::index_t N = 4096; ck::index_t K = 4096; - ck::index_t StrideA = 0; - ck::index_t StrideB = 0; - ck::index_t StrideC = 0; + ck::index_t StrideA = -1; + ck::index_t StrideB = -1; + ck::index_t StrideC = -1; ck::index_t KBatch = 1; }; diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index f66d2adc1..fe12998e3 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -116,21 +116,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) }; auto f_get_default_stride = - [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - if(stride == 0) + [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) { + if(stride == -1) { - // give a chance if stride is zero, return a default packed stride + // give a chance if stride is -1, return a default packed stride if constexpr(std::is_same_v) { - return col; + return static_cast(col); } else { - return row; + return static_cast(row); } } else - return stride; + return static_cast(stride); }; StrideA = f_get_default_stride(M, K, StrideA, ALayout{}); diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc index 32bd3a19a..6679f9515 100644 --- a/example/01_gemm/run_gemm_example_streamk_v2.inc +++ b/example/01_gemm/run_gemm_example_streamk_v2.inc @@ -117,9 +117,9 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) auto f_get_default_stride = [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) { - if(stride == 0) + if(stride == -1) { - // give a chance if stride is 0, return a default packed stride + // give a chance if stride is -1, return a default packed stride if constexpr(std::is_same_v) { return static_cast(col); diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc index ad7238f0d..0bcee658b 100644 --- a/example/01_gemm/run_gemm_example_v2.inc +++ b/example/01_gemm/run_gemm_example_v2.inc @@ -115,21 +115,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) }; auto f_get_default_stride = - [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - if(stride == 0) + [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) { + if(stride == -1) { - // give a chance if stride is zero, return a default packed stride + // give a chance if stride is -1, return a default packed stride if constexpr(std::is_same_v) { - return col; + return static_cast(col); } else { - return row; + return static_cast(row); } } else - return stride; + return static_cast(stride); }; StrideA = f_get_default_stride(M, K, StrideA, ALayout{}); -- GitLab From 0394f8a713d40aae40339691e8ab980823d76a54 Mon Sep 17 00:00:00 2001 From: ltqin Date: Tue, 22 Oct 2024 09:26:18 +0800 Subject: [PATCH 008/153] update layernorm (#1570) * port layernorm * change warp_welford.hpp * Update warpshuffle * 1. Add save mean and save std back 2. Move construction of tensor_view and tile_window to operator() * refine welford max count calculation * unify layernorm api * Rename file * Remove save mean and inv std * Revert "refine welford max count calculation" This reverts commit 022365802b43a398deee2bc672785fa31a89297d. * Fix order of parameter * refine welford max count calculation again * Remove fp32 instances * Fix bug of padding * refactor api * Support bf16 * Extract common function * Refine arg of operator() * Add kMThreadPerBlock to template parameter * clang format * Refine variable name * Refine file name * remove redundant line * refactor layernorm2d pipeline and add block-per-block utility * fix name * rename more * add more block-per-tile instance * remove duplicated define * update instance for 2048, 1024 case * support up to 2048 now * opt loading * add n1536 * Add two pass pipeline * format * Fix incorrect type * parallel compilation * Use smaller N * fix 2p pass * Support Repeat_M in distribution * Refine nameing * Add reduce example --------- Co-authored-by: letaoqin Co-authored-by: aska-0096 Co-authored-by: rocking Co-authored-by: carlushuang --- ...n_complex_contraction_bilinear_example.inc | 223 ++++---- example/ck_tile/02_layernorm2d/CMakeLists.txt | 21 +- example/ck_tile/02_layernorm2d/README.md | 5 +- .../instances/layernorm2d_fwd_api.cpp | 155 ++++++ .../layernorm2d_fwd_bf16_n1024_instance.cpp | 22 + .../layernorm2d_fwd_bf16_n1536_instance.cpp | 13 + .../layernorm2d_fwd_bf16_n2048_instance.cpp | 14 + .../layernorm2d_fwd_bf16_n256_instance.cpp | 12 + .../layernorm2d_fwd_bf16_n3072_instance.cpp | 14 + .../layernorm2d_fwd_bf16_n4096_instance.cpp | 14 + ...layernorm2d_fwd_bf16_n4096_tp_instance.cpp | 14 + .../layernorm2d_fwd_bf16_n512_instance.cpp | 13 + ...layernorm2d_fwd_bf16_n64_n128_instance.cpp | 12 + .../layernorm2d_fwd_bf16_n768_instance.cpp | 12 + .../layernorm2d_fwd_fp16_n1024_instance.cpp | 22 + .../layernorm2d_fwd_fp16_n1536_instance.cpp | 13 + .../layernorm2d_fwd_fp16_n2048_instance.cpp | 14 + .../layernorm2d_fwd_fp16_n256_instance.cpp | 12 + .../layernorm2d_fwd_fp16_n3072_instance.cpp | 14 + .../layernorm2d_fwd_fp16_n4096_instance.cpp | 14 + ...layernorm2d_fwd_fp16_n4096_tp_instance.cpp | 14 + .../layernorm2d_fwd_fp16_n512_instance.cpp | 13 + ...layernorm2d_fwd_fp16_n64_n128_instance.cpp | 12 + .../layernorm2d_fwd_fp16_n768_instance.cpp | 12 + .../layernorm2d_fwd_instance_common.hpp | 67 +++ .../02_layernorm2d/layernorm2d_fwd.cpp | 236 ++++----- .../02_layernorm2d/layernorm2d_fwd.hpp | 117 +++- .../02_layernorm2d/script/perf_test.sh | 38 ++ .../02_layernorm2d/script/smoke_test.sh | 31 ++ example/ck_tile/05_reduce/CMakeLists.txt | 19 + example/ck_tile/05_reduce/reduce.cpp | 110 ++++ example/ck_tile/05_reduce/reduce.hpp | 118 +++++ example/ck_tile/CMakeLists.txt | 1 + include/ck_tile/core.hpp | 1 + include/ck_tile/core/arch/utility.hpp | 43 ++ include/ck_tile/core/config.hpp | 2 + include/ck_tile/core/container/sequence.hpp | 122 +++++ include/ck_tile/core/container/tuple.hpp | 20 + .../core/tensor/static_distributed_tensor.hpp | 14 + include/ck_tile/core/tensor/sweep_tile.hpp | 278 ++++++++++ .../ck_tile/core/tensor/tile_distribution.hpp | 158 ++---- .../core/utility/functional_with_tuple.hpp | 173 ++++++ include/ck_tile/host.hpp | 2 +- ...rm2d.hpp => reference_layernorm2d_fwd.hpp} | 0 include/ck_tile/ops/layernorm2d.hpp | 7 +- .../kernel/layernorm2d_fwd_kernel.hpp | 499 ++++++------------ .../kernel/layernorm2d_fwd_shape.hpp | 78 +++ .../block_layernorm2d_fwd_problem.hpp | 34 -- ...ayernorm2d_fwd_pipeline_default_policy.hpp | 99 ++++ .../layernorm2d_fwd_pipeline_one_pass.hpp | 119 +++++ .../layernorm2d_fwd_pipeline_problem.hpp | 40 ++ .../layernorm2d_fwd_pipeline_two_pass.hpp | 160 ++++++ .../pipeline/tile_layernorm2d_fwd_shape.hpp | 35 -- .../ck_tile/ops/reduce/block/block_reduce.hpp | 2 +- include/ck_tile/ops/welford.hpp | 3 +- .../ops/welford/block/block_welford.hpp | 362 +++++++++++++ .../welford/block/block_welford_problem.hpp | 18 + .../ops/welford/thread/thread_welford.hpp | 113 +--- .../ck_tile/ops/welford/warp/warp_welford.hpp | 154 ------ 59 files changed, 2916 insertions(+), 1041 deletions(-) mode change 100755 => 100644 example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp create mode 100755 example/ck_tile/02_layernorm2d/script/perf_test.sh create mode 100755 example/ck_tile/02_layernorm2d/script/smoke_test.sh create mode 100644 example/ck_tile/05_reduce/CMakeLists.txt create mode 100644 example/ck_tile/05_reduce/reduce.cpp create mode 100644 example/ck_tile/05_reduce/reduce.hpp create mode 100644 include/ck_tile/core/utility/functional_with_tuple.hpp rename include/ck_tile/host/reference/{reference_layernorm2d.hpp => reference_layernorm2d_fwd.hpp} (100%) create mode 100644 include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp delete mode 100644 include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp create mode 100644 include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp create mode 100644 include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp create mode 100644 include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp create mode 100644 include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp delete mode 100644 include/ck_tile/ops/layernorm2d/pipeline/tile_layernorm2d_fwd_shape.hpp create mode 100644 include/ck_tile/ops/welford/block/block_welford.hpp create mode 100644 include/ck_tile/ops/welford/block/block_welford_problem.hpp delete mode 100644 include/ck_tile/ops/welford/warp/warp_welford.hpp diff --git a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc old mode 100755 new mode 100644 index b54842754..82ac0a15e --- a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc +++ b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc @@ -127,44 +127,47 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) switch(init_method) { - case 0: break; - case 1: + case 0: break; + case 1: - a_ms_ks_re.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b_ns_ks_re.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - d_ms_ns_re.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + a_ms_ks_re.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_ns_ks_re.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + d_ms_ns_re.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - a_ms_ks_img.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b_ns_ks_img.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - d_ms_ns_img.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; + a_ms_ks_img.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_ns_ks_img.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + d_ms_ns_img.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; - default: - a_ms_ks_re.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - b_ns_ks_re.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - d_ms_ns_re.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + default: + a_ms_ks_re.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_ns_ks_re.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + d_ms_ns_re.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - a_ms_ks_img.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - b_ns_ks_img.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - d_ms_ns_img.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + a_ms_ks_img.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_ns_ks_img.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + d_ms_ns_img.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - break; + break; } DeviceMem a_device_buf_re(sizeof(ADataType) * a_ms_ks_re.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf_re(sizeof(BDataType) * b_ns_ks_re.mDesc.GetElementSpaceSize()); DeviceMem d_device_buf_re(sizeof(DDataType) * d_ms_ns_re.mDesc.GetElementSpaceSize()); - DeviceMem e_device_buf_re(sizeof(EDataType) * e_ms_ns_device_result_re.mDesc.GetElementSpaceSize()); + DeviceMem e_device_buf_re(sizeof(EDataType) * + e_ms_ns_device_result_re.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf_img(sizeof(ADataType) * a_ms_ks_img.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf_img(sizeof(BDataType) * b_ns_ks_img.mDesc.GetElementSpaceSize()); DeviceMem d_device_buf_img(sizeof(DDataType) * d_ms_ns_img.mDesc.GetElementSpaceSize()); - DeviceMem e_device_buf_img(sizeof(EDataType) * e_ms_ns_device_result_img.mDesc.GetElementSpaceSize()); + DeviceMem e_device_buf_img(sizeof(EDataType) * + e_ms_ns_device_result_img.mDesc.GetElementSpaceSize()); // Intermediate Value For E Real and Img - DeviceMem e_device_buf_re1(sizeof(EDataType) * e_ms_ns_device_result_re.mDesc.GetElementSpaceSize()); - DeviceMem e_device_buf_img1(sizeof(EDataType) * e_ms_ns_device_result_img.mDesc.GetElementSpaceSize()); - + DeviceMem e_device_buf_re1(sizeof(EDataType) * + e_ms_ns_device_result_re.mDesc.GetElementSpaceSize()); + DeviceMem e_device_buf_img1(sizeof(EDataType) * + e_ms_ns_device_result_img.mDesc.GetElementSpaceSize()); a_device_buf_re.ToDevice(a_ms_ks_re.mData.data()); b_device_buf_re.ToDevice(b_ns_ks_re.mData.data()); @@ -181,7 +184,7 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) // set zero for intermediate values e_device_buf_re1.SetZero(); e_device_buf_img1.SetZero(); - + auto a_element_op = AElementOp{}; auto b_element_op = BElementOp{}; auto cde_element_op = CDEElementOp{alpha, beta}; @@ -189,23 +192,24 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) // device operation // For real Intermediate Value re_1 - auto op = DeviceOpInstance{}; - auto invoker = op.MakeInvoker(); - auto argument_re1 = op.MakeArgument(a_device_buf_re.GetDeviceBuffer(), - b_device_buf_re.GetDeviceBuffer(), - std::array{d_device_buf_re.GetDeviceBuffer()}, - e_device_buf_re1.GetDeviceBuffer(), - a_ms_ks_lengths, - a_ms_ks_strides, - b_ns_ks_lengths, - b_ns_ks_strides, - std::array, 1>{d_ms_ns_lengths}, - std::array, 1>{d_ms_ns_strides}, - e_ms_ns_lengths, - e_ms_ns_strides, - a_element_op, - b_element_op, - cde_element_op); + auto op = DeviceOpInstance{}; + auto invoker = op.MakeInvoker(); + auto argument_re1 = + op.MakeArgument(a_device_buf_re.GetDeviceBuffer(), + b_device_buf_re.GetDeviceBuffer(), + std::array{d_device_buf_re.GetDeviceBuffer()}, + e_device_buf_re1.GetDeviceBuffer(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 1>{d_ms_ns_lengths}, + std::array, 1>{d_ms_ns_strides}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); if(!op.IsSupportedArgument(argument_re1)) { @@ -216,7 +220,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) float ave_time_re1 = invoker.Run(argument_re1, StreamConfig{nullptr, time_kernel}); - alpha = -1.f; beta = 1.f; @@ -228,21 +231,22 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) // For real Intermediate Value re_2 // auto op = DeviceOpInstance{}; // auto invoker = op.MakeInvoker(); - auto argument_re2 = op.MakeArgument(a_device_buf_img.GetDeviceBuffer(), - b_device_buf_img.GetDeviceBuffer(), - std::array{e_device_buf_re1.GetDeviceBuffer()}, - e_device_buf_re.GetDeviceBuffer(), - a_ms_ks_lengths, - a_ms_ks_strides, - b_ns_ks_lengths, - b_ns_ks_strides, - std::array, 1>{d_ms_ns_lengths}, - std::array, 1>{d_ms_ns_strides}, - e_ms_ns_lengths, - e_ms_ns_strides, - a_element_op, - b_element_op, - cde_element_op); + auto argument_re2 = + op.MakeArgument(a_device_buf_img.GetDeviceBuffer(), + b_device_buf_img.GetDeviceBuffer(), + std::array{e_device_buf_re1.GetDeviceBuffer()}, + e_device_buf_re.GetDeviceBuffer(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 1>{d_ms_ns_lengths}, + std::array, 1>{d_ms_ns_strides}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); if(!op.IsSupportedArgument(argument_re2)) { @@ -253,7 +257,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) float ave_time_re2 = invoker.Run(argument_re2, StreamConfig{nullptr, time_kernel}); - alpha = 1.f; beta = 1.f; @@ -261,22 +264,22 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) b_element_op = BElementOp{}; cde_element_op = CDEElementOp{alpha, beta}; - auto argument_img1 = op.MakeArgument(a_device_buf_re.GetDeviceBuffer(), - b_device_buf_img.GetDeviceBuffer(), - std::array{d_device_buf_img.GetDeviceBuffer()}, - e_device_buf_img1.GetDeviceBuffer(), - a_ms_ks_lengths, - a_ms_ks_strides, - b_ns_ks_lengths, - b_ns_ks_strides, - std::array, 1>{d_ms_ns_lengths}, - std::array, 1>{d_ms_ns_strides}, - e_ms_ns_lengths, - e_ms_ns_strides, - a_element_op, - b_element_op, - cde_element_op); - + auto argument_img1 = + op.MakeArgument(a_device_buf_re.GetDeviceBuffer(), + b_device_buf_img.GetDeviceBuffer(), + std::array{d_device_buf_img.GetDeviceBuffer()}, + e_device_buf_img1.GetDeviceBuffer(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 1>{d_ms_ns_lengths}, + std::array, 1>{d_ms_ns_strides}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); if(!op.IsSupportedArgument(argument_img1)) { @@ -290,23 +293,22 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) alpha = 1.f; beta = 1.f; - auto argument_img2 = op.MakeArgument(a_device_buf_img.GetDeviceBuffer(), - b_device_buf_re.GetDeviceBuffer(), - std::array{e_device_buf_img1.GetDeviceBuffer()}, - e_device_buf_img.GetDeviceBuffer(), - a_ms_ks_lengths, - a_ms_ks_strides, - b_ns_ks_lengths, - b_ns_ks_strides, - std::array, 1>{d_ms_ns_lengths}, - std::array, 1>{d_ms_ns_strides}, - e_ms_ns_lengths, - e_ms_ns_strides, - a_element_op, - b_element_op, - cde_element_op); - - + auto argument_img2 = + op.MakeArgument(a_device_buf_img.GetDeviceBuffer(), + b_device_buf_re.GetDeviceBuffer(), + std::array{e_device_buf_img1.GetDeviceBuffer()}, + e_device_buf_img.GetDeviceBuffer(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 1>{d_ms_ns_lengths}, + std::array, 1>{d_ms_ns_strides}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); if(!op.IsSupportedArgument(argument_img2)) { @@ -317,7 +319,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) float ave_time_img2 = invoker.Run(argument_img2, StreamConfig{nullptr, time_kernel}); - ck::index_t M = ck::accumulate_n(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{}); @@ -331,9 +332,9 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(DDataType) * M * N + sizeof(EDataType) * M * N * 2; - float ave_time = ave_time_img2 + ave_time_img1 + ave_time_re2 + ave_time_re1 ; + float ave_time = ave_time_img2 + ave_time_img1 + ave_time_re2 + ave_time_re1; - float tflops = static_cast(flop) / 1.E9 / ave_time; + float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " @@ -343,7 +344,7 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) e_device_buf_img.FromDevice(e_ms_ns_device_result_img.mData.data()); auto isRealOk = 0; - auto isImgOk = 0; + auto isImgOk = 0; if(do_verification) { @@ -366,17 +367,16 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) auto ref_op = ReferenceOpInstance{}; auto ref_invoker = ref_op.MakeInvoker(); - auto ref_argument_re = - ref_op.MakeArgument(a_ms_ks_re, b_ns_ks_re, c_ms_ns_host_result_re, a_element_op, b_element_op); + auto ref_argument_re = ref_op.MakeArgument( + a_ms_ks_re, b_ns_ks_re, c_ms_ns_host_result_re, a_element_op, b_element_op); ref_invoker.Run(ref_argument_re); alpha = 1.f; beta = 1.f; - + cde_element_op = CDEElementOp{alpha, beta}; - for(size_t m0 = 0; m0 < e_ms_ns_host_result_re.mDesc.GetLengths()[0]; ++m0) { for(size_t m1 = 0; m1 < e_ms_ns_host_result_re.mDesc.GetLengths()[1]; ++m1) @@ -395,11 +395,11 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) alpha = 1.f; beta = -1.f; - + cde_element_op = CDEElementOp{alpha, beta}; - auto ref_argument_re1 = - ref_op.MakeArgument(a_ms_ks_img, b_ns_ks_img, c_ms_ns_host_result_re1, a_element_op, b_element_op); + auto ref_argument_re1 = ref_op.MakeArgument( + a_ms_ks_img, b_ns_ks_img, c_ms_ns_host_result_re1, a_element_op, b_element_op); ref_invoker.Run(ref_argument_re1); @@ -419,23 +419,20 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) } } - isRealOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1; - - - + isRealOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1; // Img Part Verification Tensor c_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides); Tensor c_ms_ns_host_result_img1(e_ms_ns_lengths, e_ms_ns_strides); - auto ref_argument_img = - ref_op.MakeArgument(a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op); - + auto ref_argument_img = ref_op.MakeArgument( + a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op); + ref_invoker.Run(ref_argument_img); alpha = 1.f; beta = 1.f; - + cde_element_op = CDEElementOp{alpha, beta}; for(size_t m0 = 0; m0 < e_ms_ns_host_result_img.mDesc.GetLengths()[0]; ++m0) @@ -454,9 +451,9 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) } } - auto ref_argument_img1 = - ref_op.MakeArgument(a_ms_ks_img, b_ns_ks_re, c_ms_ns_host_result_img1, a_element_op, b_element_op); - + auto ref_argument_img1 = ref_op.MakeArgument( + a_ms_ks_img, b_ns_ks_re, c_ms_ns_host_result_img1, a_element_op, b_element_op); + ref_invoker.Run(ref_argument_img1); for(size_t m0 = 0; m0 < e_ms_ns_host_result_img.mDesc.GetLengths()[0]; ++m0) @@ -475,7 +472,7 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[]) } } - isImgOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1; + isImgOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1; return (isRealOk && isImgOk); } diff --git a/example/ck_tile/02_layernorm2d/CMakeLists.txt b/example/ck_tile/02_layernorm2d/CMakeLists.txt index bac5f45cd..feae5f791 100644 --- a/example/ck_tile/02_layernorm2d/CMakeLists.txt +++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt @@ -1,4 +1,21 @@ +set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd") # not using add_example_executable() to add this target, since we don't want this to have # to be included in "make all/install/check" -add_executable(tile_example_layernorm2d_fwd EXCLUDE_FROM_ALL layernorm2d_fwd.cpp) -target_compile_options(tile_example_layernorm2d_fwd PRIVATE -DSAVE_MEAN_INV_STD) \ No newline at end of file +message("adding example ${EXAMPLE_LAYERNORM2D_FWD}") +file(GLOB INSTANCE_SRCS instances/*.cpp) +add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp) +target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${INSTANCE_SRCS}) + +set(EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS) + +# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations +list(APPEND EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + +target_compile_options(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS}) + +# TODO: we have to turn off this global prop, otherwise the progress bar generated +# by cmake will print too many files, execvp: /bin/sh: Argument list too long +# however, this property may affect global +# TODO: consider codegen a makefile by us +set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md index 66b16c1b7..405325a2a 100644 --- a/example/ck_tile/02_layernorm2d/README.md +++ b/example/ck_tile/02_layernorm2d/README.md @@ -6,8 +6,7 @@ This folder contains example for Layernorm2D forward using ck_tile tile-programm ``` # in the root of ck_tile mkdir build && cd build -# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank -sh ../script/cmake-ck-dev.sh ../ +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... make tile_example_layernorm2d_fwd -j ``` This will result in an executable `build/bin/tile_example_layernorm2d_fwd` @@ -20,4 +19,4 @@ args: -e epsilon (default:1e-5) -v cpu validation or not (default:1) -prec precision (default:fp16) -``` \ No newline at end of file +``` diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp new file mode 100644 index 000000000..f2f51de5d --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "layernorm2d_fwd.hpp" + +template +using trait_ = layernorm2d_fwd_traits_; + +template +float layernorm2d_fwd_b16_(layernorm2d_fwd_traits /*t*/, + layernorm2d_fwd_args a, + const ck_tile::stream_config& s) +{ +#if 1 + float r = -1; + // clang-format off + // rm rn tm tn vn pd mv 2p + if(a.n <= 64) { + r = layernorm2d_fwd_>(s, a); + } + else if(a.n <= 128) { + if (a.n % 2 == 0) + r = layernorm2d_fwd_>(s, a); + else + r = layernorm2d_fwd_>(s, a); + } + else if(a.n <= 256) { + if (a.n % 4 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = layernorm2d_fwd_>(s, a); + else + r = layernorm2d_fwd_>(s, a); + } + else if(a.n <= 512) { + if (a.n % 8 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = layernorm2d_fwd_>(s, a); + else + r = layernorm2d_fwd_>(s, a); + } + else if(a.n <= 768) { + if (a.n % 4 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = layernorm2d_fwd_>(s, a); + else + r = layernorm2d_fwd_>(s, a); + } + else if(a.n <= 1024) { + if (a.n % 8 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = layernorm2d_fwd_>(s, a); + else + r = layernorm2d_fwd_>(s, a); + } + else if(a.n <= 1536) { + if (a.n % 8 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = layernorm2d_fwd_>(s, a); + else + r = layernorm2d_fwd_>(s, a); + } + else if(a.n <= 2048) { + if (a.n % 8 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = layernorm2d_fwd_>(s, a); + else + r = layernorm2d_fwd_>(s, a); + } + else if(a.n <= 3072) { + if (a.n % 8 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = layernorm2d_fwd_>(s, a); + else + r = layernorm2d_fwd_>(s, a); + } + else if(a.n <= 4096) { + if (a.n % 8 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = layernorm2d_fwd_>(s, a); + else + r = layernorm2d_fwd_>(s, a); + } + else if(a.n > 4096) { + if (a.n % 8 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = layernorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = layernorm2d_fwd_>(s, a); + else + r = layernorm2d_fwd_>(s, a); + } + return r; +#else + return layernorm2d_fwd_>(s, a); +#endif + // clang-format on +} + +float layernorm2d_fwd(layernorm2d_fwd_traits t, + layernorm2d_fwd_args a, + const ck_tile::stream_config& s) +{ + + float r = -1; + if(t.data_type.compare("fp16") == 0) + { + return layernorm2d_fwd_b16_(t, a, s); + } + else if(t.data_type.compare("bf16") == 0) + { + return layernorm2d_fwd_b16_(t, a, s); + } + if(r < 0) + throw std::runtime_error("Without supported instances!"); + + return r; +} diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp new file mode 100644 index 000000000..2a20d1e05 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +#if 0 +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); + +template float layernorm2d_fwd_>(const S&, A); +#endif + +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp new file mode 100644 index 000000000..d043efc86 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp new file mode 100644 index 000000000..a6ffc8cd2 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp new file mode 100644 index 000000000..80beeca67 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp new file mode 100644 index 000000000..b362a550a --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp new file mode 100644 index 000000000..9c2d78999 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp new file mode 100644 index 000000000..c0c75f878 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp new file mode 100644 index 000000000..1bcd0f8a7 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp new file mode 100644 index 000000000..6b25fce8c --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp new file mode 100644 index 000000000..c4400f0f2 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp new file mode 100644 index 000000000..7f0e4898c --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +#if 0 +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); + +template float layernorm2d_fwd_>(const S&, A); +#endif + +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp new file mode 100644 index 000000000..8c3a42cc4 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp new file mode 100644 index 000000000..04d8bc153 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp new file mode 100644 index 000000000..c32574749 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp new file mode 100644 index 000000000..c71db57a6 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp new file mode 100644 index 000000000..f3ca0932e --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp new file mode 100644 index 000000000..242f1d2dd --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp new file mode 100644 index 000000000..e3bfa8e3a --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp new file mode 100644 index 000000000..90d960cf0 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp new file mode 100644 index 000000000..0960a95c3 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd mv 2p +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +template float layernorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp new file mode 100644 index 000000000..22895e8ed --- /dev/null +++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp @@ -0,0 +1,67 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "layernorm2d_fwd.hpp" +#include + +#pragma once + +using S = ck_tile::stream_config; +using A = layernorm2d_fwd_args; + +template +using trait_ = layernorm2d_fwd_traits_; + +template +float layernorm2d_fwd_(const S& s, A a) +{ + using DataType = typename Traits_::DataType; + + using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem< + typename LayerNormTypeConfig::XDataType, + typename LayerNormTypeConfig::GammaDataType, + typename LayerNormTypeConfig::BetaDataType, + typename LayerNormTypeConfig::ComputeDataType, + typename LayerNormTypeConfig::YDataType, + typename LayerNormTypeConfig::MeanDataType, + typename LayerNormTypeConfig::InvStdDataType, + typename Traits_::Shape, + Traits_::kPadN, + Traits_::kSaveMeanInvStd, + Traits_::kTwoPass>; + + using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass; + using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass; + using Pipeline = std::conditional_t; + + using Kernel = ck_tile::Layernorm2dFwd; + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + auto kargs = Kernel::MakeKargs(a); + if(s.log_level_ > 0) + std::cout << ", " << Kernel::GetName() << std::flush; + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); +} diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp index 35f291e06..4f12d9103 100644 --- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp +++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp @@ -2,161 +2,120 @@ #include "layernorm2d_fwd.hpp" #include -// Host API implementation -float layernorm2d_fwd(layernorm2d_fwd_traits t, - layernorm2d_fwd_args a, - const ck_tile::stream_config& s) +// different threshold for different dtype +template +auto get_elimit() { - if(t.data_type.compare("fp16") == 0) - { - using XDataType = ck_tile::half_t; - using YDataType = ck_tile::half_t; - using GammaDataType = ck_tile::half_t; - using BetaDataType = ck_tile::half_t; -#ifdef SAVE_MEAN_INV_STD - using MeanDataType = ck_tile::half_t; - using InvStdDataType = ck_tile::half_t; -#else - using MeanDataType = ck_tile::null_type; - using InvStdDataType = ck_tile::null_type; -#endif - using ComputeDataType = float; - - using thread_tile = ck_tile::sequence<4, 4>; - using warp_tile = ck_tile::sequence<8, 128>; - using block_tile = ck_tile::sequence<32, 128>; - - using Shape = ck_tile::TileLayernorm2dShape; - - using PipelineProblem = ck_tile::BlockLayernorm2dFwdProblem; - - using Kernel = ck_tile::Layernorm2dFwd; - - auto kargs = Kernel::MakeKargs( - a.p_x, a.p_gamma, a.p_beta, a.p_y, a.p_mean, a.p_invStd, a.epsilon, a.M, a.N); - - const dim3 grids = Kernel::GridSize(a.M); - constexpr dim3 blocks = Kernel::BlockSize(); - - constexpr ck_tile::index_t kBlockPerCu = Shape::kMWarpPerBlock * Shape::kNWarpPerBlock; - - float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - - return ave_time; - } + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} - return 0; +template <> +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); } auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; arg_parser.insert("m", "3328", "m dimension") - .insert("n", "4096", "m dimension") + .insert("n", "4096", "n dimension") + .insert("stride", "-1", "stride per row, if -1 then equal to n") .insert("e", "1e-5", "epsilon") + .insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case") .insert("v", "1", "cpu validation or not") - .insert("prec", "fp16", "precision"); + .insert("kname", "1", "print kernel name or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); } -int main(int argc, char* argv[]) +template +bool run(const ck_tile::ArgParser& arg_parser) { - - auto [result, arg_parser] = create_args(argc, argv); - if(!result) - return -1; - + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = n; float epsilon = arg_parser.get_float("e"); - ck_tile::index_t M = arg_parser.get_int("m"); - ck_tile::index_t N = arg_parser.get_int("n"); std::string data_type = arg_parser.get_str("prec"); + int kname = arg_parser.get_int("kname"); int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); - using XDataType = ck_tile::half_t; - using YDataType = ck_tile::half_t; - using GammaDataType = ck_tile::half_t; - using BetaDataType = ck_tile::half_t; -#ifdef SAVE_MEAN_INV_STD - using MeanDataType = ck_tile::half_t; - using InvStdDataType = ck_tile::half_t; -#else - using MeanDataType = ck_tile::null_type; - using InvStdDataType = ck_tile::null_type; -#endif - using ComputeDataType = float; + assert(stride >= n); - // host verify - ck_tile::HostTensor x_host({M, N}); - ck_tile::HostTensor gamma_host({N}); - ck_tile::HostTensor beta_host({N}); + using TypeConfig = LayerNormTypeConfig; + + using XDataType = typename TypeConfig::XDataType; + using YDataType = typename TypeConfig::YDataType; + using GammaDataType = typename TypeConfig::GammaDataType; + using BetaDataType = typename TypeConfig::BetaDataType; + + using MeanDataType = + std::conditional_t; + using InvStdDataType = + std::conditional_t; - ck_tile::HostTensor y_host_ref({M, N}); - ck_tile::HostTensor y_host_dev({M, N}); + using ComputeDataType = typename TypeConfig::ComputeDataType; - ck_tile::HostTensor mean_host_ref({M}); - ck_tile::HostTensor invStd_host_ref({M}); + // host verify + ck_tile::HostTensor x_host({m, n}, {stride, 1}); + ck_tile::HostTensor gamma_host({n}); + ck_tile::HostTensor beta_host({n}); + + ck_tile::HostTensor y_host_ref({m, n}, {stride, 1}); + ck_tile::HostTensor y_host_dev({m, n}, {stride, 1}); -#ifdef SAVE_MEAN_INV_STD - ck_tile::HostTensor mean_host_dev({M}); - ck_tile::HostTensor invStd_host_dev({M}); -#endif + ck_tile::HostTensor mean_host_ref({m}); + ck_tile::HostTensor invStd_host_ref({m}); - ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host); - ck_tile::FillUniformDistribution{-5.f, 5.f}(gamma_host); - ck_tile::FillUniformDistribution{-5.f, 5.f}(beta_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(gamma_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(beta_host); ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes()); -#ifdef SAVE_MEAN_INV_STD - ck_tile::DeviceMem mean_buf(mean_host_dev.get_element_space_size_in_bytes()); - ck_tile::DeviceMem invStd_buf(invStd_host_dev.get_element_space_size_in_bytes()); -#endif - x_buf.ToDevice(x_host.data()); gamma_buf.ToDevice(gamma_host.data()); beta_buf.ToDevice(beta_host.data()); - layernorm2d_fwd_traits traits{data_type}; + std::cout << "[" << data_type << "]" + << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush; + + layernorm2d_fwd_traits traits{data_type, SaveMeanVar}; layernorm2d_fwd_args args{x_buf.GetDeviceBuffer(), gamma_buf.GetDeviceBuffer(), beta_buf.GetDeviceBuffer(), y_buf.GetDeviceBuffer(), -#ifdef SAVE_MEAN_INV_STD - mean_buf.GetDeviceBuffer(), - invStd_buf.GetDeviceBuffer(), -#else nullptr, nullptr, -#endif epsilon, - M, - N}; + m, + n, + stride}; - float ave_time = layernorm2d_fwd(traits, args, ck_tile::stream_config{nullptr, true}); + float ave_time = layernorm2d_fwd( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); - std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N + - sizeof(BetaDataType) * N + sizeof(YDataType) * M * N; + std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(GammaDataType) * n + + sizeof(BetaDataType) * n + sizeof(YDataType) * m * n; float gb_per_sec = num_byte / 1.E6 / ave_time; - std::cout << "[" << data_type << "]" - << " m:" << M << ", n:" << N << ", " << ave_time << " ms, " << gb_per_sec << " GB/s" - << std::flush; + std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush; bool pass = true; @@ -174,20 +133,59 @@ int main(int argc, char* argv[]) y_buf.FromDevice(y_host_dev.data()); - pass = ck_tile::check_err(y_host_dev, y_host_ref); + auto [rtol, atol] = get_elimit(); + if(stride == n) + { + pass = ck_tile::check_err( + y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol); + } + else + { + for(int i_r = 0; i_r < m; i_r++) + { + std::vector y_host_dev_row(y_host_dev.begin() + i_r * stride, + y_host_dev.begin() + i_r * stride + n); + std::vector y_host_ref_row(y_host_ref.begin() + i_r * stride, + y_host_ref.begin() + i_r * stride + n); + pass &= ck_tile::check_err(y_host_dev_row, + y_host_ref_row, + std::string("OUT[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } -#ifdef SAVE_MEAN_INV_STD - mean_buf.FromDevice(mean_host_dev.data()); - pass &= ck_tile::check_err(mean_host_dev, mean_host_ref); + return pass; +} - invStd_buf.FromDevice(invStd_host_dev.data()); - pass &= ck_tile::check_err(invStd_host_dev, invStd_host_ref); -#endif +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; - std::cout << ", valid:" << (pass ? "y" : "n") << std::flush; + const std::string data_type = arg_parser.get_str("prec"); + int save_mv = arg_parser.get_int("save_mv"); + if(data_type == "fp16" && save_mv) + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "fp16" && !save_mv) + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "bf16" && save_mv) + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "bf16" && !save_mv) + { + return run(arg_parser) ? 0 : -2; } - std::cout << std::endl << std::flush; - - return !pass; + return -3; } diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp index 4d1aac099..861e4a023 100644 --- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp +++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp @@ -8,23 +8,114 @@ #include "ck_tile/ops/layernorm2d.hpp" #include -struct layernorm2d_fwd_traits +template +struct LayerNormTypeConfig; + +template <> +struct LayerNormTypeConfig { - std::string data_type; + using XDataType = ck_tile::half_t; + using YDataType = ck_tile::half_t; + using GammaDataType = ck_tile::half_t; + using BetaDataType = ck_tile::half_t; + using MeanDataType = ck_tile::half_t; + using InvStdDataType = ck_tile::half_t; + using ComputeDataType = float; +}; + +template <> +struct LayerNormTypeConfig +{ + using XDataType = ck_tile::bf16_t; + using YDataType = ck_tile::bf16_t; + using GammaDataType = ck_tile::bf16_t; + using BetaDataType = ck_tile::bf16_t; + using MeanDataType = ck_tile::bf16_t; + using InvStdDataType = ck_tile::bf16_t; + using ComputeDataType = float; +}; + +// runtime args +struct layernorm2d_fwd_args : public ck_tile::Layernorm2dFwdHostArgs +{ +}; + +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template +struct layernorm2d_fwd_traits_ +{ + using DataType = ck_tile::remove_cvref_t; + + static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize; + static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0); + static constexpr ck_tile::index_t total_warps = + (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize; + + // num of warps along m + static constexpr ck_tile::index_t BlockWarps_M = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return total_warps * (warpSize / ThreadPerBlock_N_); + } + else + { + // static_assert(warpSize % ThreadPerBlock_M_ == 0); + return total_warps / (ThreadPerBlock_N_ / warpSize); + } + }(); + + // num of warps along n + static constexpr ck_tile::index_t BlockWarps_N = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return 1; + } + else + { + static_assert(ThreadPerBlock_N_ % warpSize == 0); + return ThreadPerBlock_N_ / warpSize; + } + }(); + + static constexpr ck_tile::index_t Repeat_M = Repeat_M_; + static constexpr ck_tile::index_t Repeat_N = Repeat_N_; + + static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_; + static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_; + + static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M; + static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_; + + using BlockTile = ck_tile::sequence; + using BlockWarps = ck_tile::sequence; + using WarpTile = ck_tile::sequence; + using Vector = ck_tile::sequence<1, Vector_N_>; + + using Shape = ck_tile::Layernorm2dShape; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; + static constexpr bool kTwoPass = kTwoPass_; }; -struct layernorm2d_fwd_args +template +float layernorm2d_fwd_(const ck_tile::stream_config& s, layernorm2d_fwd_args a); + +// This is the public API, will be generated by script +struct layernorm2d_fwd_traits { - const void* p_x; - const void* p_gamma; - const void* p_beta; - void* p_y; - void* p_mean; - void* p_invStd; - float epsilon; - ck_tile::index_t M; - ck_tile::index_t N; + std::string data_type; + bool save_mean_var; }; -// host API float layernorm2d_fwd(layernorm2d_fwd_traits, layernorm2d_fwd_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/02_layernorm2d/script/perf_test.sh b/example/ck_tile/02_layernorm2d/script/perf_test.sh new file mode 100755 index 000000000..bfb7f9ffe --- /dev/null +++ b/example/ck_tile/02_layernorm2d/script/perf_test.sh @@ -0,0 +1,38 @@ + +# run from top of ck folder +EXE=build/bin/tile_example_layernorm2d_fwd + +$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 + +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 \ No newline at end of file diff --git a/example/ck_tile/02_layernorm2d/script/smoke_test.sh b/example/ck_tile/02_layernorm2d/script/smoke_test.sh new file mode 100755 index 000000000..dcd40fda4 --- /dev/null +++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# call from top of CK folder +EXE=./build/bin/tile_example_layernorm2d_fwd + +for pr_i in "fp16" "bf16" ; do +$EXE -prec=$pr_i -m=99 -n=13 +$EXE -prec=$pr_i -m=17 -n=16 +$EXE -prec=$pr_i -m=1 -n=100 +$EXE -prec=$pr_i -m=4 -n=128 +$EXE -prec=$pr_i -m=80 -n=127 +$EXE -prec=$pr_i -m=22 -n=255 -stride=256 +$EXE -prec=$pr_i -m=7 -n=599 +$EXE -prec=$pr_i -m=19 -n=512 +$EXE -prec=$pr_i -m=33 -n=313 -stride=1000 +$EXE -prec=$pr_i -m=11 -n=510 +$EXE -prec=$pr_i -m=171 -n=676 -stride=818 +$EXE -prec=$pr_i -m=91 -n=636 +$EXE -prec=$pr_i -m=12 -n=768 -stride=800 +$EXE -prec=$pr_i -m=100 -n=766 -stride=812 +$EXE -prec=$pr_i -m=31 -n=1024 +$EXE -prec=$pr_i -m=64 -n=1000 -stride=1004 +$EXE -prec=$pr_i -m=8 -n=1501 +$EXE -prec=$pr_i -m=3 -n=1826 +$EXE -prec=$pr_i -m=5 -n=2040 +$EXE -prec=$pr_i -m=7 -n=2734 +$EXE -prec=$pr_i -m=1 -n=3182 +$EXE -prec=$pr_i -m=9 -n=4096 +$EXE -prec=$pr_i -m=3 -n=8192 +$EXE -prec=$pr_i -m=1 -n=10547 +$EXE -prec=$pr_i -m=3 -n=17134 +done diff --git a/example/ck_tile/05_reduce/CMakeLists.txt b/example/ck_tile/05_reduce/CMakeLists.txt new file mode 100644 index 000000000..6caa38d50 --- /dev/null +++ b/example/ck_tile/05_reduce/CMakeLists.txt @@ -0,0 +1,19 @@ +set(EXAMPLE_REDUCE "tile_example_reduce") +# not using add_example_executable() to add this target, since we don't want this to have +# to be included in "make all/install/check" +message("adding example ${EXAMPLE_REDUCE}") + +add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp) +target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +set(EXAMPLE_REDUCE_COMPILE_OPTIONS) + +# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations +list(APPEND EXAMPLE_REDUCE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + +target_compile_options(${EXAMPLE_REDUCE} PRIVATE ${EXAMPLE_REDUCE_COMPILE_OPTIONS}) + +# TODO: we have to turn off this global prop, otherwise the progress bar generated +# by cmake will print too many files, execvp: /bin/sh: Argument list too long +# however, this property may affect global +# TODO: consider codegen a makefile by us +set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) \ No newline at end of file diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp new file mode 100644 index 000000000..7973a8dfd --- /dev/null +++ b/example/ck_tile/05_reduce/reduce.cpp @@ -0,0 +1,110 @@ +#include "ck_tile/host.hpp" +#include "reduce.hpp" +#include + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3328", "m dimension") + .insert("n", "4096", "n dimension") + .insert("v", "1", "cpu validation or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + using ADataType = DataType; + using AccDataType = float; + using BDataType = DataType; + + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + ck_tile::HostTensor a_host({m, n}); + ck_tile::HostTensor b_host_ref({m}); + ck_tile::HostTensor b_host_dev({m}); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_host); + + ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_buf(b_host_dev.get_element_space_size_in_bytes()); + + a_buf.ToDevice(a_host.data()); + + using BlockWarps = ck_tile::sequence<4, 1>; + using BlockTile = ck_tile::sequence<128, 128>; + using WarpTile = ck_tile::sequence<32, 128>; + using ThreadTile = ck_tile::sequence<8, 8>; + + constexpr ck_tile::index_t kBlockSize = 256; + constexpr ck_tile::index_t kBlockPerCu = 1; + ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{})); + std::cout << "grid size " << kGridSize << std::endl; + + using Kernel = ck_tile::Reduce; + + float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, + ck_tile::make_kernel( + Kernel{}, + kGridSize, + kBlockSize, + 0, + static_cast(a_buf.GetDeviceBuffer()), + static_cast(b_buf.GetDeviceBuffer()), + m, + n)); + + std::size_t num_btype = sizeof(ADataType) * m * n + sizeof(BDataType) * m; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; + + bool pass = true; + + if(do_validation) + { + // reference + ck_tile::reference_reduce(a_host, b_host_ref); + b_buf.FromDevice(b_host_dev.mData.data()); + pass = ck_tile::check_err(b_host_dev, b_host_ref); + + std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + + if(data_type == "fp16") + { + return run(arg_parser) ? 0 : -2; + } + if(data_type == "bf16") + { + return run(arg_parser) ? 0 : -2; + } +} diff --git a/example/ck_tile/05_reduce/reduce.hpp b/example/ck_tile/05_reduce/reduce.hpp new file mode 100644 index 000000000..e36b46895 --- /dev/null +++ b/example/ck_tile/05_reduce/reduce.hpp @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +#include "ck_tile/ops/reduce/block/block_reduce.hpp" + +namespace ck_tile { + +template + typename BlockTile, // block size, seq + typename WarpTile, // warp size, seq + typename ThreadTile> // contiguous pixels(vector size) along seq +struct Reduce +{ + static constexpr index_t Block_M = BlockTile::at(number<0>{}); + static constexpr index_t Block_N = BlockTile::at(number<1>{}); + + static constexpr index_t Warp_M = WarpTile::at(number<0>{}); + static constexpr index_t Warp_N = WarpTile::at(number<1>{}); + + static constexpr index_t Thread_M = ThreadTile::at(number<0>{}); + static constexpr index_t Thread_N = ThreadTile::at(number<1>{}); + + static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{}); + static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{}); + + static constexpr index_t ThreadPerWarp_M = Warp_M / Thread_M; + static constexpr index_t ThreadPerWarp_N = Warp_N / Thread_N; + + static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); + static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); + + __device__ static constexpr auto MakeABlockTileDistribution() + { + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 2>>, + sequence<1, 1, 2, 2>, + sequence<0, 3, 0, 3>>{}); + } + + __device__ void operator()(const ADataType* p_a, BDataType* p_b, index_t M, index_t N) const + { + const auto a_m_n = make_naive_tensor_view( + p_a, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); + + const auto iM = get_block_id() * Block_M; + + // A window + auto a_block_window = make_tile_window(a_m_n, + make_tuple(number{}, number{}), + {iM, 0}, + MakeABlockTileDistribution()); + + const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; }; + + const ADataType reduce_init_value = 0; + + constexpr auto reduce_dims = sequence<1>{}; + + // Acc tile + // TODO: support cross warp reduction + auto acc_block_tensor = decltype(block_tile_reduce( + load_tile(a_block_window), reduce_dims, f_reduce, reduce_init_value)){}; + + // init Acc tile + tile_elementwise_inout( + [&](auto& acc) { acc = type_convert(reduce_init_value); }, + acc_block_tensor); + + // loop + index_t iN = 0; + + do + { + const auto a_block_tensor = load_tile(a_block_window); + + // FIXME: support cross warp reduction + block_tile_reduce(acc_block_tensor, a_block_tensor, reduce_dims, f_reduce); + + move_tile_window(a_block_window, {0, Block_N}); + + iN += Block_N; + + } while(iN < N); + + // FIXME: support cross warp reduction + block_tile_reduce_sync(acc_block_tensor, f_reduce); + + // convert acc_block_tensor to b_block_tensor + const auto b_block_tensor = tile_elementwise_in( + [](const auto& acc) { return type_convert(acc); }, acc_block_tensor); + + // B + const auto b_m = make_naive_tensor_view_packed( + p_b, make_tuple(M), number<32>{}); + + // B window + auto b_block_window = make_tile_window(b_m, make_tuple(number{}), {iM}); + + // store B tile + store_tile(b_block_window, b_block_tensor); + } +}; + +} // namespace ck_tile diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index fe1e9c9ed..ec4a175d3 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -6,3 +6,4 @@ add_subdirectory(01_fmha) add_subdirectory(02_layernorm2d) add_subdirectory(03_gemm) add_subdirectory(04_img2col) +add_subdirectory(05_reduce) diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 4cddf6faa..d96f14710 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -52,6 +52,7 @@ #include "ck_tile/core/tensor/update_tile.hpp" #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" +#include "ck_tile/core/utility/functional_with_tuple.hpp" #include "ck_tile/core/utility/ignore.hpp" #include "ck_tile/core/utility/magic_div.hpp" #include "ck_tile/core/utility/philox_rand.hpp" diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp index 42508e66a..a88780459 100644 --- a/include/ck_tile/core/arch/utility.hpp +++ b/include/ck_tile/core/arch/utility.hpp @@ -59,4 +59,47 @@ CK_TILE_DEVICE T warp_shuffle_down(const T& v_local, uint32_t lane_delta) #endif } +template +CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane) +{ +#if 0 + return __shfl(v_local, src_lane); +#elif 1 + if constexpr(sizeof(int32_t) > sizeof(T)) + { + union packet + { + int32_t x; + T v; + }; + packet p; + p.v = v_local; + packet p_remote; + p_remote.x = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast(p)); + + return p_remote.v; + } + else if constexpr(sizeof(int32_t) == sizeof(T)) + { + const int32_t v_remote_tmp = + __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast(v_local)); + + return bit_cast(v_remote_tmp); + } + else + { + static_assert(sizeof(T) % sizeof(int32_t) == 0, "wrong!"); + constexpr index_t elm = sizeof(T) / sizeof(int32_t); + using vector_type = thread_buffer; + auto vs = bit_cast(v_local); + auto vs_remote = vector_type{}; + static_for<0, elm, 1>{}([&](auto i_e) { + int32_t tmp = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast(vs[i_e])); + vs_remote(i_e) = tmp; + }); + return bit_cast(vs_remote); + } +#endif +} + } // namespace ck_tile diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp index a8bc27cdf..580faae92 100644 --- a/include/ck_tile/core/config.hpp +++ b/include/ck_tile/core/config.hpp @@ -32,11 +32,13 @@ #define CK_TILE_DEVICE inline __device__ #define CK_TILE_HOST_DEVICE inline __host__ __device__ #define CK_TILE_DEVICE_EXTERN __device__ +#define CK_TILE_HOST_DEVICE_EXTERN __host__ __device__ #else #define CK_TILE_HOST inline #define CK_TILE_DEVICE inline #define CK_TILE_HOST_DEVICE inline #define CK_TILE_DEVICE_EXTERN +#define CK_TILE_HOST_DEVICE_EXTERN #endif #ifndef CK_TILE_USE_CUSTOM_DATA_TYPE diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp index acf187cfc..4fcea9642 100644 --- a/include/ck_tile/core/container/sequence.hpp +++ b/include/ck_tile/core/container/sequence.hpp @@ -1111,4 +1111,126 @@ CK_TILE_HOST_DEVICE constexpr auto generate_array(F&& f, number) typename arithmetic_sequence_gen<0, N, 1>::type{}); } +namespace impl { +template +struct reverse_slice_sequence_impl; + +template +struct reverse_slice_sequence_impl, + sequence, + sequence, + SliceSize> +{ + using old_scan = + reverse_slice_sequence_impl, sequence, sequence, SliceSize>; + + static constexpr auto slice_size = old_scan::remaining_slice_sizes::front().value; + static constexpr auto slice_length = + std::conditional_t, number>::value; + + using dim_lengths = + typename sequence_merge, typename old_scan::dim_lengths>::type; + using dim_slices = + typename sequence_merge, typename old_scan::dim_slices>::type; + using remaining_slice_sizes = typename sequence_merge< + std::conditional_t, sequence>, + typename old_scan::remaining_slice_sizes>::type; + + // the first idx that sliced length not equal to original length + static constexpr index_t _flag = + slice_length != x && remaining_slice_sizes{}.front().value == 1; + static constexpr index_t _split_flag = std::conditional_t, number<0>>::value; + static constexpr index_t _split_idx = + std::conditional_t<_split_flag, number, number<0>>::value; + + static constexpr index_t split_flag = _split_flag || old_scan::split_flag; + static constexpr index_t split_idx = std:: + conditional_t, number<_split_idx>>::value; +}; + +template +struct reverse_slice_sequence_impl, sequence, sequence, SliceSize> +{ + static constexpr auto slice_size = SliceSize; + static constexpr auto slice_length = + std::conditional_t, number>::value; + + using dim_lengths = sequence; + using dim_slices = sequence; + using remaining_slice_sizes = + std::conditional_t, sequence>; + + // the first idx that sliced length not equal to original length + static constexpr index_t _flag = + slice_length != x && remaining_slice_sizes{}.front().value == 1; + static constexpr index_t split_flag = std::conditional_t, number<0>>::value; + static constexpr index_t split_idx = + std::conditional_t, number<0>>::value; +}; +} // namespace impl + +// clang-format off +// input a sequence(with optional mask), and the SliceSize : size per slice +// output the sequence each slice, and number of slices +// +// e.g. <2, 1, 4, 2>, 8 -> lengths:<1, 1, 4, 2> , nums: <2, 1, 1, 1> : 2 slices , slice_idx: 0 +// <4, 2, 4, 1, 2>, 4 -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 1> : 16 slices , slice_idx: 2 +// <4, 2, 4, 1, 6>, 4 -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 3> : 48 slices , slice_idx: 2 +// <4, 2, 5, 1, 2>, 10 -> lengths:<1, 1, 5, 1, 2> , nums: <4, 2, 1, 1, 1> : 8 slices , slice_idx: 1 +// +// <4, 2, 8>, 64 -> lengths:<4, 2, 8> , nums: <1, 1, 1> : 1 slices , slice_idx: 0 +// <4, 2, 8>, 32 -> lengths:<2, 2, 8> , nums: <2, 1, 1> : 2 slices , slice_idx: 0 +// <4, 2, 8>, 16 -> lengths:<1, 2, 8> , nums: <4, 1, 1> : 4 slices , slice_idx: 0 +// <4, 2, 8>, 8 -> lengths:<1, 1, 8> , nums: <4, 2, 1> : 8 slices , slice_idx: 1 +// <4, 2, 8>, 4 -> lengths:<1, 1, 4> , nums: <4, 2, 2> : 16 slices , slice_idx: 2 +// <4, 2, 8>, 2 -> lengths:<1, 1, 2> , nums: <4, 2, 4> : 32 slices , slice_idx: 2 +// <4, 2, 8>, 1 -> lengths:<1, 1, 1> , nums: <4, 2, 8> : 64 slices , slice_idx: 2 +// +// <4, 2, 1, 4, 2> / 4 -> +// mask:<1, 1, 1, 0, 1>, -> lengths:<1, 2, 1, 4, 2> , nums: <4, 1, 1, 1, 1> : 8 slices , slice_idx: 0 +// +// return tuple, slice_index is at which index will start +// have split slices (right -> left) +// or the first index that sliced length is different from the original length +// clang-format on +template ::type> +constexpr auto reverse_slice_sequence(Seq, + number, + Mask = typename uniform_sequence_gen::type{}) +{ + static_assert(Seq::size() == Mask::size()); + using sliced_type = + impl::reverse_slice_sequence_impl::type, + SliceSize>; + static_assert(sliced_type::remaining_slice_sizes::front().value == 1, + "can not evenly divide this sequence, please check"); + return make_tuple(typename sliced_type::dim_lengths{}, + typename sliced_type::dim_slices{}, + number{}); +} + +template ::type> +constexpr auto slice_sequence(Seq, + number, + Mask = typename uniform_sequence_gen::type{}) +{ + constexpr auto r = + reverse_slice_sequence(Seq{}.reverse(), number{}, Mask{}.reverse()); + return make_tuple(r[number<0>{}].reverse(), + r[number<1>{}].reverse(), + number{}] - 1>{}); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp index cb8c2c70c..598dfeea3 100644 --- a/include/ck_tile/core/container/tuple.hpp +++ b/include/ck_tile/core/container/tuple.hpp @@ -488,6 +488,26 @@ CK_TILE_HOST_DEVICE constexpr auto transform_tuples(F f, const X& x, const Y& y, f, x, y, z, typename arithmetic_sequence_gen<0, X::size(), 1>::type{}); } +namespace detail { + +template +CK_TILE_HOST_DEVICE constexpr auto embed_tuples_impl(F f, const X& x, sequence) +{ + return concat_tuple(f(x.at(number{}))...); +} + +} // namespace detail + +// make sure F return at least a tuple +// e.g. x : tuple, f will return tuple +// this function will return +template +CK_TILE_HOST_DEVICE constexpr auto embed_tuples(F f, const X& x) +{ + return detail::embed_tuples_impl( + f, x, typename arithmetic_sequence_gen<0, X::size(), 1>::type{}); +} + // By default unroll to the flatten template CK_TILE_HOST_DEVICE constexpr auto unroll_nested_tuple(const tuple<>& t) diff --git a/include/ck_tile/core/tensor/static_distributed_tensor.hpp b/include/ck_tile/core/tensor/static_distributed_tensor.hpp index 299a74bc0..29c20bed0 100644 --- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp +++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp @@ -187,4 +187,18 @@ set_tile_if(static_distributed_tensor& out_ten }); } +// this function used inside span loop over +template +CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks_from_x_unpacks(YLengths, number) +{ + constexpr auto y_size = reduce_on_sequence(YLengths{}, multiplies{}, number<1>{}); + constexpr auto y_packs = number{}; + static_assert(y_size % y_packs == 0); + constexpr auto y_slice_size = y_size / y_packs; + + constexpr auto slice_info = slice_sequence(YLengths{}, number{}); + constexpr auto unpacks = slice_info[number<1>{}]; + return unpacks; +} + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/sweep_tile.hpp b/include/ck_tile/core/tensor/sweep_tile.hpp index f1511f11d..f82f6b5bc 100644 --- a/include/ck_tile/core/tensor/sweep_tile.hpp +++ b/include/ck_tile/core/tensor/sweep_tile.hpp @@ -8,6 +8,7 @@ #include "ck_tile/core/numeric/integral_constant.hpp" #include "ck_tile/core/tensor/tile_distribution.hpp" #include "ck_tile/core/utility/functional.hpp" +#include "ck_tile/core/utility/functional_with_tuple.hpp" #include "ck_tile/core/utility/type_traits.hpp" namespace ck_tile { @@ -27,4 +28,281 @@ CK_TILE_DEVICE void sweep_tile_span(TileDistributedSpan_, const F& f) }); } +// unpacked span, this version support span with unpack(multi-arg) functor +// +template < + typename TileDistributedSpan_, // tile_distributed_span<...> + typename F, // signature: F(tile_distributed_index<...>) + typename Unpacks = typename uniform_sequence_gen::type> +CK_TILE_DEVICE void sweep_tile_uspan(TileDistributedSpan_, const F& f, Unpacks = {}) +{ + using DstrSpan = remove_cvref_t; + + static_uford{}( + [&](auto... dstr_idx_impl) { f(detail::make_tile_distributed_index(dstr_idx_impl)...); }); +} + +namespace impl { + +template +struct sweep_tile_impl; + +template +struct sweep_tile_impl> +{ + CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks() const + { + constexpr auto spans = DistributedTensor::get_distributed_spans(); + constexpr auto y_lengths = typename decltype(spans[number{}])::Impl{}; + constexpr auto x_unpacks = number{})>{}; + constexpr auto y_unpacks = get_y_unpacks_from_x_unpacks(y_lengths, x_unpacks); + return y_unpacks; + } + CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const + { + constexpr auto spans = DistributedTensor::get_distributed_spans(); + constexpr auto u = + static_uford{}])::Impl, decltype(get_y_unpacks())>{}; + return u.get_num_of_access() * + sweep_tile_impl>{} + .get_num_of_access(); + } + template + CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, const SpanIdx& span_idx) const + { + constexpr auto spans = DistributedTensor::get_distributed_spans(); + + sweep_tile_uspan( + spans[number{}], + [&](auto... i_idx) { + const auto next_span_idx = embed_tuples( + [&](auto si) { return make_tuple(concat_tuple(si, make_tuple(i_idx))...); }, + span_idx); + sweep_tile_impl>{}( + f, next_span_idx); + }, + get_y_unpacks()); + } + template + CK_TILE_HOST_DEVICE constexpr void + operator()(const F& f, const SpanIdx& span_idx, number) const + { + constexpr auto spans = DistributedTensor::get_distributed_spans(); + constexpr auto u = + static_uford{}])::Impl, decltype(get_y_unpacks())>{}; + constexpr auto access_stride = + sweep_tile_impl>{} + .get_num_of_access(); + constexpr auto curr_i_access = number{}; + constexpr auto next_i_access = number{}; + u( + [&](auto... i_idx) { + const auto next_span_idx = embed_tuples( + [&](auto si) { + return make_tuple(concat_tuple( + si, make_tuple(detail::make_tile_distributed_index(i_idx)))...); + }, + span_idx); + sweep_tile_impl>{}( + f, next_span_idx, next_i_access); + }, + curr_i_access); + } +}; + +template +struct sweep_tile_impl> +{ + CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const { return 1; } + template + CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, const SpanIdx& span_idx) const + { + unpack(f, span_idx); + } + template + CK_TILE_HOST_DEVICE constexpr void + operator()(const F& f, const SpanIdx& span_idx, number) const + { + unpack(f, span_idx); + } +}; + +template +struct sweep_tile_impl_0; + +// TODO: support empty tuple to remove this "entry-point" like function +template +struct sweep_tile_impl_0> +{ + CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks() const + { + constexpr auto spans = DistributedTensor::get_distributed_spans(); + constexpr auto y_lengths = typename decltype(spans[number{}])::Impl{}; + constexpr auto x_unpacks = number{})>{}; + constexpr auto y_unpacks = get_y_unpacks_from_x_unpacks(y_lengths, x_unpacks); + return y_unpacks; + } + CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const + { + constexpr auto spans = DistributedTensor::get_distributed_spans(); + constexpr auto u = + static_uford{}])::Impl, decltype(get_y_unpacks())>{}; + return u.get_num_of_access() * + sweep_tile_impl>{} + .get_num_of_access(); + } + template + CK_TILE_HOST_DEVICE constexpr void operator()(const F& f) const + { + constexpr auto spans = DistributedTensor::get_distributed_spans(); + sweep_tile_uspan( + spans[number{}], + [&](auto... i_idx) { + constexpr auto next_span_idx = make_tuple(make_tuple(i_idx)...); + sweep_tile_impl>{}( + f, next_span_idx); + }, + get_y_unpacks()); + } + template + CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, number) const + { + constexpr auto spans = DistributedTensor::get_distributed_spans(); + constexpr auto u = + static_uford{}])::Impl, decltype(get_y_unpacks())>{}; + constexpr auto access_stride = + sweep_tile_impl>{} + .get_num_of_access(); + constexpr auto curr_i_access = number{}; + constexpr auto next_i_access = number{}; + u( + [&](auto... i_idx) { + constexpr auto next_span_idx = + make_tuple(make_tuple(detail::make_tile_distributed_index(i_idx))...); + sweep_tile_impl>{}( + f, next_span_idx, next_i_access); + }, + curr_i_access); + } +}; + +} // namespace impl + +/* + * Enhanced sweep-tile utility, can control unpacks along each X-dim + * the lambda function argument is the distributed-idx, which can directly + * plugged into the distributed tensor as setter/getter + * + * e.g. below function, y with the type DistributedTensor, r is row scale + * + * // sweep tile 1 by 1 + * sweep_tile([&](auto idx) { + * constexpr auto row_id = make_tuple(idx[number<0>{}]); + * y(idx) = y(idx) * r(row_id); + * }); + * + * // sweep tile with 2 pixel from last dim each function call + * sweep_tile( + * [&](auto idx_0, auto idx_1) { + * constexpr auto row_id = make_tuple(idx_0[number<0>{}]); + * y(idx_0) = y(idx_0) * r(row_id); + * y(idx_1) = y(idx_1) * r(row_id); + * }, + * sequence<1, 2>{}); + * + * // sweep tile with 2x2 pixel each function call + * sweep_tile( + * [&](auto idx_00, auto idx_01, auto idx_10, auto idx_11) { + * constexpr auto row_id0 = make_tuple(idx_00[number<0>{}]); + * constexpr auto row_id1 = make_tuple(idx_10[number<0>{}]); + * y(idx_00) = y(idx_00) * r(row_id0); + * y(idx_01) = y(idx_01) * r(row_id0); + * y(idx_10) = y(idx_10) * r(row_id1); + * y(idx_11) = y(idx_11) * r(row_id1); + * }, + * sequence<2, 2>{}); + * + * TODO: do we need constexpr? lambda function could be non-constexpr + */ +template ::type> +CK_TILE_HOST_DEVICE constexpr void sweep_tile(const F& f, UnpacksPerXDim = {}) +{ + constexpr auto spans = DistributedTensor::get_distributed_spans(); + + impl::sweep_tile_impl_0::type>{}(f); +} + +template ::type> +CK_TILE_HOST_DEVICE constexpr void +sweep_tile(const DistributedTensor&, const F& f, UnpacksPerXDim = {}) +{ + sweep_tile(f, UnpacksPerXDim{}); +} + +/* + * construct a sweep tile instance, which support issue the lambda one by one + * Note that this struct will hold the lambda functor, but will not hold the distributed tensor + * the functionality is the same as sweep_tile() + */ +template ::type> +struct tile_sweeper +{ + using DistributedTensor = remove_cvref_t; + using F = remove_cvref_t; + using UnpacksPerXDim = remove_cvref_t; + + CK_TILE_HOST_DEVICE tile_sweeper(const F& f_, UnpacksPerXDim = {}) : f(f_) {} + CK_TILE_HOST_DEVICE tile_sweeper(const DistributedTensor&, const F& f_, UnpacksPerXDim = {}) + : f(f_) + { + } + CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_access() + { + constexpr auto spans = DistributedTensor::get_distributed_spans(); + constexpr auto tmp = + impl::sweep_tile_impl_0::type>{}; + return tmp.get_num_of_access(); + } + + CK_TILE_HOST_DEVICE void operator()() const + { + sweep_tile(f, UnpacksPerXDim{}); + } + + template + CK_TILE_HOST_DEVICE void operator()(number) const + { + constexpr auto spans = DistributedTensor::get_distributed_spans(); + + impl::sweep_tile_impl_0::type>{}( + f, number{}); + } + F f; +}; + +// partial deduction is not allowed +// template +// CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const F&, U = {})->tile_sweeper; + +// deduction guide +template ::type> +CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const T&, const F&, U = {})->tile_sweeper; + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp index 24c932f0a..7761be492 100644 --- a/include/ck_tile/core/tensor/tile_distribution.hpp +++ b/include/ck_tile/core/tensor/tile_distribution.hpp @@ -17,6 +17,14 @@ namespace ck_tile { +namespace detail { +template +CK_TILE_HOST_DEVICE auto get_partition_index(Distribution) +{ + return Distribution::_get_partition_index(); +} +} // namespace detail + // distributed span template struct tile_distributed_span @@ -83,6 +91,21 @@ struct tile_distribution CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_dimension_p() { return NDimP; } CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_dimension_r() { return NDimR; } + CK_TILE_HOST_DEVICE static auto _get_partition_index() + { + // only support warp-tile and block-tile + static_assert(NDimP == 1 or NDimP == 2, "wrong!"); + + if constexpr(NDimP == 1) + { + return array{get_lane_id()}; + } + else if constexpr(NDimP == 2) + { + return array{get_warp_id(), get_lane_id()}; + } + } + CK_TILE_HOST_DEVICE static constexpr auto get_lengths() { #if 0 @@ -149,6 +172,16 @@ struct tile_distribution } #endif + template + CK_TILE_HOST_DEVICE auto + calculate_index(const PartitionIndex& ps_idx = _get_partition_index()) const + { + const auto ps_ys_idx = container_concat(ps_idx, array{0}); + const auto window_adaptor_thread_coord_tmp = + make_tensor_adaptor_coordinate(ps_ys_to_xs_, ps_ys_idx); + return window_adaptor_thread_coord_tmp.get_bottom_index(); + } + CK_TILE_HOST_DEVICE static constexpr auto get_distributed_spans() { constexpr auto distributed_spans_impl = DstrEncode::detail::distributed_spans_lengthss_; @@ -421,6 +454,7 @@ struct tile_distribution_detail } // namespace detail +#if 0 // this returns a constexpr tile_distribution template CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistributionEncoding_) @@ -457,6 +491,7 @@ CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistribution detail::tile_distribution_detail>>{ ps_ys_to_xs_adaptor, ys_to_d_descriptor}; } +#endif // this returns a static tile_distribution template @@ -499,129 +534,6 @@ CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution(StaticTileDistr //*********************************************************************************** namespace detail { - -template -CK_TILE_HOST_DEVICE auto get_partition_index(Distribution) -{ - // only support warp-tile and block-tile - static_assert(Distribution::NDimP == 1 or Distribution::NDimP == 2, "wrong!"); - - if constexpr(Distribution::NDimP == 1) - { - return array{get_lane_id()}; - } - else if constexpr(Distribution::NDimP == 2) - { - return array{get_warp_id(), get_lane_id()}; - } -} - -template -struct reverse_slice_sequence_impl; - -template -struct reverse_slice_sequence_impl, - sequence, - sequence, - SliceSize> -{ - using old_scan = - reverse_slice_sequence_impl, sequence, sequence, SliceSize>; - - static constexpr auto slice_size = old_scan::remaining_slice_sizes::front().value; - static constexpr auto slice_length = - std::conditional_t, number>::value; - - using dim_lengths = - typename sequence_merge, typename old_scan::dim_lengths>::type; - using dim_slices = - typename sequence_merge, typename old_scan::dim_slices>::type; - using remaining_slice_sizes = typename sequence_merge< - std::conditional_t, sequence>, - typename old_scan::remaining_slice_sizes>::type; - - // the first idx that sliced length not equal to original length - static constexpr index_t _flag = - slice_length != x && remaining_slice_sizes{}.front().value == 1; - static constexpr index_t _split_flag = std::conditional_t, number<0>>::value; - static constexpr index_t _split_idx = - std::conditional_t<_split_flag, number, number<0>>::value; - - static constexpr index_t split_flag = _split_flag || old_scan::split_flag; - static constexpr index_t split_idx = std:: - conditional_t, number<_split_idx>>::value; -}; - -template -struct reverse_slice_sequence_impl, sequence, sequence, SliceSize> -{ - static constexpr auto slice_size = SliceSize; - static constexpr auto slice_length = - std::conditional_t, number>::value; - - using dim_lengths = sequence; - using dim_slices = sequence; - using remaining_slice_sizes = - std::conditional_t, sequence>; - - // the first idx that sliced length not equal to original length - static constexpr index_t _flag = - slice_length != x && remaining_slice_sizes{}.front().value == 1; - static constexpr index_t split_flag = std::conditional_t, number<0>>::value; - static constexpr index_t split_idx = - std::conditional_t, number<0>>::value; -}; - -// clang-format off -// input a sequence(with optional mask), and the SliceSize : size per slice -// output the sequence each slice, and number of slices -// -// e.g. <2, 1, 4, 2>, 8 -> lengths:<1, 1, 4, 2> , nums: <2, 1, 1, 1> : 2 slices , slice_idx: 0 -// <4, 2, 4, 1, 2>, 4 -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 1> : 16 slices , slice_idx: 2 -// <4, 2, 4, 1, 6>, 4 -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 3> : 48 slices , slice_idx: 2 -// <4, 2, 5, 1, 2>, 10 -> lengths:<1, 1, 5, 1, 2> , nums: <4, 2, 1, 1, 1> : 8 slices , slice_idx: 1 -// -// <4, 2, 8>, 64 -> lengths:<4, 2, 8> , nums: <1, 1, 1> : 1 slices , slice_idx: 0 -// <4, 2, 8>, 32 -> lengths:<2, 2, 8> , nums: <2, 1, 1> : 2 slices , slice_idx: 0 -// <4, 2, 8>, 16 -> lengths:<1, 2, 8> , nums: <4, 1, 1> : 4 slices , slice_idx: 0 -// <4, 2, 8>, 8 -> lengths:<1, 1, 8> , nums: <4, 2, 1> : 8 slices , slice_idx: 1 -// <4, 2, 8>, 4 -> lengths:<1, 1, 4> , nums: <4, 2, 2> : 16 slices , slice_idx: 2 -// <4, 2, 8>, 2 -> lengths:<1, 1, 2> , nums: <4, 2, 4> : 32 slices , slice_idx: 2 -// <4, 2, 8>, 1 -> lengths:<1, 1, 1> , nums: <4, 2, 8> : 64 slices , slice_idx: 2 -// -// <4, 2, 1, 4, 2> / 4 -> -// mask:<1, 1, 1, 0, 1>, -> lengths:<1, 2, 1, 4, 2> , nums: <4, 1, 1, 1, 1> : 8 slices , slice_idx: 0 -// -// return tuple, slice_index is at which index will start -// have split slices (right -> left) -// or the first index that sliced length is different from the original length -// clang-format on -template ::type> -constexpr auto reverse_slice_sequence(Seq, - number, - Mask = typename uniform_sequence_gen::type{}) -{ - static_assert(Seq::size() == Mask::size()); - using sliced_type = - reverse_slice_sequence_impl::type, - SliceSize>; - static_assert(sliced_type::remaining_slice_sizes::front().value == 1, - "can not evenly divide this sequence, please check"); - return make_tuple(typename sliced_type::dim_lengths{}, - typename sliced_type::dim_slices{}, - number{}); -} - // // slice tensor from x_dim, result in split in y_dim, not p_dim. // We don't support slice cross p_dim (aka, slice different threads) diff --git a/include/ck_tile/core/utility/functional_with_tuple.hpp b/include/ck_tile/core/utility/functional_with_tuple.hpp new file mode 100644 index 000000000..4b4040319 --- /dev/null +++ b/include/ck_tile/core/utility/functional_with_tuple.hpp @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +// This file should not be included inside tuple.hpp! + +#include "ck_tile/core/config.hpp" +#include "ck_tile/core/numeric/integer.hpp" +#include "ck_tile/core/numeric/integral_constant.hpp" +#include "ck_tile/core/numeric/math.hpp" +#include "ck_tile/core/container/sequence.hpp" +#include "ck_tile/core/container/tuple.hpp" +#include "ck_tile/core/utility/type_traits.hpp" +#include +#include + +namespace ck_tile { + +namespace detail { + +// RemainLengths: sequence<...> +// Orders: sequence<...> +template +struct static_uford_impl +{ + CK_TILE_HOST_DEVICE constexpr static_uford_impl() + { + static_assert(RemainLengths::size() > 0, "wrong! should not get here"); + static_assert(RamainUnpacks::size() > 0, "wrong! should not get here"); + } + + template + CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentUnpackIds) const + { + constexpr index_t pack_len = RamainUnpacks::front(); + static_for<0, RemainLengths::front(), pack_len>{}([=](auto I) { + constexpr auto new_pack = generate_tuple( + [&](auto idx_) { + constexpr auto i_new_pack = number{}; + constexpr auto i_pre_pack = number{}; + return CurrentUnpackIds{}.at(i_pre_pack).push_back(i_new_pack); + }, + number{}); + + static_uford_impl{}(f, new_pack); + }); + } +}; + +template +struct static_uford_impl, sequence<>, Orders> +{ + template + CK_TILE_HOST_DEVICE constexpr void operator()(F f, PackedId) const + { + constexpr auto origin_packs = transform_tuples( + [](auto pack_) { return decltype(pack_)::reorder_old_to_new(Orders{}); }, PackedId{}); + unpack(f, origin_packs); + } +}; + +template +struct static_uford_one_shot_impl +{ + template + CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentUnpackIds, number) const + { + constexpr auto r_lens_stride = + reverse_exclusive_scan_sequence(RemainLengths{}, multiplies{}, number<1>{}); + constexpr auto r_upks_stride = + reverse_exclusive_scan_sequence(RamainUnpacks{}, multiplies{}, number<1>{}); + + constexpr index_t current_stride = r_lens_stride.front() / r_upks_stride.front(); + constexpr index_t pack_len = RamainUnpacks::front(); + constexpr index_t current_idx = (current_acc / current_stride) * pack_len; + + constexpr auto new_pack = generate_tuple( + [&](auto idx_) { + constexpr auto i_new_pack = number{}; + constexpr auto i_pre_pack = number{}; + return CurrentUnpackIds{}.at(i_pre_pack).push_back(i_new_pack); + }, + number{}); + + static_uford_one_shot_impl{}(f, new_pack, number{}); + } +}; + +template +struct static_uford_one_shot_impl, sequence<>, Orders> +{ + template + CK_TILE_HOST_DEVICE constexpr void operator()(F f, PackedId, number) const + { + constexpr auto origin_packs = transform_tuples( + [](auto pack_) { return decltype(pack_)::reorder_old_to_new(Orders{}); }, PackedId{}); + unpack(f, origin_packs); + } +}; + +} // namespace detail + +// TODO: we may unify static_ford/static_uford in the future +// +// loop over nd space(sequence) with packs +// you must make sure the function passed in has same number of argument +// +// e.g. +// Lengths=seq<2, 3, 4>, Unpacks=<1, 1, 2> +// static_uford{}([&](auto i_0, auto i_1){}); // require 2 args(packs) +// +// loop #0, i_0=seq<0, 0, 0>, i_1=<0, 0, 1> +// loop #1, i_0=seq<0, 0, 2>, i_1=<0, 0, 3> +// loop #2, i_0=seq<0, 1, 0>, i_1=<0, 1, 1> +// loop #3, i_0=seq<0, 1, 2>, i_1=<0, 1, 3> +// loop #4, i_0=seq<0, 2, 0>, i_1=<0, 2, 1> +// loop #5, i_0=seq<0, 2, 2>, i_1=<0, 2, 3> +// loop #6, i_0=seq<1, 0, 0>, i_1=<1, 0, 1> +// ... +template ::type, + class Orders = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type> +struct static_uford +{ + static constexpr index_t num_packs = reduce_on_sequence(Unpacks{}, multiplies{}, number<1>{}); + + CK_TILE_HOST_DEVICE constexpr static_uford() + { + static_assert(Lengths::size() > 0, "wrong! Lengths is empty"); + static_assert(Lengths::size() == Unpacks::size(), "wrong! inconsistent size"); + static_assert(Lengths::size() == Orders::size(), "wrong! inconsistent size"); + static_for<0, Lengths::size(), 1>{}( + [&](auto i) { static_assert(Lengths{}.at(i) % Unpacks{}.at(i) == 0); }); + } + + CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_access() + { + using L_ = decltype(Lengths{} / Unpacks{}); + + return reduce_on_sequence(L_{}, multiplies{}, number<1>{}); + } + + // F signature: F(sequence<...> multi_id...) + // multi_id is the unordered multi-index + template + CK_TILE_HOST_DEVICE constexpr void operator()(F f) const + { + constexpr auto ordered_lengths = Lengths::reorder_new_to_old(Orders{}); + constexpr auto ordered_unpacks = Unpacks::reorder_new_to_old(Orders{}); + detail::static_uford_impl{}( + f, make_tuple(sequence<>{})); + } + + // this version is friendly for issue function one by one + template + CK_TILE_HOST_DEVICE constexpr void operator()(F f, number) const + { + static_assert(i_access < get_num_of_access()); + constexpr auto ordered_lengths = Lengths::reorder_new_to_old(Orders{}); + constexpr auto ordered_unpacks = Unpacks::reorder_new_to_old(Orders{}); + detail::static_uford_one_shot_impl{}( + f, make_tuple(sequence<>{}), number{}); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp index b382710b1..dbc1f5d23 100644 --- a/include/ck_tile/host.hpp +++ b/include/ck_tile/host.hpp @@ -21,7 +21,7 @@ #include "ck_tile/host/reference/reference_batched_softmax.hpp" #include "ck_tile/host/reference/reference_gemm.hpp" #include "ck_tile/host/reference/reference_im2col.hpp" -#include "ck_tile/host/reference/reference_layernorm2d.hpp" +#include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp" #include "ck_tile/host/reference/reference_reduce.hpp" #include "ck_tile/host/reference/reference_softmax.hpp" #include "ck_tile/host/stream_config.hpp" diff --git a/include/ck_tile/host/reference/reference_layernorm2d.hpp b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp similarity index 100% rename from include/ck_tile/host/reference/reference_layernorm2d.hpp rename to include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp diff --git a/include/ck_tile/ops/layernorm2d.hpp b/include/ck_tile/ops/layernorm2d.hpp index 3b66645ed..2a403b0f4 100644 --- a/include/ck_tile/ops/layernorm2d.hpp +++ b/include/ck_tile/ops/layernorm2d.hpp @@ -4,6 +4,9 @@ #pragma once #include "ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp" -#include "ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp" -#include "ck_tile/ops/layernorm2d/pipeline/tile_layernorm2d_fwd_shape.hpp" +#include "ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp index 468df793d..cebe5131a 100644 --- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp +++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp @@ -5,37 +5,57 @@ #include "ck_tile/core.hpp" #include "ck_tile/ops/common.hpp" -#include "ck_tile/ops/welford/thread/thread_welford.hpp" -#include "ck_tile/ops/welford/warp/warp_welford.hpp" namespace ck_tile { -// TODO: Extract some type to wrapper class -template -struct Layernorm2dFwd +// host side args +struct Layernorm2dFwdHostArgs { - using Problem = ck_tile::remove_cvref_t; + const void* p_x; + const void* p_gamma; + const void* p_beta; - using XDataType = ck_tile::remove_cvref_t; - using GammaDataType = ck_tile::remove_cvref_t; - using BetaDataType = ck_tile::remove_cvref_t; - using ComputeDataType = ck_tile::remove_cvref_t; - using YDataType = ck_tile::remove_cvref_t; - using MeanDataType = ck_tile::remove_cvref_t; - using InvStdDataType = ck_tile::remove_cvref_t; + void* p_y; + void* p_mean; + void* p_invStd; - static constexpr bool kHasGamma = !std::is_same_v; - static constexpr bool kHasBeta = !std::is_same_v; - static constexpr bool kSaveMean = !std::is_same_v; - static constexpr bool kSaveInvStd = !std::is_same_v; + float epsilon; - static constexpr ck_tile::index_t kMPerBlock = Problem::BlockShape::kMPerBlock; - static constexpr ck_tile::index_t kNPerBlock = Problem::BlockShape::kNPerBlock; - static constexpr bool kPadM = Problem::kPadM; - static constexpr bool kPadN = Problem::kPadN; + index_t m; + index_t n; + index_t stride; // row_stride +}; - static constexpr ck_tile::index_t kNThreadPerWarp = Problem::BlockShape::kNThreadPerWarp; - static constexpr ck_tile::index_t kNPerThread = Problem::BlockShape::kNPerThread; +// TODO: Extract some type to wrapper class +template +struct Layernorm2dFwd +{ + using Pipeline = remove_cvref_t; + using Problem = typename Pipeline::Problem; + + using XDataType = remove_cvref_t; + using GammaDataType = remove_cvref_t; + using BetaDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YDataType = remove_cvref_t; + using MeanDataType = remove_cvref_t; + using InvStdDataType = remove_cvref_t; + + static constexpr bool kHasGamma = !std::is_same_v; + static constexpr bool kHasBeta = !std::is_same_v; + static constexpr bool kSaveMeanInvStd = Problem::kSaveMeanInvStd; + static constexpr bool kSaveMean = Problem::kSaveMeanInvStd; + static constexpr bool kSaveInvStd = Problem::kSaveMeanInvStd; + + static constexpr index_t Block_M = Problem::BlockShape::Block_M; + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + static constexpr bool kPadM = false; // always no need to pad along M + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kTwoPass = Problem::kTwoPass; + + static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N; + static constexpr index_t Vector_N = Problem::BlockShape::Vector_N; + static constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N; static constexpr auto I0 = number<0>{}; static constexpr auto I1 = number<1>{}; @@ -52,400 +72,177 @@ struct Layernorm2dFwd float epsilon; - ck_tile::index_t M; - ck_tile::index_t N; + index_t m; + index_t n; + index_t stride; // row_stride }; + using Hargs = Layernorm2dFwdHostArgs; - CK_TILE_HOST static constexpr Kargs MakeKargs(const void* p_x, - const void* p_gamma, - const void* p_beta, - void* p_y, - void* p_mean, - void* p_invStd, - float epsilon, - ck_tile::index_t M, - ck_tile::index_t N) + CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) { - return Kargs{p_x, p_gamma, p_beta, p_y, p_mean, p_invStd, epsilon, M, N}; + return Kargs{hargs.p_x, + hargs.p_gamma, + hargs.p_beta, + hargs.p_y, + hargs.p_mean, + hargs.p_invStd, + hargs.epsilon, + hargs.m, + hargs.n, + hargs.stride}; } - CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t M) { return M / kMPerBlock; } - - CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::kBlockSize; } - - CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution() + CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) { - using S = typename Problem::BlockShape; - - return make_static_tile_distribution( - tile_distribution_encoding< - sequence<>, - tuple, - sequence>, - tuple, sequence<1, 2>>, - tuple, sequence<1, 1>>, - sequence<1, 2>, - sequence<2, 2>>{}); + return (hargs.m + Block_M - 1) / Block_M; } - CK_TILE_DEVICE static constexpr auto MakeGammaBetaBlockTileDistribution() - { - using S = typename Problem::BlockShape; - - return make_static_tile_distribution( - tile_distribution_encoding< - sequence, - tuple>, - tuple, sequence<0, 1>>, - tuple, sequence<1, 1>>, - sequence<1>, - sequence<2>>{}); - } - - CK_TILE_DEVICE static int GetWelfordMaxCount(int N) - { - constexpr ck_tile::index_t kNThreadPerBlock = kNPerBlock / kNPerThread; - - int thread_id_n = get_thread_id() % kNThreadPerBlock; - int max_count = - __builtin_amdgcn_readfirstlane(N < kNPerBlock ? 0 : kNPerThread * (N / kNPerBlock)); - int n_per_block_tail_loop = - __builtin_amdgcn_readfirstlane(N - max_count * kNThreadPerBlock); - - if(n_per_block_tail_loop > 0) - { - int thread_max_n = (thread_id_n + 1) * kNPerThread; - int delta = thread_max_n - n_per_block_tail_loop; - delta = clamp(thread_max_n - n_per_block_tail_loop, 0, kNPerThread); - max_count += kNPerThread - delta; - } - - return max_count; - } + CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; } - template - CK_TILE_DEVICE static auto InvSqrt(const DistributedTensor& in_dstr_tensor, - const ComputeDataType epsilon) - { - // TODO: Investigate fast inverse square root algorithm with epsilon - constexpr auto spans = DistributedTensor::get_distributed_spans(); - - DistributedTensor out_dstr_tensor; + // clang-format off + template struct t2s; + template <> struct t2s { static constexpr const char * name = "fp32"; }; + template <> struct t2s { static constexpr const char * name = "fp16"; }; + template <> struct t2s { static constexpr const char * name = "bf16"; }; + template <> struct t2s { static constexpr const char * name = "fp8"; }; + template <> struct t2s { static constexpr const char * name = "bf8"; }; + // clang-format on - sweep_tile_span(spans[number<0>{}], [&](auto idx0) { - constexpr auto i_idx = make_tuple(idx0); - out_dstr_tensor(i_idx) = type_convert(1.0f) / - ck_tile::sqrt(in_dstr_tensor[i_idx] + epsilon); - }); - - return out_dstr_tensor; - } + // in byte + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); } - template - CK_TILE_DEVICE std::enable_if_t - TwoPassLayernorm2dFwd(XBlockWindow& x_block_window, - GammaBlockWindow& gamma_block_window, - BetaBlockWindow& beta_block_window, - YBlockWindow& y_block_window, - MeanBlockWindow& mean_block_window, - InvStdBlockWindow& inv_std_block_window, - ComputeDataType epsilon, - ck_tile::index_t N) const + CK_TILE_HOST static std::string GetName() { - // TODO - Optimize tail loop to reduce move_tile_window() - index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, kNPerBlock)); - - int welford_max_count = GetWelfordMaxCount(N); - ThreadWelford thread_welford{welford_max_count}; - - using XTensorType = decltype(load_tile(x_block_window)); - auto mean_compute_block_tensor = - thread_welford.template MakeInitialMeanVarDistributedTensor(); - auto var_compute_block_tensor = - thread_welford.template MakeInitialMeanVarDistributedTensor(); - - clear_tile(mean_compute_block_tensor); - clear_tile(var_compute_block_tensor); - - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) - { - const auto x_block_tensor = load_tile(x_block_window); - - thread_welford(x_block_tensor, mean_compute_block_tensor, var_compute_block_tensor); - move_tile_window(x_block_window, {0, kNPerBlock}); - } - - // TODO: support cross warp Welford - WarpMergeWelford{}( - mean_compute_block_tensor, var_compute_block_tensor, thread_welford.cur_count_); - - auto inv_std_compute_block_tensor = InvSqrt(var_compute_block_tensor, epsilon); - - if constexpr(kSaveMean) - store_tile(mean_block_window, cast_tile(mean_compute_block_tensor)); - if constexpr(kSaveInvStd) - store_tile(inv_std_block_window, - cast_tile(inv_std_compute_block_tensor)); - - // reverse read x to reuse cache - ck_tile::index_t stride_to_right_most_window = - N % kNPerBlock == 0 ? N - kNPerBlock : N - N % kNPerBlock; - - move_tile_window(x_block_window, {0, -kNPerBlock}); - move_tile_window(gamma_block_window, {stride_to_right_most_window}); - move_tile_window(beta_block_window, {stride_to_right_most_window}); - move_tile_window(y_block_window, {0, stride_to_right_most_window}); - - // Normalization - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) - { - const auto x_block_tensor = load_tile(x_block_window); - const auto gamma_block_tensor = load_tile(gamma_block_window); - const auto beta_block_tensor = load_tile(beta_block_window); - - constexpr auto x_spans = decltype(x_block_tensor)::get_distributed_spans(); - - auto y_block_tensor = - make_static_distributed_tensor(x_block_tensor.get_tile_distribution()); - - sweep_tile_span(x_spans[I1], [&](auto idx1) { - constexpr auto j_idx = make_tuple(idx1); - const auto gamma = type_convert(gamma_block_tensor[j_idx]); - const auto beta = type_convert(beta_block_tensor[j_idx]); - - sweep_tile_span(x_spans[I0], [&](auto idx0) { - constexpr auto i_idx = make_tuple(idx0); - constexpr auto i_j_idx = make_tuple(idx0, idx1); - - const auto mean = mean_compute_block_tensor[i_idx]; - const auto inv_std = inv_std_compute_block_tensor[i_idx]; - - const auto x = type_convert(x_block_tensor[i_j_idx]); - auto y = (x - mean) * inv_std * gamma + beta; - - y_block_tensor(i_j_idx) = type_convert(y); - }); - }); - - store_tile(y_block_window, y_block_tensor); - - move_tile_window(x_block_window, {0, -kNPerBlock}); - move_tile_window(gamma_block_window, {-kNPerBlock}); - move_tile_window(beta_block_window, {-kNPerBlock}); - move_tile_window(y_block_window, {0, -kNPerBlock}); - } - } - - template - CK_TILE_DEVICE std::enable_if_t - OnePassLayernorm2dFwd(XBlockWindow& x_block_window, - GammaBlockWindow& gamma_block_window, - BetaBlockWindow& beta_block_window, - YBlockWindow& y_block_window, - MeanBlockWindow& mean_block_window, - InvStdBlockWindow& inv_std_block_window, - ComputeDataType epsilon, - ck_tile::index_t N) const - { - int welford_max_count = GetWelfordMaxCount(N); - ThreadWelford thread_welford{welford_max_count}; - - using XTensorType = decltype(load_tile(x_block_window)); - auto mean_compute_block_tensor = - thread_welford.template MakeInitialMeanVarDistributedTensor(); - auto var_compute_block_tensor = - thread_welford.template MakeInitialMeanVarDistributedTensor(); - - clear_tile(mean_compute_block_tensor); - clear_tile(var_compute_block_tensor); - - const auto x_block_tensor = load_tile(x_block_window); - thread_welford(x_block_tensor, mean_compute_block_tensor, var_compute_block_tensor); - // TODO: support cross warp Welford - WarpMergeWelford{}( - mean_compute_block_tensor, var_compute_block_tensor, thread_welford.cur_count_); - - auto inv_std_compute_block_tensor = InvSqrt(var_compute_block_tensor, epsilon); - - if constexpr(kSaveMean) - store_tile(mean_block_window, cast_tile(mean_compute_block_tensor)); - if constexpr(kSaveInvStd) - store_tile(inv_std_block_window, - cast_tile(inv_std_compute_block_tensor)); - - // normalize - const auto gamma_block_tensor = load_tile(gamma_block_window); - const auto beta_block_tensor = load_tile(beta_block_window); - - constexpr auto x_spans = decltype(x_block_tensor)::get_distributed_spans(); - - auto y_block_tensor = - make_static_distributed_tensor(x_block_tensor.get_tile_distribution()); - - sweep_tile_span(x_spans[I1], [&](auto idx1) { - constexpr auto j_idx = make_tuple(idx1); - const auto gamma = type_convert(gamma_block_tensor[j_idx]); - const auto beta = type_convert(beta_block_tensor[j_idx]); - - sweep_tile_span(x_spans[I0], [&](auto idx0) { - constexpr auto i_idx = make_tuple(idx0); - constexpr auto i_j_idx = make_tuple(idx0, idx1); - - const auto mean = mean_compute_block_tensor[i_idx]; - const auto inv_std = inv_std_compute_block_tensor[i_idx]; - - const auto x = type_convert(x_block_tensor[i_j_idx]); - auto y = (x - mean) * inv_std * gamma + beta; - - y_block_tensor(i_j_idx) = type_convert(y); - }); - }); - - store_tile(y_block_window, y_block_tensor); + // clang-format off + using S_ = typename Problem::BlockShape; + auto surfix = [&] () { + std::string n; + if (kPadN) n += "_pn"; + if (kSaveMeanInvStd) n += "_mv"; + if (kTwoPass) n += "_2p"; + return n; }(); + + #define _SS_ std::string + #define _TS_ std::to_string + return _SS_("layernorm2d_fwd_") + _SS_(t2s::name) + "_" + + _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" + + _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" + + _SS_(Pipeline::name) + surfix; + #undef _SS_ + #undef _TS_ + // clang-format on } CK_TILE_DEVICE void operator()(Kargs kargs) const { - const auto x_m_n = [&]() { - const auto x_dram_naive = make_naive_tensor_view( + const auto iM = get_block_id() * Block_M; + + const auto x_window = [&]() { + const auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_x), - make_tuple(kargs.M, kargs.N), - make_tuple(kargs.N, 1), - number{}, + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, number<1>{}); - return pad_tensor_view(x_dram_naive, - make_tuple(number{}, number{}), - sequence{}); + // NOTE: we don't do any pad in this kernel for loading, assume that inside kernel will + // check the max count dynamically + const auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); }(); - const auto gamma_n = [&]() { - const auto gamma_dram_naive = make_naive_tensor_view( + const auto gamma_window = [&]() { + const auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_gamma), - make_tuple(kargs.N), + make_tuple(kargs.n), make_tuple(1), - number{}, + number{}, number<1>{}); - return pad_tensor_view( - gamma_dram_naive, make_tuple(number{}), sequence{}); + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + + return make_tile_window(tmp2_, make_tuple(number{}), {0}); }(); - const auto beta_n = [&]() { - const auto gamma_dram_naive = make_naive_tensor_view( + const auto beta_window = [&]() { + const auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_beta), - make_tuple(kargs.N), + make_tuple(kargs.n), make_tuple(1), - number{}, + number{}, number<1>{}); - return pad_tensor_view( - gamma_dram_naive, make_tuple(number{}), sequence{}); + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + return make_tile_window(tmp2_, make_tuple(number{}, number{}), {0}); }(); - const auto iM = get_block_id() * kMPerBlock; - - constexpr auto xDstr = MakeXBlockTileDistribution(); - - auto x_block_window = make_tile_window( - x_m_n, make_tuple(number{}, number{}), {iM, 0}, xDstr); - - const auto y_m_n = [&]() { - const auto y_dram_naive = make_naive_tensor_view( + auto y_window = [&]() { + auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_y), - make_tuple(kargs.M, kargs.N), - make_tuple(kargs.N, 1), - number{}, + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, number<1>{}); - return pad_tensor_view(y_dram_naive, - make_tuple(number{}, number{}), - sequence{}); + auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); }(); - auto y_block_window = make_tile_window( - y_m_n, make_tuple(number{}, number{}), {iM, 0}); - - constexpr auto gammaDstr = MakeGammaBetaBlockTileDistribution(); - constexpr auto betaDstr = gammaDstr; - - auto gamma_block_window = - make_tile_window(gamma_n, make_tuple(number{}), {0}, gammaDstr); - - auto beta_block_window = make_tile_window( - beta_n, make_tuple(number{}, number{}), {0}, betaDstr); - - auto mean_block_window = [&]() { + auto mean_window = [&]() { if constexpr(kSaveMean) { const auto mean_m = [&]() { const auto mean_dram_naive = make_naive_tensor_view_packed( static_cast(kargs.p_mean), - make_tuple(kargs.M), + make_tuple(kargs.m), number<1>{}); return pad_tensor_view( - mean_dram_naive, make_tuple(number{}), sequence{}); + mean_dram_naive, make_tuple(number{}), sequence{}); }(); - - return make_tile_window(mean_m, make_tuple(number{}), {iM}); + return make_tile_window(mean_m, make_tuple(number{}), {iM}); } else - return make_null_tile_window(make_tuple(number{})); + return make_null_tile_window(make_tuple(number{})); }(); - auto inv_std_block_window = [&]() { + auto inv_std_window = [&]() { if constexpr(kSaveInvStd) { const auto inv_std_m = [&]() { const auto inv_std_dram_naive = make_naive_tensor_view_packed( static_cast(kargs.p_invStd), - make_tuple(kargs.M), + make_tuple(kargs.m), number<1>{}); return pad_tensor_view( - inv_std_dram_naive, make_tuple(number{}), sequence{}); + inv_std_dram_naive, make_tuple(number{}), sequence{}); }(); - - return make_tile_window(inv_std_m, make_tuple(number{}), {iM}); + return make_tile_window(inv_std_m, make_tuple(number{}), {iM}); } else - return make_null_tile_window(make_tuple(number{})); + return make_null_tile_window(make_tuple(number{})); }(); - if(kargs.N <= kNPerBlock) - OnePassLayernorm2dFwd(x_block_window, - gamma_block_window, - beta_block_window, - y_block_window, - mean_block_window, - inv_std_block_window, - static_cast(kargs.epsilon), - kargs.N); - else - TwoPassLayernorm2dFwd(x_block_window, - gamma_block_window, - beta_block_window, - y_block_window, - mean_block_window, - inv_std_block_window, - static_cast(kargs.epsilon), - kargs.N); + __shared__ char smem[GetSmemSize()]; + + Pipeline{}(x_window, + gamma_window, + beta_window, + y_window, + mean_window, + inv_std_window, + static_cast(kargs.epsilon), + kargs.n, + smem); } }; diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp new file mode 100644 index 000000000..e4b60331e --- /dev/null +++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { +/* +// clang-format off + +4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector + + Block_N (Warp_N * WarpPerBlock_N * Repeat_N ) + +<----------------------< Repeat_N(2)>--------------------->+ + | | + +<-- -->+ + Warp_N + +--------------+--------------+--------------+--------------+----+----------------+ + Warp_M | wrap_0 | wrap_1 | | ^ ^ + +--------------+--------------+ | | + | wrap_2 | wrap_3 | | v + +--------------+--------------+--------------+--------------+----+ Block_M + | | | + + + | + | | | v + +--------------+--------------+--------------+--------------+ + + + each Warp-tile (e.g 16 thrd per row) + + Vector_N (contiguous pixels each thrd holds along N, or vector size) + +-----------+-----------+-----------+-----------+-----------+ + | thrd_0 | thrd_1 | thrd_2 | thrd_3 | ... Vector_M + +-----------+-----------+-----------+-----------+-----------+ + | thrd_16 | thrd_17 | thrd_18 | thrd_19 | ... + +-----------+-----------+-----------+-----------+-----------+ +// clang-format on +*/ +template + typename WarpPerBlock_, // num warps along seq + typename WarpTile_, // warp size, seq + typename Vector_, // contiguous pixels(vector size) along seq + index_t BlockSize_ = + warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})> +struct Layernorm2dShape +{ + // block size + static constexpr index_t Block_M = BlockTile_::at(number<0>{}); + static constexpr index_t Block_N = BlockTile_::at(number<1>{}); + + // num warps along seq, within each block + static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{}); + static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{}); + + // warp size + static constexpr index_t Warp_M = WarpTile_::at(number<0>{}); + static constexpr index_t Warp_N = WarpTile_::at(number<1>{}); + + static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0); + static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0); + // repeat of each thread along seq + static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); + static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); + + // vector size along seq + static constexpr index_t Vector_M = Vector_::at(number<0>{}); + static constexpr index_t Vector_N = Vector_::at(number<1>{}); + + static_assert(Warp_M % Vector_M == 0); + static_assert(Warp_N % Vector_N == 0); + // num of threads along seq, within each warp + static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; + static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; + + static constexpr index_t BlockSize = BlockSize_; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp deleted file mode 100644 index 707a38f62..000000000 --- a/include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core/utility/type_traits.hpp" - -namespace ck_tile { - -template -struct BlockLayernorm2dFwdProblem -{ - using XDataType = remove_cvref_t; - using GammaDataType = remove_cvref_t; - using BetaDataType = remove_cvref_t; - using ComputeDataType = remove_cvref_t; - using YDataType = remove_cvref_t; - using MeanDataType = remove_cvref_t; - using InvStdDataType = remove_cvref_t; - using BlockShape = remove_cvref_t; - static constexpr bool kPadM = kPadM_; - static constexpr bool kPadN = kPadN_; -}; - -} // namespace ck_tile diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp new file mode 100644 index 000000000..6661cddf4 --- /dev/null +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/welford/block/block_welford_problem.hpp" +#include "ck_tile/ops/welford/block/block_welford.hpp" + +namespace ck_tile { + +struct Layernorm2dFwdPipelineDefaultPolicy +{ + template + CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution() + { + using S = typename Problem::BlockShape; + + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 2>>, + sequence<1, 1, 2, 2>, + sequence<0, 3, 0, 3>>{}); + } + template + CK_TILE_DEVICE static constexpr auto MakeGammaBetaBlockTileDistribution() + { + using S = typename Problem::BlockShape; + + return make_static_tile_distribution( + tile_distribution_encoding< + sequence, + tuple>, + tuple, sequence<0, 1>>, + tuple, sequence<1, 2>>, + sequence<1, 1>, + sequence<0, 3>>{}); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelford() + { + using P_ = BlockWelfordProblem; + + return BlockWelford{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordSync() + { + using P_ = BlockWelfordProblem; + + return BlockWelfordSync{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordCrossWarpSync() + { + using P_ = BlockWelfordProblem; + + return BlockWelfordCrossWarpSync{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + if constexpr(Problem::kNeedCrossWarpSync) + { + using P_ = BlockWelfordProblem; + + using block_welford = BlockWelford; + using x_block_tile = + decltype(make_static_distributed_tensor( + MakeXBlockTileDistribution())); + using mean_var_block_tile = + decltype(block_welford::template MakeMeanVarBlockTile()); + + return GetBlockWelfordCrossWarpSync() + .template GetSmemSize(); + } + else + { + return 1; // zero size arrays are an extension + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp new file mode 100644 index 000000000..d73bcb29e --- /dev/null +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp" +#include +#include + +namespace ck_tile { + +template +struct Layernorm2dFwdPipelineOnePass +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using XDataType = ck_tile::remove_cvref_t; + using GammaDataType = ck_tile::remove_cvref_t; + using BetaDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using YDataType = ck_tile::remove_cvref_t; + using MeanDataType = ck_tile::remove_cvref_t; + using InvStdDataType = ck_tile::remove_cvref_t; + + static constexpr bool kHasGamma = !std::is_same_v; + static constexpr bool kHasBeta = !std::is_same_v; + static constexpr bool kSaveMean = Problem::kSaveMeanInvStd; + static constexpr bool kSaveInvStd = Problem::kSaveMeanInvStd; + + static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; + static constexpr bool kPadM = false; // TODO - BlockLayernorm2dFwdProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + + static constexpr const char* name = []() { + if constexpr(kNeedCrossWarpSync) + return "bpr"; // block per row + else + return "wpr"; // warp per row + }(); + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + CK_TILE_DEVICE auto operator()(const XWindow& x_window_, + const GammaWindow& gamma_window_, + const BetaWindow& beta_window_, + YWindow& y_window, + MeanWindow& mean_window, + InvStdWindow& inv_std_window, + ComputeDataType epsilon, + ck_tile::index_t row_size, + void* smem) const + { + const auto x_window = + make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); + const auto gamma_window = make_tile_window( + gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution()); + const auto beta_window = make_tile_window( + beta_window_, Policy::template MakeGammaBetaBlockTileDistribution()); + + const auto x = load_tile(x_window); + int cur_count = 0; + int max_count = + block_tile_welford_calculate_max_count(row_size); + auto block_welford = Policy::template GetBlockWelford(); + auto block_welford_sync = Policy::template GetBlockWelfordSync(); + auto block_welford_cross_warp_sync = + Policy::template GetBlockWelfordCrossWarpSync(); + + // load gamma/beta (TODO: support no gamma/beta?) + const auto gamma = load_tile(gamma_window); + const auto beta = load_tile(beta_window); + + // compute welford each-thread->cross-lane->cross-warp + auto [mean, var] = block_welford(x, cur_count, max_count); + block_welford_sync(mean, var, cur_count); + block_welford_cross_warp_sync(mean, var, cur_count, smem); + block_tile_welford_post_scale_var(var, cur_count); + + // compute inv-std + auto inv_std = tile_elementwise_in( + [&](const auto& v_) { + return type_convert(1.0f) / (sqrt(v_) + epsilon); + }, + var); + + if constexpr(kSaveMean) + store_tile(mean_window, cast_tile(mean)); + if constexpr(kSaveInvStd) + store_tile(inv_std_window, cast_tile(inv_std)); + + // layernorm computation + auto y = make_static_distributed_tensor(x.get_tile_distribution()); + sweep_tile(y, [&, mean_ = mean](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + + const auto gamma_ = type_convert(gamma[j_idx]); + const auto beta_ = type_convert(beta[j_idx]); + + const auto x_ = type_convert(x[idx]); + auto y_ = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; + + y(idx) = type_convert(y_); + }); + store_tile(y_window, y); + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp new file mode 100644 index 000000000..8e9f8e81e --- /dev/null +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/utility/type_traits.hpp" + +namespace ck_tile { + +template +struct Layernorm2dFwdPipelineProblem +{ + using XDataType = remove_cvref_t; + using GammaDataType = remove_cvref_t; + using BetaDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YDataType = remove_cvref_t; + using MeanDataType = remove_cvref_t; + using InvStdDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; + + static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; + static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; + static constexpr bool kTwoPass = kTwoPass_; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp new file mode 100644 index 000000000..dcbfc87da --- /dev/null +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp" +#include +#include + +namespace ck_tile { + +template +struct Layernorm2dFwdPipelineTwoPass +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using XDataType = ck_tile::remove_cvref_t; + using GammaDataType = ck_tile::remove_cvref_t; + using BetaDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using YDataType = ck_tile::remove_cvref_t; + using MeanDataType = ck_tile::remove_cvref_t; + using InvStdDataType = ck_tile::remove_cvref_t; + + static constexpr bool kHasGamma = !std::is_same_v; + static constexpr bool kHasBeta = !std::is_same_v; + static constexpr bool kSaveMean = Problem::kSaveMeanInvStd; + static constexpr bool kSaveInvStd = Problem::kSaveMeanInvStd; + + static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; + static constexpr bool kPadM = false; // TODO - BlockLayernorm2dFwdProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + + static constexpr const char* name = []() { + if constexpr(kNeedCrossWarpSync) + return "bpr"; // block per row + else + return "wpr"; // warp per row + }(); + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + CK_TILE_DEVICE auto operator()(const XWindow& x_window_, + const GammaWindow& gamma_window_, + const BetaWindow& beta_window_, + YWindow& y_window, + MeanWindow& mean_window, + InvStdWindow& inv_std_window, + ComputeDataType epsilon, + ck_tile::index_t row_size, + void* smem) const + { + auto x_window = + make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); + auto gamma_window = make_tile_window( + gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution()); + auto beta_window = make_tile_window( + beta_window_, Policy::template MakeGammaBetaBlockTileDistribution()); + + // Problem::BlockShape + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + index_t num_n_tile_iteration = + __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); + + // total number of count assume current iter have no pad(only last iter has pad) + constexpr index_t count_per_iter = + Problem::BlockShape::Repeat_N * Problem::BlockShape::Vector_N; + const index_t last_iter_n = row_size - (num_n_tile_iteration - 1) * Block_N; + + int cur_count = 0; + int max_count = + (num_n_tile_iteration - 1) * count_per_iter + + block_tile_welford_calculate_max_count(last_iter_n); + auto block_welford = Policy::template GetBlockWelford(); + auto block_welford_sync = Policy::template GetBlockWelfordSync(); + auto block_welford_cross_warp_sync = + Policy::template GetBlockWelfordCrossWarpSync(); + + using XTensorType = decltype(load_tile(x_window)); + auto mean = block_welford.template MakeMeanVarBlockTile(); + auto var = block_welford.template MakeMeanVarBlockTile(); + + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto x = load_tile(x_window); + block_welford(x, mean, var, cur_count, max_count); + move_tile_window(x_window, {0, Block_N}); + } + + block_welford_sync(mean, var, cur_count); + block_welford_cross_warp_sync(mean, var, cur_count, smem); + block_tile_welford_post_scale_var(var, cur_count); + + // compute inv-std + auto inv_std = tile_elementwise_in( + [&](const auto& v_) { + return type_convert(1.0f) / (sqrt(v_) + epsilon); + }, + var); + + if constexpr(kSaveMean) + store_tile(mean_window, cast_tile(mean)); + if constexpr(kSaveInvStd) + store_tile(inv_std_window, cast_tile(inv_std)); + + // reverse read x to reuse cache + ck_tile::index_t stride_to_right_most_window = + row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N; + + // x_window.foo(); + // gamma_window.foo(); + move_tile_window(x_window, {0, -Block_N}); + move_tile_window(gamma_window, {stride_to_right_most_window}); + move_tile_window(beta_window, {stride_to_right_most_window}); + move_tile_window(y_window, {0, stride_to_right_most_window}); + + // layernorm computation + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto x = load_tile(x_window); + // load gamma/beta (TODO: support no gamma/beta?) + const auto gamma = load_tile(gamma_window); + const auto beta = load_tile(beta_window); + + auto y = make_static_distributed_tensor(x.get_tile_distribution()); + + sweep_tile(y, [&, mean_ = mean](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + + const auto gamma_ = type_convert(gamma[j_idx]); + const auto beta_ = type_convert(beta[j_idx]); + + const auto x_ = type_convert(x[idx]); + auto y_ = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; + + y(idx) = type_convert(y_); + }); + + store_tile(y_window, y); + + move_tile_window(x_window, {0, -Block_N}); + move_tile_window(gamma_window, {-Block_N}); + move_tile_window(beta_window, {-Block_N}); + move_tile_window(y_window, {0, -Block_N}); + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/layernorm2d/pipeline/tile_layernorm2d_fwd_shape.hpp b/include/ck_tile/ops/layernorm2d/pipeline/tile_layernorm2d_fwd_shape.hpp deleted file mode 100644 index 1ff541d84..000000000 --- a/include/ck_tile/ops/layernorm2d/pipeline/tile_layernorm2d_fwd_shape.hpp +++ /dev/null @@ -1,35 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core.hpp" - -namespace ck_tile { -template // Sequence<... -struct TileLayernorm2dShape -{ - static constexpr index_t kMPerThread = ThreadTile::at(number<0>{}); - static constexpr index_t kNPerThread = ThreadTile::at(number<1>{}); - - static constexpr index_t kMPerWarp = WarpTile::at(number<0>{}); - static constexpr index_t kNPerWarp = WarpTile::at(number<1>{}); - - static constexpr index_t kMThreadPerWarp = kMPerWarp / kMPerThread; - static constexpr index_t kNThreadPerWarp = kNPerWarp / kNPerThread; - - static constexpr index_t kMPerBlock = BlockTile::at(number<0>{}); - static constexpr index_t kNPerBlock = BlockTile::at(number<1>{}); - - static constexpr index_t kMWarpPerBlock = kMPerBlock / kMPerWarp; - static constexpr index_t kNWarpPerBlock = kNPerBlock / kNPerWarp; - - // TODO - kNNumWarps can only be 1 if we don't support cross warp welford - static_assert(kNWarpPerBlock == 1); - - static constexpr index_t kBlockSize = warpSize * kMWarpPerBlock * kNWarpPerBlock; -}; - -} // namespace ck_tile diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp index 682d60d87..63c364331 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/welford.hpp b/include/ck_tile/ops/welford.hpp index dffaad750..ebf940683 100644 --- a/include/ck_tile/ops/welford.hpp +++ b/include/ck_tile/ops/welford.hpp @@ -3,6 +3,7 @@ #pragma once +#include "ck_tile/ops/welford/block/block_welford.hpp" +#include "ck_tile/ops/welford/block/block_welford_problem.hpp" #include "ck_tile/ops/welford/thread/thread_welford.hpp" -#include "ck_tile/ops/welford/warp/warp_welford.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp new file mode 100644 index 000000000..55d55402d --- /dev/null +++ b/include/ck_tile/ops/welford/block/block_welford.hpp @@ -0,0 +1,362 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/welford/thread/thread_welford.hpp" + +namespace ck_tile { + +template +struct BlockWelford +{ + using Problem = remove_cvref_t; + using XDataType = typename Problem::XDataType; + using ComputeDataType = typename Problem::ComputeDataType; + + CK_TILE_DEVICE constexpr BlockWelford() {} + + // [CAUSION] - max_count_ is to deal with the padding problem + // max_count_ is depend on caller, eg: naive and splitN welford will have different + // calculation of max_count_ + // -> use block_welford_calculate_max_count to compute + template + CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor, + MeanDistributedTensor_& mean_tensor, + VarDistributedTensor_& var_tensor, + int& cur_count_, // -> prefer init as zero + const int& max_count_) + { + constexpr auto I0 = number<0>{}; + constexpr auto I1 = number<1>{}; + + constexpr auto spans = XDistributedTensor_::get_distributed_spans(); + + sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) { + if(cur_count_ < max_count_) + { + ++cur_count_; + + sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) { + constexpr auto in_dstr_idx = make_tuple(dstr_idx_i0, dstr_idx_i1); + constexpr auto out_dstr_idx = make_tuple(dstr_idx_i0); + + auto x = ck_tile::type_convert(x_tensor[in_dstr_idx]); + + welford_update( + mean_tensor(out_dstr_idx), var_tensor(out_dstr_idx), x, cur_count_); + }); + } + }); + } + + template + CK_TILE_DEVICE static auto MakeMeanVarBlockTile() + { + static_assert(std::is_same_v, "wrong!"); + + constexpr auto reduce_dims = sequence<1>{}; + + constexpr auto dstr = + make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding( + XDistributedTensor_::get_tile_distribution() + .get_static_tile_distribution_encoding(), + reduce_dims)); + + auto tensor = make_static_distributed_tensor(dstr); + + return tensor; + } + + template + CK_TILE_DEVICE auto + operator()(const XDistributedTensor_& x_tensor, int& cur_count_, const int& max_count_) + { + auto mean_tensor = MakeMeanVarBlockTile(); + auto var_tensor = MakeMeanVarBlockTile(); + clear_tile(mean_tensor); + clear_tile(var_tensor); + + (*this)(x_tensor, mean_tensor, var_tensor, cur_count_, max_count_); + + return ck_tile::make_tuple(mean_tensor, var_tensor); + } +}; + +template +struct BlockWelfordSync +{ + using Problem = remove_cvref_t; + + template + CK_TILE_DEVICE void + operator()(MeanDistributedTensor_& mean_tensor, VarDistributedTensor_& var_tensor, int& count) + { + using Dstr = typename MeanDistributedTensor_::StaticTileDistribution; + using DstrEncode = typename Dstr::DstrEncode; + using DstrEncodeDetail = typename DstrEncode::detail; + + static_assert(std::is_same_v, + "wrong!"); + + constexpr index_t NDimP = Dstr::get_num_of_dimension_p(); + constexpr index_t NDimR = Dstr::get_num_of_dimension_r(); + + constexpr index_t idim_p_lane = NDimP - 1; + + // const auto ps_idx = make_array(get_warp_id(), get_lane_id()); + // const auto rs_idx = + // mean_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx); + + constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size(); + static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size()); + + const int original_count = count; + + // loop over thread data + static_for<0, thread_buf_size, 1>{}([&](auto i) { + auto v_local_mean = mean_tensor.get_thread_buffer()[i]; + auto v_local_var = var_tensor.get_thread_buffer()[i]; + auto v_local_count = original_count; + + // cross-lane reduce for replication + // only reduce on R dimension correspond to lane + // (lane id maps to this R dimension) + static_for<0, NDimR, 1>{}([&](auto idim_r) { + // FIXME: nasty to use does_p_own_r_ + if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r]) + { + constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r]; + + constexpr index_t lid_over_rid_derivative = + DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r]; + + static_assert(is_power_of_two_integer(r_length), + "wrong! only support power of 2 reduction"); + + constexpr index_t nstage = integer_log2_floor(r_length); + + // reduction sweep forward + static_for<0, nstage, 1>{}([&](auto istage) { + // xor + index_t src_lane = + (__lane_id()) ^ + (number{}.value); + + // pull data from remote lane + const auto v_remote_mean = warp_shuffle(v_local_mean, src_lane); + const auto v_remote_var = warp_shuffle(v_local_var, src_lane); + const auto v_remote_count = warp_shuffle(v_local_count, src_lane); + + // welford merge + welford_merge(v_local_mean, + v_local_var, + v_local_count, + v_remote_mean, + v_remote_var, + v_remote_count); + }); + } + }); + + mean_tensor.get_thread_buffer()(i) = v_local_mean; + var_tensor.get_thread_buffer()(i) = v_local_var; + + count = v_local_count; + }); + } +}; + +template +struct BlockWelfordCrossWarpSync +{ + using Problem = remove_cvref_t; + using BlockShape = typename Problem::BlockShape; + + template + CK_TILE_DEVICE static constexpr index_t GetReduceWarps() + { + constexpr index_t num_reduce_warps = [&]() { + using Dstr = typename MeanDistributedTensor_::StaticTileDistribution; + using DstrEncode = typename Dstr::DstrEncode; + using DstrEncodeDetail = typename DstrEncode::detail; + + constexpr index_t NDimR = Dstr::get_num_of_dimension_r(); + + constexpr index_t idim_p_warp = 0; + + index_t len_ = 1; + static_for<0, NDimR, 1>{}([&](auto idim_r) { + if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_warp][idim_r]) + { + constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r]; + len_ *= r_length; + } + }); + return len_; + }(); + return num_reduce_warps; + } + + // return in byte + template + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + // constexpr auto num_reduce_warps = GetReduceWarps(); + + // data need to exchange is very small, we just pack mean+var+count -> 4dword + constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size(); + + // we need to store all data from every wave into smem + // e.g. 2x2 reduce along N + // -------------> reduce N + // | w0 | w1 | ___> | w01 | + // | w2 | w3 | | w23 | + // + // -> store data from every wave into LDS + // + // + // -------------> reduce N + // | w0 | w1 | w2 | w3 | -----> | w0123 | + // + // -> also store data from every wave into LDS + constexpr index_t num_warps = BlockShape::BlockSize / warpSize; + return num_warps * 4 * thread_buf_size * sizeof(float); + } + + template + CK_TILE_DEVICE void operator()(MeanDistributedTensor_& mean_tensor, + VarDistributedTensor_& var_tensor, + int& count, + void* smem) + { + using DataType = typename MeanDistributedTensor_::DataType; + using Dstr = typename MeanDistributedTensor_::StaticTileDistribution; + // using DstrEncode = typename Dstr::DstrEncode; + // using DstrEncodeDetail = typename DstrEncode::detail; + + static_assert(std::is_same_v, + "wrong!"); + + constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size(); + static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size()); + + // Note: we always pack everything into fp32x4 + fp32x4_t* smem_ptr = reinterpret_cast(smem); + const index_t lane_id = get_lane_id(); + const index_t warp_id = get_warp_id(); + constexpr auto num_reduce_warps = GetReduceWarps(); + constexpr index_t num_warps = BlockShape::BlockSize / warpSize; + const index_t smem_offset = warp_id; + + // skip if nonthing to do + if constexpr(num_reduce_warps == 1) + return; + + // store into smem only for lane-0 within one warp + if(lane_id == 0) + { + static_for<0, thread_buf_size, 1>{}([&](auto i) { + fp32x4_t local_scratch_; + local_scratch_[0] = bit_cast(mean_tensor.get_thread_buffer()[i]); + local_scratch_[1] = bit_cast(var_tensor.get_thread_buffer()[i]); + local_scratch_[2] = bit_cast(count); + + smem_ptr[smem_offset + i * num_warps] = local_scratch_; + }); + } + block_sync_lds(); + + // load from smem. here we let everythread to do compute :) + index_t local_warp_id = warp_id / num_reduce_warps; + index_t local_smem_os = local_warp_id * num_reduce_warps; + fp32x4_t all_scratch[thread_buf_size * num_reduce_warps]; + static_for<0, thread_buf_size, 1>{}([&](auto i_0) { + static_for<0, num_reduce_warps, 1>{}([&](auto i_1) { + all_scratch[i_0 * num_warps + i_1] = + smem_ptr[i_0 * num_reduce_warps + local_smem_os + i_1]; + }); + }); + block_sync_lds(); // TODO: we don't need sync here + + // const int original_count = count; + + static_for<0, thread_buf_size, 1>{}([&](auto i_0) { + // TODO: use descriptor for this + auto v_local = all_scratch[i_0 * num_warps]; + auto v_local_mean = bit_cast(v_local[0]); + auto v_local_var = bit_cast(v_local[1]); + auto v_local_count = bit_cast(v_local[2]); + + // further reduce mean/var + static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) { + constexpr auto i_1 = number{}; + const fp32x4_t v_remote = all_scratch[i_0 * num_warps + i_1]; + const auto v_remote_mean = bit_cast(v_remote[0]); + const auto v_remote_var = bit_cast(v_remote[1]); + const auto v_remote_count = bit_cast(v_remote[2]); + + welford_merge(v_local_mean, + v_local_var, + v_local_count, + v_remote_mean, + v_remote_var, + v_remote_count); + }); + + mean_tensor.get_thread_buffer()(i_0) = v_local_mean; + var_tensor.get_thread_buffer()(i_0) = v_local_var; + + count = v_local_count; + }); + } +}; + +// compute the max count for a last dim reduce +// everything may have vector/repeat, so the max count could be uneven +// TODO: specify which dim to compute and proper set the problem +// TODO: BlockShape we reuse layernorm_fwd_shape :) +template +CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count(int row_size) +{ +#if 0 + using S = BlockShape; + index_t LastloopN = row_size % S::Block_N == 0 ? S::Block_N : row_size % S::Block_N; + constexpr index_t NThread = S::WarpPerBlock_N * S::ThreadPerWarp_N; + index_t iNLane = get_thread_id() % NThread; + index_t iN0 = LastloopN / (S::Vector_N * S::ThreadPerWarp_N); + index_t iN1 = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) / S::Vector_N; + index_t N2 = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) % S::Vector_N; + index_t iN3 = iNLane < iN1 ? S::Vector_N : iNLane == iN1 ? N2 : 0; + return iN0 * S::Vector_N + iN3; +#endif + using S_ = BlockShape; + constexpr index_t ThreadsPerBlock_N = S_::WarpPerBlock_N * S_::ThreadPerWarp_N; + + // TODO: we always check vector size, need be evenly devidable by vector-n + const index_t element_per_row = row_size / S_::Vector_N; + index_t lane_id_n = get_thread_id() % ThreadsPerBlock_N; + + index_t cnt = 0; + // TODO: Repeat_N can not be too long, otherwise this is not good + static_for<0, S_::Repeat_N, 1>{}([&](auto) { + index_t _a = lane_id_n < element_per_row ? 1 : 0; + cnt += _a; + lane_id_n += ThreadsPerBlock_N; + }); + return cnt * S_::Vector_N; +} + +// Note: this function must be called after all the computation +template +CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var(VarDistributedTensor_& var_tensor, + int count) +{ + using DataType = typename VarDistributedTensor_::DataType; + tile_elementwise_inout([&count](auto& x) { x = x / type_convert(count); }, + var_tensor); +} +} // namespace ck_tile diff --git a/include/ck_tile/ops/welford/block/block_welford_problem.hpp b/include/ck_tile/ops/welford/block/block_welford_problem.hpp new file mode 100644 index 000000000..dcae1ef2e --- /dev/null +++ b/include/ck_tile/ops/welford/block/block_welford_problem.hpp @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template +struct BlockWelfordProblem +{ + using XDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/welford/thread/thread_welford.hpp b/include/ck_tile/ops/welford/thread/thread_welford.hpp index 2ca9a2365..4c61cdcf4 100644 --- a/include/ck_tile/ops/welford/thread/thread_welford.hpp +++ b/include/ck_tile/ops/welford/thread/thread_welford.hpp @@ -7,95 +7,30 @@ namespace ck_tile { -template -struct ThreadWelford +template +CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count) { - using XDataType = remove_cvref_t; - using ComputeDataType = remove_cvref_t; - - template - CK_TILE_DEVICE void Update(T& mean, T& var, T x) - { - if(ck_tile::isnan(x)) - { - mean = x; - var = x; - } - else - { - T delta = x - mean; - mean += delta / cur_count_; - T delta2 = x - mean; - var += delta * delta2; - } - } - - // [CAUSION] - max_count_ is to deal with the padding problem - // max_count_ is depend on caller, eg: naive and splitN welford will have different - // calculation of max_count_ - CK_TILE_DEVICE constexpr ThreadWelford(int max_count) : cur_count_(0), max_count_(max_count) {} - - template - CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor, - MeanDistributedTensor_& mean_tensor, - VarDistributedTensor_& var_tensor) - { - constexpr auto I0 = number<0>{}; - constexpr auto I1 = number<1>{}; - - constexpr auto spans = XDistributedTensor_::get_distributed_spans(); - - sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) { - if(cur_count_ < max_count_) - { - ++cur_count_; - - sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) { - constexpr auto in_dstr_idx = make_tuple(dstr_idx_i0, dstr_idx_i1); - constexpr auto out_dstr_idx = make_tuple(dstr_idx_i0); - - auto x = ck_tile::type_convert(x_tensor[in_dstr_idx]); - - Update(mean_tensor(out_dstr_idx), var_tensor(out_dstr_idx), x); - }); - } - }); - } - - template - CK_TILE_DEVICE static auto MakeInitialMeanVarDistributedTensor() - { - static_assert(std::is_same_v, "wrong!"); - - constexpr auto reduce_dims = sequence<1>{}; - - constexpr auto dstr = - make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding( - XDistributedTensor_::get_tile_distribution() - .get_static_tile_distribution_encoding(), - reduce_dims)); - - auto tensor = make_static_distributed_tensor(dstr); - clear_tile(tensor); - - return tensor; - } - - template - CK_TILE_DEVICE auto operator()(const XDistributedTensor_& x_tensor) - { - auto mean_tensor = MakeInitialMeanVarDistributedTensor(); - auto var_tensor = MakeInitialMeanVarDistributedTensor(); - - (*this)(x_tensor, mean_tensor, var_tensor); - - return ck_tile::make_tuple(mean_tensor, var_tensor); - } - - int cur_count_; - int max_count_; -}; + // TODO: check nan? maybe no + T delta = x - mean; + mean += delta / count; + T delta2 = x - mean; + var += delta * delta2; +} + +template +CK_TILE_DEVICE static void +welford_merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b) +{ + int count = count_a + count_b; + T count_ = type_convert(count); + T count_a_ = type_convert(count_a); + T count_b_ = type_convert(count_b); + T count_b_over_count = count == 0 ? type_convert(0) : count_b_ / count_; + + T delta = mean_b - mean_a; + mean_a += delta * count_b_over_count; + var_a += var_b + delta * delta * count_a_ * count_b_over_count; + count_a = count; +} } // namespace ck_tile diff --git a/include/ck_tile/ops/welford/warp/warp_welford.hpp b/include/ck_tile/ops/welford/warp/warp_welford.hpp deleted file mode 100644 index 687b61f43..000000000 --- a/include/ck_tile/ops/welford/warp/warp_welford.hpp +++ /dev/null @@ -1,154 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core.hpp" - -namespace ck_tile { - -template -struct WarpMergeWelford -{ - using ComputeDataType = remove_cvref_t; - - template - CK_TILE_DEVICE static void - Merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b) - { - int count = count_a + count_b; - T count_ = type_convert(count); - T count_a_ = type_convert(count_a); - T count_b_ = type_convert(count_b); - T count_b_over_count = count == 0 ? type_convert(0) : count_b_ / count_; - - T delta = mean_b - mean_a; - mean_a += delta * count_b_over_count; - var_a += var_b + delta * delta * count_a_ * count_b_over_count; - count_a = count; - } - - template - CK_TILE_DEVICE void - operator()(MeanDistributedTensor_& mean_tensor, VarDistributedTensor_& var_tensor, int& count) - { - using Dstr = typename MeanDistributedTensor_::StaticTileDistribution; - using DstrEncode = typename Dstr::DstrEncode; - using DstrEncodeDetail = typename DstrEncode::detail; - - static_assert(std::is_same_v, - "wrong!"); - - constexpr index_t NDimP = Dstr::get_num_of_dimension_p(); - constexpr index_t NDimR = Dstr::get_num_of_dimension_r(); - - constexpr index_t idim_p_lane = NDimP - 1; - - const auto ps_idx = make_array(get_warp_id(), get_lane_id()); - const auto rs_idx = - mean_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx); - - constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size(); - static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size()); - - const int original_count = count; - - // loop over thread data - static_for<0, thread_buf_size, 1>{}([&](auto i) { - auto v_local_mean = mean_tensor.get_thread_buffer()[i]; - auto v_local_var = var_tensor.get_thread_buffer()[i]; - auto v_local_count = original_count; - - // cross-lane reduce for replication - // only reduce on R dimension correspond to lane - // (lane id maps to this R dimension) - static_for<0, NDimR, 1>{}([&](auto idim_r) { - // FIXME: nasty to use does_p_own_r_ - if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r]) - { - constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r]; - - constexpr index_t lid_over_rid_derivative = - DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r]; - - static_assert(is_power_of_two_integer(r_length), - "wrong! only support power of 2 reduction"); - - constexpr index_t nstage = integer_log2_floor(r_length); - - // reduction sweep forward - static_for<0, nstage, 1>{}([&](auto istage) { - constexpr index_t lid_delta = - lid_over_rid_derivative * (1 << (nstage - istage - 1)); - - // pull data from remote lane - const auto v_remote_mean = warp_shuffle_down(v_local_mean, lid_delta); - const auto v_remote_var = warp_shuffle_down(v_local_var, lid_delta); - const auto v_remote_count = warp_shuffle_down(v_local_count, lid_delta); - - // welford merge - Merge(v_local_mean, - v_local_var, - v_local_count, - v_remote_mean, - v_remote_var, - v_remote_count); - }); - } - }); - - // cross-lane broadcast for replication - // only broadcast on R dimension correspond to lane - // (lane id maps to this R dimension) - if constexpr(BroadcastLane) - { - static_for<0, NDimR, 1>{}([&](auto idim_r) { - // FIXME: nasty to use does_p_own_r_ - if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r]) - { - const index_t r_id = rs_idx[idim_r]; - - constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r]; - - constexpr index_t lid_over_rid_derivative = - DstrEncodeDetail::ps_over_rs_derivative_[NDimP - 1][idim_r]; - - static_assert(is_power_of_two_integer(r_length), - "wrong! only support power of 2 reduction"); - - constexpr index_t nstage = integer_log2_floor(r_length); - - // broadcast sweep backward - static_for<0, nstage, 1>{}([&](auto istage) { - // do I hold reduced data? - const bool do_i_hold_reduced_data = r_id < (1 << istage); - - constexpr index_t lid_delta = lid_over_rid_derivative * (1 << istage); - - // pull data from remote lane - const auto v_remote_mean = warp_shuffle_up(v_local_mean, lid_delta); - const auto v_remote_var = warp_shuffle_up(v_local_var, lid_delta); - const auto v_remote_count = warp_shuffle_up(v_local_count, lid_delta); - - // decide whether to update local data with remote data - v_local_mean = do_i_hold_reduced_data ? v_local_mean : v_remote_mean; - v_local_var = do_i_hold_reduced_data ? v_local_var : v_remote_var; - v_local_count = do_i_hold_reduced_data ? v_local_count : v_remote_count; - }); - } - }); - } - - mean_tensor.get_thread_buffer()(i) = v_local_mean; - - if constexpr(GetActualVariance) - var_tensor.get_thread_buffer()(i) = v_local_var / v_local_count; - else - var_tensor.get_thread_buffer()(i) = v_local_var; - - count = v_local_count; - }); - } -}; - -} // namespace ck_tile -- GitLab From 82fc53835aabb044d2ef15f485d0a2c8d52b4702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Tue, 22 Oct 2024 16:18:28 +0200 Subject: [PATCH 009/153] Enable grouped conv bwd wei bf16 NGCHW (#1589) * Enable grouped conv bwd wei bf16 NGCHW * fixes * fixes * Fixes * fixes * fixes * Fixes --- ...conv_bwd_weight_two_stage_xdl_instance.hpp | 71 +++++++++++++- ...e_grouped_conv_bwd_weight_xdl_instance.hpp | 4 +- .../grouped_convolution_backward_weight.hpp | 48 ++++++++++ ...rouped_convolution_backward_weight_xdl.inc | 94 +++++++++++++++++++ .../grouped_conv1d_bwd_weight/CMakeLists.txt | 6 +- ...gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp} | 2 +- ...nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp} | 2 +- ...gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp} | 28 +++--- .../grouped_conv2d_bwd_weight/CMakeLists.txt | 12 ++- ...wc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp} | 2 +- ...gc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp} | 2 +- ...ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp | 41 ++++++++ ...ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp | 41 ++++++++ ...nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp | 41 ++++++++ ...nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp | 41 ++++++++ ...wc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp} | 28 +++--- ...gc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp} | 28 +++--- .../grouped_conv3d_bwd_weight/CMakeLists.txt | 12 ++- ..._gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp} | 2 +- ..._gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp} | 2 +- ...wgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp | 41 ++++++++ ...wgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp | 41 ++++++++ ...dhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp | 41 ++++++++ ...dhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp | 41 ++++++++ ..._gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp} | 28 +++--- ..._gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp} | 28 +++--- .../src/profile_grouped_conv_bwd_weight.cpp | 26 ++++- script/convert_miopen_driver_to_profiler.py | 5 +- 28 files changed, 667 insertions(+), 91 deletions(-) rename library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/{device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp => device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp} (96%) rename library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/{device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp => device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp} (96%) rename library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/{device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp => device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp} (60%) rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/{device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp => device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp} (97%) rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/{device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp => device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp} (97%) create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp => device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp} (62%) rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp => device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp} (61%) rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/{device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp => device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp} (96%) rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/{device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp => device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp} (97%) create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp => device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp} (60%) rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp => device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp} (61%) diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp index 2ce334d9d..5f6c340e4 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp @@ -15,8 +15,9 @@ namespace instance { using namespace ck::tensor_layout::convolution; -using F16 = ck::half_t; -using F32 = float; +using BF16 = ck::bhalf_t; +using F16 = ck::half_t; +using F32 = float; using Empty_Tuple = ck::Tuple<>; @@ -45,17 +46,42 @@ using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances = std //#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| Scheduler| Version| | //#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | | | DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 16, 16, 32, 8, 16, 16, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 1>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2>, DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 64, 32, 8, 32, 32, 1, 2, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4>, DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 128, 32, 8, 32, 32, 1, 4, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 8>, - DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 16, 16, 32, 8, 16, 16, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 1>, DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2>, DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 32, 32, 8, 32, 32, 2, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4>, DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 128, 32, 32, 8, 32, 32, 4, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 8> // clang-format on >; +template +using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances = std::tuple< + // clang-format off + //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups| + //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge| + //#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| Scheduler| Version| | + //#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | | | + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 16, 16, 32, 8, 16, 16, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 1>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 64, 32, 8, 32, 32, 1, 2, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 128, 32, 8, 32, 32, 1, 4, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 8>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 32, 32, 8, 32, 32, 2, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 128, 32, 32, 8, 32, 32, 4, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 8> + // clang-format on + >; + // NGCHW requires transpose, we use vector loads and stores params for them template ; +template +using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances = std::tuple< + // clang-format off + //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups| + //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge| + //#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| Scheduler| Version| | + //#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | | | + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 16, 16, 32, 8, 16, 16, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 2>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 64, 32, 8, 32, 32, 1, 2, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 4>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 128, 32, 8, 32, 32, 1, 4, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 4, 1, 8>, 1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 8>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 2>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 32, 32, 8, 32, 32, 2, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 4>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 128, 32, 32, 8, 32, 32, 4, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 8, 1, 4>, 1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 8>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2, BF16, BF16, 1, 2>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 64, 32, 8, 32, 32, 1, 2, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, BF16, BF16, 1, 4>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 128, 32, 8, 32, 32, 1, 4, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 4, 1, 8>, 1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 8>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 32, 32, 8, 32, 32, 2, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, BF16, BF16, 1, 4>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 128, 32, 32, 8, 32, 32, 4, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 8, 1, 4>, 1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 8>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 1>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 64, 32, 8, 32, 32, 1, 2, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 1>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 128, 32, 8, 32, 32, 1, 4, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 4, 1, 8>, 1, Scheduler, PipelineVersion, 8, BF16, BF16, 8 ,1>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 32, 32, 8, 32, 32, 2, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 1>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 128, 32, 32, 8, 32, 32, 4, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 8, 1, 4>, 1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 1> + // clang-format on + >; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp index 096e0b177..32f52770b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" @@ -113,7 +113,7 @@ template -using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances = std::tuple< +using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances = std::tuple< // clang-format off //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp index 0f11d337f..797233be0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp @@ -367,6 +367,17 @@ struct DeviceOperationInstanceFactory && + is_same_v && + is_same_v && + is_same_v && + is_same_v) + { + add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances( + op_ptrs); + add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances( + op_ptrs); + } #endif } if constexpr(is_same_v && is_same_v && @@ -382,6 +393,19 @@ struct DeviceOperationInstanceFactory && + is_same_v && + is_same_v && + is_same_v && + is_same_v) + { + add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances( + op_ptrs); + add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances( + op_ptrs); + } #endif } } @@ -453,6 +477,17 @@ struct DeviceOperationInstanceFactory && + is_same_v && + is_same_v && + is_same_v && + is_same_v) + { + add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances( + op_ptrs); + add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances( + op_ptrs); + } #endif #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8 if constexpr(is_same_v && is_same_v && @@ -477,6 +512,19 @@ struct DeviceOperationInstanceFactory && + is_same_v && + is_same_v && + is_same_v && + is_same_v) + { + add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances( + op_ptrs); + add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances( + op_ptrs); + } #endif } } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc index f240fa323..5f6f2fc6f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc @@ -100,6 +100,53 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_in PassThrough, PassThrough, PassThrough>>>& instances); + +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances( + std::vector>>& instances); + +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances( + std::vector>>& instances); +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances( + std::vector>>& instances); + +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances( + std::vector>>& instances); #endif #ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances( @@ -226,6 +273,53 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16 PassThrough, PassThrough, PassThrough>>>& instances); + +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances( + std::vector>>& instances); + +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances( + std::vector>>& instances); +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances( + std::vector>>& instances); + +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances( + std::vector>>& instances); #endif #ifdef CK_ENABLE_FP16 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances( diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt index ab4313d89..b057e0c8d 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt @@ -2,16 +2,16 @@ set(GROUPED_CONV1D_BWD_WEIGHT xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp - xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp) + xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp) if(DL_KERNELS) list(APPEND GROUPED_CONV1D_BWD_WEIGHT dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instance.cpp dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instance.cpp - dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp + dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instance.cpp dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instance.cpp - dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp) + dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp) endif() add_instance_library(device_grouped_conv1d_bwd_weight_instance ${GROUPED_CONV1D_BWD_WEIGHT}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp similarity index 96% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp index d7a15784a..59981b642 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp" diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp similarity index 96% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp index a92cb4285..a2ac640d3 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp" diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp similarity index 60% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp index f9368ab57..9c97d80c8 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp" @@ -24,19 +24,21 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_insta // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<1, - GNWC, - GKXC, - GNWK, - ConvBwdWeightDefault>{}); + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances< + 1, + GNWC, + GKXC, + GNWK, + ConvBwdWeightDefault>{}); // 2. Filter1x1Stride1Pad0 - add_device_operation_instances(instances, - device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances< - 1, - GNWC, - GKXC, - GNWK, - ConvBwdWeightFilter1x1Stride1Pad0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances< + 1, + GNWC, + GKXC, + GNWK, + ConvBwdWeightFilter1x1Stride1Pad0>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt index 8d67b46fb..ef99d69ae 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt @@ -2,24 +2,28 @@ set(GROUPED_CONV2D_BWD_WEIGHT xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp - xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp + xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp - xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp + xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp + xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp + xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp + xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp + xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp ) if(DL_KERNELS) list(APPEND GROUPED_CONV2D_BWD_WEIGHT dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp - dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp + dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp - dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp) + dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp) endif() add_instance_library(device_grouped_conv2d_bwd_weight_instance ${GROUPED_CONV2D_BWD_WEIGHT}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp similarity index 97% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp index 37b465e6c..63d20524f 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp" diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp similarity index 97% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp index cf3db8331..a615edfac 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp" diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp new file mode 100644 index 000000000..9fbdc6c46 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances< + 2, + NGCHW, + GKYXC, + NGKHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v2>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp new file mode 100644 index 000000000..e1c865a88 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances< + 2, + NGCHW, + GKYXC, + NGKHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v5>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp new file mode 100644 index 000000000..0e4d085de --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v2>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp new file mode 100644 index 000000000..680494cfd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v5>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp similarity index 62% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp index 17f5ee4e2..69e22dee4 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp" @@ -25,19 +25,21 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_in // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2, - GNHWC, - GKYXC, - GNHWK, - ConvBwdWeightDefault>{}); + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances< + 2, + GNHWC, + GKYXC, + GNHWK, + ConvBwdWeightDefault>{}); // 2. Filter1x1Stride1Pad0 - add_device_operation_instances(instances, - device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances< - 2, - GNHWC, - GKYXC, - GNHWK, - ConvBwdWeightFilter1x1Stride1Pad0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances< + 2, + GNHWC, + GKYXC, + GNHWK, + ConvBwdWeightFilter1x1Stride1Pad0>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp similarity index 61% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp index 614cc0a7e..cac935335 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp" @@ -25,19 +25,21 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_in // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2, - NHWGC, - GKYXC, - NHWGK, - ConvBwdWeightDefault>{}); + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK, + ConvBwdWeightDefault>{}); // 2. Filter1x1Stride1Pad0 - add_device_operation_instances(instances, - device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances< - 2, - NHWGC, - GKYXC, - NHWGK, - ConvBwdWeightFilter1x1Stride1Pad0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK, + ConvBwdWeightFilter1x1Stride1Pad0>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt index 7857bb029..2ceac45f9 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt @@ -2,24 +2,28 @@ set(GROUPED_CONV3D_BWD_WEIGHT xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp - xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp - xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp ) if(DL_KERNELS) list(APPEND GROUPED_CONV3D_BWD_WEIGHT dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp - dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp + dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp - dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp) + dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp) endif() list(APPEND GROUPED_CONV3D_BWD_WEIGHT diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp similarity index 96% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp index c9646d085..eadb7afd6 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp" diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp similarity index 97% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp index a37e6cbf3..b39babf3e 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp" diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp new file mode 100644 index 000000000..549716586 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances< + 3, + NDHWGC, + GKZYXC, + NDHWGK, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v2>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp new file mode 100644 index 000000000..18a00c6ea --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances< + 3, + NDHWGC, + GKZYXC, + NDHWGK, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v5>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp new file mode 100644 index 000000000..ac6cb8268 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances< + 3, + NGCDHW, + GKZYXC, + NGKDHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v2>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp new file mode 100644 index 000000000..705f5e8ce --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances< + 3, + NGCDHW, + GKZYXC, + NGKDHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v5>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp similarity index 60% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp index 91d80e4f7..81d64344f 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp" @@ -24,19 +24,21 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16 // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3, - GNDHWC, - GKZYXC, - GNDHWK, - ConvBwdWeightDefault>{}); + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances< + 3, + GNDHWC, + GKZYXC, + GNDHWK, + ConvBwdWeightDefault>{}); // 2. Filter1x1Stride1Pad0 - add_device_operation_instances(instances, - device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances< - 3, - GNDHWC, - GKZYXC, - GNDHWK, - ConvBwdWeightFilter1x1Stride1Pad0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances< + 3, + GNDHWC, + GKZYXC, + GNDHWK, + ConvBwdWeightFilter1x1Stride1Pad0>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp similarity index 61% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp index a394e0d6f..679f30a3d 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp" @@ -25,19 +25,21 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16 // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3, - NDHWGC, - GKZYXC, - NDHWGK, - ConvBwdWeightDefault>{}); + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances< + 3, + NDHWGC, + GKZYXC, + NDHWGK, + ConvBwdWeightDefault>{}); // 2. Filter1x1Stride1Pad0 - add_device_operation_instances(instances, - device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances< - 3, - NDHWGC, - GKZYXC, - NDHWGK, - ConvBwdWeightFilter1x1Stride1Pad0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances< + 3, + NDHWGC, + GKZYXC, + NDHWGK, + ConvBwdWeightFilter1x1Stride1Pad0>{}); } } // namespace instance diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp index 8533f3e8f..9872ff8ac 100644 --- a/profiler/src/profile_grouped_conv_bwd_weight.cpp +++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp @@ -25,7 +25,8 @@ enum struct ConvDataType F16_F16_F16, // 1 BF16_F32_BF16, // 2 F16_F16_F16_BF8_F8, // 3 - I8_I8_I8 // 4 + I8_I8_I8, // 4 + BF16_BF16_BF16, // 5 }; #define OP_NAME "grouped_conv_bwd_weight" @@ -38,7 +39,8 @@ static void print_helper_msg() << " 1: Input fp16, Weight fp16, Output fp16\n" << " 2: Input bf16, Weight fp32, Output bf16\n" << " 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8\n" - << " 4: Input int8, Weight int8, Output int8)\n" + << " 4: Input int8, Weight int8, Output int8\n" + << " 5: Input bf16, Weight bf16, Output bf16)\n" << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, " "N, K, Ho, Wo]\n" << " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, " @@ -187,6 +189,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) { return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } } if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) { @@ -203,6 +210,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) // fp32 atomic add is used for weight tensor in bf16 kernel return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } else if(data_type == ConvDataType::I8_I8_I8) { return profile( @@ -224,6 +236,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) // fp32 atomic add is used for weight tensor in bf16 kernel return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } if(data_type == ConvDataType::F16_F16_F16_BF8_F8) { return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{}); @@ -240,6 +257,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) { return profile(I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile( + I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } } std::cout << "this data_type & layout is not implemented" << std::endl; diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py index d9f5050d0..5bcaf1448 100644 --- a/script/convert_miopen_driver_to_profiler.py +++ b/script/convert_miopen_driver_to_profiler.py @@ -65,8 +65,9 @@ def parse_data_type(args): if args.ck_profier_op == "grouped_conv_fwd": args.data_type = 3 if args.data_type == "bfp16": - if args.ck_profier_op == "grouped_conv_bwd_weight" or \ - args.ck_profier_op == "grouped_conv_bwd_data" or \ + if args.ck_profier_op == "grouped_conv_bwd_weight": + args.data_type = 5 + if args.ck_profier_op == "grouped_conv_bwd_data" or \ args.ck_profier_op == "grouped_conv_fwd": args.data_type = 2 -- GitLab From 4d5248e2d17770234f433f1a83aa0294ff60c7b1 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com> Date: Tue, 22 Oct 2024 19:17:32 +0100 Subject: [PATCH 010/153] Explicit cast values to half (#1593) Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> --- include/ck/utility/math_v2.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp index cbbe15585..b374c4ad5 100644 --- a/include/ck/utility/math_v2.hpp +++ b/include/ck/utility/math_v2.hpp @@ -653,7 +653,7 @@ inline __device__ double sin(double x) template <> inline __device__ half_t sin(half_t x) { - return ::hsin(x); + return hsin(static_cast<__half>(x)); }; template @@ -785,7 +785,7 @@ inline __device__ double ceil(double x) template <> inline __device__ half_t ceil(half_t x) { - return ::hceil(x); + return hceil(static_cast<__half>(x)); }; template @@ -827,7 +827,7 @@ inline __device__ double floor(double x) template <> inline __device__ half_t floor(half_t x) { - return ::hfloor(x); + return hfloor(static_cast<__half>(x)); }; template @@ -849,7 +849,7 @@ inline __device__ T exp(T x) template <> inline __device__ half_t exp(half_t x) { - return hexp(x); + return hexp(static_cast<__half>(x)); }; template <> @@ -873,7 +873,7 @@ inline __device__ T log(T x) template <> inline __device__ half_t log(half_t x) { - return hlog(x); + return hlog(static_cast<__half>(x)); }; template <> -- GitLab From cedccd59c94cb0c74e7ec0d0f6c791aed081febc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Wed, 23 Oct 2024 12:02:33 +0200 Subject: [PATCH 011/153] [POST MERGE PR] Enable grouped conv bwd wei bf16 NGCHW (#1594) --- ...e_grouped_conv_bwd_weight_xdl_instance.hpp | 35 ++++++++++++++ .../grouped_convolution_backward_weight.hpp | 4 ++ ...rouped_convolution_backward_weight_xdl.inc | 24 ++++++++++ .../grouped_conv2d_bwd_weight/CMakeLists.txt | 1 + ...ht_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 46 +++++++++++++++++++ .../grouped_conv3d_bwd_weight/CMakeLists.txt | 1 + ...xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp | 46 +++++++++++++++++++ .../src/profile_grouped_conv_bwd_weight.cpp | 9 ++-- 8 files changed, 161 insertions(+), 5 deletions(-) create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp index 32f52770b..a08d73546 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp @@ -141,6 +141,41 @@ using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances = std // clang-format on >; +template +using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances = std::tuple< + // clang-format off + //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| + //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| + //#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| + //#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + // generic instance + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 64, 4, 8, 32, 32, 2, 2, S<1, 4, 8, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 2, 4, true, S<1, 4, 8, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 2, 4, true, 1, 1, S<1, 16, 1, 4>, 2>, + // instance for small conv.K + // for bf16 conv.K and conv.C must be divisible by 2 + // since half_t atomic_add require scalar_per_x_vector % 2 == 0 + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 128, 128, 32, 4, 8, 32, 32, 2, 1, S<1, 4, 16, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, S<1, 4, 4, 8>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 2, 1, true, 1, 1, S<1, 32, 1, 4>, 2>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 64, 4, 8, 32, 32, 1, 2, S<1, 4, 4, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 2, 2, true, S<1, 4, 8, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, 1, 1, S<1, 16, 1, 4>, 8>, + + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<1, 4, 32, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, S<1, 4, 16, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<1, 4, 16, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, S<1, 4, 32, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 128, 128, 128, 4, 8, 32, 32, 4, 2, S<1, 4, 16, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, S<1, 4, 16, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<1, 4, 16, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, S<1, 4, 16, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<1, 4, 16, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, S<1, 4, 8, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 128, 64, 128, 4, 8, 32, 32, 2, 2, S<1, 4, 8, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, S<1, 4, 16, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 64, 4, 8, 32, 32, 2, 2, S<1, 4, 8, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, S<1, 4, 8, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 256, 128, 64, 4, 8, 32, 32, 2, 1, S<1, 4, 16, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, S<1, 4, 8, 8>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 1, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 256, 64, 128, 4, 8, 32, 32, 1, 2, S<1, 4, 8, 8>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 1, true, S<1, 4, 16, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 128, 128, 32, 4, 8, 32, 32, 2, 1, S<1, 4, 16, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, S<1, 4, 4, 8>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 1, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 128, 32, 128, 4, 8, 32, 32, 1, 2, S<1, 4, 4, 8>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 1, true, S<1, 4, 16, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 32, 4, 8, 32, 32, 2, 1, S<1, 4, 8, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, S<1, 4, 4, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 64, 4, 8, 32, 32, 1, 2, S<1, 4, 4, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, S<1, 4, 8, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, 1, 1, S<1, 16, 1, 4>, 8> + // clang-format on + >; + template && is_same_v) { + add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances( + op_ptrs); add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances( op_ptrs); add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances( @@ -483,6 +485,8 @@ struct DeviceOperationInstanceFactory && is_same_v) { + add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + op_ptrs); add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances( op_ptrs); add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances( diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc index 5f6f2fc6f..132dde81a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc @@ -89,6 +89,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances( PassThrough>>>& instances); #endif #ifdef CK_ENABLE_BF16 +void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector>>& instances); + void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances( std::vector>>& instances); #endif #ifdef CK_ENABLE_BF16 +void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector>>& instances); + void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances( std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2, + NHWGC, + GKYXC, + NHWGK, + ConvBwdWeightDefault>{}); + // 2. Filter1x1Stride1Pad0 + add_device_operation_instances(instances, + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK, + ConvBwdWeightFilter1x1Stride1Pad0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt index 2ceac45f9..c8c30897c 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt @@ -6,6 +6,7 @@ set(GROUPED_CONV3D_BWD_WEIGHT xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp new file mode 100644 index 000000000..f1ea37181 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3, + NDHWGC, + GKZYXC, + NDHWGK, + ConvBwdWeightDefault>{}); + // 2. Filter1x1Stride1Pad0 + add_device_operation_instances(instances, + device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances< + 3, + NDHWGC, + GKZYXC, + NDHWGK, + ConvBwdWeightFilter1x1Stride1Pad0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp index 9872ff8ac..4170ac65a 100644 --- a/profiler/src/profile_grouped_conv_bwd_weight.cpp +++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp @@ -182,6 +182,10 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) // fp32 atomic add is used for weight tensor in bf16 kernel return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } } else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKYXC_NGKHW) { @@ -210,11 +214,6 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) // fp32 atomic add is used for weight tensor in bf16 kernel return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); } - if(data_type == ConvDataType::BF16_BF16_BF16) - { - return profile( - I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); - } else if(data_type == ConvDataType::I8_I8_I8) { return profile( -- GitLab From 8e22e1ae31bbf7086f69d8724e027676791d351a Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 23 Oct 2024 15:55:39 -0700 Subject: [PATCH 012/153] fix the logic of enabling XDL and WMMA instances (#1595) --- CMakeLists.txt | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0700fe838..6a5180363 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,18 +177,14 @@ rocm_check_target_ids(SUPPORTED_GPU_TARGETS message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}") -if (GPU_TARGETS) - if (GPU_TARGETS MATCHES "gfx9") - add_definitions(-DCK_USE_XDL) - set(CK_USE_XDL "ON") - endif() - if (GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12") - add_definitions(-DCK_USE_WMMA) - set(CK_USE_WMMA "ON") - endif() -else() - add_definitions(-DCK_USE_WMMA -DCK_USE_XDL) +if (SUPPORTED_GPU_TARGETS MATCHES "gfx9") + message("Enabling XDL instances") + add_definitions(-DCK_USE_XDL) set(CK_USE_XDL "ON") +endif() +if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") + message("Enabling WMMA instances") + add_definitions(-DCK_USE_WMMA) set(CK_USE_WMMA "ON") endif() @@ -578,7 +574,7 @@ rocm_package_setup_component(profiler ) add_subdirectory(profiler) -if(CK_USE_CODEGEN AND (GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) +if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) add_subdirectory(codegen) endif() -- GitLab From 9183ce69cac01374d0eafbdb4258cf1744b5a548 Mon Sep 17 00:00:00 2001 From: dummycoderfe Date: Fri, 25 Oct 2024 11:17:45 +0800 Subject: [PATCH 013/153] hot_fix epsilon pos (#1597) Co-authored-by: dummycoderfe --- .../layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp | 2 +- .../layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index d73bcb29e..bf002141b 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -90,7 +90,7 @@ struct Layernorm2dFwdPipelineOnePass // compute inv-std auto inv_std = tile_elementwise_in( [&](const auto& v_) { - return type_convert(1.0f) / (sqrt(v_) + epsilon); + return type_convert(1.0f) / (sqrt(v_ + epsilon)); }, var); diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp index dcbfc87da..db094ac2a 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -105,7 +105,7 @@ struct Layernorm2dFwdPipelineTwoPass // compute inv-std auto inv_std = tile_elementwise_in( [&](const auto& v_) { - return type_convert(1.0f) / (sqrt(v_) + epsilon); + return type_convert(1.0f) / (sqrt(v_ + epsilon)); }, var); -- GitLab From 9385caa3069b8b366c365765164df0c0b6b32925 Mon Sep 17 00:00:00 2001 From: aledudek Date: Fri, 25 Oct 2024 12:46:24 +0200 Subject: [PATCH 014/153] Generic threshold calculation (#1546) * Calculate generic relative threshold pool3dfwd * Calculate absolute error threshold pool3d fwd * Generic threshold calculation take max input for relative error pool3dfwd * Remove max possible value for error calculation at runtime * Remove debug print in pool3dfwd * Pool3d fwd adjusted types in generic threshold calculation * Generic threshold calculation take into account number of accumulations and accdatatype * Generic threshold fix final error formula * Generic threshold calculation - num of accs fix * Generic threshold calculation - adjust absolute error * Generic threshold calculation - OutDataType in absolute error --- include/ck/utility/data_type.hpp | 9 ++ .../include/ck/library/utility/check_err.hpp | 127 ++++++++++++++++++ .../profiler/profile_pool3d_fwd_impl.hpp | 38 +++++- 3 files changed, 167 insertions(+), 7 deletions(-) diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index debeb472a..39f532e0e 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -1803,4 +1803,13 @@ struct NumericUtils static constexpr int bias = 16; // negative zero nan mode // static constexpr int bias = 15; // ieee mode }; + +template <> +struct NumericUtils +{ + static constexpr int exp = 8; + static constexpr int mant = 7; + static constexpr int bias = 128; // negative zero nan mode + // static constexpr int bias = 127; // ieee mode +}; } // namespace ck diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp index 58479f212..73ac2a189 100644 --- a/library/include/ck/library/utility/check_err.hpp +++ b/library/include/ck/library/utility/check_err.hpp @@ -23,6 +23,130 @@ namespace ck { namespace utils { +template +double get_relative_threshold(const int numberOfAccumulations = 1) +{ + using F8 = ck::f8_t; + using F16 = ck::half_t; + using BF16 = ck::bhalf_t; + using F32 = float; + using I8 = int8_t; + using I32 = int32_t; + + static_assert(is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v, + "Warning: Unhandled ComputeDataType for setting up the relative threshold!"); + double compute_error = 0; + if constexpr(is_same_v || is_same_v || + is_same_v) + { + return 0; + } + else + { + compute_error = std::pow(2, -NumericUtils::mant) * 0.5; + } + + static_assert(is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v, + "Warning: Unhandled OutDataType for setting up the relative threshold!"); + double output_error = 0; + if constexpr(is_same_v || is_same_v || + is_same_v) + { + return 0; + } + else + { + output_error = std::pow(2, -NumericUtils::mant) * 0.5; + } + double midway_error = std::max(compute_error, output_error); + + static_assert(is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v, + "Warning: Unhandled AccDataType for setting up the relative threshold!"); + double acc_error = 0; + if constexpr(is_same_v || is_same_v || + is_same_v) + { + return 0; + } + else + { + acc_error = std::pow(2, -NumericUtils::mant) * 0.5 * numberOfAccumulations; + } + return std::max(acc_error, midway_error); +} + +template +double get_absolute_threshold(const double max_possible_num, const int numberOfAccumulations = 1) +{ + using F8 = ck::f8_t; + using F16 = ck::half_t; + using BF16 = ck::bhalf_t; + using F32 = float; + using I8 = int8_t; + using I32 = int32_t; + + static_assert(is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v, + "Warning: Unhandled ComputeDataType for setting up the absolute threshold!"); + auto expo = std::log2(std::abs(max_possible_num)); + double compute_error = 0; + if constexpr(is_same_v || is_same_v || + is_same_v) + { + return 0; + } + else + { + compute_error = std::pow(2, expo - NumericUtils::mant) * 0.5; + } + + static_assert(is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v, + "Warning: Unhandled OutDataType for setting up the absolute threshold!"); + double output_error = 0; + if constexpr(is_same_v || is_same_v || + is_same_v) + { + return 0; + } + else + { + output_error = std::pow(2, expo - NumericUtils::mant) * 0.5; + } + double midway_error = std::max(compute_error, output_error); + + static_assert(is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v, + "Warning: Unhandled AccDataType for setting up the absolute threshold!"); + double acc_error = 0; + if constexpr(is_same_v || is_same_v || + is_same_v) + { + return 0; + } + else + { + acc_error = + std::pow(2, expo - NumericUtils::mant) * 0.5 * numberOfAccumulations; + } + return std::max(acc_error, midway_error); +} + template typename std::enable_if< std::is_same_v, ranges::range_value_t> && @@ -253,11 +377,13 @@ check_err(const Range& out, int err_count = 0; double err = 0; double max_err = std::numeric_limits::min(); + for(std::size_t i = 0; i < ref.size(); ++i) { const double o = type_convert(*std::next(std::begin(out), i)); const double r = type_convert(*std::next(std::begin(ref), i)); err = std::abs(o - r); + if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) { max_err = err > max_err ? err : max_err; @@ -270,6 +396,7 @@ check_err(const Range& out, res = false; } } + if(!res) { std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp index 3bdaa5c83..a0890028a 100644 --- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp +++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp @@ -102,11 +102,22 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& Tensor out_indices_n_c_do_ho_wo_device( f_host_tensor_descriptor(N, C, Do, Ho, Wo)); + constexpr int inDataRangeTensor1{1}; + constexpr int inDataRangeTensor2{5}; + constexpr double inDataRangeTensor3{0.5}; + switch(in_params.init_method) { - case 0: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1{}); break; - case 1: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2{-5, 5}); break; - default: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + case 0: + in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1{inDataRangeTensor1}); + break; + case 1: + in_n_c_di_hi_wi.GenerateTensorValue( + GeneratorTensor_2{-inDataRangeTensor2, inDataRangeTensor2}); + break; + default: + in_n_c_di_hi_wi.GenerateTensorValue( + GeneratorTensor_3{-inDataRangeTensor3, inDataRangeTensor3}); } DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_di_hi_wi.mDesc.GetElementSpaceSize()); @@ -229,12 +240,25 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& { out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data()); - auto tolerance = 1e-3; - bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData, + auto absolute_error_threshold = 1.0; + switch(in_params.init_method) + { + case 0: absolute_error_threshold = static_cast(inDataRangeTensor1); break; + case 1: absolute_error_threshold = static_cast(inDataRangeTensor2); break; + default: absolute_error_threshold = inDataRangeTensor3; + } + + absolute_error_threshold = + ck::utils::get_absolute_threshold( + absolute_error_threshold); + auto relative_error_threshold = + ck::utils::get_relative_threshold(); + + bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData, out_n_c_do_ho_wo_host.mData, "Error: Incorrect results", - tolerance, - tolerance); + relative_error_threshold, + absolute_error_threshold); if constexpr(OutputIndex) { -- GitLab From 7d576f1748eca6f02f5ab3e0a860ed3cb3a9c6d8 Mon Sep 17 00:00:00 2001 From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Date: Fri, 25 Oct 2024 10:13:46 -0500 Subject: [PATCH 015/153] Update GPU verification (#1596) * Update inits * Update static_cast to type_convert * Add verification option selection --- example/01_gemm/common.hpp | 15 ++++++----- example/01_gemm/run_gemm_example.inc | 27 ++++++++++--------- .../01_gemm/run_gemm_example_streamk_v2.inc | 2 +- example/01_gemm/run_gemm_example_v2.inc | 2 +- .../gpu/reference_gemm.hpp | 10 +++---- 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp index d08196924..6e1c9f2a0 100644 --- a/example/01_gemm/common.hpp +++ b/example/01_gemm/common.hpp @@ -75,9 +75,10 @@ struct ProblemSizeSplitK final struct ExecutionConfig final { - bool do_verification = true; - int init_method = 2; - bool time_kernel = false; + // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU + int do_verification = 3; + int init_method = 2; + bool time_kernel = false; }; template @@ -126,7 +127,7 @@ bool parse_cmd_args(int argc, } else { - std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl + std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl << "arg3: time kernel (0=no, 1=yes)" << std::endl @@ -176,7 +177,7 @@ bool parse_cmd_args(int argc, else { std::cerr - << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl + << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl << "arg3: time kernel (0=no, 1=yes)" << std::endl << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl @@ -225,7 +226,7 @@ bool parse_cmd_args(int argc, } else { - std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl + std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl << "arg3: time kernel (0=no, 1=yes)" << std::endl @@ -275,7 +276,7 @@ bool parse_cmd_args(int argc, } else { - std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl + std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl << "arg3: time kernel (0=no, 1=yes)" << std::endl diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index fe12998e3..bafec3f35 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -330,7 +330,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) bool pass = true; - if(config.do_verification) + if((config.do_verification == 1) || (config.do_verification == 3)) { // CPU verification auto ref_gemm = ReferenceGemmInstance{}; @@ -353,13 +353,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) #else c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); - pass &= !ck::utils::check_err(c_m_n_device_result, - c_m_n_host_result, - "Error: Incorrect results!", - get_rtol(), - get_atol()); + pass &= ck::utils::check_err(c_m_n_device_result, + c_m_n_host_result, + "Error: Incorrect results!", + get_rtol(), + get_atol()); #endif + } + if((config.do_verification == 2) || (config.do_verification == 3)) + { // GPU verification auto ref_gemm_gpu = ReferenceGemmInstanceGPU{}; auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker(); @@ -381,14 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); - pass &= !ck::utils::check_err(c_m_n_device_result, - c_m_n_device_ref_result, - "Error: Incorrect results!", - get_rtol(), - get_atol()); + pass &= ck::utils::check_err(c_m_n_device_result, + c_m_n_device_ref_result, + "Error: Incorrect results!", + get_rtol(), + get_atol()); } - return !pass; + return pass == true; } bool run_gemm_example(int argc, char* argv[]) diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc index 6679f9515..8ed8b81be 100644 --- a/example/01_gemm/run_gemm_example_streamk_v2.inc +++ b/example/01_gemm/run_gemm_example_streamk_v2.inc @@ -241,7 +241,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) } bool pass = true; - if(config.do_verification) + if((config.do_verification == 1) || (config.do_verification == 3)) { auto ref_gemm = ReferenceGemmInstance{}; auto ref_invoker = ref_gemm.MakeInvoker(); diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc index 0bcee658b..71524fdec 100644 --- a/example/01_gemm/run_gemm_example_v2.inc +++ b/example/01_gemm/run_gemm_example_v2.inc @@ -228,7 +228,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) } bool pass = true; - if(config.do_verification) + if((config.do_verification == 1) || (config.do_verification == 3)) { auto ref_gemm = ReferenceGemmInstance{}; auto ref_invoker = ref_gemm.MakeInvoker(); diff --git a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp index 639b5fe80..2c2cac77e 100644 --- a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp @@ -45,10 +45,10 @@ __global__ void if(row_idx < m && col_idx < n) { - AccDataType v_acc = static_cast(0.0); - ComputeTypeA v_a = static_cast(0.0); - ComputeTypeB v_b = static_cast(0.0); - CDataType v_c = static_cast(0.0); + AccDataType v_acc{0}; + ComputeTypeA v_a{0}; + ComputeTypeB v_b{0}; + CDataType v_c{0}; for(int k_idx = 0; k_idx < k; ++k_idx) { @@ -76,7 +76,7 @@ __global__ void // apply b_element_op b_element_op(v_b, p_b_grid[element_idx_b]); // multiply and accumulate - v_acc += static_cast(v_a) * static_cast(v_b); + v_acc += type_convert(v_a) * type_convert(v_b); } // apply c_element_op c_element_op(v_c, v_acc); -- GitLab From eda593838621984ea008a783ca0093350a7bf60e Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Fri, 18 Oct 2024 17:09:12 +0000 Subject: [PATCH 016/153] add parsing grouped conv fwd instances --- .../grouped_conv_fwd/gen_instances.py | 167 ++++++++++++++++++ python/ck4inductor/grouped_conv_fwd/op.py | 93 ++++++++++ .../universal_gemm/gen_instances.py | 5 +- python/ck4inductor/universal_gemm/op.py | 3 + python/ck4inductor/util.py | 5 +- 5 files changed, 271 insertions(+), 2 deletions(-) create mode 100644 python/ck4inductor/grouped_conv_fwd/gen_instances.py create mode 100644 python/ck4inductor/grouped_conv_fwd/op.py diff --git a/python/ck4inductor/grouped_conv_fwd/gen_instances.py b/python/ck4inductor/grouped_conv_fwd/gen_instances.py new file mode 100644 index 000000000..ffbea6bdc --- /dev/null +++ b/python/ck4inductor/grouped_conv_fwd/gen_instances.py @@ -0,0 +1,167 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +import logging +import os +import subprocess +from dataclasses import replace +from functools import lru_cache +from typing import List + +from ..util import library_path + +from .op import CKGroupedConvFwdOp + +log = logging.getLogger(__name__) + + +def _ck_conv_instances_path(): + conv_instances_path = os.path.join( # noqa: F821 + library_path(), + "include", + "ck", + "library", + "tensor_operation_instance", + "gpu", + "grouped_conv_fwd", + ) + if not os.path.exists(conv_instances_path): + log.error( + "CK library conv instances path %s does not exist", conv_instances_path + ) + return None + return conv_instances_path + + +def parse_instances(str_instances: List[str]) -> List[CKGroupedConvFwdOp]: + """ + Parse the lines containing Grouped Convolution Forward template instances + into `CKGroupedConvFwdOp` instances + """ + + def maybe_int(s): + try: + return int(s) + except ValueError: + return s + + op_instances = [] + # TODO: maybe use libclang for parsing C++ code in the future + # to avoid this hacky parsing logic below ? :) - copilot + for line in str_instances: + s_template_args = line.split("DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3")[ + -1 + ].strip("<>, ") + template_args = [] + i_current = 0 + while i_current < len(s_template_args): + if s_template_args[i_current] == " ": + # skip whitespace + i_current += 1 + continue + elif s_template_args[i_current : i_current + 2] == "S<": + # parse template S + i_next = s_template_args.find(">", i_current) + template_args.append( + tuple(map(int, s_template_args[i_current + 2 : i_next].split(","))) + ) + i_current = i_next + 2 + else: + # all string attributes must be either type aliases or global constants in C++ + i_next = s_template_args.find(",", i_current) + template_args.append( + maybe_int( + s_template_args[i_current : i_next if i_next != -1 else None] + ) + ) + if i_next != -1: + i_current = i_next + 1 + if i_next == -1: + break + + template_args[0] = -1 # n_dim_spatial + template_args[3] = tuple() # ds_layout + template_args[9] = tuple() # ds_element_dtype + + new_instance = CKGroupedConvFwdOp( + *template_args, # type: ignore[arg-type] + ) + + op_instances.append(new_instance) + return op_instances + + +@lru_cache(None) +def gen_conv_ops_library() -> List[CKGroupedConvFwdOp]: + """ + Parse the Grouped Convolution Forward instances + defined in the Composable Kernel library folder. + """ + ck_library_dir = _ck_conv_instances_path() + if not ck_library_dir: + return [] + + grep_result = subprocess.run( + [ + "grep", + "-inR", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3", + ck_library_dir, + ], + capture_output=True, + text=True, + ) + + op_instances = parse_instances(grep_result.stdout.strip().split("\n")) + + log.debug("ck instances from library: %d", len(op_instances)) + + schedulers = [ + "BlockGemmPipelineScheduler::Intrawave", + "BlockGemmPipelineScheduler::Interwave", + ] + conv_specs = [ + "ConvolutionForwardSpecialization::Default", + "ConvolutionForwardSpecialization::Filter1x1Pad0", + "ConvolutionForwardSpecialization::Filter1x1Stride1Pad0", + "ConvolutionForwardSpecialization::OddC", + ] + + # substitute templated args by looping through their domains + substitute_instances = [] + for instance in op_instances: + sub_scheduler = ( + instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched" + ) + sub_spec = instance.conv_forward_specialization == "ConvSpec" + schedulers_range = ( + schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler] + ) + spec_range = conv_specs if sub_spec else [instance.conv_forward_specialization] + for scheduler in schedulers_range: + for spec in spec_range: + for channels_last in [True, False]: + if channels_last: + a_layout = "NHWGC" + e_layout = "NHWGK" + else: + a_layout = "NGCHW" + e_layout = "NGKHW" + substitute_instances.append( + replace( + instance, + block_gemm_pipeline_scheduler=scheduler, + conv_forward_specialization=spec, + gemm_specialization="GemmSpecialization::MNKPadding", + n_dim_spatial=2, + a_layout=a_layout, + b_layout="GKYXC", + e_layout=e_layout, + ) + ) + + return substitute_instances + + +if __name__ == "__main__": + print(gen_conv_ops_library()) diff --git a/python/ck4inductor/grouped_conv_fwd/op.py b/python/ck4inductor/grouped_conv_fwd/op.py new file mode 100644 index 000000000..25d45e8ff --- /dev/null +++ b/python/ck4inductor/grouped_conv_fwd/op.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +from dataclasses import asdict, dataclass +from typing import Optional, Tuple + + +@dataclass +class CKGroupedConvFwdOp: + n_dim_spatial: int + a_layout: str + b_layout: str + ds_layout: Tuple[str] + e_layout: str + a_element_dtype: str + b_element_dtype: str + acc_dtype: str + c_shuffle_dtype: str + ds_element_dtype: Tuple[str] + e_element_dtype: str + a_elementwise_op: str + b_elementwise_op: str + cde_elementwise_op: str + conv_forward_specialization: str + gemm_specialization: str + + block_size: int + m_per_block: int + n_per_block: int + k_per_block: int + ak1: int + bk1: int + m_per_xdl: int + n_per_xdl: int + m_xdl_per_wave: int + n_xdl_per_wave: int + a_block_transfer_thread_cluster_lengths_ak0_m_ak1: Tuple[int, int, int] + a_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int] + a_block_transfer_src_access_order: Tuple[int, int, int] + a_block_transfer_src_vector_dim: int + a_block_transfer_src_scalar_per_vector: int + a_block_transfer_dst_scalar_per_vector_ak1: int + a_block_lds_extra_m: bool + + b_block_transfer_thread_cluster_lengths_bk0_n_bk1: Tuple[int, int, int] + b_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int] + b_block_transfer_src_access_order: Tuple[int, int, int] + + b_block_transfer_src_vector_dim: int + b_block_transfer_src_scalar_per_vector: int + b_block_transfer_dst_scalar_per_vector_bk1: int + b_block_lds_extra_n: bool + + c_shuffle_m_xdl_per_wave_per_shuffle: int + c_shuffle_n_xdl_per_wave_per_shuffle: int + cde_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block: Tuple[ # noqa + int, + int, + int, + int, + ] + cde_block_transfer_scalar_per_vector_n_per_block: int + block_gemm_pipeline_scheduler: str + block_gemm_pipeline_version: str + + a_compute_dtype: Optional[str] = None + b_compute_dtype: Optional[str] = None + + def name(self): + # cpp alias for template instance + return ( + f"ck_device_grouped_convolution_fwd_multiple_abd_xdl_c_shuffle_v3_" + f"{self.key_name()}" + ) + + def key_name(self): + # TBD; must be unique per instance. Intended to use as dict key + return "_".join( + [ + "K" + + field_name.replace("_", "").lower() + + "V" + + ( + "x".join(map(str, iter(field_value))) + if isinstance(field_value, tuple) + else str(field_value).replace(":", "") + ) + for field_name, field_value in self.dict_items() + ] + ) + + def dict_items(self): + return asdict(self).items() diff --git a/python/ck4inductor/universal_gemm/gen_instances.py b/python/ck4inductor/universal_gemm/gen_instances.py index 5594b8681..24bab5477 100644 --- a/python/ck4inductor/universal_gemm/gen_instances.py +++ b/python/ck4inductor/universal_gemm/gen_instances.py @@ -1,7 +1,10 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + import logging import os import subprocess -from dataclasses import fields, replace +from dataclasses import replace from functools import lru_cache, partial from typing import List diff --git a/python/ck4inductor/universal_gemm/op.py b/python/ck4inductor/universal_gemm/op.py index a8bb72500..946aaa7af 100644 --- a/python/ck4inductor/universal_gemm/op.py +++ b/python/ck4inductor/universal_gemm/op.py @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + from dataclasses import asdict, dataclass from typing import Optional, Tuple diff --git a/python/ck4inductor/util.py b/python/ck4inductor/util.py index 79d6be00f..4d7e8bd87 100644 --- a/python/ck4inductor/util.py +++ b/python/ck4inductor/util.py @@ -1,7 +1,10 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + import functools import os @functools.lru_cache(None) def library_path(): - return os.path.join(os.path.dirname(__file__), 'library') + return os.path.join(os.path.dirname(__file__), "library") -- GitLab From 37f7afed1e2a19be8c04b7cd26d07db41c082e88 Mon Sep 17 00:00:00 2001 From: valarLip <103567126+valarLip@users.noreply.github.com> Date: Sat, 26 Oct 2024 16:39:34 +0800 Subject: [PATCH 017/153] add int8 gemm multiply multiply a8w8 (#1591) * add int8 gemm multiply multiply a8w8 * uncomment * clang-format-12 * Add example_gemm_multiply_multiply_xdl_int8 * Remove shell scripts * update preprocess number for mi308; bring back printout in ckprofiler * format --------- Co-authored-by: chenjun Co-authored-by: Haocong WANG Co-authored-by: carlushuang --- .../65_gemm_multiply_multiply/CMakeLists.txt | 1 + .../gemm_multiply_multiply_xdl_int8.cpp | 304 ++++++++++++++++++ include/ck/host_utility/flush_cache.hpp | 55 +++- .../gpu/element/element_wise_operation.hpp | 20 ++ include/ck/utility/amd_xdlops.hpp | 12 +- .../gpu/gemm_multiply_multiply.hpp | 105 ++++++ .../gpu/gemm_multiply_multiply/CMakeLists.txt | 10 + ...tiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp | 99 ++++++ ...i8_bf16_mk_nk_mn_comp_default_instance.cpp | 32 ++ ...8_bf16_mk_nk_mn_comp_kpadding_instance.cpp | 32 ++ ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp | 33 ++ ...bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp | 33 ++ ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp | 33 ++ ...bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp | 33 ++ .../profile_gemm_multiply_multiply_impl.hpp | 10 +- .../src/profile_gemm_multiply_multiply.cpp | 10 +- 16 files changed, 794 insertions(+), 28 deletions(-) create mode 100644 example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt index d39114013..55c884246 100644 --- a/example/65_gemm_multiply_multiply/CMakeLists.txt +++ b/example/65_gemm_multiply_multiply/CMakeLists.txt @@ -1,3 +1,4 @@ add_example_executable(example_gemm_multiply_multiply_xdl_fp8 gemm_multiply_multiply_xdl_fp8.cpp) add_example_executable(example_gemm_multiply_multiply_xdl_fp8_ab_scale gemm_multiply_multiply_xdl_fp8_ab_scale.cpp) add_example_executable(example_gemm_add_add_xdl_fp16 gemm_add_add_xdl_fp16.cpp) +add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_multiply_xdl_int8.cpp) \ No newline at end of file diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp new file mode 100644 index 000000000..fb1642bba --- /dev/null +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" + +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +#include "ck/library/utility/check_err.hpp" + +#include "ck/utility/blkgemmpipe_scheduler.hpp" + +template +using S = ck::Sequence; + +using I8 = int8_t; +using I32 = int; +using F16 = ck::half_t; +using FP8 = ck::f8_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using A0DataType = I8; +using B0DataType = I8; +using AccDataType = I32; +using CShuffleDataType = I32; +using D0DataType = F32; +using D1DataType = F32; +using DsDataType = ck::Tuple; +using EDataType = F16; + +using A0Layout = Row; +using B0Layout = Col; +using D0Layout = Row; +using D1Layout = Col; +using DsLayout = ck::Tuple; +using ELayout = Row; + +struct MultiplyMultiply +{ + template + __host__ __device__ constexpr void + operator()(E& e, const C& c, const D0& d0, const D1& d1) const; + + template <> + __host__ __device__ constexpr void operator()( + ck::half_t& e, const float& c, const float& d0, const float& d1) const + { + const float x0_f = c * d0 * d1; + + e = ck::type_convert(x0_f); + } + + template <> + __host__ __device__ constexpr void operator()( + ck::half_t& e, const int& c, const float& d0, const float& d1) const + { + const float x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1); + + e = ck::type_convert(x0_f); + } + + template <> + __host__ __device__ constexpr void operator()( + ck::bhalf_t& e, const int& c, const float& d0, const float& d1) const + { + const float x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1); + + e = ck::type_convert(x0_f); + } +}; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = MultiplyMultiply; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNPadding; + +using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3 + // clang-format off +///######| ALayout| BLayout| DsLayout| ELayout| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| +///######| | | | | Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| +///######| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| +///######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S| +///###### RRR + ///< Row, Row, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, I8>; +///###### RCR + < Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, I8>; +// clang-format on + +int main(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = K; + ck::index_t StrideB = K; + ck::index_t StrideD = 0; + ck::index_t StrideE = N; + + ck::index_t KBatch = 1; + + if(argc == 1) + { + // use default case + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else if(argc == 12) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + + StrideA = std::stoi(argv[7]); + StrideB = std::stoi(argv[8]); + StrideD = std::stoi(argv[9]); + StrideE = std::stoi(argv[10]); + + KBatch = std::stoi(argv[11]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf( + "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, KBatch\n"); + exit(0); + } + do_verification = false; + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + + if(std::is_same::value) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + Tensor a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{})); + Tensor b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); + Tensor d0_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{})); + Tensor d1_m_n(f_host_tensor_descriptor(M, N, StrideD, D1Layout{})); + Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); + Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); + + std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl; + std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl; + std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl; + std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl; + std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a0_m_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_k_n.GenerateTensorValue(GeneratorTensor_2{0, 2}); + d0_m_n.GenerateTensorValue(GeneratorTensor_2{0, 2}); + d1_m_n.GenerateTensorValue(GeneratorTensor_2{0, 2}); + break; + default: + a0_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b0_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + d0_m_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + d1_m_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize()); + DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize()); + DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize()); + DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); + + a0_device_buf.ToDevice(a0_m_k.mData.data()); + b0_device_buf.ToDevice(b0_k_n.mData.data()); + d0_device_buf.ToDevice(d0_m_n.mData.data()); + d1_device_buf.ToDevice(d1_m_n.mData.data()); + e_device_buf.ToDevice(e_m_n_device_result.mData.data()); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto cde_element_op = CDEElementOp{}; + + constexpr ck::index_t NumDTensor = DsDataType::Size(); + + constexpr auto I0 = ck::Number<0>{}; + + // do GEMM + auto device_op = DeviceOpInstance{}; + auto invoker = device_op.MakeInvoker(); + auto argument = + device_op.MakeArgument(a0_device_buf.GetDeviceBuffer(), + b0_device_buf.GetDeviceBuffer(), + std::array{d0_device_buf.GetDeviceBuffer(), + d1_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + std::array{I0, I0}, + StrideE, + KBatch, + a_element_op, + b_element_op, + cde_element_op); + + if(!device_op.IsSupportedArgument(argument)) + { + throw std::runtime_error( + "wrong! device_gemm with the specified compilation parameters does " + "not support this GEMM problem"); + } + + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 20, 50}); + + std::size_t flop = std::size_t(2) * M * N * K; + std::size_t num_btype = + sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" + << std::endl; + + if(do_verification) + { + invoker.Run(argument, StreamConfig{nullptr, false}); + + e_device_buf.FromDevice(e_m_n_device_result.mData.data()); + + Tensor c_m_n({M, N}); + + using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; + auto ref_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = ref_gemm.MakeArgument( + a0_m_k, b0_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{}); + + ref_invoker.Run(ref_argument); + + for(int m = 0; m < M; ++m) + { + for(int n = 0; n < N; ++n) + { + cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n)); + } + } + + e_device_buf.FromDevice(e_m_n_device_result.mData.data()); + + return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1; + } + + return 0; +} diff --git a/include/ck/host_utility/flush_cache.hpp b/include/ck/host_utility/flush_cache.hpp index 63fa365cc..918fb28ea 100644 --- a/include/ck/host_utility/flush_cache.hpp +++ b/include/ck/host_utility/flush_cache.hpp @@ -237,7 +237,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, Args... args) { #if CK_TIME_KERNEL -#define MEDIAN 1 +#define MEDIAN 0 if(stream_config.time_kernel_) { if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) @@ -275,6 +275,14 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, #else float total_time = 0; #endif + hipEvent_t start, stop; + + hip_check_error(hipEventCreate(&start)); + hip_check_error(hipEventCreate(&stop)); + + hip_check_error(hipDeviceSynchronize()); + hip_check_error(hipEventRecord(start, stream_config.stream_id_)); + for(int i = 0; i < nrepeat; ++i) { if constexpr(!TimePreprocess) @@ -282,13 +290,13 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, preprocess(); } - hipEvent_t start, stop; + // hipEvent_t start, stop; - hip_check_error(hipEventCreate(&start)); - hip_check_error(hipEventCreate(&stop)); + // hip_check_error(hipEventCreate(&start)); + // hip_check_error(hipEventCreate(&stop)); - hip_check_error(hipDeviceSynchronize()); - hip_check_error(hipEventRecord(start, stream_config.stream_id_)); + // hip_check_error(hipDeviceSynchronize()); + // hip_check_error(hipEventRecord(start, stream_config.stream_id_)); // calculate preprocess time if constexpr(TimePreprocess) { @@ -299,25 +307,34 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, hip_check_error(hipGetLastError()); // end real kernel - hip_check_error(hipEventRecord(stop, stream_config.stream_id_)); - hip_check_error(hipEventSynchronize(stop)); - float cur_time = 0; - hip_check_error(hipEventElapsedTime(&cur_time, start, stop)); -#if MEDIAN - times.insert(cur_time); -#else - total_time += cur_time; -#endif + // hip_check_error(hipEventRecord(stop, stream_config.stream_id_)); + // hip_check_error(hipEventSynchronize(stop)); + // float cur_time = 0; + // hip_check_error(hipEventElapsedTime(&cur_time, start, stop)); + // #if MEDIAN + // times.insert(cur_time); + // #else + // total_time += cur_time; + // #endif if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { - std::cout << "i: " << i << " cur_time: " << cur_time << std::endl; + // std::cout << "i: " << i << " cur_time: " << cur_time << std::endl; printf("gemm_args.p_a_grid: %p, gemm_args.p_b_grid:%p\n", static_cast(gemm_args.p_a_grid), static_cast(gemm_args.p_b_grid)); } } + hip_check_error(hipEventRecord(stop, stream_config.stream_id_)); + hip_check_error(hipEventSynchronize(stop)); + float cur_time = 0; + hip_check_error(hipEventElapsedTime(&cur_time, start, stop)); +#if MEDIAN + times.insert(cur_time); +#else + total_time += cur_time; +#endif #if MEDIAN auto mid = times.begin(); @@ -333,7 +350,11 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, return (*mid + *mid_next) / 2; } #else - return total_time / nrepeat; + // return total_time / nrepeat; + hipDeviceProp_t deviceProps; + hip_check_error(hipGetDeviceProperties(&deviceProps, 0)); + float preprocess_offset = deviceProps.multiProcessorCount == 80 ? 0.005 : 0.01; + return (total_time - preprocess_offset * nrepeat) / nrepeat; #endif } else diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 9c60121c8..135eaec93 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -272,6 +272,26 @@ struct MultiplyMultiply e = ck::type_convert(x0_f); } + + template <> + __host__ __device__ constexpr void operator()( + ck::half_t& e, const int& c, const ck::half_t& d0, const ck::half_t& d1) const + { + const float x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1); + + e = ck::type_convert(x0_f); + } + + template <> + __host__ __device__ constexpr void operator()( + ck::bhalf_t& e, const int& c, const float& d0, const float& d1) const + { + const float x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1); + + e = ck::type_convert(x0_f); + } }; struct MultiplyAddFastGelu diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index d8ccb2ea7..a955279bc 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -327,12 +327,12 @@ struct intrin_mfma_i32_16x16x32i8<16, 16> __device__ static void Run(const int8x8_t& reg_a, const int8x8_t& reg_b, FloatC& reg_c) { reg_c.template AsType()(Number<0>{}) = - __builtin_amdgcn_mfma_i32_16x16x32i8(bit_cast(reg_a), - bit_cast(reg_b), - reg_c.template AsType()[Number<0>{}], - 0, - 0, - 0); + __builtin_amdgcn_mfma_i32_16x16x32_i8(bit_cast(reg_a), + bit_cast(reg_b), + reg_c.template AsType()[Number<0>{}], + 0, + 0, + 0); } }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp index 2077f904d..b6aa61277 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp @@ -96,6 +96,87 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_i MultiplyMultiply>>>& instances); #endif +#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_INT8)) +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances); + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances); + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances); + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances); + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances); + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances); + +#endif + template && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instances( + op_ptrs); + add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instances( + op_ptrs); + + add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instances( + op_ptrs); + + add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instances( + op_ptrs); + } + } #endif return op_ptrs; } diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt index 5e56aebcf..0107c3dec 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt @@ -8,9 +8,19 @@ list(APPEND GEMM_MULTIPLY_MULTIPLY_INSTANCES device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp + + device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp + device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp + device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instance.cpp + device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp + device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instance.cpp + device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp ) set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1") set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1") +set_source_files_properties(device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1") +set_source_files_properties(device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1") + add_instance_library(device_gemm_multiply_multiply_instance ${GEMM_MULTIPLY_MULTIPLY_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp new file mode 100644 index 000000000..2d4c37199 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using I8 = int8_t; +using I32 = int; +using BF16 = bhalf_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; +using MultiplyMultiply = element_wise::MultiplyMultiply; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_instances = std::tuple< + // clang-format off + //################################| ALayout| BLayout| DsLayout| ELayout|AData| BData| DsData| EData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Compute friendly + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 256, 64, 16, 16, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 256, 128, 16, 16, 16, 16, 8, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 256, 64, 16, 16, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 224, 256, 128, 16, 16, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 224, 128, 16, 16, 16, 16, 8, 7, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 2, 1, S<1, 64, 1, 4>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 64, 128, 16, 16, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 64, 128, 128, 16, 16, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 64, 64, 128, 16, 16, 32, 32, 1, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8> + // clang-format oI + >; + +template +using device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_instances = std::tuple< + // clang-format off + //################################| ALayout| BLayout| DsLayout| ELayout|AData| BData| DsData| EData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2, 2, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 16, 32, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, I8>, + // Memory friendly + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 32, 128, 16, 16, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 16, 128, 16, 16, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<2, 2, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 128, 32, 128, 16, 16, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 128, 16, 128, 16, 16, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2, 2, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 64, 32, 128, 16, 16, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 64, 16, 128, 16, 16, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2, 2, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2, 2, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 64, 16, 16, 64, 16, 16, 16, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 16, 32, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 16, 64, 128, 16, 16, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 32, 64, 128, 16, 16, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<8, 8, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 16, 128, 128, 16, 16, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 32, 128, 128, 16, 16, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<8, 8, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 16, 256, 128, 16, 16, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, I8, I8, Tuple, BF16, I32, I32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 32, 256, 128, 16, 16, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, S<8, 8, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8> + // clang-format oI + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp new file mode 100644 index 000000000..09ee08dd6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp new file mode 100644 index 000000000..e18262108 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instance.cpp new file mode 100644 index 000000000..173bd4dcb --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instance.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp new file mode 100644 index 000000000..6aa427433 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100644 index 000000000..5797f0c8b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp new file mode 100644 index 000000000..7dc8440bf --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector, + Row, + I8, + I8, + Tuple, + BF16, + PassThrough, + PassThrough, + MultiplyMultiply>>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp b/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp index 7dd7b041e..29a645e9d 100644 --- a/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp +++ b/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp @@ -271,10 +271,12 @@ bool profile_gemm_multiply_multiply_impl(int do_verification, << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch " << kbatch_curr << std::endl; -#if defined CK_ENABLE_FP8 +#if defined CK_ENABLE_FP8 || defined CK_ENABLE_INT8 // set softer tolerances for fp8 - if constexpr(is_same_v || is_same_v || - is_same_v) + if constexpr((is_same_v || is_same_v || + is_same_v) || + (is_same_v || is_same_v || + is_same_v)) { std::string msg = "Error: Incorrect results!"; double rtol = 1e-1; @@ -286,7 +288,7 @@ bool profile_gemm_multiply_multiply_impl(int do_verification, { #endif pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result); -#if defined CK_ENABLE_FP8 +#if defined CK_ENABLE_FP8 || defined CK_ENABLE_INT8 } #endif diff --git a/profiler/src/profile_gemm_multiply_multiply.cpp b/profiler/src/profile_gemm_multiply_multiply.cpp index b7e80ed79..df87cc815 100644 --- a/profiler/src/profile_gemm_multiply_multiply.cpp +++ b/profiler/src/profile_gemm_multiply_multiply.cpp @@ -27,6 +27,7 @@ enum struct GemmDataType F16_F8_F16, // 5 F16_F16_F16_F8, // 6 F8_F8_BF16, // 7 + INT8_INT8_BF16, // 8 }; #define OP_NAME "gemm_multiply_multiply" @@ -39,7 +40,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[]) printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: " "f16->f8; 7: f8->bf16, " - "comp f8)\n"); + "comp f8; 8: int8->bf16)\n"); printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"); printf(" 1: A[m, k] * B[n, k] = C[m, n];\n"); printf(" 2: A[k, m] * B[k, n] = C[m, n];\n"); @@ -89,6 +90,8 @@ int profile_gemm_multiply_multiply(int argc, char* argv[]) using F32 = float; using BF16 = ck::bhalf_t; using F8 = ck::f8_t; + using I8 = int8_t; + using I32 = int; using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -162,6 +165,11 @@ int profile_gemm_multiply_multiply(int argc, char* argv[]) return profile( F8{}, F8{}, F8{}, F32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{}); } + else if(data_type == GemmDataType::INT8_INT8_BF16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile( + I8{}, I8{}, I8{}, I32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{}); + } else { std::cout << "this data_type & layout is not implemented" << std::endl; -- GitLab From 54f0e6f4bb37f574b703ee22d069d773c0d95dfd Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Sat, 26 Oct 2024 18:35:45 +0800 Subject: [PATCH 018/153] [CK_TILE] More fmha splitkv optimizations (#1588) * Use pre-defined constants for readability * Use vector write for o_acc tensor * Remove no-longer used policy method * Deprecate no-longer used policy/pipeline * Specify gemm0/gemm1 block warps separately in codegen * Fix wrong ps_idx creation logic * Add single-warp block gemm * Supoprt single-warp gemm0 * Make MakeCBlockTile() as static method * Use MakeCBlockTile() to get underlying tile distribution * Use kNumGemm1Warps to compute # threads for gemm1 * Put normal case in the if clause * Refine fmha splitkv block mapping * Refine & fix the lse_acc/o_acc layout * Fix wrong LDS size for K tile * Use kK0=64 for hdim=128,256 fmha splitkv kernels * Use kK1=64 for hdim=32,64,128 fmha splitkv kernels * Undo kK0/kK1 changes * Use more reasonable GetAlignmentV() computation * Using store_tile() in fmha splitkv kernel epilogue --- .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 41 +-- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 32 +-- example/ck_tile/01_fmha/fmha_fwd.cpp | 64 ++--- .../ops/fmha/kernel/fmha_fwd_kernel.hpp | 6 +- .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 8 +- .../fmha_fwd_splitkv_tile_partitioner.hpp | 9 +- ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp | 13 +- ...litkv_pipeline_qr_ks_vs_default_policy.hpp | 21 +- .../pipeline/block_fmha_pipeline_problem.hpp | 14 +- .../pipeline/block_fmha_pipeline_qr_ks_vs.hpp | 10 +- .../block_fmha_pipeline_qr_ks_vs_async.hpp | 10 +- .../pipeline/block_fmha_pipeline_qs_ks_vs.hpp | 3 +- ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 131 ++++------ .../ops/fmha/pipeline/tile_fmha_shape.hpp | 9 +- include/ck_tile/ops/gemm.hpp | 1 + .../block/block_gemm_areg_breg_creg_v1.hpp | 2 +- ...block_gemm_areg_bsmem_creg_one_warp_v1.hpp | 237 ++++++++++++++++++ .../block/block_gemm_areg_bsmem_creg_v1.hpp | 2 +- .../block/block_gemm_areg_bsmem_creg_v2.hpp | 2 +- .../block/block_gemm_asmem_breg_creg_v1.hpp | 2 +- .../block/block_gemm_asmem_bsmem_creg_v1.hpp | 2 +- .../ck_tile/ops/reduce/block/block_reduce.hpp | 2 +- 22 files changed, 422 insertions(+), 199 deletions(-) create mode 100644 include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index 860ee20d3..805803fed 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -36,13 +36,12 @@ FMHA_FWD_KERNEL_BODY=""" using fmha_dtype_{F_idx} = {F_dtype}; using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>; -using fmha_block_warps_{F_idx} = ck_tile::sequence<{F_rm}, {F_rn}, {F_rk}>; using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>; using fmha_shape_{F_idx} = ck_tile::TileFmhaShape, fmha_warp_tile_{F_idx}, - fmha_block_warps_{F_idx}, + ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>, fmha_warp_tile_{F_idx}, {F_vlayout}>; @@ -291,9 +290,12 @@ class FmhaFwdTileSize: F_bn1 : int # tile size along v head_dim F_bk1 : int # tile size along kv gemm unroll F_bk0blen : int # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile) - F_rm : int # number of warps along q seqlen (block warps) - F_rn : int # number of warps along k seqlen(not used) - F_rk : int # number of warps along gemm-k(not used) + F_rm0 : int # number of warps for gemm0 along q seqlen + F_rn0 : int # number of warps for gemm0 along k seqlen + F_rk0 : int # number of warps for gemm0 along head dim q (not used) + F_rm1 : int # number of warps for gemm1 along q seqlen + F_rn1 : int # number of warps for gemm1 along head dim v + F_rk1 : int # number of warps for gemm1 along k seqlen (not used) F_wm : int # warp size along m (warp size) F_wn : int # warp size along n F_wk : int # warp size along k @@ -301,8 +303,8 @@ class FmhaFwdTileSize: @property def name(self) -> str: return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0blen}" +\ - f"_r{self.F_rm}x{self.F_rn}x{self.F_rk}_w{self.F_wm}x{self.F_wn}x{self.F_wk}" +\ - ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}") + f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\ + f"_w{self.F_wm}x{self.F_wn}x{self.F_wk}" + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}") @dataclass class FmhaFwdKernel: @@ -334,9 +336,12 @@ class FmhaFwdKernel: F_bn1 = self.F_tile.F_bn1, F_bk1 = self.F_tile.F_bk1, F_bk0blen = self.F_tile.F_bk0blen, - F_rm = self.F_tile.F_rm, - F_rn = self.F_tile.F_rn, - F_rk = self.F_tile.F_rk, + F_rm0 = self.F_tile.F_rm0, + F_rn0 = self.F_tile.F_rn0, + F_rk0 = self.F_tile.F_rk0, + F_rm1 = self.F_tile.F_rm1, + F_rn1 = self.F_tile.F_rn1, + F_rk1 = self.F_tile.F_rk1, F_wm = self.F_tile.F_wm, F_wn = self.F_tile.F_wn, F_wk = self.F_tile.F_wk, @@ -394,16 +399,16 @@ class FmhaFwdKernel: def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]: if dtype == 'fp16' or dtype == 'bf16': return { - '32' : FmhaFwdTileSize(128, 64, 16, 32, 32, 32, 2, 1, 1, 32, 32, 16, -1), - '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 32, 32, 16, -1), - '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 32, 32, 16, -1), - '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 32, 32, 16, -1), + '32' : FmhaFwdTileSize(128, 64, 16, 32, 32, 32, 2, 1, 1, 2, 1, 1, 32, 32, 16, -1), + '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), + '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), + '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), } elif dtype == 'fp8' or dtype == 'bf8': return { - '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 2, 1, 1, 32, 32, 32, -1), - '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 32, 32, 32, -1), - '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 32, 32, 32, -1) + '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 2, 1, 1, 2, 1, 1, 32, 32, 32, -1), + '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 32, -1), + '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 32, -1) } else: return None diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index 57360ea99..46c26b22c 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -42,13 +42,12 @@ namespace {{ template struct kernel_runner {{ using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>; -using fmha_block_warps = ck_tile::sequence<{F_rm}, {F_rn}, {F_rk}>; using fmha_warp_tile = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>; using fmha_shape = ck_tile::TileFmhaShape, fmha_warp_tile, - fmha_block_warps, + ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>, fmha_warp_tile, {F_vlayout}>; @@ -162,10 +161,12 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaSplitKVCombinePipelineProblem< using fmha_pipeline = ck_tile::BlockFmhaFwdSplitKVCombinePipeline< fmha_pipeline_problem>; +/// FIXME: use {F_spad}/{F_dvpad} as kPadM/kPadN parameters after solving +/// store_tile_raw() data corruption issue using fmha_epilogue = ck_tile::Default2DEpilogue::OaccDataType, typename FmhaFwdTypeConfig<{F_dtype}>::ODataType, - {F_spad}, {F_dvpad}>>; + false, false>>; using fmha_kernel = ck_tile::FmhaFwdSplitKVCombineKernel, @@ -458,9 +459,12 @@ class FmhaFwdSplitKVKernel: F_bn1 = self.F_tile.F_bn1, F_bk1 = self.F_tile.F_bk1, F_bk0blen = self.F_tile.F_bk0blen, - F_rm = self.F_tile.F_rm, - F_rn = self.F_tile.F_rn, - F_rk = self.F_tile.F_rk, + F_rm0 = self.F_tile.F_rm0, + F_rn0 = self.F_tile.F_rn0, + F_rk0 = self.F_tile.F_rk0, + F_rm1 = self.F_tile.F_rm1, + F_rn1 = self.F_tile.F_rn1, + F_rk1 = self.F_tile.F_rk1, F_wm = self.F_tile.F_wm, F_wn = self.F_tile.F_wn, F_wk = self.F_tile.F_wk, @@ -553,16 +557,16 @@ class FmhaFwdSplitKVCombineKernel: def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]: if dtype == 'fp16' or dtype == 'bf16': return { - '32' : FmhaFwdTileSize(32, 64, 16, 32, 32, 32, 2, 1, 1, 16, 16, 16, -1), - '64' : FmhaFwdTileSize(64, 64, 32, 64, 32, 64, 4, 1, 1, 16, 16, 16, -1), - '128' : FmhaFwdTileSize(64, 128, 32, 128, 32, 128, 4, 1, 1, 16, 16, 16, -1), - '256' : FmhaFwdTileSize(64, 128, 32, 256, 32, 256, 4, 1, 1, 16, 16, 16, -1), + '32' : FmhaFwdTileSize(32, 64, 16, 32, 32, 32, 2, 1, 1, 2, 1, 1, 16, 16, 16, -1), + '64' : FmhaFwdTileSize(64, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), + '128' : FmhaFwdTileSize(64, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), + '256' : FmhaFwdTileSize(64, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), } elif dtype == 'fp8' or dtype == 'bf8': return { - '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 2, 1, 1, 32, 32, 32, -1), - '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 32, 32, 32, -1), - '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 32, 32, 32, -1) + '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 2, 1, 1, 2, 1, 1, 32, 32, 32, -1), + '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 32, -1), + '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 32, -1) } else: return None diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index 6d519a7ea..14291715f 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -557,33 +557,16 @@ bool run(const ck_tile::ArgParser& arg_parser) } #endif - struct - { - auto operator()(bool permute, - ck_tile::index_t b /*batch*/, - ck_tile::index_t h /*nhead*/, - ck_tile::index_t s /*seqlen*/, - ck_tile::index_t d /*hdim*/) - { - if(permute) - return std::array{b, h, s, d}; - else - return std::array{b, s, h, d}; - } - - auto operator()(bool permute, - ck_tile::index_t ns /*num_splits*/, - ck_tile::index_t b /*batch*/, - ck_tile::index_t h /*nhead*/, - ck_tile::index_t s /*seqlen*/, - ck_tile::index_t d /*hdim*/) - { - if(permute) - return std::array{ns, b, h, s, d}; - else - return std::array{ns, b, s, h, d}; - } - } get_lengths; + static const auto get_lengths = [](bool permute, + ck_tile::index_t b /*batch*/, + ck_tile::index_t h /*nhead*/, + ck_tile::index_t s /*seqlen*/, + ck_tile::index_t d /*hdim*/) { + if(permute) + return std::array{b, h, s, d}; + else + return std::array{b, s, h, d}; + }; bool is_v_rowmajor = vlayout == std::string("r"); @@ -635,12 +618,15 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::HostTensor lse_acc_host( 1 < num_splits || use_kvcache - ? std::array{num_splits, shape_batch, nhead, shape_seqlen_q} + ? std::array{shape_batch, nhead, num_splits, shape_seqlen_q} : std::array{1, 1, 1, 1}); ck_tile::HostTensor o_acc_host( - 1 < num_splits || use_kvcache - ? get_lengths(o_perm, num_splits, shape_batch, nhead, shape_seqlen_q, hdim_v) - : std::array{1, 1, 1, 1, 1}); + 1 < num_splits || use_kvcache ? std::array{shape_batch, + nhead, + num_splits, + shape_seqlen_q, + hdim_v} + : std::array{1, 1, 1, 1, 1}); // batch mode of lse data layout is [batch, nhead, seqlen_q] // group mode of lse data layout is [nhead, total_seqlen_q] @@ -880,7 +866,7 @@ bool run(const ck_tile::ArgParser& arg_parser) }(); const ck_tile::index_t stride_bias = (i_perm ? shape_seqlen_k : 1 * shape_seqlen_k); const ck_tile::index_t stride_randval = (max_seqlen_k); - const ck_tile::index_t stride_o_acc = (o_perm ? hdim_v : nhead * hdim_v); + const ck_tile::index_t stride_o_acc = (hdim_v); const ck_tile::index_t stride_o = (o_perm ? hdim_v : nhead * hdim_v); // setup nhead_stride_* arguments const ck_tile::index_t nhead_stride_q = (i_perm ? shape_seqlen_q * hdim_q : hdim_q); @@ -906,8 +892,8 @@ bool run(const ck_tile::ArgParser& arg_parser) (i_perm ? 0 * shape_seqlen_q * shape_seqlen_k : 0 * shape_seqlen_k); const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k); const ck_tile::index_t nhead_stride_lse = shape_seqlen_q; - const ck_tile::index_t nhead_stride_lse_acc = shape_seqlen_q; - const ck_tile::index_t nhead_stride_o_acc = (o_perm ? shape_seqlen_q * hdim_v : hdim_v); + const ck_tile::index_t nhead_stride_lse_acc = (num_splits * shape_seqlen_q); + const ck_tile::index_t nhead_stride_o_acc = (num_splits * shape_seqlen_q * hdim_v); const ck_tile::index_t nhead_stride_o = (o_perm ? shape_seqlen_q * hdim_v : hdim_v); // setup batch_stride_* arguments const ck_tile::index_t batch_stride_q = (nhead * shape_seqlen_q * hdim_q); @@ -922,13 +908,13 @@ bool run(const ck_tile::ArgParser& arg_parser) const ck_tile::index_t batch_stride_bias = (0 * nhead * shape_seqlen_q * shape_seqlen_k); const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k); const ck_tile::index_t batch_stride_lse = (nhead * shape_seqlen_q); - const ck_tile::index_t batch_stride_lse_acc = (nhead * shape_seqlen_q); - const ck_tile::index_t batch_stride_o_acc = (nhead * shape_seqlen_q * hdim_v); - const ck_tile::index_t batch_stride_o = (nhead * shape_seqlen_q * hdim_v); + const ck_tile::index_t batch_stride_lse_acc = (nhead * num_splits * shape_seqlen_q); + const ck_tile::index_t batch_stride_o_acc = (nhead * num_splits * shape_seqlen_q * hdim_v); + const ck_tile::index_t batch_stride_o = (nhead * shape_seqlen_q * hdim_v); const ck_tile::index_t batch_stride_block_table = (max_num_page_blocks / batch); // setup split_stride_* arguments (only used in split-kv kernel) - const ck_tile::index_t split_stride_lse_acc = (shape_batch * nhead * shape_seqlen_q); - const ck_tile::index_t split_stride_o_acc = (shape_batch * nhead * shape_seqlen_q * hdim_v); + const ck_tile::index_t split_stride_lse_acc = (shape_seqlen_q); + const ck_tile::index_t split_stride_o_acc = (shape_seqlen_q * hdim_v); args.q_ptr = q_buf.GetDeviceBuffer(); args.k_ptr = k_buf.GetDeviceBuffer(); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index adabda165..8c1f6c805 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -69,7 +69,8 @@ struct FmhaFwdKernel // sync with generate.py // clang-format off using bfs = typename FmhaPipeline::BlockFmhaShape; - using gbr = typename bfs::Gemm0BlockWarps; + using g0br = typename bfs::Gemm0BlockWarps; + using g1br = typename bfs::Gemm1BlockWarps; using gwt = typename bfs::Gemm0WarpTile; #define _SS_ std::string #define _TS_ std::to_string @@ -85,7 +86,8 @@ struct FmhaFwdKernel "_" + (kIsGroupMode ? "group" : "batch") + "_" + _SS_(TilePartitioner::name) + "_" "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" + _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kK0BlockLength) + "_" + - "r" + _TS_(gbr::at(ck_tile::number<0>{})) + "x" + _TS_(gbr::at(ck_tile::number<1>{})) + "x" + _TS_(gbr::at(ck_tile::number<2>{})) + "_" + + "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" + + "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" + "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" + (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" + "v" + (std::is_same_v ? "r" : "c") + (pn.empty() ? "" : "_" + pn) + diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index 34f75990c..ea30025b5 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -65,7 +65,8 @@ struct FmhaFwdSplitKVKernel // sync with generate.py // clang-format off using bfs = typename FmhaPipeline::BlockFmhaShape; - using gbr = typename bfs::Gemm0BlockWarps; + using g0br = typename bfs::Gemm0BlockWarps; + using g1br = typename bfs::Gemm1BlockWarps; using gwt = typename bfs::Gemm0WarpTile; #define _SS_ std::string #define _TS_ std::to_string @@ -81,7 +82,8 @@ struct FmhaFwdSplitKVKernel "_" + (kIsGroupMode ? "group" : "batch") + "_" "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" + _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kK0BlockLength) + "_" + - "r" + _TS_(gbr::at(ck_tile::number<0>{})) + "x" + _TS_(gbr::at(ck_tile::number<1>{})) + "x" + _TS_(gbr::at(ck_tile::number<2>{})) + "_" + + "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" + + "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" + "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" + (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" + "v" + (std::is_same_v ? "r" : "c") + (pn.empty() ? "" : "_" + pn) + @@ -894,7 +896,7 @@ struct FmhaFwdSplitKVKernel o_acc_ptr, make_tuple(kargs.seqlen_q, kargs.hdim_v), make_tuple(kargs.stride_o_acc, 1), - number<1>{}, + number{}, number<1>{}); return pad_tensor_view( diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp index 2d06ba176..675a31019 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp @@ -26,8 +26,8 @@ struct FmhaFwdSplitKVTilePartitioner { // TODO: this may need tuning return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, kM0) * - ck_tile::integer_divide_ceil(hdim_v, kN1), - nhead * num_splits, + ck_tile::integer_divide_ceil(hdim_v, kN1) * num_splits, + nhead, batch_size); } @@ -42,8 +42,9 @@ struct FmhaFwdSplitKVTilePartitioner return ck_tile::make_tuple(quotient, modulus); }; - const auto [i_tile_m, i_tile_n] = f(blockIdx.x, num_tile_n1); - const auto [i_nhead, i_split] = f(blockIdx.y, num_splits); + const auto [mn, i_split] = f(blockIdx.x, num_splits); + const auto [i_tile_m, i_tile_n] = f(mn, num_tile_n1); + const index_t i_nhead = blockIdx.y; const index_t i_batch = blockIdx.z; return ck_tile::make_tuple(i_tile_m, i_tile_n, i_split, i_nhead, i_batch); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp index 75af7be82..6e7416ce8 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp @@ -64,6 +64,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS return kPadSeqLenK ? 1 : Policy::template GetAlignmentV(); }(); + static constexpr index_t kAlignmentOacc = + kPadHeadDimV ? 1 : Policy::template GetAlignmentOacc(); + static constexpr index_t kAlignmentBias = kPadSeqLenK ? 1 : Policy::template GetAlignmentBias(); @@ -252,11 +255,11 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS k_dram_block_window_lengths, {adjusted_seqlen_k_start, 0}); const auto bias_origin = bias_dram_block_window_tmp.get_window_origin(); - auto bias_dram_window = make_tile_window( - bias_dram_block_window_tmp.get_bottom_tensor_view(), - bias_dram_block_window_tmp.get_window_lengths(), - {bias_origin.at(number<0>{}), adjusted_seqlen_k_start}, // M/N - Policy::template MakeBiasDramTileDistribution()); + auto bias_dram_window = + make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(), + bias_dram_block_window_tmp.get_window_lengths(), + {bias_origin.at(number<0>{}), adjusted_seqlen_k_start}, // M/N + Policy::template MakeBiasDramTileDistribution()); auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window( v_dram_block_window_lengths, diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp index 338319ab3..b7f1f042e 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp @@ -9,11 +9,20 @@ namespace ck_tile { // This pipeline is qkv all located in LDS -using BlockFmhaFwdSplitKVPipelineQRKSVSDefaultPolicy = - BlockFmhaPipelineQXKSVSCustomPolicy; +struct BlockFmhaFwdSplitKVPipelineQRKSVSDefaultPolicy + : BlockFmhaPipelineQXKSVSCustomPolicy +{ + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentOacc() + { + using OaccDataType = remove_cvref_t; + + return static_cast(16 / sizeof(OaccDataType)); + } +}; } // namespace ck_tile diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp index 1846664e7..d9da2f088 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp @@ -39,8 +39,11 @@ struct BlockFmhaPipelineProblem using FmhaMask = remove_cvref_t; using Traits = remove_cvref_t; - static constexpr index_t kBlockSize = BlockFmhaShape::NumWarps * get_warp_size(); - static constexpr bool kIsGroupMode = kIsGroupMode_; + static constexpr index_t kNumGemm0Warps = BlockFmhaShape::NumGemm0Warps; + static constexpr index_t kNumGemm1Warps = BlockFmhaShape::NumGemm1Warps; + static constexpr index_t kBlockSize = BlockFmhaShape::NumWarps * get_warp_size(); + + static constexpr bool kIsGroupMode = kIsGroupMode_; // attributes from traits static constexpr bool kPadSeqLenQ = Traits::kPadSeqLenQ; @@ -84,8 +87,11 @@ struct BlockFmhaFwdSplitKVPipelineProblem using FmhaMask = remove_cvref_t; using Traits = remove_cvref_t; - static constexpr index_t kBlockSize = BlockFmhaShape::NumWarps * get_warp_size(); - static constexpr bool kIsGroupMode = kIsGroupMode_; + static constexpr index_t kNumGemm0Warps = BlockFmhaShape::NumGemm0Warps; + static constexpr index_t kNumGemm1Warps = BlockFmhaShape::NumGemm1Warps; + static constexpr index_t kBlockSize = BlockFmhaShape::NumWarps * get_warp_size(); + + static constexpr bool kIsGroupMode = kIsGroupMode_; // attributes from traits static constexpr bool kPadSeqLenQ = Traits::kPadSeqLenQ; diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp index 281ddc07b..6837ffdee 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp @@ -242,11 +242,11 @@ struct BlockFmhaPipelineQRKSVS {seqlen_k_start, 0}); const auto bias_origin = bias_dram_block_window_tmp.get_window_origin(); - auto bias_dram_window = make_tile_window( - bias_dram_block_window_tmp.get_bottom_tensor_view(), - bias_dram_block_window_tmp.get_window_lengths(), - {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N - Policy::template MakeBiasDramTileDistribution()); + auto bias_dram_window = + make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(), + bias_dram_block_window_tmp.get_window_lengths(), + {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N + Policy::template MakeBiasDramTileDistribution()); auto randval_dram_window = dropout.template MakeRandvalDramWindow( randval_dram_block_window_tmp, seqlen_k_start); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp index 19f569c45..c4872def1 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp @@ -314,11 +314,11 @@ struct BlockFmhaPipelineQRKSVSAsync }(); const auto bias_origin = bias_dram_block_window_tmp.get_window_origin(); - auto bias_dram_window = make_tile_window( - bias_dram_block_window_tmp.get_bottom_tensor_view(), - bias_dram_block_window_tmp.get_window_lengths(), - {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N - Policy::template MakeBiasDramTileDistribution()); + auto bias_dram_window = + make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(), + bias_dram_block_window_tmp.get_window_lengths(), + {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N + Policy::template MakeBiasDramTileDistribution()); auto randval_dram_window = dropout.template MakeRandvalDramWindow( randval_dram_block_window_tmp, seqlen_k_start); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp index bc9ca93d0..d08a8d489 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp @@ -9,9 +9,10 @@ namespace ck_tile { +/// NOTICE: we no-longer use this pipeline. // This pipeline is qkv all located in LDS template -struct BlockFmhaPipelineQSKSVS +struct [[deprecated]] BlockFmhaPipelineQSKSVS { using Problem = remove_cvref_t; using Policy = remove_cvref_t; diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp index a66d2be78..807ad6548 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp @@ -15,6 +15,7 @@ #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp" #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_custom_policy.hpp" #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp" +#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp" // TODO: remove this #define K_LDS_LOAD_USE_OFFSET_TRANSFORM 0 @@ -64,13 +65,28 @@ struct BlockFmhaPipelineQXCustomPolicy constexpr index_t M1 = MWarp; constexpr index_t M0 = kMPerBlock / (M2 * M1); - return make_static_tile_distribution( - tile_distribution_encoding, - tuple, sequence>, - tuple, sequence<2, 1>>, - tuple, sequence<1, 2>>, - sequence<1, 2, 2>, - sequence<0, 0, 2>>{}); + if constexpr(1 < Problem::kNumGemm0Warps) + { + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<2, 1>>, + tuple, sequence<1, 2>>, + sequence<1, 2, 2>, + sequence<0, 0, 2>>{}); + } + else + { + static_assert(MWarp == 1); + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2, 2>, + sequence<0, 0, 2>>{}); + } } template @@ -80,7 +96,7 @@ struct BlockFmhaPipelineQXCustomPolicy BlockGemmProblem, @@ -129,12 +145,16 @@ struct BlockFmhaPipelineQXCustomPolicy typename Problem::BlockFmhaShape::Gemm0BlockWarps, decltype(warp_gemm)>; - return BlockGemmARegBSmemCRegV2{}; + if constexpr(1 < Problem::kNumGemm0Warps) + return BlockGemmARegBSmemCRegV2{}; + else + return BlockGemmARegBSmemCRegOneWarpV1{}; } }; +/// NOTICE: we no-longer use this policy. template <> -struct BlockFmhaPipelineQXCustomPolicy +struct [[deprecated]] BlockFmhaPipelineQXCustomPolicy { static constexpr bool QLoadOnce = false; @@ -364,12 +384,15 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy(16 / sizeof(VDataType))); + constexpr index_t kMinVecLoad = 4 / sizeof(VDataType); - // TODO: not correct! - if constexpr(total_pixels > 4) - return 4; - else - return 2; + constexpr index_t kVecLoad = ((total_pixels / kMaxVecLoad) >= kMinVecLoad) + ? kMaxVecLoad + : (total_pixels / kMinVecLoad); + + return kVecLoad; } else { @@ -383,10 +406,8 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy())>; constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp(); using WG = remove_cvref_t())>; - using CWarpDstr = typename WG::CWarpDstr; - constexpr auto vec = - CWarpDstr{}.get_ys_to_d_descriptor().get_lengths().at(number{}); - return vec; + + return WG::WarpGemmAttribute::Impl::kCM1PerLane; } template @@ -395,10 +416,8 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy())>; constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp(); using WG = remove_cvref_t())>; - using CWarpDstr = typename WG::CWarpDstr; - constexpr auto vec = - CWarpDstr{}.get_ys_to_d_descriptor().get_lengths().at(number{}); - return vec; + + return WG::WarpGemmAttribute::Impl::kCM1PerLane; } template @@ -449,44 +468,12 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy - CK_TILE_HOST_DEVICE static constexpr auto MakeQRegBlockDescriptor() - { - constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0; - constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0BlockLength; - - constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp(); - - using WG = remove_cvref_t())>; - - constexpr index_t MWarp = config.template at<1>(); - constexpr index_t NWarp = config.template at<2>(); - - constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WG::kM); - constexpr index_t KIterPerWarp = kKPerBlock / WG::kK; - - constexpr auto q_block_outer_dstr_encoding = - tile_distribution_encoding, - tuple, sequence>, - tuple>, - tuple>, - sequence<1, 2>, - sequence<0, 0>>{}; - - constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding( - q_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{}); - - constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode); - - return q_block_dstr; - } - // TODO: this is used for non async copy desc. unify in the future template CK_TILE_HOST_DEVICE static constexpr auto MakeKLdsBlockDescriptor() { constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0; - constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1; + constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0; constexpr index_t kKPack = GetSmemKPackK(); constexpr auto k_lds_block_desc_0 = make_naive_tensor_descriptor( @@ -886,36 +873,10 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy + template CK_TILE_HOST_DEVICE static constexpr auto MakeBiasDramTileDistribution() { - constexpr index_t MPerBlock = Problem::BlockFmhaShape::kM0; - constexpr index_t NPerBlock = Problem::BlockFmhaShape::kN0; - - constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp(); - using WG = remove_cvref_t())>; - - constexpr index_t MWarp = config.template at<1>(); - constexpr index_t NWarp = config.template at<2>(); - - constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM); - constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN); - - // Construct C-Block-HostTensor - constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding< - sequence<>, - tuple, sequence>, - tuple>, - tuple>, - sequence<1, 2>, - sequence<0, 0>>{}; - - constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( - c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{}); - - constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); - - return c_block_dstr; + return BlockGemm::MakeCBlockTile().get_tile_distribution(); } template @@ -972,7 +933,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy, diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp index 64a61e94d..f2bb2200f 100644 --- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp +++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp @@ -21,10 +21,15 @@ struct TileFmhaShape using Gemm1BlockWarps = remove_cvref_t; using Gemm1WarpTile = remove_cvref_t; - static constexpr index_t NumWarps = + static constexpr index_t NumGemm0Warps = reduce_on_sequence(Gemm0BlockWarps{}, multiplies{}, number<1>{}); + static constexpr index_t NumGemm1Warps = + reduce_on_sequence(Gemm1BlockWarps{}, multiplies{}, number<1>{}); + static_assert(NumGemm1Warps % NumGemm0Warps == 0); + + static constexpr index_t NumWarps = max(NumGemm0Warps, NumGemm1Warps); - static_assert(NumWarps == reduce_on_sequence(Gemm1BlockWarps{}, multiplies{}, number<1>{})); + static_assert(std::is_same_v); static constexpr index_t kM0 = BlockTile::at(number<0>{}); // tile size along q seqlen static constexpr index_t kN0 = BlockTile::at(number<1>{}); // tile size along k seqlen diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index e70825570..4ca773479 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -8,6 +8,7 @@ #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp" #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_custom_policy.hpp" #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_default_policy.hpp" +#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp" #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp" #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp" #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp" diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp index 9a5c2aae5..728a04d83 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp @@ -157,7 +157,7 @@ struct BlockGemmARegBRegCRegV1 }); } - CK_TILE_DEVICE constexpr auto MakeCBlockTile() const + CK_TILE_DEVICE static constexpr auto MakeCBlockTile() { constexpr index_t MPerBlock = BlockGemmShape::kM; constexpr index_t NPerBlock = BlockGemmShape::kN; diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp new file mode 100644 index 000000000..ff23f6355 --- /dev/null +++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp" + +namespace ck_tile { + +// A is block distributed tensor +// B is block window on shared memory +// C is block distributed tensor +template +struct BlockGemmARegBSmemCRegOneWarpV1 +{ + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; + + static constexpr index_t kBlockSize = Problem::kBlockSize; + static_assert(kBlockSize == get_warp_size(), "Check failed!"); + + // C += A * B + template + CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor, + const ABlockTensorTmp& a_block_tensor_tmp, + const BBlockWindowTmp& b_block_window_tmp) const + { + static_assert( + std::is_same_v> && + std::is_same_v> && + std::is_same_v>, + "wrong!"); + + // constexpr index_t MPerBlock = ABlockTensorTmp{}.get_lengths()[number<0>{}]; + // constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}]; + // constexpr index_t KPerBlock = ABlockTensorTmp{}.get_lengths()[number<1>{}]; + constexpr index_t MPerBlock = BlockGemmShape::kM; + constexpr index_t NPerBlock = BlockGemmShape::kN; + constexpr index_t KPerBlock = BlockGemmShape::kK; + + // static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN && + // KPerBlock == BlockGemmShape::kK, + // "wrong!"); + + constexpr auto config = Policy::template GetWarpGemmMWarpNWarp(); + + using WG = remove_cvref_t())>; + + constexpr index_t MWarp = config.template at<1>(); + constexpr index_t NWarp = config.template at<2>(); + + static_assert(MWarp == 1 && NWarp == 1, "Check failed!"); + + constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM); + constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN); + constexpr index_t KIterPerWarp = KPerBlock / WG::kK; + + constexpr index_t NPerBlockPerIter = NPerBlock / NIterPerWarp; + constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp; + + const index_t iNWarp = 0; + + constexpr auto a_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto c_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, sequence>, + tuple<>, + tuple<>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{}); + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{}); + + constexpr auto a_block_dstr = make_static_tile_distribution(a_block_dstr_encode); + + // constrcut from A-block-tensor from A-Block-tensor-tmp + // FIXME: need method to check a_block_tensor and a_block_tensor_tmp have equivalent + // distribution + auto a_block_tensor = + make_static_distributed_tensor(a_block_dstr); + + a_block_tensor.get_thread_buffer() = a_block_tensor_tmp.get_thread_buffer(); + + // construct B-warp-window + auto b_warp_window_tmp = make_tile_window( + b_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_block_window_tmp.get_window_origin() + multi_index<2>{iNWarp * WG::kN, 0}, + make_static_tile_distribution(typename WG::BWarpDstrEncoding{})); + +#if 0 // FIXME: using array will cause register spill + array, NIterPerWarp> b_warp_windows{ + {b_warp_window_tmp}}; + + for(index_t nIter = 0; nIter < NIterPerWarp; nIter++) + { + for(index_t kIter = 0; kIter < KIterPerWarp; kIter++) + { + move_tile_window(b_warp_windows(nIter)(kIter), + {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter}); + } + } +#else + statically_indexed_array< + statically_indexed_array, + NIterPerWarp> + b_warp_windows; + + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, KIterPerWarp, 1>{}([&](auto kIter) { + b_warp_windows(nIter)(kIter) = b_warp_window_tmp; + + move_tile_window(b_warp_windows(nIter)(kIter), + {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter}); + }); + }); +#endif + + // check C-block-distribution + static_assert( + std::is_same_v, + remove_cvref_t>, + "wrong!"); + + using AWarpDstr = typename WG::AWarpDstr; + using CWarpDstr = typename WG::CWarpDstr; + + using AWarpTensor = typename WG::AWarpTensor; + using CWarpTensor = typename WG::CWarpTensor; + + constexpr auto a_warp_y_lengths = + to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_lengths = + to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + + constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t{}; + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + // hot loop: + static_for<0, KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { + // read A warp tensor from A block tensor + AWarpTensor a_warp_tensor; + + a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data( + merge_sequences(sequence{}, a_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, a_warp_y_lengths)); + + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { + // read B warp tensor from B Block window + const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter)); + + // read C warp tensor from C block tensor + CWarpTensor c_warp_tensor; + + c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + // warp GEMM + WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor); + + // write C warp tensor into C block tensor + c_block_tensor.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + c_warp_tensor.get_thread_buffer()); + }); + }); + }); + } + + CK_TILE_DEVICE static constexpr auto MakeCBlockTile() + { + constexpr index_t MPerBlock = BlockGemmShape::kM; + constexpr index_t NPerBlock = BlockGemmShape::kN; + + constexpr auto config = Policy::template GetWarpGemmMWarpNWarp(); + + using WG = remove_cvref_t())>; + + constexpr index_t MWarp = config.template at<1>(); + constexpr index_t NWarp = config.template at<2>(); + + static_assert(MWarp == 1 && NWarp == 1, "Check failed!"); + + constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM); + constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN); + // constexpr index_t KIterPerWarp = KPerBlock / WG::kK; + + constexpr auto c_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, sequence>, + tuple<>, + tuple<>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{}); + + static_assert(decltype(c_block_dstr_encode)::NDimP == 1, "Check failed!"); + + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + auto c_block_tensor = make_static_distributed_tensor(c_block_dstr); + return c_block_tensor; + } + + // C = A * B + template + CK_TILE_DEVICE auto operator()(const ABlockTensorTmp& a_block_tensor_tmp, + const BBlockWindowTmp& b_block_window_tmp) const + { + auto c_block_tensor = MakeCBlockTile(); + operator()(c_block_tensor, a_block_tensor_tmp, b_block_window_tmp); + return c_block_tensor; + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp index beab457b9..98e5538c0 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp @@ -181,7 +181,7 @@ struct BlockGemmARegBSmemCRegV1 }); } - CK_TILE_DEVICE constexpr auto MakeCBlockTile() const + CK_TILE_DEVICE static constexpr auto MakeCBlockTile() { constexpr index_t MPerBlock = BlockGemmShape::kM; constexpr index_t NPerBlock = BlockGemmShape::kN; diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp index 4a82702c1..173ef0a02 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp @@ -182,7 +182,7 @@ struct BlockGemmARegBSmemCRegV2 }); } - CK_TILE_DEVICE constexpr auto MakeCBlockTile() const + CK_TILE_DEVICE static constexpr auto MakeCBlockTile() { constexpr index_t MPerBlock = BlockGemmShape::kM; constexpr index_t NPerBlock = BlockGemmShape::kN; diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp index 3d142df4d..d28aa9e78 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp @@ -180,7 +180,7 @@ struct BlockGemmASmemBRegCRegV1 }); } - CK_TILE_DEVICE constexpr auto MakeCBlockTile() const + CK_TILE_DEVICE static constexpr auto MakeCBlockTile() { constexpr index_t MPerBlock = BlockGemmShape::kM; constexpr index_t NPerBlock = BlockGemmShape::kN; diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp index ac4522170..dc0b41135 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp @@ -167,7 +167,7 @@ struct BlockGemmASmemBSmemCRegV1 }); } - CK_TILE_DEVICE constexpr auto MakeCBlockTile() const + CK_TILE_DEVICE static constexpr auto MakeCBlockTile() { constexpr index_t MPerBlock = BlockGemmShape::kM; constexpr index_t NPerBlock = BlockGemmShape::kN; diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp index 63c364331..a01265ad5 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp @@ -22,7 +22,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor, constexpr index_t idim_p_lane = NDimP - 1; - const auto ps_idx = make_array(get_block_id(), get_lane_id()); + const auto ps_idx = detail::get_partition_index(acc_tensor.get_tile_distribution()); const auto rs_idx = acc_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx); constexpr index_t thread_buf_size = AccDistributedTensor_::get_thread_buffer_size(); -- GitLab From 31bf253aeb93bb7e26336d4940c6f056d7c5f1b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Sat, 26 Oct 2024 15:22:37 +0200 Subject: [PATCH 019/153] Add dynamic elementwise op (#1426) * Add dynamic elementwise op Co-authored-by: ThruptiRajLakshmanaGowda * CI issues fix * Custom parameter value for dynamic functions - Comments addressed --------- Co-authored-by: ThruptiRajLakshmanaGowda Co-authored-by: ThruptiRajLakshmanaGowda --- ...ed_gemm_add_add_relu_gemm_add_xdl_fp16.cpp | 6 +- example/62_convnd_activ/CMakeLists.txt | 1 + .../dynamic_unary/CMakeLists.txt | 45 + .../convnd_fwd_activ_dynamic_unary_common.hpp | 238 +++++ .../convnd_fwd_xdl_dynamic_abs_fp16.cpp | 13 + ...onvnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp | 13 + .../convnd_fwd_xdl_dynamic_elu_fp16.cpp | 13 + .../convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp | 13 + .../convnd_fwd_xdl_dynamic_logistic_fp16.cpp | 13 + ...onvnd_fwd_xdl_dynamic_passthrough_fp16.cpp | 13 + .../convnd_fwd_xdl_dynamic_pow_fp16.cpp | 13 + .../convnd_fwd_xdl_dynamic_relu_fp16.cpp | 13 + .../convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp | 13 + .../convnd_fwd_xdl_dynamic_softrelu_fp16.cpp | 13 + .../convnd_fwd_xdl_dynamic_swish_fp16.cpp | 13 + .../convnd_fwd_xdl_dynamic_tanh_fp16.cpp | 13 + .../run_convnd_activ_dynamic_example.inc | 91 ++ ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 19 +- .../gpu/element/element_wise_operation.hpp | 8 +- .../element/unary_element_wise_operation.hpp | 877 +++++++++++++++--- ...ouped_conv_fwd_xdl_dynamic_op_instance.hpp | 179 ++++ ...grouped_convolution_forward_dynamic_op.hpp | 278 ++++++ .../CMakeLists.txt | 8 + ...mic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 55 ++ ...amic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 55 ++ ...amic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp | 55 ++ ...mic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp | 54 ++ .../CMakeLists.txt | 8 + ..._op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp | 55 ++ ...c_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 55 ++ ...c_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp | 55 ++ ..._op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp | 54 ++ 32 files changed, 2188 insertions(+), 164 deletions(-) create mode 100644 example/62_convnd_activ/dynamic_unary/CMakeLists.txt create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp create mode 100644 example/62_convnd_activ/run_convnd_activ_dynamic_example.inc create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp index 36dcf58d7..ff1282f3c 100644 --- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp +++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. /* Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o] @@ -60,14 +60,14 @@ struct AddAddRelu { const ck::half_t x = c + d0 + d1; - ck::tensor_operation::element_wise::Relu{}.template operator()(e, x); + ck::tensor_operation::element_wise::Relu{}.operator()(e, x); } __host__ __device__ void operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const { const float x = c + (d0 + d1); - ck::tensor_operation::element_wise::Relu{}.template operator()(e, x); + ck::tensor_operation::element_wise::Relu{}.operator()(e, x); } }; diff --git a/example/62_convnd_activ/CMakeLists.txt b/example/62_convnd_activ/CMakeLists.txt index ab136d99b..79fafed4e 100644 --- a/example/62_convnd_activ/CMakeLists.txt +++ b/example/62_convnd_activ/CMakeLists.txt @@ -6,6 +6,7 @@ add_subdirectory(convscale_add) add_subdirectory(convscale_reduce) add_subdirectory(multi_AB) add_subdirectory(unary) +add_subdirectory(dynamic_unary) add_custom_target(example_convnd_activ_xdl) # ScaleAdd ScaleAdd Relu diff --git a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt new file mode 100644 index 000000000..23f07439a --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt @@ -0,0 +1,45 @@ +list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) +set(target 0) +foreach(gpu IN LISTS GPU_TARGETS) + if(gpu IN_LIST gpu_list AND target EQUAL 0) + add_custom_target(example_convnd_activ_dynamic_unary_xdl) + # Sigmoid + add_example_executable(example_convnd_fwd_xdl_dynamic_sigmoid_fp16 convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_sigmoid_fp16) + # Tanh + add_example_executable(example_convnd_fwd_xdl_dynamic_tanh_fp16 convnd_fwd_xdl_dynamic_tanh_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_tanh_fp16) + # Relu + add_example_executable(example_convnd_fwd_xdl_dynamic_relu_fp16 convnd_fwd_xdl_dynamic_relu_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_relu_fp16) + # SoftRelu + add_example_executable(example_convnd_fwd_xdl_dynamic_softrelu_fp16 convnd_fwd_xdl_dynamic_softrelu_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_softrelu_fp16) + # Abs + add_example_executable(example_convnd_fwd_xdl_dynamic_abs_fp16 convnd_fwd_xdl_dynamic_abs_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_abs_fp16) + # Pow + add_example_executable(example_convnd_fwd_xdl_dynamic_pow_fp16 convnd_fwd_xdl_dynamic_pow_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_pow_fp16) + # Clipped Relu + add_example_executable(example_convnd_fwd_xdl_dynamic_clippedrelu_fp16 convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_clippedrelu_fp16) + # Leaky Relu + add_example_executable(example_convnd_fwd_xdl_dynamic_leakyrelu_fp16 convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_leakyrelu_fp16) + # Elu + add_example_executable(example_convnd_fwd_xdl_dynamic_elu_fp16 convnd_fwd_xdl_dynamic_elu_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_elu_fp16) + # Swish + add_example_executable(example_convnd_fwd_xdl_dynamic_swish_fp16 convnd_fwd_xdl_dynamic_swish_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_swish_fp16) + # PassThrough + add_example_executable(example_convnd_fwd_xdl_dynamic_passthrough_fp16 convnd_fwd_xdl_dynamic_passthrough_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_passthrough_fp16) + # Logistic + add_example_executable(example_convnd_fwd_xdl_dynamic_logistic_fp16 convnd_fwd_xdl_dynamic_logistic_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_logistic_fp16) + + set(target 1) + endif() +endforeach() diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp new file mode 100644 index 000000000..ed31be19e --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" + +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" + +constexpr ck::index_t NDimSpatial = 3; +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using AccDataType = float; +using CShuffleDataType = ck::half_t; +using OutDataType = ck::half_t; + +template +using S = ck::Sequence; + +using InLayout = ck::tensor_layout::convolution::GNDHWC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::GNDHWK; + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; +using DynamicElementOp = ck::tensor_operation::element_wise::DynamicUnaryOp; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +using DeviceGroupedConvNDActivInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle< + NDimSpatial, + InLayout, + WeiLayout, + ck::Tuple<>, + OutLayout, + InDataType, + WeiDataType, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + OutDataType, + InElementOp, + WeiElementOp, + DynamicElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 32, // KPerBlock + 8, // AK1 + 8, // BK1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, + 1, + S<1, 32, 1, 8>, + 8>; + +template +bool run_grouped_conv(bool do_verification, + int init_method, + bool time_kernel, + const ck::utils::conv::ConvParam& conv_param, + const HostTensorDescriptor& in_g_n_c_wis_desc, + const HostTensorDescriptor& wei_g_k_c_xs_desc, + const HostTensorDescriptor& out_g_n_k_wos_desc, + const InElementOp& in_element_op, + const WeiElementOp& wei_element_op, + const OutElementOp& out_element_op) +{ + + Tensor in(in_g_n_c_wis_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor out_host(out_g_n_k_wos_desc); + Tensor out_device(out_g_n_k_wos_desc); + + std::cout << "in: " << in.mDesc << std::endl; + std::cout << "wei: " << wei.mDesc << std::endl; + std::cout << "out: " << out_host.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + wei.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{-1.0, 1.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-0.05, 0.05}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + + std::array a_g_n_c_wis_lengths{}; + std::array a_g_n_c_wis_strides{}; + std::array b_g_k_c_xs_lengths{}; + std::array b_g_k_c_xs_strides{}; + std::array e_g_n_k_wos_lengths{}; + std::array e_g_n_k_wos_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; + + copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); + copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); + copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); + copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); + copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); + copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); + copy(conv_param.conv_filter_strides_, conv_filter_strides); + copy(conv_param.conv_filter_dilations_, conv_filter_dilations); + copy(conv_param.input_left_pads_, input_left_pads); + copy(conv_param.input_right_pads_, input_right_pads); + + // do Conv + auto conv = DeviceConvNDFwdInstance{}; + auto invoker = conv.MakeInvoker(); + auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(), + wei_device_buf.GetDeviceBuffer(), + std::array{}, + out_device_buf.GetDeviceBuffer(), + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + std::array, 0>{{}}, + std::array, 0>{{}}, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op); + + if(!conv.IsSupportedArgument(argument)) + { + throw std::runtime_error("The device op with the specified compilation parameters does " + "not support this convolution problem."); + } + + float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << conv.GetTypeString() << std::endl; + + if(do_verification) + { + auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in, + wei, + out_host, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + in_element_op, + wei_element_op, + out_element_op); + + ref_invoker.Run(ref_argument); + + out_device_buf.FromDevice(out_device.mData.data()); + + return ck::utils::check_err(out_device, out_host, "Error: incorrect results!"); + } + + return true; +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp new file mode 100644 index 000000000..8fa455c62 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::UnaryAbs out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp new file mode 100644 index 000000000..239a21525 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::ClippedRelu out_element_op(0.f, 1.f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp new file mode 100644 index 000000000..23a094af7 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Elu out_element_op(2.f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp new file mode 100644 index 000000000..fe4b80a68 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::LeakyRelu out_element_op(0.f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp new file mode 100644 index 000000000..756c07ed8 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Logistic out_element_op(1.0f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp new file mode 100644 index 000000000..6588ec504 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::PassThrough out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp new file mode 100644 index 000000000..90f00a166 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Power out_element_op(4.f, 1.f, 2.f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp new file mode 100644 index 000000000..830297cb5 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Relu out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp new file mode 100644 index 000000000..b143b4a4e --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Sigmoid out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp new file mode 100644 index 000000000..83ba0f7f8 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::SoftRelu out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp new file mode 100644 index 000000000..e862d1120 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Swish out_element_op(1.0f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp new file mode 100644 index 000000000..a91fc7ce3 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::TanH out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc b/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc new file mode 100644 index 000000000..4e90cf936 --- /dev/null +++ b/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +void print_helper_msg() +{ + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: time kernel (0=no, 1=yes)\n" + << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; +} + +template +bool run_convnd_example(int argc, char* argv[], const OutElementOp& out_element_op) +{ + print_helper_msg(); + + bool do_verification = true; + // Use floats for SoftRelu by default to avoid overflow after e^x. + int init_method = + std::is_same_v ? 2 : 1; + bool time_kernel = false; + + // Following shapes are selected to avoid overflow. Expect inf in case of + // size increase for some elementwise ops. + ck::utils::conv::ConvParam conv_param{ + 3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; + + if(argc == 1) + { + // use default + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + const ck::index_t num_dim_spatial = std::stoi(argv[4]); + + conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv); + } + + const auto in_element_op = InElementOp{}; + const auto wei_element_op = WeiElementOp{}; + + const auto run = [&]() { + const auto in_g_n_c_wis_desc = + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed( + conv_param); + + const auto wei_g_k_c_xs_desc = + ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed( + conv_param); + + const auto out_g_n_k_wos_desc = + ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed( + conv_param); + + return run_grouped_conv(do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); + }; + + if(conv_param.num_dim_spatial_ == 3) + { + return run(); + } + + return false; +} diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index 939ee1729..f21a45938 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -85,9 +85,9 @@ __global__ void BsPointer p_bs_grid, DsPointer p_ds_grid, EDataType* __restrict__ p_e_grid, - const AElementwiseOperation a_element_op, - const BElementwiseOperation b_element_op, - const CDEElementwiseOperation cde_element_op, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1, const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock @@ -121,6 +121,19 @@ __global__ void static_for<0, NumDTensor, 1>{}( [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_group_offset[i]; }); + if constexpr(is_same_v) + { + a_element_op.InitUnaryOpPtrOnDevice(); + } + if constexpr(is_same_v) + { + b_element_op.InitUnaryOpPtrOnDevice(); + } + if constexpr(is_same_v) + { + cde_element_op.InitUnaryOpPtrOnDevice(); + } + if constexpr(isMultiA || isMultiB) { AsPointer p_as_grid_grp; diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 135eaec93..b914c0b96 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -405,7 +405,7 @@ struct ScaleAddScaleAddRelu const float& d1) const { const float x = c * alpha1_ + alpha2_ * d0 + d1; - Relu{}.template operator()(e, x); + e = x > 0 ? x : 0; } template <> @@ -416,7 +416,7 @@ struct ScaleAddScaleAddRelu type_convert(d1); float result = 0; - Relu{}.template operator()(result, x); + result = x > 0 ? x : 0; e = type_convert(result); } @@ -429,7 +429,7 @@ struct ScaleAddScaleAddRelu type_convert(d1); float result = 0; - Relu{}.template operator()(result, x); + result = x > 0 ? x : 0; e = type_convert(result); } @@ -441,7 +441,7 @@ struct ScaleAddScaleAddRelu const float x = type_convert(c) * alpha1_ + alpha2_ * d0 + d1; float result = 0; - Relu{}.template operator()(result, x); + result = x > 0 ? x : 0; e = type_convert(result); } diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index ab6b1691a..712b88618 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -7,11 +7,36 @@ #include "ck/utility/math.hpp" #include "ck/utility/math_v2.hpp" #include "ck/utility/type_convert.hpp" +#include namespace ck { namespace tensor_operation { namespace element_wise { +struct UnaryOpBase +{ + public: + __host__ __device__ virtual ~UnaryOpBase() = default; + + __host__ __device__ UnaryOpBase() = default; + __host__ __device__ UnaryOpBase(const UnaryOpBase&) = default; + __host__ __device__ UnaryOpBase& operator=(const UnaryOpBase&) = default; + __host__ __device__ UnaryOpBase(UnaryOpBase&&) = default; + __host__ __device__ UnaryOpBase& operator=(UnaryOpBase&&) = default; + + __host__ __device__ virtual inline void operator()(float& y, const float& x) const = 0; + + __host__ __device__ virtual inline void operator()(double& y, const double& x) const = 0; + + __host__ __device__ virtual inline void operator()(int32_t& y, const int32_t& x) const = 0; + + __host__ __device__ virtual inline void operator()(int8_t& y, const int8_t& x) const = 0; + + __host__ __device__ virtual inline void operator()(half_t& y, const half_t& x) const = 0; + + __host__ __device__ virtual inline void operator()(bhalf_t& y, const bhalf_t& x) const = 0; +}; + struct PassThroughPack2 { template @@ -25,17 +50,24 @@ struct PassThroughPack2 constexpr const static bool is_pack2_invocable = true; }; -struct PassThrough +struct PassThrough : public UnaryOpBase { + + __host__ __device__ inline void operator()(float& y, const float& x) const final { y = x; } + + __host__ __device__ inline void operator()(double& y, const double& x) const final { y = x; } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final { y = x; } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final { y = x; } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final { y = x; } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final { y = x; } + template __host__ __device__ void operator()(Y& y, const X& x) const; - template <> - __host__ __device__ void operator()(double& y, const double& x) const - { - y = x; - } - template <> __host__ __device__ void operator()(float& y, const double& x) const { @@ -48,36 +80,12 @@ struct PassThrough y = type_convert(x); } - template <> - __host__ __device__ void operator()(float& y, const float& x) const - { - y = x; - } - - template <> - __host__ __device__ void operator()(half_t& y, const half_t& x) const - { - y = x; - } - template <> __host__ __device__ void operator()(half_t& y, const float& x) const { y = type_convert(x); } - template <> - __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const - { - y = x; - } - - template <> - __host__ __device__ void operator()(int32_t& y, const int32_t& x) const - { - y = x; - } - template <> __host__ __device__ void operator()(bhalf_t& y, const float& x) const { @@ -102,12 +110,6 @@ struct PassThrough y = type_convert(x); } - template <> - __host__ __device__ void operator()(int8_t& y, const int8_t& x) const - { - y = x; - } - template <> __host__ __device__ void operator()(half_t& y, const int8_t& x) const { @@ -407,20 +409,38 @@ struct UnarySquare }; }; -struct UnaryAbs +struct UnaryAbs : public UnaryOpBase { - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); + y = ck::math::abs(x); + } + __host__ __device__ inline void operator()(double& y, const double& x) const final + { y = ck::math::abs(x); - }; + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + y = ck::math::abs(x); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + y = ck::math::abs(x); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + y = ck::math::abs(x); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + y = ck::math::abs(x); + } - template <> __host__ __device__ void operator()(f8_t& y, const f8_t& x) const { y = ck::type_convert(ck::math::abs(ck::type_convert(x))); @@ -439,20 +459,34 @@ struct UnarySqrt }; }; -struct Relu +struct Relu : public UnaryOpBase { - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); y = x > 0 ? x : 0; } - template <> - __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + y = x > 0 ? x : 0; + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + y = x > 0 ? x : 0; + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + y = x > 0 ? x : 0; + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + y = x > 0 ? x : 0; + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final { float x_f32 = ck::type_convert(x); float y_f32 = x_f32 > 0 ? x_f32 : 0; @@ -599,18 +633,46 @@ struct Gelu } }; -struct Sigmoid +struct Sigmoid : public UnaryOpBase { - template - __host__ __device__ void operator()(T& y, const T& x) const + + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - constexpr T one = type_convert(1); - y = one / (one + ck::math::exp(-x)); - }; + constexpr float one = type_convert(1); + y = one / (one + ck::math::exp(-x)); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + constexpr double one = type_convert(1); + y = one / (one + ck::math::exp(-x)); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + constexpr int32_t one = type_convert(1); + y = one / (one + ck::math::exp(-x)); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + constexpr int8_t one = type_convert(1); + y = one / (one + ck::math::exp(-x)); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + constexpr half_t one = type_convert(1); + y = one / (one + ck::math::exp(-x)); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + constexpr float one = type_convert(1); + float x_f32 = ck::type_convert(x); + float y_f32 = one / (one + ck::math::exp(x_f32)); + y = ck::type_convert(y_f32); + } }; struct Silu @@ -626,18 +688,37 @@ struct Silu }; }; -struct TanH +struct TanH : public UnaryOpBase { - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); + y = ck::math::tanh(x); + } + __host__ __device__ inline void operator()(double& y, const double& x) const final + { y = ck::math::tanh(x); - }; + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + y = ck::math::tanh(x); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + y = ck::math::tanh(x); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + y = ck::math::tanh(x); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + y = ck::math::tanh(x); + } }; struct ACos @@ -878,138 +959,393 @@ struct Rcp }; }; -struct Swish +struct Swish : public UnaryOpBase { - Swish(float beta = 1.0f) : beta_(beta) {} + __host__ __device__ Swish(float beta = 1.0f) : beta_(beta) {} + + __host__ __device__ float get_beta() const { return beta_; } + + const float beta_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } template __host__ __device__ void operator()(Y& y, const X& x) const { static_assert(is_same::value || is_same::value || - is_same::value, + is_same::value, "Data type is not supported by this operation!"); static_assert(is_same::value || is_same::value || - is_same::value, + is_same::value, "Data type is not supported by this operation!"); float bx = -beta_ * type_convert(x); y = type_convert(x / (1.f + ck::math::exp(bx))); - }; - - const float beta_; + } }; -struct SoftRelu +struct SoftRelu : public UnaryOpBase { - SoftRelu(float alpha = 1.f) : alpha_(alpha){}; + __host__ __device__ SoftRelu(float alpha = 1.0f) : alpha_(alpha) {} - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ float get_alpha() const { return alpha_; } + + const float alpha_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - constexpr T one = type_convert(1); - y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + float casted_alpha = type_convert(alpha_); + constexpr float one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + constexpr double one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + constexpr int32_t one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + constexpr int8_t one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + constexpr half_t one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + bhalf_t casted_alpha = type_convert(alpha_); + constexpr bhalf_t one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; } - const float alpha_; }; -struct Power +struct Power : public UnaryOpBase { - Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f) - : alpha_(alpha), beta_(beta), gamma_(gamma){}; - - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f) + : alpha_(alpha), beta_(beta), gamma_(gamma) { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - T casted_beta = type_convert(beta_); - T casted_gamma = type_convert(gamma_); - T shifted_scaled_x = casted_alpha + casted_beta * x; - y = ck::math::pow(shifted_scaled_x, casted_gamma); } + + __host__ __device__ float get_alpha() const { return alpha_; } + + __host__ __device__ float get_beta() const { return beta_; } + + __host__ __device__ float get_gamma() const { return gamma_; } + const float alpha_; const float beta_; const float gamma_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float casted_alpha = type_convert(alpha_); + float casted_beta = type_convert(beta_); + float casted_gamma = type_convert(gamma_); + + float shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + double casted_beta = type_convert(beta_); + double casted_gamma = type_convert(gamma_); + + double shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + int32_t casted_beta = type_convert(beta_); + int32_t casted_gamma = type_convert(gamma_); + + int32_t shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + int8_t casted_beta = type_convert(beta_); + int8_t casted_gamma = type_convert(gamma_); + + int8_t shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + half_t casted_beta = type_convert(beta_); + half_t casted_gamma = type_convert(gamma_); + + half_t shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + bhalf_t casted_alpha = type_convert(alpha_); + bhalf_t casted_beta = type_convert(beta_); + bhalf_t casted_gamma = type_convert(gamma_); + + bhalf_t shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } }; -struct ClippedRelu +struct ClippedRelu : public UnaryOpBase { - ClippedRelu(float alpha = 0.f, float beta = 1.f) : alpha_(alpha), beta_(beta){}; - - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ ClippedRelu(float alpha = 0.f, float beta = 1.f) + : alpha_(alpha), beta_(beta) { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - T casted_beta = type_convert(beta_); - y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); } + + __host__ __device__ float get_alpha() const { return alpha_; } + + __host__ __device__ float get_beta() const { return beta_; } + const float alpha_; const float beta_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float casted_alpha = type_convert(alpha_); + float casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + double casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + int32_t casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + int8_t casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + half_t casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + bhalf_t casted_alpha = type_convert(alpha_); + bhalf_t casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } }; -struct LeakyRelu +struct LeakyRelu : public UnaryOpBase { - LeakyRelu(float alpha = 0.01f) : alpha_(alpha){}; - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ LeakyRelu(float alpha = 0.f) : alpha_(alpha) {} + + __host__ __device__ float get_alpha() const { return alpha_; } + + const float alpha_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + + __host__ __device__ inline void operator()([[maybe_unused]] bhalf_t& y, + [[maybe_unused]] const bhalf_t& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - y = x >= 0 ? x : x * casted_alpha; } - const float alpha_; }; -struct Elu +struct Elu : public UnaryOpBase { - Elu(float alpha = 1.f) : alpha_(alpha){}; - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ Elu(float alpha = 1.f) : alpha_(alpha) {} + + __host__ __device__ float get_alpha() const { return alpha_; } + + const float alpha_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + float casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + bhalf_t casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); } - const float alpha_; }; -struct Logistic +struct Logistic : public UnaryOpBase { - Logistic(float alpha = 1.f) : alpha_(alpha){}; - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ Logistic(float alpha = 1.0f) : alpha_(alpha) {} + + __host__ __device__ float get_alpha() const { return alpha_; } + + const float alpha_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - constexpr T one = type_convert(1); - y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + float casted_alpha = type_convert(alpha_); + constexpr float one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + constexpr double one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + constexpr int32_t one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + constexpr int8_t one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + constexpr half_t one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + bhalf_t casted_alpha = type_convert(alpha_); + constexpr bhalf_t one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); } - const float alpha_; }; struct ConvInvscale @@ -1074,7 +1410,7 @@ struct ConvScaleRelu __host__ __device__ void operator()(f8_t& e, const float& c) const { float x; - Relu{}.template operator()(x, c * scale_in_ * scale_wei_); + Relu{}(x, c * scale_in_ * scale_wei_); e = type_convert(x * scale_out_); }; @@ -1153,6 +1489,239 @@ struct FastNumericArrayConverter __device__ OutputArray operator()(InputArray const& Input) { return convert(Input); } }; +struct DynamicUnaryOp +{ + + DynamicUnaryOp& operator=(const DynamicUnaryOp& other) + { + if(this != &other) + { + unary_op_ptr_ = other.unary_op_ptr_; + unary_op_type_ = other.unary_op_type_; + } + return *this; + } + + __host__ __device__ DynamicUnaryOp() = delete; + + __host__ __device__ DynamicUnaryOp(const Swish& swish) + { + unary_op_type_ = UnaryOpType::Swish; + beta = swish.get_beta(); + } + + __host__ __device__ DynamicUnaryOp(const Swish&& swish) + { + unary_op_type_ = UnaryOpType::Swish; + beta = swish.get_beta(); + } + + __host__ __device__ DynamicUnaryOp(const Sigmoid&) { unary_op_type_ = UnaryOpType::Sigmoid; } + + __host__ __device__ DynamicUnaryOp(const Sigmoid&&) { unary_op_type_ = UnaryOpType::Sigmoid; } + + __host__ __device__ DynamicUnaryOp(const PassThrough&) + { + unary_op_type_ = UnaryOpType::PassThrough; + } + + __host__ __device__ DynamicUnaryOp(const PassThrough&&) + { + unary_op_type_ = UnaryOpType::PassThrough; + } + + __host__ __device__ DynamicUnaryOp(const Logistic& logistic) + { + unary_op_type_ = UnaryOpType::Logistic; + alpha = logistic.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const Logistic&& logistic) + { + unary_op_type_ = UnaryOpType::Logistic; + alpha = logistic.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const TanH&) { unary_op_type_ = UnaryOpType::TanH; } + + __host__ __device__ DynamicUnaryOp(const TanH&&) { unary_op_type_ = UnaryOpType::TanH; } + + __host__ __device__ DynamicUnaryOp(const Relu&) { unary_op_type_ = UnaryOpType::Relu; } + + __host__ __device__ DynamicUnaryOp(const Relu&&) { unary_op_type_ = UnaryOpType::Relu; } + + __host__ __device__ DynamicUnaryOp(const SoftRelu& softrelu) + { + unary_op_type_ = UnaryOpType::SoftRelu; + alpha = softrelu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const SoftRelu&& softrelu) + { + unary_op_type_ = UnaryOpType::SoftRelu; + alpha = softrelu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const UnaryAbs&) { unary_op_type_ = UnaryOpType::UnaryAbs; } + + __host__ __device__ DynamicUnaryOp(const UnaryAbs&&) { unary_op_type_ = UnaryOpType::UnaryAbs; } + + __host__ __device__ DynamicUnaryOp(const Power& pow) + { + unary_op_type_ = UnaryOpType::Power; + alpha = pow.get_alpha(); + beta = pow.get_beta(); + gamma = pow.get_gamma(); + } + + __host__ __device__ DynamicUnaryOp(const Power&& pow) + { + unary_op_type_ = UnaryOpType::Power; + alpha = pow.get_alpha(); + beta = pow.get_beta(); + gamma = pow.get_gamma(); + } + + __host__ __device__ DynamicUnaryOp(const ClippedRelu& clippedrelu) + { + unary_op_type_ = UnaryOpType::ClippedRelu; + alpha = clippedrelu.get_alpha(); + beta = clippedrelu.get_beta(); + } + + __host__ __device__ DynamicUnaryOp(const ClippedRelu&& clippedrelu) + { + unary_op_type_ = UnaryOpType::ClippedRelu; + alpha = clippedrelu.get_alpha(); + beta = clippedrelu.get_beta(); + } + + __host__ __device__ DynamicUnaryOp(const LeakyRelu& leakyrelu) + { + unary_op_type_ = UnaryOpType::LeakyRelu; + alpha = leakyrelu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const LeakyRelu&& leakyrelu) + { + unary_op_type_ = UnaryOpType::LeakyRelu; + alpha = leakyrelu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const Elu& elu) + { + unary_op_type_ = UnaryOpType::Elu; + alpha = elu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const Elu&& elu) + { + unary_op_type_ = UnaryOpType::Elu; + alpha = elu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const DynamicUnaryOp& dynamic_op) + : unary_op_type_(dynamic_op.unary_op_type_), + unary_op_ptr_(dynamic_op.unary_op_ptr_), + alpha(dynamic_op.alpha), + beta(dynamic_op.beta), + gamma(dynamic_op.gamma) + { + } + + __host__ __device__ ~DynamicUnaryOp() + { + if(unary_op_ptr_) + delete unary_op_ptr_; + } + + __device__ void InitUnaryOpPtrOnDevice() + { + switch(unary_op_type_) + { + case(UnaryOpType::Swish): unary_op_ptr_ = new Swish(beta); break; + case(UnaryOpType::Sigmoid): unary_op_ptr_ = new Sigmoid; break; + case(UnaryOpType::PassThrough): unary_op_ptr_ = new PassThrough; break; + case(UnaryOpType::Logistic): unary_op_ptr_ = new Logistic(alpha); break; + case(UnaryOpType::TanH): unary_op_ptr_ = new TanH; break; + case(UnaryOpType::Relu): unary_op_ptr_ = new Relu; break; + case(UnaryOpType::SoftRelu): unary_op_ptr_ = new SoftRelu(alpha); break; + case(UnaryOpType::UnaryAbs): unary_op_ptr_ = new UnaryAbs; break; + case(UnaryOpType::Power): unary_op_ptr_ = new Power(alpha, beta, gamma); break; + case(UnaryOpType::ClippedRelu): unary_op_ptr_ = new ClippedRelu(alpha, beta); break; + case(UnaryOpType::LeakyRelu): unary_op_ptr_ = new LeakyRelu(alpha); break; + case(UnaryOpType::Elu): unary_op_ptr_ = new Elu(alpha); break; + + default: unary_op_ptr_ = nullptr; break; + } + } + + template + __device__ void operator()(Y& y, const X& x) const + { + isSupported(); + unary_op_ptr_->operator()(y, x); + } + + template + __host__ void operator()(Y& y, const X& x) const + { + isSupported(); + switch(unary_op_type_) + { + case(UnaryOpType::Swish): Swish{}.operator()(y, x); break; + case(UnaryOpType::Sigmoid): Sigmoid{}.operator()(y, x); break; + case(UnaryOpType::PassThrough): PassThrough{}.operator()(y, x); break; + case(UnaryOpType::Logistic): Logistic{}.operator()(y, x); break; + case(UnaryOpType::TanH): TanH{}.operator()(y, x); break; + case(UnaryOpType::Relu): Relu{}.operator()(y, x); break; + case(UnaryOpType::SoftRelu): SoftRelu{}.operator()(y, x); break; + case(UnaryOpType::UnaryAbs): UnaryAbs{}.operator()(y, x); break; + case(UnaryOpType::Power): Power{}.operator()(y, x); break; + case(UnaryOpType::ClippedRelu): ClippedRelu{}.operator()(y, x); break; + case(UnaryOpType::LeakyRelu): LeakyRelu{}.operator()(y, x); break; + case(UnaryOpType::Elu): Elu{}.operator()(y, x); break; + default: break; + } + } + + template + __device__ __host__ constexpr void isSupported() const + { + + static_assert(std::is_same::value, "X and Y must be of the same type"); + + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value || is_same::value, + "Data type is not supported by this operation!"); + } + + private: + enum class UnaryOpType + { + Swish, + Sigmoid, + PassThrough, + Logistic, + TanH, + Relu, + SoftRelu, + UnaryAbs, + Power, + ClippedRelu, + LeakyRelu, + Elu + }; + + public: + UnaryOpType unary_op_type_; + UnaryOpBase* unary_op_ptr_ = nullptr; + float alpha; + float beta; + float gamma; +}; + } // namespace element_wise } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp new file mode 100644 index 000000000..9db675a51 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = ck::bhalf_t; +using F16 = ck::half_t; +using F32 = float; + +template +using S = ck::Sequence; + +using namespace ck::tensor_layout::convolution; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using DynamicUnaryOp = ck::tensor_operation::element_wise::DynamicUnaryOp; + +static constexpr auto ConvFwdDefault = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto ConvFwd1x1P0 = ConvolutionForwardSpecialization::Filter1x1Pad0; + +static constexpr auto ConvFwd1x1S1P0 = ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; + +static constexpr auto ConvFwdOddC = + ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; + +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +template +using device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // generic instance + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + // instances for small conv.K and conv.C + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> + // clang-format on + >; + +template +using device_grouped_conv_fwd_xdl_dynamic_op_f16_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // generic instance + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + // instances for small conv.K and conv.C + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> + // clang-format on + >; + +template +using device_grouped_conv_fwd_xdl_dynamic_op_f32_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // generic instance + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1>, + // instances for small conv.K and conv.C + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> + // clang-format on + >; + +template +using device_grouped_conv_fwd_xdl_dynamic_op_int8_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // generic instance + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + // instances for small conv.K and conv.C + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp new file mode 100644 index 000000000..5efee69b2 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dynamic.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using DynamicUnaryOp = ck::tensor_operation::element_wise::DynamicUnaryOp; + +#ifdef CK_ENABLE_BF16 +// grouped conv2d forward, NHWGC/GKYXC/NHWGK +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector, + NHWGK, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_FP16 +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector, + NHWGK, + F16, + F16, + ck::Tuple<>, + F16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_FP32 +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector, + NHWGK, + F32, + F32, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instances( + std::vector, + NHWGK, + int8_t, + int8_t, + ck::Tuple<>, + int8_t, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif +#ifdef CK_ENABLE_BF16 +// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_FP16 +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector, + NDHWGK, + F16, + F16, + ck::Tuple<>, + F16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_FP32 +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector, + NDHWGK, + F32, + F32, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instances( + std::vector, + NDHWGK, + int8_t, + int8_t, + ck::Tuple<>, + int8_t, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = + DeviceGroupedConvFwdMultipleABD; + + static auto GetInstances() + { + std::vector> op_ptrs; + if constexpr(NumDimSpatial == 3 && is_same_v && + is_same_v && is_same_v && + DLayouts::Size() == 0) + { +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && is_same_v) + { + add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_INT8 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instances( + op_ptrs); + } +#endif + } + else if constexpr(NumDimSpatial == 2 && is_same_v && + is_same_v && is_same_v && + DLayouts::Size() == 0) + { +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_INT8 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instances( + op_ptrs); + } +#endif + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt new file mode 100644 index 000000000..92735fcae --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt @@ -0,0 +1,8 @@ +# ONLY XDL_KERNELS +set(GROUPED_CONV2D_FWD_DYNAMIC_OP + xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp + xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp + xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp + xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp) + +add_instance_library(device_grouped_conv2d_fwd_dynamic_op_instance ${GROUPED_CONV2D_FWD_DYNAMIC_OP}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp new file mode 100644 index 000000000..853470e1c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector, + NHWGK, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp new file mode 100644 index 000000000..725b9ca0d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector, + NHWGK, + F16, + F16, + ck::Tuple<>, + F16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp new file mode 100644 index 000000000..fbd5fe370 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector, + NHWGK, + F32, + F32, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp new file mode 100644 index 000000000..6bfc29537 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instances( + std::vector, + NHWGK, + int8_t, + int8_t, + ck::Tuple<>, + int8_t, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt new file mode 100644 index 000000000..3b8ebbffd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt @@ -0,0 +1,8 @@ +# ONLY XDL_KERNELS +set(GROUPED_CONV3D_FWD_DYNAMIC_OP + xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp + xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp + xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp + xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp) + +add_instance_library(device_grouped_conv3d_fwd_dynamic_op_instance ${GROUPED_CONV3D_FWD_DYNAMIC_OP}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp new file mode 100644 index 000000000..249dfaa4d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp new file mode 100644 index 000000000..75c4ddc35 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector, + NDHWGK, + F16, + F16, + ck::Tuple<>, + F16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp new file mode 100644 index 000000000..2e237e07b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector, + NDHWGK, + F32, + F32, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp new file mode 100644 index 000000000..e38f1acbd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instances( + std::vector, + NDHWGK, + int8_t, + int8_t, + ck::Tuple<>, + int8_t, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck -- GitLab From b098b71b05e4c06310f2e74056282a796f3cfd13 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Sat, 26 Oct 2024 23:52:49 +0800 Subject: [PATCH 020/153] topk_softmax (#1592) * topk_softmax * remove some file * fix atomix linear_offset * address various comment, and change sfc get_index api to static(tuple) --- .../ck_tile/09_topk_softmax/CMakeLists.txt | 8 + example/ck_tile/09_topk_softmax/README.md | 28 + .../09_topk_softmax/script/smoke_test.sh | 22 + .../ck_tile/09_topk_softmax/topk_softmax.cpp | 299 +++++ .../09_topk_softmax/topk_softmax_api.cpp | 96 ++ .../09_topk_softmax/topk_softmax_api.hpp | 21 + example/ck_tile/CMakeLists.txt | 2 + include/ck_tile/core.hpp | 1 + .../core/algorithm/space_filling_curve.hpp | 12 +- .../core/arch/amd_buffer_addressing.hpp | 213 ++- include/ck_tile/core/config.hpp | 18 + include/ck_tile/core/container/tuple.hpp | 34 +- include/ck_tile/core/numeric/math.hpp | 972 +++++++++++++- include/ck_tile/core/tensor/buffer_view.hpp | 178 ++- include/ck_tile/core/tensor/load_tile.hpp | 60 +- include/ck_tile/core/tensor/shuffle_tile.hpp | 2 +- include/ck_tile/core/tensor/store_tile.hpp | 31 +- include/ck_tile/core/tensor/tensor_view.hpp | 217 ++- include/ck_tile/core/tensor/tile_window.hpp | 210 ++- .../core/tensor/tile_window_linear.hpp | 1082 +++++++++++++++ include/ck_tile/core/utility/magic_div.hpp | 27 +- include/ck_tile/host.hpp | 1 + include/ck_tile/host/fill.hpp | 68 + include/ck_tile/host/host_tensor.hpp | 23 + .../host/reference/reference_softmax.hpp | 80 +- .../ck_tile/host/reference/reference_topk.hpp | 124 ++ include/ck_tile/ops/elementwise.hpp | 7 + .../unary_element_wise_operation.hpp | 1163 +++++++++++++++++ .../block_fmha_pipeline_qr_ks_vs_async.hpp | 4 +- .../ck_tile/ops/reduce/block/block_reduce.hpp | 170 +++ include/ck_tile/ops/softmax.hpp | 8 + .../ops/softmax/block/block_softmax_2d.hpp | 81 ++ .../block/block_softmax_2d_problem.hpp | 16 + include/ck_tile/ops/topk.hpp | 8 + .../ops/topk/block/block_topk_stream_2d.hpp | 113 ++ .../block/block_topk_stream_2d_problem.hpp | 22 + include/ck_tile/ops/topk_softmax.hpp | 10 + .../kernel/topk_softmax_kernel.hpp | 166 +++ .../topk_softmax_warp_per_row_pipeline.hpp | 123 ++ .../topk_softmax_warp_per_row_policy.hpp | 63 + .../topk_softmax_warp_per_row_problem.hpp | 46 + 41 files changed, 5603 insertions(+), 226 deletions(-) create mode 100644 example/ck_tile/09_topk_softmax/CMakeLists.txt create mode 100644 example/ck_tile/09_topk_softmax/README.md create mode 100644 example/ck_tile/09_topk_softmax/script/smoke_test.sh create mode 100644 example/ck_tile/09_topk_softmax/topk_softmax.cpp create mode 100644 example/ck_tile/09_topk_softmax/topk_softmax_api.cpp create mode 100644 example/ck_tile/09_topk_softmax/topk_softmax_api.hpp create mode 100644 include/ck_tile/core/tensor/tile_window_linear.hpp create mode 100644 include/ck_tile/host/reference/reference_topk.hpp create mode 100644 include/ck_tile/ops/elementwise.hpp create mode 100644 include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp create mode 100644 include/ck_tile/ops/softmax.hpp create mode 100644 include/ck_tile/ops/softmax/block/block_softmax_2d.hpp create mode 100644 include/ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp create mode 100644 include/ck_tile/ops/topk.hpp create mode 100644 include/ck_tile/ops/topk/block/block_topk_stream_2d.hpp create mode 100644 include/ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp create mode 100644 include/ck_tile/ops/topk_softmax.hpp create mode 100644 include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp create mode 100644 include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp create mode 100644 include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp create mode 100644 include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp diff --git a/example/ck_tile/09_topk_softmax/CMakeLists.txt b/example/ck_tile/09_topk_softmax/CMakeLists.txt new file mode 100644 index 000000000..b43b98979 --- /dev/null +++ b/example/ck_tile/09_topk_softmax/CMakeLists.txt @@ -0,0 +1,8 @@ +add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp) +target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/) + +set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS) +# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations +list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) +# list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) +target_compile_options(tile_example_topk_softmax PRIVATE ${EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS}) diff --git a/example/ck_tile/09_topk_softmax/README.md b/example/ck_tile/09_topk_softmax/README.md new file mode 100644 index 000000000..104301290 --- /dev/null +++ b/example/ck_tile/09_topk_softmax/README.md @@ -0,0 +1,28 @@ +# topk-softmax + +This folder contains example for topk-softmax kernel using ck_tile tile-programming implementation. This kernel is often used in Moe model, before launching the fused-moe-gemm block. The input is a `token*expert` 2d matrix. The op will do a softmax per row(`expert`), then find the `topk` value for each row. Output is a `token*topk` weight(usually fp32) and index(int32) 2d tensor. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +make tile_example_topk_softmax -j +``` +This will result in an executable `build/bin/tile_example_topk_softmax` + +## example +``` +args: + -v weather do CPU validation or not (default:1) + -pr_i input data type. fp16/fp32 (representing 8/16/32 bit data) (default:fp16) + -pr_w output weight data type(currently only fp32 supported now) (default:fp32) + -t number of input tokens (default:32) + -e number of experts (default:8) + -k topk (default:2) + -st_i row stride of input, -1 means same as experts (default:-1) + -st_o row stride of output/indices, -1 means same as topk (default:-1) + -seed seed to be used, -1 means random every time (default:-1) + -kname when set to 1 it will print kernel name (default:0) + +``` diff --git a/example/ck_tile/09_topk_softmax/script/smoke_test.sh b/example/ck_tile/09_topk_softmax/script/smoke_test.sh new file mode 100644 index 000000000..646f5889f --- /dev/null +++ b/example/ck_tile/09_topk_softmax/script/smoke_test.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +EXE=./build/bin/tile_example_topk_softmax + +for pr_i in "fp16" "bf16" ; do +$EXE -pr_i=$pr_i -t=80 -e=17 +$EXE -pr_i=$pr_i -t=111 -e=117 +$EXE -pr_i=$pr_i -t=1000 -e=55 +$EXE -pr_i=$pr_i -t=99 -e=180 +$EXE -pr_i=$pr_i -t=175 -e=64 -k=8 +$EXE -pr_i=$pr_i -t=65 -e=8 -k=2 +$EXE -pr_i=$pr_i -t=1 -e=25 +$EXE -pr_i=$pr_i -t=31 -e=19 -k=15 +$EXE -pr_i=$pr_i -t=81 -e=37 -k=7 +$EXE -pr_i=$pr_i -t=199 -e=128 -k=13 +$EXE -pr_i=$pr_i -t=23 -e=1 -k=1 +$EXE -pr_i=$pr_i -t=127 -e=99 -k=19 -st_i=233 -st_o=31 +$EXE -pr_i=$pr_i -t=71 -e=11 -k=11 -st_i=30 -st_o=12 +$EXE -pr_i=$pr_i -t=1 -e=1 -k=1 +$EXE -pr_i=$pr_i -t=99 -e=2 -k=1 -st_i=11 -st_o=5 +$EXE -pr_i=$pr_i -t=333 -e=99 -k=13 -st_i=191 -st_o=17 +done diff --git a/example/ck_tile/09_topk_softmax/topk_softmax.cpp b/example/ck_tile/09_topk_softmax/topk_softmax.cpp new file mode 100644 index 000000000..6fc25631f --- /dev/null +++ b/example/ck_tile/09_topk_softmax/topk_softmax.cpp @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/reduce.hpp" +#include "topk_softmax_api.hpp" + +#if 0 +template +void dump_host_tensor_2d(const ck_tile::HostTensor& x) +{ + auto len = x.get_lengths(); + assert(len.size() == 2); + std::cout << "["; + for(size_t i = 0; i < len[0]; i++) + { + std::cout << i << ": ["; + for(size_t j = 0; j < len[1]; j++) + { + if constexpr(std::is_same_v) + { + auto v = ck_tile::type_convert(x(i, j)); + + std::cout << v; + if(j != len[1] - 1) + std::cout << ","; + } + else + { + std::cout << x(i, j) << " "; + } + } + std::cout << "]"; + if(i != len[0] - 1) + std::cout << ","; + else + std::cout << "]"; + std::cout << std::endl; + } + std::cout << "--------------------" << std::endl; +} +#endif + +// CPU reference +template +auto reference_topk_softmax(const ck_tile::HostTensor& x, + ck_tile::index_t k, + ck_tile::index_t dim = -1, + bool largest = true, + bool sorted = true) +{ + using namespace ck_tile; + + auto y = reference_softmax(x, dim); + + auto [y_values, y_indices] = reference_topk(y, k, dim, largest, sorted); + + return ck_tile::make_tuple(y_values, y_indices); +} + +template +auto reference_topk_softmax(const ck_tile::HostTensor& x, + ck_tile::HostTensor& y_values, + ck_tile::HostTensor& y_indices, + ck_tile::index_t k, + ck_tile::index_t dim = -1, + bool largest = true, + bool sorted = true) +{ + using namespace ck_tile; + + auto y = reference_softmax(x, dim); + reference_topk(y, y_values, y_indices, k, dim, largest, sorted); +} + +// different threshold for different dtype +template +auto get_elimit(std::string /*init_method*/) +{ + double rtol = 1e-3; + double atol = 1e-3; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit(std::string /*init_method*/) +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit(std::string init_method) +{ + if(init_method == "ui" || init_method == "ni") + { + unsigned max_rounding_point_distance = 0; + double atol = 2e-3; + return ck_tile::make_tuple(max_rounding_point_distance, atol); + } + else + { + unsigned max_rounding_point_distance = 1; + double atol = 0.0625; + return ck_tile::make_tuple(max_rounding_point_distance, atol); + } +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("v", "1", "weather do CPU validation or not") + .insert("pr_i", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)") + .insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)") + .insert("t", "32", "number of input tokens") + .insert("e", "8", "number of experts") + .insert("k", "2", "topk") + .insert("st_i", "-1", "row stride of input, -1 means same as experts") + .insert("st_o", "-1", "row stride of output/indices, -1 means same as topk") + .insert("seed", "-1", "seed to be used, -1 means random every time") + .insert("kname", "0", "when set to 1 it will print kernel name") + .insert("warmup", "5", "number of iterations before benchmark the kernel") + .insert("repeat", "20", "number of iterations to benchmark the kernel"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool test_topk_softmax(ck_tile::ArgParser args) +{ + int validate = args.get_int("v"); + std::string input_prec = args.get_str("pr_i"); + std::string weight_prec = args.get_str("pr_w"); + int tokens = args.get_int("t"); + int experts = args.get_int("e"); + int topk = args.get_int("k"); + int seed = args.get_int("seed"); + int stride_input = args.get_int("st_i"); + int stride_output = args.get_int("st_o"); + int kname = args.get_int("kname"); + int warmup = args.get_int("warmup"); + int repeat = args.get_int("repeat"); + + if(stride_input < 0) + { + stride_input = experts; + } + if(stride_output < 0) + { + stride_output = topk; + } + assert(stride_input >= experts); + assert(stride_output >= topk); + + if(seed < 0) + { + seed = std::time(nullptr); + } + + if(topk > experts) + { + printf("topk:%d value should be smaller than, or equal to number of experts:%d\n", + topk, + experts); + return false; + } + + // tokens already considered batch size + ck_tile::HostTensor x_host({tokens, experts}, {stride_input, 1}); + ck_tile::HostTensor value_host({tokens, topk}, {stride_output, 1}); + ck_tile::HostTensor index_host({tokens, topk}, {stride_output, 1}); + + { + // random require per-row unique + auto rand_gen = ck_tile::FillUniformDistribution_Unique{ + -5.f, 5.f, static_cast(seed)}; + + for(int i_t = 0; i_t < tokens; i_t++) + { + ck_tile::HostTensor x_row({experts}); + rand_gen(x_row); + std::copy(x_row.begin(), x_row.end(), x_host.begin() + i_t * stride_input); + rand_gen.clear(); + } + } + + ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem value_dev(value_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem index_dev(index_host.get_element_space_size_in_bytes()); + + x_dev.ToDevice(x_host.data()); + + topk_softmax_trait trait{input_prec, weight_prec, experts}; + + topk_softmax_kargs karg{x_dev.GetDeviceBuffer(), + value_dev.GetDeviceBuffer(), + index_dev.GetDeviceBuffer(), + tokens, + experts, + topk, + stride_input, + stride_output}; + + ck_tile::stream_config sc{nullptr, + true, + /* log_level = */ (kname ? 1 : 0), + warmup, + repeat}; + auto ms = topk_softmax(trait, karg, sc); + printf("[%s|%s]tokens:%d, experts:%d, topk:%d, st_i:%d, st_o:%d, ms:%f, ", + input_prec.c_str(), + weight_prec.c_str(), + tokens, + experts, + topk, + stride_input, + stride_output, + ms); + if(ms < 0) + printf("not supported\n"); + fflush(stdout); + if(ms < 0) + { + return false; + } + + value_dev.FromDevice(value_host.data()); + index_dev.FromDevice(index_host.data()); + + bool rtn = true; + if(validate) + { + ck_tile::HostTensor value_ref({tokens, topk}, {stride_output, 1}); + ck_tile::HostTensor index_ref({tokens, topk}, {stride_output, 1}); + + reference_topk_softmax( + x_host, value_ref, index_ref, topk); + + auto [rtol, atol] = get_elimit(""); + for(int i_t = 0; i_t < tokens; i_t++) + { + auto s_begin = std::vector{static_cast(i_t), static_cast(0)}; + auto s_end = + std::vector{static_cast(i_t + 1), static_cast(topk)}; + auto s_value_host = value_host.slice(s_begin, s_end); + auto s_value_ref = value_ref.slice(s_begin, s_end); + rtn &= ck_tile::check_err(s_value_host, + s_value_ref, + std::string("[") + std::to_string(i_t) + + std::string("] Value Error:"), + rtol, + atol); + auto s_index_host = index_host.slice(s_begin, s_end); + auto s_index_ref = index_ref.slice(s_begin, s_end); + rtn &= ck_tile::check_err(s_index_host, + s_index_ref, + std::string("[") + std::to_string(i_t) + + std::string("] Index Error:"), + rtol, + atol); + } + } + + printf("valid:%s\n", rtn ? "y" : "n"); + fflush(stdout); + return rtn; +} + +int main(int argc, char** argv) +{ + auto [result, args] = create_args(argc, argv); + if(!result) + return -1; + std::string input_prec = args.get_str("pr_i"); + std::string weight_prec = args.get_str("pr_w"); + + bool r = true; + if(input_prec.compare("fp16") == 0 && weight_prec.compare("fp32") == 0) + { + r &= test_topk_softmax(args); + } + else if(input_prec.compare("bf16") == 0 && weight_prec.compare("fp32") == 0) + { + r &= test_topk_softmax(args); + } + + return r ? 0 : -1; +} diff --git a/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp new file mode 100644 index 000000000..249a307b8 --- /dev/null +++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "topk_softmax_api.hpp" + +#define TOPK_SOFTMAX_DISPATCH(experts_) \ + constexpr ck_tile::index_t ts_experts = experts_; \ + using ts_problem = ck_tile:: \ + TopkSoftmaxWarpPerRowProblem; \ + using ts_pipeline = ck_tile::TopkSoftmaxWarpPerRowPipeline; \ + \ + using kernel = ck_tile::TopkSoftmaxKernel; \ + \ + auto kargs = kernel::MakeKargs(a); \ + \ + const dim3 grids = kernel::GridSize(a); \ + constexpr dim3 blocks = kernel::BlockSize(); \ + \ + float ave_time = ck_tile::launch_kernel( \ + s, ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs)); \ + \ + return ave_time; + +float topk_softmax(topk_softmax_trait t, topk_softmax_kargs a, ck_tile::stream_config s) +{ + if(t.input_type == "fp16" && t.weight_type == "fp32") + { + using ts_input_type = ck_tile::fp16_t; + using ts_weight_type = float; + using ts_index_type = ck_tile::index_t; +#if 1 + if(t.experts <= 8) + { + TOPK_SOFTMAX_DISPATCH(8) + } + else if(t.experts <= 16) + { + TOPK_SOFTMAX_DISPATCH(16) + } + else if(t.experts <= 32) + { + TOPK_SOFTMAX_DISPATCH(32) + } + else if(t.experts <= 64) + { + TOPK_SOFTMAX_DISPATCH(64) + } + else if(t.experts <= 128) + { + TOPK_SOFTMAX_DISPATCH(128) + } + else if(t.experts <= 192) + { + TOPK_SOFTMAX_DISPATCH(192) + } +#else + if(t.experts <= 128) + { + TOPK_SOFTMAX_DISPATCH(128) + } +#endif + } + else if(t.input_type == "bf16" && t.weight_type == "fp32") + { +#if 1 + using ts_input_type = ck_tile::bf16_t; + using ts_weight_type = float; + using ts_index_type = ck_tile::index_t; + if(t.experts <= 8) + { + TOPK_SOFTMAX_DISPATCH(8) + } + else if(t.experts <= 16) + { + TOPK_SOFTMAX_DISPATCH(16) + } + else if(t.experts <= 32) + { + TOPK_SOFTMAX_DISPATCH(32) + } + else if(t.experts <= 64) + { + TOPK_SOFTMAX_DISPATCH(64) + } + else if(t.experts <= 128) + { + TOPK_SOFTMAX_DISPATCH(128) + } + else if(t.experts <= 192) + { + TOPK_SOFTMAX_DISPATCH(192) + } +#endif + } + return -1; +} diff --git a/example/ck_tile/09_topk_softmax/topk_softmax_api.hpp b/example/ck_tile/09_topk_softmax/topk_softmax_api.hpp new file mode 100644 index 000000000..65651efa4 --- /dev/null +++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.hpp @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/topk_softmax.hpp" +#include + +struct topk_softmax_trait +{ + std::string input_type; + std::string weight_type; // currently always float + int experts; +}; + +struct topk_softmax_kargs : public ck_tile::TopkSoftmaxHostArgs +{ +}; + +float topk_softmax(topk_softmax_trait t, topk_softmax_kargs a, ck_tile::stream_config s); diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index ec4a175d3..366fb18a0 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -7,3 +7,5 @@ add_subdirectory(02_layernorm2d) add_subdirectory(03_gemm) add_subdirectory(04_img2col) add_subdirectory(05_reduce) +add_subdirectory(09_topk_softmax) + diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index d96f14710..56dfbd636 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -49,6 +49,7 @@ #include "ck_tile/core/tensor/tile_distribution_encoding.hpp" #include "ck_tile/core/tensor/tile_elementwise.hpp" #include "ck_tile/core/tensor/tile_window.hpp" +#include "ck_tile/core/tensor/tile_window_linear.hpp" #include "ck_tile/core/tensor/update_tile.hpp" #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" diff --git a/include/ck_tile/core/algorithm/space_filling_curve.hpp b/include/ck_tile/core/algorithm/space_filling_curve.hpp index 77a635611..6591acddb 100644 --- a/include/ck_tile/core/algorithm/space_filling_curve.hpp +++ b/include/ck_tile/core/algorithm/space_filling_curve.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -81,8 +81,10 @@ struct space_filling_curve return get_step_between(number{}, number{}); } + // Do not use this function directly! + // TODO: can refactor into generic lambda in the future template - static CK_TILE_HOST_DEVICE constexpr Index get_index(number) + static CK_TILE_HOST_DEVICE constexpr Index _get_index(number) { #if 0 /* @@ -153,11 +155,11 @@ struct space_filling_curve return idx_md; } - // FIXME: rename this function + // FIXME: return tuple of number<>, which is compile time only variable template - static CK_TILE_HOST_DEVICE constexpr auto get_index_tuple_of_number(number) + static CK_TILE_HOST_DEVICE constexpr auto get_index(number) { - constexpr auto idx = get_index(number{}); + constexpr auto idx = _get_index(number{}); return generate_tuple([&](auto i) { return number{}; }, number{}); } diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 7f488d1b7..3feede4d2 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -621,6 +621,99 @@ CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0) asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory"); } +namespace impl { +// below type indicate the data type used for buffer load inline asm +// clang-format off +template struct smem_load_trait; + +template struct smem_load_trait<16, T> { using payload_t = fp32x4_t; }; +template struct smem_load_trait<8 , T> { using payload_t = fp32x2_t; }; +template struct smem_load_trait<4 , T> { using payload_t = float; }; +template struct smem_load_trait<2 , T> { using payload_t = float; }; +template struct smem_load_trait<1 , T> { using payload_t = float; }; + +// clang-format on +} // namespace impl + +// NOTE: smem load/store no need pre_nop to make sure dependency by sw, happy :) +template +struct smem_load; + +template <> +struct smem_load<16> +{ + template + CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset) + { + static_assert(sizeof(T) == 16); + using mbuf_t = typename impl::smem_load_trait<16, T>::payload_t; + asm volatile("ds_read_b128 %0, %1 offset:%2" + : "=v"(reinterpret_cast(value)) // ! direct write + : "v"(v_offset), "n"(i_offset) + : "memory"); + } +}; + +template <> +struct smem_load<8> +{ + template + CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset) + { + static_assert(sizeof(T) == 8); + using mbuf_t = typename impl::smem_load_trait<8, T>::payload_t; + asm volatile("ds_read_b64 %0, %1 offset:%2" + : "=v"(reinterpret_cast(value)) // ! direct write + : "v"(v_offset), "n"(i_offset) + : "memory"); + } +}; + +template <> +struct smem_load<4> +{ + template + CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset) + { + static_assert(sizeof(T) == 4); + using mbuf_t = typename impl::smem_load_trait<4, T>::payload_t; + asm volatile("ds_read_b32 %0, %1 offset:%2" + : "=v"(reinterpret_cast(value)) // ! direct write + : "v"(v_offset), "n"(i_offset) + : "memory"); + } +}; + +template <> +struct smem_load<2> +{ + template + CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset) + { + static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually + using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t; + asm volatile("ds_read_u16 %0, %1 offset:%2" + : "=v"(reinterpret_cast(value)) // ! direct write + : "v"(v_offset), "n"(i_offset) + : "memory"); + } +}; + +template <> +struct smem_load<1> +{ + template + CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset) + { + static_assert(sizeof(T) == 4); + using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t; + asm volatile("ds_read_u8 %0, %1 offset:%2" + : "=v"(reinterpret_cast(value)) // ! direct write + : "v"(v_offset), "n"(i_offset) + : "memory"); + } +}; + // clang-format off namespace impl{ @@ -976,6 +1069,16 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int soffset, // dst_wave_addr_offset int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64"); +// Direct loads from global to LDS. +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, + __attribute__((address_space(3))) uint32_t* lds_ptr, + index_t size, + index_t voffset, + index_t soffset, + index_t offset, + index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); + template CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem, int32x4_t rsrc, @@ -1313,6 +1416,7 @@ CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer& dst, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, + index_t src_linear_addr_offset, index_t flag = 0, bool_constant = {}) { @@ -1327,7 +1431,7 @@ CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer& dst, src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, - 0, + src_linear_addr_offset, flag, bool_constant{}); } @@ -1337,7 +1441,7 @@ CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer& dst, src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, - 0, + src_linear_addr_offset, flag, bool_constant{}); } @@ -1365,6 +1469,43 @@ CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem, bool_constant{}); } +template +CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem, + int32x4_t src_wave_buffer_resource, + index_t src_thread_addr_offset, + index_t src_wave_addr_offset, + index_t src_immediate_addr_offset = 0, + index_t flag = 0, + bool_constant = {}) +{ + static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size"); + + if constexpr(oob_conditional_check) + { + index_t v_offset = flag ? v_offset : src_wave_buffer_resource[2]; + llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource, + smem, + sizeof(uint32_t), + v_offset, + src_wave_addr_offset, + src_immediate_addr_offset, + static_cast(coherence)); + } + else + { + llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource, + smem, + sizeof(uint32_t), + src_thread_addr_offset, + src_wave_addr_offset, + src_immediate_addr_offset, + static_cast(coherence)); + } +} + template CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer src_thread_data, @@ -1685,6 +1826,7 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer& dst_thr int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset, + index_t dst_linear_addr_offset, index_t is_valid_element = 1) { constexpr index_t bytes = sizeof(T) * N; @@ -1698,7 +1840,7 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer& dst_thr dst_wave_buffer_resource, dst_thread_addr_offset, dst_wave_addr_offset, - 0, + dst_linear_addr_offset, is_valid_element); } else @@ -1707,7 +1849,7 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer& dst_thr dst_wave_buffer_resource, dst_thread_addr_offset, dst_wave_addr_offset, - 0); + dst_linear_addr_offset); } } @@ -2014,6 +2156,7 @@ template & dst, const T* p_src_wave, index_t src_thread_element_offset, + index_t src_linear_element_offset, index_t src_element_space_size, index_t is_valid_element = 0, bool_constant = {}) @@ -2022,12 +2165,14 @@ CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer& dst, make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T)); index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); + index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T); amd_buffer_load_raw_impl( dst, src_wave_buffer_resource, src_thread_addr_offset, 0, + src_linear_addr_offset, is_valid_element, bool_constant{}); } @@ -2041,16 +2186,19 @@ template & dst, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, + index_t src_linear_element_offset, index_t is_valid_element = 0, bool_constant = {}) { index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); + index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T); amd_buffer_load_raw_impl( dst, src_wave_buffer_resource, src_thread_addr_offset, 0, + src_linear_addr_offset, is_valid_element, bool_constant{}); } @@ -2066,6 +2214,7 @@ template = {}) { @@ -2073,9 +2222,14 @@ CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem, make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T)); index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); + index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T); - amd_async_buffer_load_impl( - smem, src_wave_buffer_resource, src_thread_addr_offset, 0, 0, bool_constant{}); + amd_async_buffer_load_impl(smem, + src_wave_buffer_resource, + src_thread_addr_offset, + 0, + src_linear_addr_offset, + bool_constant{}); } // This version support buffer resource as input arg @@ -2086,12 +2240,42 @@ template = {}) { index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); + index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T); - amd_async_buffer_load_impl( - smem, src_wave_buffer_resource, src_thread_addr_offset, 0, 0, bool_constant{}); + amd_async_buffer_load_impl(smem, + src_wave_buffer_resource, + src_thread_addr_offset, + 0, + src_linear_addr_offset, + bool_constant{}); +} + +// This version support buffer resource as input arg +template +CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem, + const int32x4_t src_wave_buffer_resource, + index_t src_thread_element_offset, + index_t src_linear_element_offset, + bool is_valid_element, + bool_constant = {}) +{ + index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); + index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T); + + amd_async_buffer_load(smem, + src_wave_buffer_resource, + src_thread_addr_offset, + 0, + src_linear_addr_offset, + is_valid_element, + bool_constant{}); } // buffer_store requires: @@ -2146,6 +2330,7 @@ template & src_thread_data, T* p_dst_wave, const index_t dst_thread_element_offset, + const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) { @@ -2153,11 +2338,13 @@ CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer& src_thread_d make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T)); index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); + index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T); amd_buffer_store_raw_impl(src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0, + dst_linear_addr_offset, dst_thread_element_valid); } @@ -2221,16 +2408,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer& src_thread_ #endif } -// Direct loads from global to LDS. -CK_TILE_DEVICE_EXTERN void -llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, - __attribute__((address_space(3))) uint32_t* lds_ptr, - index_t size, - index_t voffset, - index_t soffset, - index_t offset, - index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); - template CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, const index_t global_offset, diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp index 580faae92..4be50b865 100644 --- a/include/ck_tile/core/config.hpp +++ b/include/ck_tile/core/config.hpp @@ -41,6 +41,19 @@ #define CK_TILE_HOST_DEVICE_EXTERN #endif +// implementing the "memory address space" attribute +// https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table +#ifdef __HIPCC_ +#define CK_TILE_GENERIC_ADDR __attribute__((address_space(0))) +#define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1))) +#define CK_TILE_LDS_ADDR __attribute__((address_space(3))) +#define CK_TILE_BUF_RES_ADDR __attribute__((address_space(8))) +#else +#define CK_TILE_GENERIC_ADDR +#define CK_TILE_GLOBAL_ADDR +#define CK_TILE_LDS_ADDR +#define CK_TILE_BUF_RES_ADDR +#endif #ifndef CK_TILE_USE_CUSTOM_DATA_TYPE #define CK_TILE_USE_CUSTOM_DATA_TYPE 0 // custom data type will generate extra move/bfi code #endif @@ -205,3 +218,8 @@ #ifndef CK_TILE_BUFFER_LOAD_RAW_BF16_WA #define CK_TILE_BUFFER_LOAD_RAW_BF16_WA 1 #endif + +// workaround: compiler not emiting reciprocal instruction frm __frcp_rn() +#ifndef CK_TILE_WORKAROUND_SWDEV_383542 +#define CK_TILE_WORKAROUND_SWDEV_383542 1 +#endif diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp index 598dfeea3..19d853ad5 100644 --- a/include/ck_tile/core/container/tuple.hpp +++ b/include/ck_tile/core/container/tuple.hpp @@ -623,7 +623,7 @@ template CK_TILE_HOST_DEVICE constexpr auto operator+=(tuple& y, const X& x) { - static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same"); + static_assert(X::size() == sizeof...(Ys), "wrong! size not the same"); constexpr index_t NSize = sizeof...(Ys); static_for<0, NSize, 1>{}([&](auto i) { y[i] += x[i]; }); return y; @@ -635,7 +635,7 @@ template CK_TILE_HOST_DEVICE constexpr auto operator-=(tuple& y, const X& x) { - static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same"); + static_assert(X::size() == sizeof...(Ys), "wrong! size not the same"); constexpr index_t NSize = sizeof...(Ys); static_for<0, NSize, 1>{}([&](auto i) { y[i] -= x[i]; }); return y; @@ -647,7 +647,7 @@ template CK_TILE_HOST_DEVICE constexpr auto operator+(const tuple& x, const Y& y) { - static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same"); + static_assert(Y::size() == sizeof...(Xs), "wrong! size not the same"); constexpr index_t NSize = sizeof...(Xs); tuple r; @@ -655,13 +655,21 @@ CK_TILE_HOST_DEVICE constexpr auto operator+(const tuple& x, const Y& y) return r; } +template +CK_TILE_HOST_DEVICE constexpr auto operator+(const tuple& x, const tuple& y) +{ + static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong!"); + constexpr index_t NSize = sizeof...(Xs); + return generate_tuple([&](auto i) { return x[i] + y[i]; }, number{}); +} + template ::value && !std::is_floating_point::value, bool> = false> CK_TILE_HOST_DEVICE constexpr auto operator-(const tuple& x, const Y& y) { - static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same"); + static_assert(Y::size() == sizeof...(Xs), "wrong! size not the same"); constexpr index_t NSize = sizeof...(Xs); tuple r; @@ -669,13 +677,21 @@ CK_TILE_HOST_DEVICE constexpr auto operator-(const tuple& x, const Y& y) return r; } +template +CK_TILE_HOST_DEVICE constexpr auto operator-(const tuple& x, const tuple& y) +{ + static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong!"); + constexpr index_t NSize = sizeof...(Xs); + return generate_tuple([&](auto i) { return x[i] - y[i]; }, number{}); +} + template ::value && !std::is_floating_point::value, bool> = false> CK_TILE_HOST_DEVICE constexpr auto operator*(const tuple& x, const Y& y) { - static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same"); + static_assert(Y::size() == sizeof...(Xs), "wrong! size not the same"); constexpr index_t NSize = sizeof...(Xs); tuple r; @@ -706,6 +722,14 @@ CK_TILE_HOST_DEVICE constexpr auto operator*(const tuple& x, Y a) return a * x; } +template +CK_TILE_HOST_DEVICE constexpr auto operator*(const tuple& x, const tuple& y) +{ + static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong!"); + constexpr index_t NSize = sizeof...(Xs); + return generate_tuple([&](auto i) { return x[i] * y[i]; }, number{}); +} + template CK_TILE_HOST_DEVICE constexpr auto operator/(const tuple& x, const tuple& y) { diff --git a/include/ck_tile/core/numeric/math.hpp b/include/ck_tile/core/numeric/math.hpp index f512e50e0..785691b66 100644 --- a/include/ck_tile/core/numeric/math.hpp +++ b/include/ck_tile/core/numeric/math.hpp @@ -487,55 +487,12 @@ struct log2e template constexpr T log2e_v = log2e::value; -// math -CK_TILE_HOST_DEVICE -float abs(const float& x) -{ - union - { - float f32; - uint32_t u32; - } y; - y.f32 = x; - y.u32 = y.u32 & 0x7fffffff; - return y.f32; -} - -CK_TILE_HOST_DEVICE -bool isnan(const float& x) -{ - uint32_t xx = bit_cast(x); - return (xx & 0x7fffffff) > 0x7F800000; -} - -CK_TILE_HOST float sqrt(float x) { return std::sqrt(x); }; - -CK_TILE_HOST double sqrt(double x) { return std::sqrt(x); }; - -CK_TILE_DEVICE -float sqrt(float x) { return __builtin_amdgcn_sqrtf(x); }; - -CK_TILE_DEVICE -double sqrt(double x) { return __builtin_amdgcn_sqrt(x); }; - -CK_TILE_DEVICE -float exp(float x) { return __ocml_exp_f32(x); }; - -CK_TILE_HOST -float exp(float x) { return std::expf(x); } - CK_TILE_DEVICE float exp2(float x) { return exp2f(x); }; CK_TILE_HOST float exp2(float x) { return std::exp2f(x); }; -CK_TILE_DEVICE -float log(float x) { return __logf(x); }; - -CK_TILE_HOST -float log(float x) { return std::logf(x); }; - CK_TILE_DEVICE uint16_t sad_u16(uint16_t x, uint16_t y, uint16_t acc) { return __builtin_amdgcn_sad_u16(x, y, acc); @@ -554,4 +511,933 @@ CK_TILE_HOST uint32_t sad_u32(uint32_t x, uint32_t y, uint32_t acc) return (x > y ? (x - y) : (y - x)) + acc; } +/////////////////////////////////////////////////////////////// + +} // namespace ck_tile +// blow function need data type pre-defined +#include "ck_tile/core/numeric/half.hpp" +#include "ck_tile/core/numeric/bfloat16.hpp" +#include "ck_tile/core/numeric/float8.hpp" +#include "ck_tile/core/numeric/type_convert.hpp" +#ifndef __HIP_DEVICE_COMPILE__ +#include +#endif + +namespace ck_tile { +#if CK_TILE_WORKAROUND_SWDEV_383542 +extern "C" CK_TILE_DEVICE float __ocml_native_recip_f32(float); +#endif + +// math functions for the host, some are implemented by calling C++ std functions + +CK_TILE_HOST float abs(float x) { return std::abs(x); }; + +CK_TILE_HOST double abs(double x) { return std::abs(x); }; + +CK_TILE_HOST int8_t abs(int8_t x) +{ + int8_t sgn = x >> (8 - 1); + + return (x ^ sgn) - sgn; +}; + +CK_TILE_HOST int32_t abs(int32_t x) +{ + int32_t sgn = x >> (32 - 1); + + return (x ^ sgn) - sgn; +}; + +CK_TILE_HOST fp16_t abs(fp16_t x) +{ + uint16_t xx = bit_cast(x); + + uint16_t abs_xx = xx & 0x7fff; + + fp16_t abs_x = bit_cast(abs_xx); + + return abs_x; +}; + +#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +CK_TILE_HOST int4_t abs(int4_t x) +{ + int4_t sgn = x >> (4 - 1); + return (x ^ sgn) - sgn; +} +#endif + +CK_TILE_HOST bool isnan(float x) { return std::isnan(x); }; + +CK_TILE_HOST bool isnan(double x) { return std::isnan(x); }; + +CK_TILE_HOST bool isnan(int8_t x) +{ + (void)x; + return false; +}; + +CK_TILE_HOST bool isnan(int32_t x) +{ + (void)x; + return false; +}; + +CK_TILE_HOST bool isnan(fp16_t x) +{ + uint16_t xx = bit_cast(x); + + return (xx & 0x7FFF) > 0x7C00; +}; + +#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +CK_TILE_HOST bool isnan(int4_t x) +{ + (void)x; + return false; +}; +#endif + +CK_TILE_HOST fp16_t sqrt(fp16_t x) +{ + return static_cast(std::sqrt(static_cast(x))); +}; + +CK_TILE_HOST float sqrt(float x) { return std::sqrt(x); }; + +CK_TILE_HOST double sqrt(double x) { return std::sqrt(x); }; + +template +CK_TILE_HOST T tanh(T x) +{ + return type_convert(std::tanhf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float tanh(float x) +{ + return std::tanhf(x); +}; + +template <> +CK_TILE_HOST double tanh(double x) +{ + return std::tanh(x); +}; + +template +CK_TILE_HOST T acos(T x) +{ + return type_convert(std::acosf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float acos(float x) +{ + return std::acosf(x); +}; + +template <> +CK_TILE_HOST double acos(double x) +{ + return std::acos(x); +}; + +template +CK_TILE_HOST T neg(T x) +{ + return type_convert(-(type_convert(x))); +}; + +template <> +CK_TILE_HOST float neg(float x) +{ + return -x; +}; + +template <> +CK_TILE_HOST double neg(double x) +{ + return -x; +}; + +template <> +CK_TILE_HOST int32_t neg(int32_t x) +{ + return -x; +}; + +template <> +CK_TILE_HOST int8_t neg(int8_t x) +{ + return -x; +}; + +template +CK_TILE_HOST T atan(T x) +{ + return type_convert(std::atanf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float atan(float x) +{ + return std::atanf(x); +}; + +template <> +CK_TILE_HOST double atan(double x) +{ + return std::atan(x); +}; + +template +CK_TILE_HOST T sin(T x) +{ + return type_convert(std::sinf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float sin(float x) +{ + return std::sinf(x); +}; + +template <> +CK_TILE_HOST double sin(double x) +{ + return std::sin(x); +}; + +template +CK_TILE_HOST T asin(T x) +{ + return type_convert(std::asinf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float asin(float x) +{ + return std::asinf(x); +}; + +template <> +CK_TILE_HOST double asin(double x) +{ + return std::asin(x); +}; + +template +CK_TILE_HOST T asinh(T x) +{ + return type_convert(std::asinhf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float asinh(float x) +{ + return std::asinhf(x); +}; + +template <> +CK_TILE_HOST double asinh(double x) +{ + return std::asinh(x); +}; + +template +CK_TILE_HOST T cos(T x) +{ + return type_convert(std::cosf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float cos(float x) +{ + return std::cosf(x); +}; + +template <> +CK_TILE_HOST double cos(double x) +{ + return std::cos(x); +}; + +template +CK_TILE_HOST T acosh(T x) +{ + return type_convert(std::acoshf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float acosh(float x) +{ + return std::acoshf(x); +}; + +template <> +CK_TILE_HOST double acosh(double x) +{ + return std::acosh(x); +}; + +template +CK_TILE_HOST T tan(T x) +{ + return type_convert(std::tanf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float tan(float x) +{ + return std::tanf(x); +}; + +template <> +CK_TILE_HOST double tan(double x) +{ + return std::tan(x); +}; + +template +CK_TILE_HOST T atanh(T x) +{ + return type_convert(std::atanhf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float atanh(float x) +{ + return std::atanhf(x); +}; + +template <> +CK_TILE_HOST double atanh(double x) +{ + return std::atanh(x); +}; + +template +CK_TILE_HOST T sinh(T x) +{ + return type_convert(std::sinhf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float sinh(float x) +{ + return std::sinhf(x); +}; + +template <> +CK_TILE_HOST double sinh(double x) +{ + return std::sinh(x); +}; + +template +CK_TILE_HOST T ceil(T x) +{ + return type_convert(std::ceilf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float ceil(float x) +{ + return std::ceilf(x); +}; + +template <> +CK_TILE_HOST double ceil(double x) +{ + return std::ceil(x); +}; + +template +CK_TILE_HOST T cosh(T x) +{ + return type_convert(std::coshf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float cosh(float x) +{ + return std::coshf(x); +}; + +template <> +CK_TILE_HOST double cosh(double x) +{ + return std::cosh(x); +}; + +template +CK_TILE_HOST T floor(T x) +{ + return type_convert(std::floorf(type_convert(x))); +}; + +template <> +CK_TILE_HOST float floor(float x) +{ + return std::floorf(x); +}; + +template <> +CK_TILE_HOST double floor(double x) +{ + return std::floor(x); +}; + +template +CK_TILE_HOST T rcp(T x) +{ + return type_convert(1.f / type_convert(x)); +}; + +template +CK_TILE_HOST T exp(T x) +{ + return type_convert(std::expf(type_convert(x))); +} + +template <> +CK_TILE_HOST float exp(float x) +{ + return std::expf(x); +} + +template <> +CK_TILE_HOST double exp(double x) +{ + return std::exp(x); +} + +template +CK_TILE_HOST T log(T x) +{ + return type_convert(std::logf(type_convert(x))); +} + +template <> +CK_TILE_HOST float log(float x) +{ + return std::logf(x); +} + +template <> +CK_TILE_HOST double log(double x) +{ + return std::log(x); +} + +template +CK_TILE_HOST T pow(T x, T gamma) +{ + return type_convert(std::powf(type_convert(x), type_convert(gamma))); +} + +template <> +CK_TILE_HOST float pow(float x, float gamma) +{ + return std::powf(x, gamma); +} + +template <> +CK_TILE_HOST double pow(double x, double gamma) +{ + return std::pow(x, gamma); +} + +template +CK_TILE_HOST T expm1(T x) +{ + return type_convert(std::expm1f(type_convert(x))); +} + +template <> +CK_TILE_HOST float expm1(float x) +{ + return std::expm1f(x); +} + +template <> +CK_TILE_HOST double expm1(double x) +{ + return std::expm1(x); +} + +// math functions for the HIP kernel, some are implemented by calling hip builtin functions + +CK_TILE_DEVICE float abs(float x) +{ + union + { + float f32; + uint32_t u32; + } y; + y.f32 = x; + y.u32 = y.u32 & 0x7fffffff; + return y.f32; +}; + +CK_TILE_DEVICE double abs(double x) { return ::abs(x); }; + +CK_TILE_DEVICE int8_t abs(int8_t x) +{ + int8_t sgn = x >> (8 - 1); + + return (x ^ sgn) - sgn; +}; + +CK_TILE_DEVICE int32_t abs(int32_t x) +{ + int32_t sgn = x >> (32 - 1); + + return (x ^ sgn) - sgn; +}; + +#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +CK_TILE_DEVICE int4_t abs(int4_t x) +{ + int4_t sgn = x >> (4 - 1); + + return (x ^ sgn) - sgn; +}; +#endif + +CK_TILE_DEVICE fp16_t abs(fp16_t x) +{ + uint16_t xx = bit_cast(x); + + uint16_t abs_xx = xx & 0x7fff; + + fp16_t abs_x = bit_cast(abs_xx); + + return abs_x; +}; + +CK_TILE_DEVICE bool isnan(float x) { return ::isnan(x); }; + +CK_TILE_DEVICE bool isnan(double x) { return ::isnan(x); }; + +CK_TILE_DEVICE bool isnan(int8_t x) +{ + (void)x; + return false; +}; + +CK_TILE_DEVICE bool isnan(int32_t x) +{ + (void)x; + return false; +}; + +#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +CK_TILE_DEVICE bool isnan(int4_t x) +{ + (void)x; + return false; +}; +#endif + +CK_TILE_DEVICE bool isnan(fp16_t x) +{ + uint16_t xx = bit_cast(x); + + return (xx & 0x7FFF) > 0x7C00; +}; + +CK_TILE_DEVICE fp16_t sqrt(fp16_t x) +{ + return static_cast(__builtin_amdgcn_sqrtf(static_cast(x))); +}; + +CK_TILE_DEVICE float sqrt(float x) { return __builtin_amdgcn_sqrtf(x); }; + +CK_TILE_DEVICE double sqrt(double x) { return __builtin_amdgcn_sqrt(x); }; + +template +CK_TILE_DEVICE T tanh(T x) +{ + return type_convert(::tanhf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float tanh(float x) +{ + return ::tanhf(x); +}; + +template <> +CK_TILE_DEVICE double tanh(double x) +{ + return ::tanh(x); +}; + +template +CK_TILE_DEVICE T acos(T x) +{ + return type_convert(::acosf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float acos(float x) +{ + return ::acosf(x); +}; + +template <> +CK_TILE_DEVICE double acos(double x) +{ + return ::acos(x); +}; + +template +CK_TILE_DEVICE T neg(T x) +{ + return type_convert(-(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float neg(float x) +{ + return -x; +}; + +template <> +CK_TILE_DEVICE double neg(double x) +{ + return -x; +}; + +template <> +CK_TILE_DEVICE int32_t neg(int32_t x) +{ + return -x; +}; + +template <> +CK_TILE_DEVICE int8_t neg(int8_t x) +{ + return -x; +}; + +template <> +CK_TILE_DEVICE fp16_t neg(fp16_t x) +{ + return __hneg(x); +}; + +template +CK_TILE_DEVICE T atan(T x) +{ + return type_convert(::atanf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float atan(float x) +{ + return ::atanf(x); +}; + +template <> +CK_TILE_DEVICE double atan(double x) +{ + return ::atan(x); +}; + +template +CK_TILE_DEVICE T sin(T x) +{ + return type_convert(::sinf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float sin(float x) +{ + return ::sinf(x); +}; + +template <> +CK_TILE_DEVICE double sin(double x) +{ + return ::sin(x); +}; + +template <> +CK_TILE_DEVICE fp16_t sin(fp16_t x) +{ + return ::hsin(x); +}; + +template +CK_TILE_DEVICE T asin(T x) +{ + return type_convert(::asinf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float asin(float x) +{ + return ::asinf(x); +}; + +template <> +CK_TILE_DEVICE double asin(double x) +{ + return ::asin(x); +}; + +template +CK_TILE_DEVICE T asinh(T x) +{ + return type_convert(::asinhf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float asinh(float x) +{ + return ::asinhf(x); +}; + +template <> +CK_TILE_DEVICE double asinh(double x) +{ + return ::asinh(x); +}; + +template +CK_TILE_DEVICE T acosh(T x) +{ + return type_convert(::acoshf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float acosh(float x) +{ + return ::acoshf(x); +}; + +template <> +CK_TILE_DEVICE double acosh(double x) +{ + return ::acosh(x); +}; + +template +CK_TILE_DEVICE T tan(T x) +{ + return type_convert(::tanf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float tan(float x) +{ + return ::tanf(x); +}; + +template <> +CK_TILE_DEVICE double tan(double x) +{ + return ::tan(x); +}; + +template +CK_TILE_DEVICE T atanh(T x) +{ + return type_convert(::atanhf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float atanh(float x) +{ + return ::atanhf(x); +}; + +template <> +CK_TILE_DEVICE double atanh(double x) +{ + return ::atanh(x); +}; + +template +CK_TILE_DEVICE T sinh(T x) +{ + return type_convert(::sinhf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float sinh(float x) +{ + return ::sinhf(x); +}; + +template <> +CK_TILE_DEVICE double sinh(double x) +{ + return ::sinh(x); +}; + +template +CK_TILE_DEVICE T ceil(T x) +{ + return type_convert(::ceilf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float ceil(float x) +{ + return ::ceilf(x); +}; + +template <> +CK_TILE_DEVICE double ceil(double x) +{ + return ::ceil(x); +}; + +template <> +CK_TILE_DEVICE fp16_t ceil(fp16_t x) +{ + return ::hceil(x); +}; + +template +CK_TILE_DEVICE T cosh(T x) +{ + return type_convert(::coshf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float cosh(float x) +{ + return ::coshf(x); +}; + +template <> +CK_TILE_DEVICE double cosh(double x) +{ + return ::cosh(x); +}; + +template +CK_TILE_DEVICE T floor(T x) +{ + return type_convert(::floorf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float floor(float x) +{ + return ::floorf(x); +}; + +template <> +CK_TILE_DEVICE double floor(double x) +{ + return ::floor(x); +}; + +template <> +CK_TILE_DEVICE fp16_t floor(fp16_t x) +{ + return ::hfloor(x); +}; + +template +CK_TILE_DEVICE T rcp(T x) +{ +#if !CK_TILE_WORKAROUND_SWDEV_383542 + return __frcp_rn(x); +#else + // return __ocml_native_recip_f32(x); + return __builtin_amdgcn_rcpf(x); +#endif +}; + +template +CK_TILE_DEVICE T exp(T x) +{ + return type_convert(__ocml_exp_f32(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE fp16_t exp(fp16_t x) +{ + return hexp(x); +}; + +template <> +CK_TILE_DEVICE float exp(float x) +{ + return __ocml_exp_f32(x); +}; + +template <> +CK_TILE_DEVICE double exp(double x) +{ + return exp(x); +}; + +template +CK_TILE_DEVICE T log(T x) +{ + return type_convert(__logf(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE fp16_t log(fp16_t x) +{ + return hlog(x); +}; + +template <> +CK_TILE_DEVICE float log(float x) +{ + return __logf(x); +}; + +template <> +CK_TILE_DEVICE double log(double x) +{ + return log(x); +}; + +template +CK_TILE_DEVICE T pow(T x, T gamma) +{ + return type_convert(powf(type_convert(x), type_convert(gamma))); +}; + +template <> +CK_TILE_DEVICE float pow(float x, float gamma) +{ + return powf(x, gamma); +}; + +template <> +CK_TILE_DEVICE double pow(double x, double gamma) +{ + return pow(x, gamma); +}; + +template +CK_TILE_DEVICE T expm1(T x) +{ + return type_convert(expm1f(type_convert(x))); +}; + +template <> +CK_TILE_DEVICE float expm1(float x) +{ + return expm1f(x); +}; + +template <> +CK_TILE_DEVICE double expm1(double x) +{ + return expm1(x); +}; + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp index ed705c91e..2cc788d42 100644 --- a/include/ck_tile/core/tensor/buffer_view.hpp +++ b/include/ck_tile/core/tensor/buffer_view.hpp @@ -91,8 +91,10 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE constexpr auto - get(index_t i, bool is_valid_element, bool_constant = {}) const + CK_TILE_DEVICE constexpr auto get(index_t i, + index_t linear_offset, + bool is_valid_element, + bool_constant = {}) const { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -107,11 +109,11 @@ struct buffer_view(&p_data_[i]); + return *c_style_pointer_cast(&p_data_[i + linear_offset]); #endif } else @@ -134,17 +136,17 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { if constexpr(Op == memory_operation_enum::set) { - this->template set(i, is_valid_element, x); + this->template set(i, linear_offset, is_valid_element, x); } // FIXME: remove memory_operation_enum::add else if constexpr(Op == memory_operation_enum::add) { - auto tmp = this->template get(i, is_valid_element); - this->template set(i, is_valid_element, x + tmp); + auto tmp = this->template get(i, linear_offset, is_valid_element); + this->template set(i, linear_offset, is_valid_element, x + tmp); } } @@ -154,7 +156,7 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -169,9 +171,9 @@ struct buffer_view(&p_data_[i]) = x; + *c_style_pointer_cast(&p_data_[i + linear_offset]) = x; #endif } } @@ -276,8 +278,10 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE constexpr auto - get(index_t i, bool is_valid_element, bool_constant = {}) const + CK_TILE_DEVICE constexpr auto get(index_t i, + index_t linear_offset, + bool is_valid_element, + bool_constant = {}) const { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -303,7 +307,7 @@ struct buffer_view( - p_data_, i, is_valid_element, buffer_size_); + p_data_, i + linear_offset, is_valid_element, buffer_size_); } else { @@ -311,8 +315,11 @@ struct buffer_view, t_per_x, Coherence, - oob_conditional_check>( - p_data_, i, is_valid_element, buffer_size_, invalid_element_value_); + oob_conditional_check>(p_data_, + i + linear_offset, + is_valid_element, + buffer_size_, + invalid_element_value_); } } else @@ -322,11 +329,11 @@ struct buffer_view(&p_data_[i]); + return *c_style_pointer_cast(&p_data_[i + linear_offset]); #endif } else @@ -352,7 +359,8 @@ struct buffer_view>::scalar_type>::value, bool>::type = false> CK_TILE_DEVICE constexpr auto get_raw(remove_cvref_t& dst, - index_t i, + index_t v_offset, + index_t i_offset, bool is_valid_element, bool_constant = {}) const { @@ -366,7 +374,38 @@ struct buffer_view, t_per_x, Coherence, oob_conditional_check, pre_nop>( - dst, cached_buf_res_, i, is_valid_element, bool_constant{}); + dst, cached_buf_res_, v_offset, i_offset, is_valid_element, bool_constant{}); + } + + // i is offset of T, not X. i should be aligned to X + template >::scalar_type, + typename vector_traits>::scalar_type>::value, + bool>::type = false> + CK_TILE_DEVICE constexpr auto async_get(CK_TILE_LDS_ADDR remove_cvref_t* smem, + index_t i, + index_t linear_offset, + bool is_valid_element, + bool_constant = {}) const + { + // X is vector of T + constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; + constexpr index_t scalar_per_x_vector = vector_traits>::vector_size; + + static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, + "wrong! X should contain multiple T"); + + constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; + + amd_async_buffer_load_with_oob, t_per_x, Coherence>( + smem, + cached_buf_res_, + i, + linear_offset, + is_valid_element, + bool_constant{}); } // i is offset of T, not X. i should be aligned to X @@ -378,6 +417,7 @@ struct buffer_view::type = false> CK_TILE_DEVICE constexpr auto async_get_raw(remove_cvref_t* smem, index_t i, + index_t linear_offset, bool /*is_valid_element*/, bool_constant = {}) const { @@ -391,7 +431,7 @@ struct buffer_view, t_per_x, Coherence>( - smem, cached_buf_res_, i, bool_constant{}); + smem, cached_buf_res_, i, linear_offset, bool_constant{}); } // i is offset of T, not X. i should be aligned to X @@ -401,25 +441,25 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { if constexpr(Op == memory_operation_enum::set) { - this->template set(i, is_valid_element, x); + this->template set(i, linear_offset, is_valid_element, x); } else if constexpr(Op == memory_operation_enum::atomic_add) { - this->template atomic_add(i, is_valid_element, x); + this->template atomic_add(i, linear_offset, is_valid_element, x); } else if constexpr(Op == memory_operation_enum::atomic_max) { - this->template atomic_max(i, is_valid_element, x); + this->template atomic_max(i, linear_offset, is_valid_element, x); } // FIXME: remove memory_operation_enum::add else if constexpr(Op == memory_operation_enum::add) { - auto tmp = this->template get(i, is_valid_element); - this->template set(i, is_valid_element, x + tmp); + auto tmp = this->template get(i, linear_offset, is_valid_element); + this->template set(i, linear_offset, is_valid_element, x + tmp); // tmp += x; // this->template set(i, is_valid_element, tmp); } @@ -432,7 +472,7 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -453,7 +493,7 @@ struct buffer_view, t_per_x, Coherence>( - x, p_data_, i, is_valid_element, buffer_size_); + x, p_data_, i + linear_offset, is_valid_element, buffer_size_); } else { @@ -462,9 +502,9 @@ struct buffer_view(&p_data_[i]) = x; + *c_style_pointer_cast(&p_data_[i + linear_offset]) = x; #endif } } @@ -477,7 +517,7 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void set_raw(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -489,7 +529,7 @@ struct buffer_view, t_per_x, Coherence, oob_conditional_check>( - x, p_data_, i, is_valid_element, buffer_size_); + x, p_data_, i, linear_offset, is_valid_element, buffer_size_); } template >::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void atomic_add(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void + atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { using scalar_t = typename vector_traits>::scalar_type; @@ -532,13 +573,13 @@ struct buffer_view, t_per_x>( - x, p_data_, i, is_valid_element, buffer_size_); + x, p_data_, i + linear_offset, is_valid_element, buffer_size_); } else { if(is_valid_element) { - atomic_add_g, t_per_x>(&p_data_[i], x); + atomic_add_g, t_per_x>(&p_data_[i + linear_offset], x); } } } @@ -548,7 +589,8 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void atomic_max(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void + atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -572,11 +614,11 @@ struct buffer_view, t_per_x>( - x, p_data_, i, is_valid_element, buffer_size_); + x, p_data_, i + linear_offset, is_valid_element, buffer_size_); } else if(is_valid_element) { - atomic_max_g, t_per_x>(&p_data_[i], x); + atomic_max_g, t_per_x>(&p_data_[i + linear_offset], x); } } @@ -668,8 +710,10 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE constexpr auto - get(index_t i, bool is_valid_element, bool_constant = {}) const + CK_TILE_DEVICE constexpr auto get(index_t i, + index_t linear_offset, + bool is_valid_element, + bool_constant = {}) const { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -684,14 +728,14 @@ struct buffer_view>::scalar_type, scalar_per_t_vector * scalar_per_x_vector>; // using buf_t = ushort __attribute__((ext_vector_type(8))); - auto rtn = *c_style_pointer_cast(&p_data_[i]); + auto rtn = *c_style_pointer_cast(&p_data_[i + linear_offset]); return bit_cast(rtn); #endif } @@ -708,6 +752,23 @@ struct buffer_view>::scalar_type, + typename vector_traits>::scalar_type>::value, + bool>::type = false> + CK_TILE_DEVICE constexpr auto get_raw(remove_cvref_t& dst, + index_t v_offset, + index_t i_offset, + bool /*is_valid_element*/, + bool_constant = {}) const + { + smem_load{}(dst, v_offset * sizeof(T), i_offset * sizeof(T)); + } + // i is offset of T, not X. i should be aligned to X template >::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { if constexpr(Op == memory_operation_enum::set) { - this->template set(i, is_valid_element, x); + this->template set(i, linear_offset, is_valid_element, x); } // FIXME: remove memory_operation_enum::add else if constexpr(Op == memory_operation_enum::add) { - auto tmp = this->template get(i, is_valid_element); - this->template set(i, is_valid_element, x + tmp); + auto tmp = this->template get(i, linear_offset, is_valid_element); + this->template set(i, linear_offset, is_valid_element, x + tmp); } } @@ -735,7 +796,7 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -751,6 +812,7 @@ struct buffer_view>::scalar_type, int8_t>::value && workaround_int8_ds_write_issue) @@ -952,8 +1014,10 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE constexpr auto - get(index_t i, bool is_valid_element, bool_constant = {}) const + CK_TILE_DEVICE constexpr auto get(index_t i, + index_t /*linear_offset*/, + bool is_valid_element, + bool_constant = {}) const { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -995,17 +1059,17 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { if constexpr(Op == memory_operation_enum::set) { - this->template set(i, is_valid_element, x); + this->template set(i, linear_offset, is_valid_element, x); } // FIXME: remove memory_operation_enum::add else if constexpr(Op == memory_operation_enum::add) { - auto tmp = this->template get(i, is_valid_element); - this->template set(i, is_valid_element, x + tmp); + auto tmp = this->template get(i, linear_offset, is_valid_element); + this->template set(i, linear_offset, is_valid_element, x + tmp); } } @@ -1015,7 +1079,7 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x) + CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x) { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -1030,9 +1094,9 @@ struct buffer_view(&p_data_[i]) = x; + *c_style_pointer_cast(&p_data_[i + linear_offset]) = x; #endif } } diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp index aeda5e9c0..06b5a8da0 100644 --- a/include/ck_tile/core/tensor/load_tile.hpp +++ b/include/ck_tile/core/tensor/load_tile.hpp @@ -12,6 +12,7 @@ #include "ck_tile/core/tensor/tile_window.hpp" #include "ck_tile/core/utility/type_traits.hpp" #include "ck_tile/core/tensor/tile_window.hpp" +#include "ck_tile/core/tensor/tile_window_linear.hpp" #include "ck_tile/core/tensor/null_tile_window.hpp" #include "ck_tile/core/tensor/null_tensor.hpp" @@ -28,7 +29,21 @@ CK_TILE_DEVICE auto load_tile(const tile_window_with_static_distribution& tile_window, bool_constant = {}) { - return tile_window.load(bool_constant{}); + return tile_window.load(number<-1>{}, bool_constant{}); +} + +template +CK_TILE_DEVICE auto load_tile(const tile_window_linear& tile_window, + bool_constant = {}) +{ + return tile_window.load(number<-1>{}, bool_constant{}); } template = {}, bool_constant = {}) { - tile_window.load_raw(tile, bool_constant{}, bool_constant{}); + tile_window.load_raw( + tile, number<-1>{}, bool_constant{}, bool_constant{}); +} + +template +CK_TILE_DEVICE auto load_tile_raw(T& tile, + const tile_window_linear& tile_window, + bool_constant = {}, + bool_constant = {}) +{ + tile_window.load_raw( + tile, number<-1>{}, bool_constant{}, bool_constant{}); } template = {}) { return tile_window.async_load_raw( - lds_tile, bool_constant{}, bool_constant{}); + lds_tile, number<-1>{}, bool_constant{}, bool_constant{}); +} + +template +CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile, + const tile_window_linear& tile_window, + bool_constant = {}, + bool_constant = {}) +{ + return tile_window.async_load_raw( + lds_tile, number<-1>{}, bool_constant{}, bool_constant{}); } CK_TILE_DEVICE auto async_load_fence(index_t cnt = 0) diff --git a/include/ck_tile/core/tensor/shuffle_tile.hpp b/include/ck_tile/core/tensor/shuffle_tile.hpp index baf009add..da3c7117e 100644 --- a/include/ck_tile/core/tensor/shuffle_tile.hpp +++ b/include/ck_tile/core/tensor/shuffle_tile.hpp @@ -109,7 +109,7 @@ CK_TILE_DEVICE void shuffle_tile_impl_in_thread(OutTensor& out_tensor, const InT // get input vectors static_for<0, num_vec_in, 1>{}([&](auto i) { - constexpr auto idx_y_in = generate_array( + constexpr auto idx_y_in = generate_tuple( [&](auto ii) { return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii]; }, diff --git a/include/ck_tile/core/tensor/store_tile.hpp b/include/ck_tile/core/tensor/store_tile.hpp index 2efc65701..d5a716664 100644 --- a/include/ck_tile/core/tensor/store_tile.hpp +++ b/include/ck_tile/core/tensor/store_tile.hpp @@ -10,6 +10,7 @@ #include "ck_tile/core/container/container_helper.hpp" #include "ck_tile/core/numeric/math.hpp" #include "ck_tile/core/tensor/tile_window.hpp" +#include "ck_tile/core/tensor/tile_window_linear.hpp" #include "ck_tile/core/utility/type_traits.hpp" namespace ck_tile { @@ -72,7 +73,7 @@ store_tile(tile_window_with_static_distribution& tile_window, const static_distributed_tensor& dstr_tensor) { - tile_window.store(dstr_tensor); + tile_window.store(dstr_tensor, number<-1>{}); } template & tile_window, const static_distributed_tensor& dstr_tensor) { - tile_window.store_raw(dstr_tensor); + tile_window.store_raw(dstr_tensor, number<-1>{}); +} + +template +CK_TILE_DEVICE void store_tile( + tile_window_linear& + tile_window, + const static_distributed_tensor& dstr_tensor) +{ + tile_window.store(dstr_tensor, number<-1>{}); +} + +template +CK_TILE_DEVICE void store_tile_raw( + tile_window_linear& + tile_window, + const static_distributed_tensor& dstr_tensor) +{ + tile_window.store_raw(dstr_tensor, number<-1>{}); } } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp index 4655eec24..698ce5378 100644 --- a/include/ck_tile/core/tensor/tensor_view.hpp +++ b/include/ck_tile/core/tensor/tensor_view.hpp @@ -16,6 +16,24 @@ namespace ck_tile { +/* + * tensor_view + * abstract the underneath memory buffer(global, LDS, etc...) + * and provide a unified get/set function for access + * + * For addressing into the buffer we use 2 variable to control: + * coord : ND tensor coordinate, will calculate the actual offset inside + * linear_offset : 1D offset, will be used in the immediate field of + * the buffer instruction to help reduce register usage + * + * User can use either of the field, or both to indexing into the tensor + * + * We usually provide 2 set of API for buffer get/set, e.g. + * get_vectorized_elements()/get_vectorized_elements_raw() + * the former usually will call intrinsic or normal C function, the later + * usually will call inline-asm function + * + */ template @@ -49,22 +67,6 @@ struct tensor_view CK_TILE_HOST_DEVICE constexpr auto& get_buffer_view() { return buf_; } -#if 0 - CK_TILE_HOST_DEVICE constexpr DataType get_element(const TensorCoord& coord) const - { - return buf_.template get( - coord.get_offset(), - coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord)); - } - - CK_TILE_HOST_DEVICE constexpr void set_element(const TensorCoord& coord, const DataType& x) - { - buf_.template set( - coord.get_offset(), - coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord), - x); - } -#endif // X is vector of DataType. // "coord" is coordinate of DataType, not X. "coord" should be aligned to X template ::type = false> CK_TILE_HOST_DEVICE constexpr remove_cvref_t get_vectorized_elements(const TensorCoord& coord, + index_t linear_offset, bool_constant = {}) const { return buf_.template get( coord.get_offset(), + linear_offset, coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord), bool_constant{}); } + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr remove_cvref_t + get_vectorized_elements(const TensorCoord& coord, + index_t linear_offset, + bool is_valid_element, // flag + bool_constant = {}) const + { + return buf_.template get(coord.get_offset(), + linear_offset, + is_valid_element, + bool_constant{}); + } + // X is vector of DataType. // "coord" is coordinate of DataType, not X. "coord" should be aligned to X template ::type = false> CK_TILE_HOST_DEVICE void get_vectorized_elements_raw(remove_cvref_t& dst, const TensorCoord& coord, + index_t linear_offset, bool_constant = {}, bool_constant = {}) const { return buf_.template get_raw( dst, coord.get_offset(), + linear_offset, + coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord), + bool_constant{}); + } + + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE void get_vectorized_elements_raw(remove_cvref_t& dst, + const TensorCoord& coord, + index_t linear_offset, + bool is_valid_element, + bool_constant = {}, + bool_constant = {}) const + { + return buf_.template get_raw( + dst, coord.get_offset(), linear_offset, is_valid_element, bool_constant{}); + } + + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr void + async_get_vectorized_elements(CK_TILE_LDS_ADDR remove_cvref_t* smem, + const TensorCoord& coord, + index_t linear_offset) const + { + return buf_.template async_get( + smem, + coord.get_offset(), + linear_offset, + coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord), + bool_constant{}); + } + + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr void + async_get_vectorized_elements(CK_TILE_LDS_ADDR remove_cvref_t* smem, + const TensorCoord& coord, + index_t linear_offset, + bool is_valid_element) const + { + return buf_.template async_get(smem, + coord.get_offset(), + linear_offset, + is_valid_element, + bool_constant{}); + } + + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr void + async_get_vectorized_elements_raw(remove_cvref_t* smem, + const TensorCoord& coord, + index_t linear_offset, + bool_constant = {}) const + { + return buf_.template async_get_raw( + smem, + coord.get_offset(), + linear_offset, coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord), bool_constant{}); } @@ -110,11 +210,15 @@ struct tensor_view std::is_same_v>::scalar_type, typename vector_traits>::scalar_type>, bool>::type = false> - CK_TILE_HOST_DEVICE constexpr void async_get_vectorized_elements_raw( - remove_cvref_t* smem, const TensorCoord& coord, bool_constant = {}) const + CK_TILE_HOST_DEVICE constexpr void + async_get_vectorized_elements_raw(remove_cvref_t* smem, + const TensorCoord& coord, + index_t linear_offset, + bool is_valid_element, + bool_constant = {}) const { return buf_.template async_get_raw( - smem, coord.get_offset(), true /*not used*/, bool_constant{}); + smem, coord.get_offset(), linear_offset, is_valid_element, bool_constant{}); } // X is vector of DataType. @@ -125,11 +229,15 @@ struct tensor_view std::is_same_v>::scalar_type, typename vector_traits>::scalar_type>, bool>::type = false> - CK_TILE_HOST_DEVICE constexpr void set_vectorized_elements( - const TensorCoord& coord, const X& x, bool_constant = {}) + CK_TILE_HOST_DEVICE constexpr void + set_vectorized_elements(const TensorCoord& coord, + index_t linear_offset, + const X& x, + bool_constant = {}) { buf_.template set( coord.get_offset(), + linear_offset, coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord), x); } @@ -140,15 +248,53 @@ struct tensor_view std::is_same_v>::scalar_type, typename vector_traits>::scalar_type>, bool>::type = false> - CK_TILE_HOST_DEVICE constexpr void set_vectorized_elements_raw( - const TensorCoord& coord, const X& x, bool_constant = {}) + CK_TILE_HOST_DEVICE constexpr void + set_vectorized_elements(const TensorCoord& coord, + index_t linear_offset, + bool is_valid_element, + const X& x, + bool_constant = {}) + { + buf_.template set( + coord.get_offset(), linear_offset, is_valid_element, x); + } + + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr void + set_vectorized_elements_raw(const TensorCoord& coord, + index_t linear_offset, + const X& x, + bool_constant = {}) { buf_.template set_raw( coord.get_offset(), + linear_offset, coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord), x); } + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr void + set_vectorized_elements_raw(const TensorCoord& coord, + index_t linear_offset, + bool is_valid_element, + const X& x, + bool_constant = {}) + { + buf_.template set_raw( + coord.get_offset(), linear_offset, is_valid_element, x); + } + // X is vector of DataType. // "coord" is coordinate of DataType, not X. "coord" should be aligned to X template >::scalar_type, typename vector_traits>::scalar_type>, bool>::type = false> - CK_TILE_HOST_DEVICE constexpr void update_vectorized_elements( - const TensorCoord& coord, const X& x, bool_constant = {}) + CK_TILE_HOST_DEVICE constexpr void + update_vectorized_elements(const TensorCoord& coord, + index_t linear_offset, + const X& x, + bool_constant = {}) { buf_.template update( coord.get_offset(), + linear_offset, coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord), x); } + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr void + update_vectorized_elements(const TensorCoord& coord, + index_t linear_offset, + bool is_valid_element, + const X& x, + bool_constant = {}) + { + buf_.template update( + coord.get_offset(), linear_offset, is_valid_element, x); + } + CK_TILE_HOST_DEVICE void print() const { printf("tensor_view{"); diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp index 266d623c7..ca3507827 100644 --- a/include/ck_tile/core/tensor/tile_window.hpp +++ b/include/ck_tile/core/tensor/tile_window.hpp @@ -18,6 +18,8 @@ namespace ck_tile { +// Note: this tile window do not support single issue +// you need to use tile_window_linear structure for this purpose template {}; static constexpr auto I1 = number<1>{}; + static_assert(NumCoord == 1); // TODO: check WindowLengths and StaticTileDistribution are consistent @@ -189,7 +192,8 @@ struct tile_window_with_static_distribution constexpr auto idx_diff_ys = SFC_Ys::get_step_between(number<0>{}, number{}); - constexpr auto idx_diff_ps_ys = container_concat(array{0}, idx_diff_ys); + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), idx_diff_ys); move_window_adaptor_and_bottom_tensor_thread_coordinate( window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); @@ -222,10 +226,11 @@ struct tile_window_with_static_distribution // move thread's window adaptor coordinate and bottom tensor coordinate // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset] + template CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate( WindowAdaptorCoord& window_adaptor_thread_coord, BottomTensorCoord& bottom_tensor_thread_coord, - const AdaptorTopIndex& idx_diff_adaptor_top) const + const ATopIndex& idx_diff_adaptor_top) const { array idx_diff_adaptor_bottom; @@ -279,10 +284,11 @@ struct tile_window_with_static_distribution get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims)); } - CK_TILE_DEVICE constexpr auto get_num_access() const { return load_store_traits::NumAccess; } + CK_TILE_DEVICE constexpr auto get_num_of_access() const { return load_store_traits::NumAccess; } - template - CK_TILE_DEVICE auto load(bool_constant = {}) const + template + CK_TILE_DEVICE auto load(number = {}, + bool_constant = {}) const { using Traits = load_store_traits; @@ -308,11 +314,11 @@ struct tile_window_with_static_distribution // read from bottom tensor const vector_t vec_value = get_bottom_tensor_view().template get_vectorized_elements( - bottom_tensor_thread_coord, bool_constant{}); + bottom_tensor_thread_coord, 0, bool_constant{}); #if 1 // write into distributed tensor static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) { - constexpr auto idx_ys = generate_array( + constexpr auto idx_ys = generate_tuple( [&](auto jj) { return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj]; @@ -338,8 +344,9 @@ struct tile_window_with_static_distribution { constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess); - constexpr auto idx_diff_ps_ys = - container_concat(array{0}, idx_diff_ys); + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); move_window_adaptor_and_bottom_tensor_thread_coordinate( window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); @@ -350,8 +357,12 @@ struct tile_window_with_static_distribution return dst_tensor; } - template + template CK_TILE_DEVICE void load_raw(DstTile& dst_tensor, + number = {}, bool_constant = {}, bool_constant = {}) const { @@ -397,6 +408,7 @@ struct tile_window_with_static_distribution get_bottom_tensor_view().template get_vectorized_elements_raw( dst_vec_tbuf.template at(), bottom_tensor_thread_coord, + 0 /**/, bool_constant{}, pre_nop_); #if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \ @@ -409,23 +421,24 @@ struct tile_window_with_static_distribution { constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess); - constexpr auto idx_diff_ps_ys = - container_concat(array{0}, idx_diff_ys); + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); move_window_adaptor_and_bottom_tensor_thread_coordinate( window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); } }); }); -#if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE - asm volatile("; this inline asm is workaround to prevent compiler from using too much " - "scratch memory" ::); -#endif } // TODO: currently async load only implemented in inline asm - template + template CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile, + number = {}, bool_constant = {}, bool_constant = {}) const { @@ -467,7 +480,7 @@ struct tile_window_with_static_distribution // loop over thread tensor space [y0, y1, ...] static_for<0, NumCoord, 1>{}([&](auto iCoord) { - // TODO: use structure binding (to be captured later) if compiled in C++20 + /// TODO: use structure binding (to be captured later) if compiled in C++20 auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0]; auto bottom_tensor_thread_coord = pre_computed_coords_[iCoord][I1]; @@ -482,15 +495,16 @@ struct tile_window_with_static_distribution // read from bottom tensor get_bottom_tensor_view().template async_get_vectorized_elements_raw( - smem, bottom_tensor_thread_coord, pre_nop_); + smem, bottom_tensor_thread_coord, 0, pre_nop_); // move thread coordinate if constexpr(iCoordAccess != (NumAccessPerCoord - 1)) { constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess); - constexpr auto idx_diff_ps_ys = - container_concat(array{0}, idx_diff_ys); + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); move_window_adaptor_and_bottom_tensor_thread_coordinate( window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); @@ -501,8 +515,81 @@ struct tile_window_with_static_distribution }); } - template + template + CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile, + number = {}, + bool_constant = {}) const + { + using LdsTileWindow = remove_cvref_t; + using LdsDataType = typename LdsTileWindow::DataType; + + // issues * warps * lanes + static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded + + // TODO: LDS offset is not good for intrinsic based implementation(compiler can't figure out + // dependency) hence avoid use offset based solution. size_per_buf should be zero (how to + // check?) + constexpr index_t size_per_buf = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<0>{}, number<0>{}, number<0>{})); + + constexpr index_t size_per_wave = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<0>{}, number<1>{}, number<0>{})) - + size_per_buf; + + constexpr index_t size_per_issue = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<1>{}, number<0>{}, number<0>{})) - + size_per_buf; + + const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id(); + + using Traits = load_store_traits; + + using vector_t = typename Traits::vector_t; + using SFC_Ys = typename Traits::SFC_Ys; + + // TODO: we force CK_TILE_LDS_ADDR + CK_TILE_LDS_ADDR LdsDataType* smem = + lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value; + + // loop over thread tensor space [y0, y1, ...] + static_for<0, NumCoord, 1>{}([&](auto iCoord) { + /// TODO: use structure binding (to be captured later) if compiled in C++20 + auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0]; + auto bottom_tensor_thread_coord = pre_computed_coords_[iCoord][I1]; + + static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) { + constexpr auto iAccess = number{}; + + // read from bottom tensor + get_bottom_tensor_view().template async_get_vectorized_elements( + smem, bottom_tensor_thread_coord, 0, bool_constant{}); + + // move thread coordinate + if constexpr(iCoordAccess != (NumAccessPerCoord - 1)) + { + constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess); + + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); + + move_window_adaptor_and_bottom_tensor_thread_coordinate( + window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); + + smem += size_per_issue; // Note we manually increase the per-issue offset + } + }); + }); + } + + template CK_TILE_DEVICE void store(const static_distributed_tensor& dstr_tensor, + number = {}, bool_constant = {}) const { using Traits = load_store_traits; @@ -515,7 +602,6 @@ struct tile_window_with_static_distribution // loop over thread tensor space [y0, y1, ...] static_for<0, NumCoord, 1>{}([&](auto iCoord) { - /// TODO: use structure binding (to be captured later) if compiled in C++20 auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0]; auto bottom_tensor_thread_coord = pre_computed_coords_[iCoord][I1]; @@ -530,7 +616,7 @@ struct tile_window_with_static_distribution vector_t vec_value; static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) { - constexpr auto idx_ys = generate_array( + constexpr auto idx_ys = generate_tuple( [&](auto jj) { return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj]; @@ -548,15 +634,19 @@ struct tile_window_with_static_distribution // write into bottom tensor get_bottom_tensor_view().template set_vectorized_elements( - bottom_tensor_thread_coord, vec_value, bool_constant{}); + bottom_tensor_thread_coord, + 0, + vec_value, + bool_constant{}); // move thread coordinate if constexpr(iCoordAccess != (NumAccessPerCoord - 1)) { constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess); - constexpr auto idx_diff_ps_ys = - container_concat(array{0}, idx_diff_ys); + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); move_window_adaptor_and_bottom_tensor_thread_coordinate( window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); @@ -565,8 +655,9 @@ struct tile_window_with_static_distribution }); } - CK_TILE_DEVICE void - store_raw(const static_distributed_tensor& dstr_tensor) const + template + CK_TILE_DEVICE void store_raw(const static_distributed_tensor& dstr_tensor, + number = {}) const { using Traits = load_store_traits; @@ -591,7 +682,7 @@ struct tile_window_with_static_distribution // read from distributed tensor vector_t vec_value; static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) { - constexpr auto idx_ys = generate_array( + constexpr auto idx_ys = generate_tuple( [&](auto jj) { return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj]; @@ -606,15 +697,16 @@ struct tile_window_with_static_distribution // write into bottom tensor get_bottom_tensor_view() .template set_vectorized_elements_raw( - bottom_tensor_thread_coord, vec_value); + bottom_tensor_thread_coord, 0, vec_value); // move thread coordinate if constexpr(iCoordAccess != (NumAccessPerCoord - 1)) { constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess); - constexpr auto idx_diff_ps_ys = - container_concat(array{0}, idx_diff_ys); + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); move_window_adaptor_and_bottom_tensor_thread_coordinate( window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); @@ -623,8 +715,9 @@ struct tile_window_with_static_distribution }); } - template + template CK_TILE_DEVICE void update(const static_distributed_tensor& dstr_tensor, + number = {}, bool_constant = {}) const { using Traits = load_store_traits; @@ -650,7 +743,7 @@ struct tile_window_with_static_distribution vector_t vec_value; static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) { - constexpr auto idx_ys = generate_array( + constexpr auto idx_ys = generate_tuple( [&](auto jj) { return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj]; @@ -666,15 +759,19 @@ struct tile_window_with_static_distribution // write into bottom tensor get_bottom_tensor_view().template update_vectorized_elements( - bottom_tensor_thread_coord, vec_value, bool_constant{}); + bottom_tensor_thread_coord, + 0, + vec_value, + bool_constant{}); // move thread coordinate if constexpr(iCoordAccess != (NumAccessPerCoord - 1)) { constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess); - constexpr auto idx_diff_ps_ys = - container_concat(array{0}, idx_diff_ys); + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); move_window_adaptor_and_bottom_tensor_thread_coordinate( window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); @@ -746,7 +843,8 @@ struct tile_window_with_static_distribution constexpr auto idx_diff_ys = SFC_Ys::get_step_between(number<0>{}, number{}); - constexpr auto idx_diff_ps_ys = container_concat(array{0}, idx_diff_ys); + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), idx_diff_ys); move_window_adaptor_and_bottom_tensor_thread_coordinate( window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); @@ -798,6 +896,27 @@ make_tile_window(const TensorView_& tensor_view, tensor_view, window_lengths, origin, tile_distribution}; } +// this version can't be called in a constexpr context +template +CK_TILE_DEVICE auto +make_tile_window_raw(const TensorView_& tensor_view, + const WindowLengths_& window_lengths, + const multi_index& origin, + const StaticTileDistribution_& tile_distribution, + number = {}) +{ + auto w = tile_window_with_static_distribution, + remove_cvref_t, + remove_cvref_t, + NumCoord>{ + tensor_view, window_lengths, origin, tile_distribution}; + w.init_raw(); + return w; +} + template +CK_TILE_DEVICE constexpr auto +make_tile_window_raw(const tile_window_with_static_lengths& tile_window, + const StaticTileDistribution& tile_distribution) +{ + auto w = make_tile_window(tile_window.get_bottom_tensor_view(), + tile_window.get_window_lengths(), + tile_window.get_window_origin(), + tile_distribution); + w.init_raw(); + return w; +} + template CK_TILE_DEVICE void move_tile_window( tile_window_with_static_lengths& window, diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp new file mode 100644 index 000000000..4b921ec5b --- /dev/null +++ b/include/ck_tile/core/tensor/tile_window_linear.hpp @@ -0,0 +1,1082 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once +#include "ck_tile/core/arch/arch.hpp" +#include "ck_tile/core/arch/utility.hpp" +#include "ck_tile/core/algorithm/space_filling_curve.hpp" +#include "ck_tile/core/config.hpp" +#include "ck_tile/core/container/array.hpp" +#include "ck_tile/core/container/sequence.hpp" +#include "ck_tile/core/container/tuple.hpp" +#include "ck_tile/core/container/container_helper.hpp" +#include "ck_tile/core/tensor/static_distributed_tensor.hpp" +#include "ck_tile/core/tensor/tensor_adaptor.hpp" +#include "ck_tile/core/tensor/tile_distribution.hpp" +#include "ck_tile/core/utility/functional.hpp" +#include "ck_tile/core/utility/type_traits.hpp" + +namespace ck_tile { + +#define WINDOW_DISPATCH_ISSUE() \ + if constexpr(i_access < 0) \ + { \ + static_for<0, NumAccess, 1>{}([&](auto ia) { issue(ia); }); \ + } \ + else \ + { \ + static_assert(i_access < NumAccess); \ + issue(number{}); \ + } + +// +// This version of tile window will pre-cache offset/flags based on need +// +// LinearBottomDims_, e.g seq<0, 1> for 2d tensor, the last one is linear dim +// so last dim can use immediate offset to indexing, can save register +// TODO: if using this struct, better use load_raw()/store_raw(), can control +// the the immediate offset on the fly +// space-filing-curve is non-snaked here! +// +template +struct tile_window_linear +{ + using BottomTensorView = remove_reference_t; + using WindowLengths = remove_cvref_t; + using TileDstr = remove_cvref_t; + + using WindowAdaptor = typename TileDstr::PsYs2XsAdaptor; + using BottomTensorDesc = typename BottomTensorView::TensorDesc; + + using DataType = remove_cvref_t; + using LinearBottomDims = remove_cvref_t; + + static_assert(LinearBottomDims::size() == BottomTensorView::get_num_of_dimension()); + + static constexpr index_t NDimWindowAdaptorTop = WindowAdaptor::get_num_of_top_dimension(); + static constexpr index_t NDimBottomTensor = BottomTensorDesc::get_num_of_dimension(); + + static constexpr index_t NDimP = TileDstr::get_num_of_dimension_p(); + static constexpr index_t NDimY = TileDstr::get_num_of_dimension_y(); + + static constexpr auto I0 = number<0>{}; + static constexpr auto I1 = number<1>{}; + + // TODO: check WindowLengths and StaticTileDistribution are consistent + + static_assert(ck_tile::is_known_at_compile_time::value, + "wrong! lengths should be static"); + static_assert(TileDstr::is_static(), "wrong!"); + + static_assert(NDimBottomTensor == WindowAdaptor::get_num_of_bottom_dimension(), + "wrong! inconsistent # of diemsnions"); + + using AdaptorTopIndex = array; + using BottomTensorIndex = array; + + using WindowAdaptorCoord = + decltype(make_tensor_adaptor_coordinate(WindowAdaptor{}, AdaptorTopIndex{})); + + using BottomTensorCoord = + decltype(make_tensor_coordinate(BottomTensorDesc{}, BottomTensorIndex{})); + + struct traits + { + private: + // return vector dimension among [y0, y1, ...] + CK_TILE_DEVICE static constexpr auto get_window_adaptor_ys_safe_vector_length_strides() + { + // bottom tensor top dimension vector lengths and strides + const auto [bottom_tensor_top_dim_vector_lengths, + bottom_tensor_top_dim_vector_strides] = + BottomTensorDesc::get_top_dimension_safe_vector_length_strides(); + + // window vector lengths/strides + const auto window_adaptor_bottom_dim_vector_lengths = + bottom_tensor_top_dim_vector_lengths; + const auto window_adaptor_bottom_dim_vector_strides = + bottom_tensor_top_dim_vector_strides; + + // window adaptor [p0, p1, ..., y0, y1, ...] + array + window_adaptor_vector_lengths{-1}; + array + window_adaptor_vector_strides{-1}; + + constexpr auto window_adaptor_bottom_dims = + WindowAdaptor::get_bottom_dimension_hidden_ids(); + + set_container_subset(window_adaptor_vector_lengths, + window_adaptor_bottom_dims, + window_adaptor_bottom_dim_vector_lengths); + set_container_subset(window_adaptor_vector_strides, + window_adaptor_bottom_dims, + window_adaptor_bottom_dim_vector_strides); + + const auto [window_adaptor_ps_ys_vector_lengths, window_adaptor_ps_ys_vector_strides] = + WindowAdaptor{}.get_top_dimension_safe_vector_length_strides( + window_adaptor_vector_lengths, window_adaptor_vector_strides); + + // [y0, y1, ...] + constexpr auto y_dims = + typename arithmetic_sequence_gen::type{}; + + return make_tuple(get_container_subset(window_adaptor_ps_ys_vector_lengths, y_dims), + get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims)); + } + + static constexpr auto get_vector_dim_y_scalar_per_vector() + { + const auto [ys_vector_lengths, ys_vector_strides] = + get_window_adaptor_ys_safe_vector_length_strides(); + + index_t VectorDimY_ = 0; + index_t ScalarPerVector_ = 1; + + for(index_t i = 0; i < NDimY; ++i) + { + if(ys_vector_strides[i] == 1 && ys_vector_lengths[i] > ScalarPerVector_) + { + ScalarPerVector_ = ys_vector_lengths[i]; + VectorDimY_ = i; + } + } + + return make_tuple(VectorDimY_, ScalarPerVector_); + } + + public: + static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>(); + static constexpr index_t ScalarPerVector = + get_vector_dim_y_scalar_per_vector().template at<1>(); + + using vector_t = thread_buffer; + + private: + static constexpr auto scalars_per_access_ = [] { + constexpr auto scalars_per_access_arr = generate_array( + [&](auto i) { return (i == VectorDimY) ? ScalarPerVector : 1; }, number{}); + + /// TODO: add non-automatic storage argument support to macro TO_SEQUENCE() + constexpr auto NDimY_ = NDimY; + + return TO_SEQUENCE(scalars_per_access_arr, NDimY_); + }(); + + static constexpr auto get_space_filling_curve() + { + constexpr auto thread_tensor_lengths_ys = + to_sequence(TileDstr{}.get_ys_to_d_descriptor().get_lengths()); + + // FIXME: need logic to judge dim access order + using DimAccessOrder = typename arithmetic_sequence_gen<0, NDimY, 1>::type; + + return space_filling_curve{}; + } + + public: + using SFC_Ys = decltype(get_space_filling_curve()); + + static constexpr index_t NumAccess = SFC_Ys::get_num_of_access(); + + static_assert(0 < NumAccess, "Wrong! NumAccess should be larger than 0"); + + private: + static constexpr auto get_num_non_linear_access() + { + constexpr auto sfc_access_lens = SFC_Ys::access_lengths; + using ys_to_rhs_major = + typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor; + + constexpr auto non_linear = [&]() { + index_t cnt = 1; + static_for<0, NDimY, 1>{}([&](auto i_dim_y) { + constexpr auto rhs_major = ys_to_rhs_major{}[i_dim_y]; + constexpr auto target_h_dim = number{}; // no r dim here! + if constexpr(LinearBottomDims{}[target_h_dim] == 0) + { + cnt *= sfc_access_lens[i_dim_y]; + } + }); + return cnt; + }(); + + return non_linear; + } + + // example: + // non_linear_access_map: sequence<0, 0, 0, 0, 1, 1, 1, 1> for 8 access, totally 2 register + // used + // -> histogram : sequence<4, 4> + // -> prefixsum : seqneuce<0, 4, 8> + // non_linear_access_map: sequence<0, 1, 2, 3, 4, 5, 6, 7> for 8 access, totally 8 register + // used, will pre-cache 8 + // -> histogram : sequence<1, 1, 1, 1, 1, 1, 1, 1> + // -> prefixsum : seqneuce<0, 1, 2, 3, 4, 5, 6, 7, 8> + // non_linear_access_map: sequence<0, 0, 1, 1, 2, 2, 3, 3> for 8 access, totally 4 register + // used, will pre-cache 4 + // -> histogram : sequence<2, 2, 2, 2> + // -> prefixsum : seqneuce<0, 2, 4, 6, 8> + static constexpr auto get_non_linear_access_map() + { + constexpr auto sfc_access_lens = SFC_Ys::access_lengths; + using ys_to_rhs_major = + typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor; + constexpr auto non_linear_map = [&]() { + array m_{0}; + index_t cumulative_len_ = 1; + index_t cumulative_non_linear_len_ = 1; + static_for<0, NDimY, 1>{}([&](auto i_y) { + constexpr auto i_dim_y = number{}; // from right to left + constexpr auto rhs_major = ys_to_rhs_major{}[i_dim_y]; + constexpr auto target_h_dim = number{}; // no r dim here! + constexpr auto is_linear_dim = LinearBottomDims{}[target_h_dim]; + + array current_m_{0}; + constexpr auto current_len_ = sfc_access_lens[i_dim_y]; + + // copy cumulative length as current pattern + for(auto i_ = 0; i_ < cumulative_len_; i_++) + { + current_m_(i_) = m_[i_]; + } + for(auto j_ = 0; j_ < current_len_; j_++) + { + auto j_offset_ = is_linear_dim ? 0 : j_ * cumulative_non_linear_len_; + for(auto i_ = 0; i_ < cumulative_len_; i_++) + { + m_(j_ * cumulative_len_ + i_) = current_m_[i_] + j_offset_; + } + } + cumulative_len_ *= current_len_; + if(!is_linear_dim) + cumulative_non_linear_len_ *= current_len_; + }); + return m_; + }(); + + return TO_SEQUENCE(non_linear_map, NumAccess); + } + + static constexpr auto get_non_linear_access_histogram() + { + constexpr auto m_ = get_non_linear_access_map(); + // m_.foo(); + + constexpr auto r_ = + typename arithmetic_sequence_gen<0, get_num_non_linear_access() + 1, 1>::type{}; + + constexpr auto h_ = histogram_sorted_sequence(m_, r_); + + return h_; + } + + static constexpr auto get_non_linear_access_histogram_prefix_sum() + { + constexpr auto h_ = get_non_linear_access_histogram(); + constexpr auto h_prefix_sum_ = prefix_sum_sequence(h_); + return h_prefix_sum_; + } + + public: + static constexpr index_t NumAccess_NonLinear = get_num_non_linear_access(); + using AccessMap_NonLinear = decltype(get_non_linear_access_map()); // sequence + using AccessHistogram_NonLinear = decltype(get_non_linear_access_histogram()); + using AccessPrefixSum_NonLinear = decltype(get_non_linear_access_histogram_prefix_sum()); + }; + + static constexpr index_t NumAccess = traits::NumAccess; + static constexpr index_t NumAccess_NonLinear = traits::NumAccess_NonLinear; + using AccessMap_NonLinear = typename traits::AccessMap_NonLinear; + using AccessHistogram_NonLinear = typename traits::AccessHistogram_NonLinear; + using AccessPrefixSum_NonLinear = typename traits::AccessPrefixSum_NonLinear; + + CK_TILE_DEVICE constexpr tile_window_linear() = default; + + CK_TILE_DEVICE constexpr tile_window_linear(const BottomTensorView& bottom_tensor_view, + const WindowLengths& window_lengths, + const BottomTensorIndex& window_origin, + const TileDstr& tile_distribution) + : bottom_tensor_view_{bottom_tensor_view}, + window_lengths_{window_lengths}, + window_origin_{window_origin}, + tile_dstr_{tile_distribution}, + cached_coords_{}, + cached_flags_{} + { + auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate( + tile_distribution.get_ps_ys_to_xs_adaptor(), + container_concat(make_tuple(get_warp_id(), get_lane_id()), + generate_tuple([&](auto) { return number<0>{}; }, number{}))); + + BottomTensorIndex bottom_tensor_thread_origin_idx_tmp = + window_origin + window_adaptor_thread_coord_tmp.get_bottom_index(); + + auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate( + bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp); + + // future load/store() calls (might allocate more registers) + using SFC_Ys = typename traits::SFC_Ys; + + static_for<0, NumAccess, 1>{}([&](auto i_access) { + constexpr auto non_linear_id = number{}; + constexpr auto need_save_non_linear_coord = + bool_constant{}; + + if constexpr(need_save_non_linear_coord) + { + cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp; + } + + // TODO: need pad_tensor_view to check which dim need use flag to check + // cached flag is independent from non-linear-coord + // but need be updated in move_tile, with proper dims + cached_flags_(i_access) = coordinate_has_valid_offset_assuming_top_index_is_valid( + bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_coord_tmp); + + if constexpr(i_access != (NumAccess - 1)) + { + constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); + + move_window_adaptor_and_bottom_tensor_thread_coordinate( + window_adaptor_thread_coord_tmp, + bottom_tensor_thread_coord_tmp, + idx_diff_ps_ys); + } + }); + } + + CK_TILE_DEVICE static constexpr index_t get_num_of_dimension() { return NDimBottomTensor; } + + CK_TILE_DEVICE static constexpr bool has_static_tile_distribution() + { + return TileDstr::is_static(); + } + + CK_TILE_DEVICE constexpr auto get_window_lengths() const { return window_lengths_; } + + CK_TILE_DEVICE constexpr auto get_tile_distribution() const { return tile_dstr_; } + + CK_TILE_DEVICE constexpr auto get_bottom_tensor_view() const { return bottom_tensor_view_; } + + CK_TILE_DEVICE constexpr auto get_window_origin() const { return window_origin_; } + + CK_TILE_DEVICE constexpr void + set_bottom_tensor_view_data_ptr(typename BottomTensorView::DataType* data) + { + bottom_tensor_view_.buf_.p_data_ = data; + } + + // move thread's window adaptor coordinate and bottom tensor coordinate + // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset] + template + CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate( + WindowAdaptorCoord& window_adaptor_thread_coord, + BottomTensorCoord& bottom_tensor_thread_coord, + const ATopIndex& idx_diff_adaptor_top) const + { + array idx_diff_adaptor_bottom; + + move_tensor_adaptor_coordinate(tile_dstr_.get_ps_ys_to_xs_adaptor(), + window_adaptor_thread_coord, + idx_diff_adaptor_top, + idx_diff_adaptor_bottom); + + move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(), + bottom_tensor_thread_coord, + idx_diff_adaptor_bottom); + } + + template + CK_TILE_DEVICE static constexpr auto get_bottom_linear_coordinate(number) + { + using SFC_Ys = typename traits::SFC_Ys; + constexpr auto idx_ys = SFC_Ys::get_index(number{}); + using ys_to_rhs_major = + typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor; + + constexpr auto modified_idx_ys = generate_tuple( + [&](auto i_dim_y) { + constexpr auto rhs_major = ys_to_rhs_major{}[i_dim_y]; + constexpr auto target_h_dim = number{}; // no r dim here! + if constexpr(LinearBottomDims{}[target_h_dim] == 0) + { + return number<0>{}; + } + else + { + return number{}; + } + }, + number{}); + + constexpr auto adaptor_ = TileDstr{}.get_ps_ys_to_xs_adaptor(); + constexpr auto idx_ = + container_concat(make_tuple(number<0>{}, number<0>{}), modified_idx_ys); + + return adaptor_.calculate_bottom_index(idx_); + } + + template + CK_TILE_DEVICE static constexpr index_t get_bottom_linear_offset(number) + { + constexpr auto linear_coord = get_bottom_linear_coordinate(number{}); + // since this is linear offset, we assum bottom X tensor is always linear + constexpr index_t linear_offset = [&]() { + constexpr auto x_idx_ = linear_coord; + constexpr auto x_len_ = TileDstr{}.get_lengths(); + static_assert(x_idx_.size() == x_len_.size()); + constexpr index_t x_dims_ = x_idx_.size(); + index_t cu_stride_ = 1; + index_t cu_offset_ = 0; + static_for<0, x_dims_, 1>{}([&](auto i_) { + auto r_i_ = number{}; + cu_offset_ += x_idx_[r_i_] * cu_stride_; + cu_stride_ *= x_len_[r_i_]; + }); + return cu_offset_; + }(); + + return linear_offset; + } + + CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; } + + template + CK_TILE_DEVICE auto load(number = {}, bool_constant = {}) const + { + using vector_t = typename traits::vector_t; + using SFC_Ys = typename traits::SFC_Ys; + + constexpr auto tile_dstr = TileDstr{}; + + auto dst_tensor = make_static_distributed_tensor(tile_dstr); + + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + auto bottom_tensor_flag = cached_flags_[IAccess]; + + constexpr auto linear_offset = get_bottom_linear_offset(IAccess); + + // read from bottom tensor + const vector_t vec_value = + get_bottom_tensor_view().template get_vectorized_elements( + bottom_tensor_thread_coord, + linear_offset, + bottom_tensor_flag, + bool_constant{}); +#if 1 + // data index [y0, y1, ...] + constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess); + // write into distributed tensor + static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) { + constexpr auto idx_ys = generate_tuple( + [&](auto jj) { + return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj]; + }, + number{}); + + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys); + + dst_tensor.get_thread_buffer().template at() = + vec_value.template get_as()[j]; + }); +#else + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start); + static_assert(d % traits::ScalarPerVector == 0); + + dst_tensor.get_thread_buffer().template get_as()( + number{}) = bit_cast(vec_value); +#endif + }; + + WINDOW_DISPATCH_ISSUE(); + + return dst_tensor; + } + + template + CK_TILE_DEVICE void load_raw(DstTile& dst_tensor, + number = {}, // negative means loop over all num_access + bool_constant = {}, + bool_constant = {}) const + { + using vector_t = typename traits::vector_t; + using SFC_Ys = typename traits::SFC_Ys; + static constexpr index_t YElementSize = + TileDstr{}.get_ys_to_d_descriptor().get_element_space_size(); + static_assert(YElementSize % traits::ScalarPerVector == 0); + using vectorized_tbuf = array; + + constexpr auto tile_dstr = TileDstr{}; + + auto& dst_vec_tbuf = reinterpret_cast(dst_tensor.get_thread_buffer()); + + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + constexpr auto pre_nop_ = [&]() { + if constexpr(pre_nop && i_access_ == 0 && + BottomTensorView::buffer_view::get_address_space() == + address_space_enum::global) + return bool_constant{}; + else + return bool_constant{}; + }(); + + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + constexpr auto linear_offset = get_bottom_linear_offset(IAccess); + auto bottom_tensor_flag = cached_flags_[IAccess]; + + // data index [y0, y1, ...] + constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess); + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start); + static_assert(d % traits::ScalarPerVector == 0); + + get_bottom_tensor_view().template get_vectorized_elements_raw( + dst_vec_tbuf.template at(), + bottom_tensor_thread_coord, + linear_offset /**/, + bottom_tensor_flag, + bool_constant{}, + pre_nop_); +#if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \ + CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE + asm volatile(""); // this is starting from rocm-6.2, but same sympton, reuse this flag +#endif + }; + + WINDOW_DISPATCH_ISSUE(); + } + + // TODO: currently async load only implemented in inline asm + template + CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile, + number = {}, + bool_constant = {}, + bool_constant = {}) const + { + using LdsTileWindow = remove_cvref_t; + using LdsDataType = typename LdsTileWindow::DataType; + + // currently we only support everything is non linear dim + // actually it's not performant if we have linear dim(e.g. fast changing) + static_assert(NumAccess_NonLinear == NumAccess); + static_assert(BottomTensorView::buffer_view::get_address_space() == + address_space_enum::global); + + // issues * warps * lanes + static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded + + const index_t size_per_buf = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<0>{}, number<0>{}, number<0>{})) * + sizeof(LdsDataType); + + const index_t size_per_wave = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<0>{}, number<1>{}, number<0>{})) * + sizeof(LdsDataType) - + size_per_buf; + + const index_t size_per_issue = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<1>{}, number<0>{}, number<0>{})) * + sizeof(LdsDataType) - + size_per_buf; + + const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id(); + m0_set_with_memory(m0_init_value); // This should be wave independent + + using vector_t = typename traits::vector_t; + + LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_; + + // loop over thread tensor space [y0, y1, ...] + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + constexpr auto pre_nop_ = [&]() { + if constexpr(pre_nop && i_access_ == 0) + return bool_constant{}; + else + return bool_constant{}; + }(); + + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + auto bottom_tensor_flag = cached_flags_[IAccess]; // get this flag anyway + + // read from bottom tensor + get_bottom_tensor_view().template async_get_vectorized_elements_raw( + smem, bottom_tensor_thread_coord, 0, bottom_tensor_flag, pre_nop_); + + // move thread coordinate + if constexpr(i_access_ != (NumAccess - 1)) + { + m0_inc_with_memory(size_per_issue); + } + }; + + WINDOW_DISPATCH_ISSUE(); + } + + template + CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile, + number = {}, + bool_constant = {}) const + { + using LdsTileWindow = remove_cvref_t; + using LdsDataType = typename LdsTileWindow::DataType; + + // currently we only support everything is non linear dim + // actually it's not performant if we have linear dim(e.g. fast changing) + static_assert(NumAccess_NonLinear == NumAccess); + static_assert(BottomTensorView::buffer_view::get_address_space() == + address_space_enum::global); + + // issues * warps * lanes + static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded + + // TODO: LDS offset is not good for intrinsic based implementation(compiler can't figure out + // dependency) hence avoid use offset based solution. size_per_buf should be zero (how to + // check?) + constexpr index_t size_per_buf = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<0>{}, number<0>{}, number<0>{})); + + constexpr index_t size_per_wave = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<0>{}, number<1>{}, number<0>{})) - + size_per_buf; + + constexpr index_t size_per_issue = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<1>{}, number<0>{}, number<0>{})) - + size_per_buf; + + const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id(); + + using vector_t = typename traits::vector_t; + + // TODO: we force CK_TILE_LDS_ADDR + CK_TILE_LDS_ADDR LdsDataType* smem = + lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value; + + // loop over thread tensor space [y0, y1, ...] + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + auto bottom_tensor_flag = cached_flags_[IAccess]; + + // read from bottom tensor + get_bottom_tensor_view().template async_get_vectorized_elements( + smem, + bottom_tensor_thread_coord, + 0, + bottom_tensor_flag, + bool_constant{}); + + // move thread coordinate + if constexpr(i_access_ != (NumAccess - 1)) + { + smem += size_per_issue; // Note we manually increase the per-issue offset + } + }; + + WINDOW_DISPATCH_ISSUE(); + } + + template + CK_TILE_DEVICE void store(const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}) const + { + + using vector_t = typename traits::vector_t; + using SFC_Ys = typename traits::SFC_Ys; + + constexpr auto tile_dstr = TileDstr{}; + + // loop over thread tensor space [y0, y1, ...] + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + constexpr auto linear_offset = get_bottom_linear_offset(IAccess); + auto bottom_tensor_flag = cached_flags_[IAccess]; + // data index [y0, y1, ...] + constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess); + + // read from distributed tensor + vector_t vec_value; + + static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) { + constexpr auto idx_ys = generate_tuple( + [&](auto jj) { + return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj]; + }, + number{}); + + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys); + + vec_value.template get_as()(j) = + dstr_tensor.get_thread_buffer().template at(); + }); + + // write into bottom tensor + get_bottom_tensor_view().template set_vectorized_elements( + bottom_tensor_thread_coord, + linear_offset, + bottom_tensor_flag, + vec_value, + bool_constant{}); + }; + + WINDOW_DISPATCH_ISSUE(); + } + + template + CK_TILE_DEVICE void store_raw(const static_distributed_tensor& dstr_tensor, + number = {}) const + { + using vector_t = typename traits::vector_t; + using SFC_Ys = typename traits::SFC_Ys; + + constexpr auto tile_dstr = TileDstr{}; + static constexpr bool oob_conditional_check = true; + + // loop over thread tensor space [y0, y1, ...] + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + constexpr auto linear_offset = get_bottom_linear_offset(IAccess); + auto bottom_tensor_flag = cached_flags_[IAccess]; + + // data index [y0, y1, ...] + constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess); + + // read from distributed tensor + vector_t vec_value; + static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) { + constexpr auto idx_ys = generate_tuple( + [&](auto jj) { + return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj]; + }, + number{}); + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys); + vec_value.template get_as()(j) = + dstr_tensor.get_thread_buffer().template at(); + }); + + // write into bottom tensor + get_bottom_tensor_view() + .template set_vectorized_elements_raw( + bottom_tensor_thread_coord, linear_offset, bottom_tensor_flag, vec_value); + }; + + WINDOW_DISPATCH_ISSUE(); + } + + template + CK_TILE_DEVICE void update(const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}) const + { + + using vector_t = typename traits::vector_t; + using SFC_Ys = typename traits::SFC_Ys; + + constexpr auto tile_dstr = TileDstr{}; + + // loop over thread tensor space [y0, y1, ...] + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + constexpr auto linear_offset = get_bottom_linear_offset(IAccess); + auto bottom_tensor_flag = cached_flags_[IAccess]; + + // data index [y0, y1, ...] + constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess); + + // read from distributed tensor + vector_t vec_value; + + static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) { + constexpr auto idx_ys = generate_tuple( + [&](auto jj) { + return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj]; + }, + number{}); + + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys); + + vec_value.template get_as()(j) = + dstr_tensor.get_thread_buffer().template at(); + }); + + // write into bottom tensor + get_bottom_tensor_view().template update_vectorized_elements( + bottom_tensor_thread_coord, + linear_offset, + bottom_tensor_flag, + vec_value, + bool_constant{}); + }; + + WINDOW_DISPATCH_ISSUE(); + } + + // move thread's botom tensor coordiante + // [x0', x1', ... ] ==> [offset] + // also move window-origin + CK_TILE_DEVICE void move(const BottomTensorIndex& step) + { + window_origin_ += step; + + static_for<0, NumAccess, 1>{}([&](auto i_access) { + constexpr auto IAccess = number{}; + constexpr auto non_linear_id = number{}; + constexpr auto need_update_non_linear_coord = + bool_constant{}; + + if constexpr(need_update_non_linear_coord) + { + move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(), + cached_coords_(non_linear_id), + step); + } + + // move the current coord with linear_coords + auto tmp_coords = cached_coords_[non_linear_id]; + constexpr auto linear_coord = get_bottom_linear_coordinate(IAccess); + move_tensor_coordinate( + bottom_tensor_view_.get_tensor_descriptor(), tmp_coords, linear_coord); + + cached_flags_(IAccess) = coordinate_has_valid_offset_assuming_top_index_is_valid( + bottom_tensor_view_.get_tensor_descriptor(), tmp_coords); + }); + } + + CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex& new_window_origin) + { + window_origin_ = new_window_origin; + + auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate( + TileDstr{}.get_ps_ys_to_xs_adaptor(), + container_concat(make_tuple(get_warp_id(), get_lane_id()), + generate_tuple([&](auto) { return number<0>{}; }, number{}))); + + BottomTensorIndex bottom_tensor_thread_origin_idx_tmp = + window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index(); + + auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate( + bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp); + + // future load/store() calls (might allocate more registers) + using SFC_Ys = typename traits::SFC_Ys; + + static_for<0, NumAccess, 1>{}([&](auto i_access) { + constexpr auto non_linear_id = number{}; + constexpr auto need_save_non_linear_coord = + bool_constant{}; + + if constexpr(need_save_non_linear_coord) + { + cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp; + } + + if constexpr(i_access != (NumAccess - 1)) + { + constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); + + move_window_adaptor_and_bottom_tensor_thread_coordinate( + window_adaptor_thread_coord_tmp, + bottom_tensor_thread_coord_tmp, + idx_diff_ps_ys); + } + }); + } + + CK_TILE_HOST_DEVICE void init_raw() { bottom_tensor_view_.init_raw(); } + + // this is the bottom tensor view + // [x0', x1', ...] ==> [offset] + BottomTensorView bottom_tensor_view_; + + // + WindowLengths window_lengths_; + + // origin ([x0', x1', ...]) of window on bottom tensor + BottomTensorIndex window_origin_; + + // Tile tensor distribution, which contains: + // 1. adaptor for window: [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] + // 2. thread descriptor for thread tensor in register: [y0, y1, ...] ==> [d] + TileDstr tile_dstr_; + + // this contains: + array cached_coords_; + array cached_flags_; +}; + +#undef WINDOW_DISPATCH_ISSUE + +namespace impl { +template +struct default_linear_bottom_dims_impl +{ + using type = typename uniform_sequence_gen::type; +}; + +template +struct default_linear_bottom_dims_impl +{ + // global default to seq<0,0,....1> + using type = typename sequence_merge::type, + sequence<1>>::type; +}; + +template +struct default_linear_bottom_dims_impl +{ + // lds default to seq<1,1.....1> + using type = typename uniform_sequence_gen::type; +}; +} // namespace impl + +template +using default_linear_bottom_dims = + typename impl::default_linear_bottom_dims_impl::type; + +// if using this API, will create a tile_window_linear +// this structure can have the chance to use immediate value, save register +// need pass in LinearBottomDims_ properly to control which dim is linear +// so to generate a constexpr offset as linear_offset for this dim +// (and finally pass to the immediate offset of buffer/lds instruction) +// +// Note: there is no internal check for which dim is OK to use linear offset +// user must make sure by themselves +// +// e.g. +// 2d global matrix, set LinearBottomDims_=seq<0, 1>, the last dim will generate +// immediate offset if each thread has multiple issue along last dim +// +// 2d LDS buffer, set LinearBottomDims_=seq<1, 1>, then only one vgpr used as offset +// everything else is just using immediate offset. +// +template > +CK_TILE_DEVICE constexpr auto +make_tile_window_linear(const TensorView_& tensor_view, + const WindowLengths_& window_lengths, + const multi_index& origin, + const StaticTileDistribution_& tile_distribution, + LinearBottomDims_ = {}) +{ + static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension()); + return tile_window_linear, + remove_cvref_t, + remove_cvref_t, + remove_cvref_t>{ + tensor_view, window_lengths, origin, tile_distribution}; +} + +template < + typename TileWindow_, + typename StaticTileDistribution_, + typename LinearBottomDims_ = default_linear_bottom_dims> +CK_TILE_DEVICE constexpr auto +make_tile_window_linear(const TileWindow_& tile_window, + const StaticTileDistribution_& tile_distribution, + LinearBottomDims_ = {}) +{ + return make_tile_window_linear(tile_window.get_bottom_tensor_view(), + tile_window.get_window_lengths(), + tile_window.get_window_origin(), + tile_distribution, + LinearBottomDims_{}); +} + +// this version must not be called under a constexpr context +template > +CK_TILE_DEVICE auto +make_tile_window_linear_raw(const TensorView_& tensor_view, + const WindowLengths_& window_lengths, + const multi_index& origin, + const StaticTileDistribution_& tile_distribution, + LinearBottomDims_ = {}) +{ + static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension()); + auto w = tile_window_linear, + remove_cvref_t, + remove_cvref_t, + remove_cvref_t>{ + tensor_view, window_lengths, origin, tile_distribution}; + w.init_raw(); + return w; +} + +template < + typename TileWindow_, + typename StaticTileDistribution_, + typename LinearBottomDims_ = default_linear_bottom_dims> +CK_TILE_DEVICE constexpr auto +make_tile_window_linear_raw(const TileWindow_& tile_window, + const StaticTileDistribution_& tile_distribution, + LinearBottomDims_ = {}) +{ + return make_tile_window_linear_raw(tile_window.get_bottom_tensor_view(), + tile_window.get_window_lengths(), + tile_window.get_window_origin(), + tile_distribution, + LinearBottomDims_{}); +} + +template +CK_TILE_DEVICE void move_tile_window( + tile_window_linear& + window, + const typename tile_window_linear::BottomTensorIndex& step) +{ + window.move(step); +} + +} // namespace ck_tile diff --git a/include/ck_tile/core/utility/magic_div.hpp b/include/ck_tile/core/utility/magic_div.hpp index 09038ba29..fd9c733c5 100644 --- a/include/ck_tile/core/utility/magic_div.hpp +++ b/include/ck_tile/core/utility/magic_div.hpp @@ -59,8 +59,16 @@ struct magic_division32_bit_range CK_TILE_DEVICE static constexpr uint32_t do_magic_division(uint32_t dividend, uint32_t multiplier, uint32_t shift) { - uint32_t tmp = __umulhi(dividend, multiplier); - return (tmp + dividend) >> shift; + if(__builtin_is_constant_evaluated()) + { + uint32_t tmp = (static_cast(dividend) * multiplier) >> 32; + return (tmp + dividend) >> shift; + } + else + { + uint32_t tmp = __umulhi(dividend, multiplier); + return (tmp + dividend) >> shift; + } } CK_TILE_HOST static constexpr uint32_t @@ -77,9 +85,18 @@ struct magic_division32_bit_range CK_TILE_DEVICE static constexpr int32_t do_magic_division(int32_t dividend_i32, uint32_t multiplier, uint32_t shift) { - uint32_t dividend_u32 = bit_cast(dividend_i32); - uint32_t tmp = __umulhi(dividend_u32, multiplier); - return (tmp + dividend_u32) >> shift; + if(__builtin_is_constant_evaluated()) + { + uint32_t dividend_u32 = bit_cast(dividend_i32); + uint32_t tmp = (static_cast(dividend_u32) * multiplier) >> 32; + return (tmp + dividend_u32) >> shift; + } + else + { + uint32_t dividend_u32 = bit_cast(dividend_i32); + uint32_t tmp = __umulhi(dividend_u32, multiplier); + return (tmp + dividend_u32) >> shift; + } } CK_TILE_HOST static constexpr int32_t diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp index dbc1f5d23..e17d7c22a 100644 --- a/include/ck_tile/host.hpp +++ b/include/ck_tile/host.hpp @@ -24,5 +24,6 @@ #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp" #include "ck_tile/host/reference/reference_reduce.hpp" #include "ck_tile/host/reference/reference_softmax.hpp" +#include "ck_tile/host/reference/reference_topk.hpp" #include "ck_tile/host/stream_config.hpp" #include "ck_tile/host/timer.hpp" diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp index f490bbdeb..335911860 100644 --- a/include/ck_tile/host/fill.hpp +++ b/include/ck_tile/host/fill.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include "ck_tile/core.hpp" @@ -41,6 +42,73 @@ struct FillUniformDistribution } }; +namespace impl { + +// clang-format off +template struct RawIntegerType_ {}; +template<> struct RawIntegerType_<1> { using type = uint8_t;}; +template<> struct RawIntegerType_<2> { using type = uint16_t;}; +template<> struct RawIntegerType_<4> { using type = uint32_t;}; +template<> struct RawIntegerType_<8> { using type = uint64_t;}; +// clang-format on + +template +using RawIntegerType = typename RawIntegerType_::type; +} // namespace impl + +// Note: this struct will have no const-ness will generate random +template +struct FillUniformDistribution_Unique +{ + float a_{-5.f}; + float b_{5.f}; + std::optional seed_{11939}; + + std::mt19937 gen_{}; + std::unordered_set> set_{}; + + FillUniformDistribution_Unique(float a = -5.f, + float b = 5.f, + std::optional seed = {11939}) + : a_(a), + b_(b), + seed_(seed), + gen_{seed_.has_value() ? *seed_ : std::random_device{}()}, + set_{} + { + } + + template + void operator()(ForwardIter first, ForwardIter last) + { + std::mt19937& gen = gen_; + std::uniform_real_distribution dis(a_, b_); + auto& set = set_; + std::generate(first, last, [&dis, &gen, &set]() { + T v = static_cast(0); + do + { + v = ck_tile::type_convert(dis(gen)); + } while(set.count(bit_cast>(v)) == 1); + set.insert(bit_cast>(v)); + + return v; + }); + } + + template + auto operator()(ForwardRange&& range) + -> std::void_t()( + std::begin(std::forward(range)), + std::end(std::forward(range))))> + { + (*this)(std::begin(std::forward(range)), + std::end(std::forward(range))); + } + + void clear() { set_.clear(); } +}; + template struct FillNormalDistribution { diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp index f533d5c18..5610ba324 100644 --- a/include/ck_tile/host/host_tensor.hpp +++ b/include/ck_tile/host/host_tensor.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "ck_tile/core.hpp" #include "ck_tile/host/ranges.hpp" @@ -545,6 +546,28 @@ struct HostTensor typename Data::size_type size() const { return mData.size(); } + // return a slice of this tensor + // for simplicity we just copy the data and return a new tensor + auto slice(std::vector s_begin, std::vector s_end) const + { + assert(s_begin.size() == s_end.size()); + assert(s_begin.size() == get_num_of_dimension()); + + std::vector s_len(s_begin.size()); + std::transform( + s_end.begin(), s_end.end(), s_begin.begin(), s_len.begin(), std::minus{}); + HostTensor sliced_tensor(s_len); + + sliced_tensor.ForEach([&](auto& self, auto idx) { + std::vector src_idx(idx.size()); + std::transform( + idx.begin(), idx.end(), s_begin.begin(), src_idx.begin(), std::plus{}); + self(idx) = operator()(src_idx); + }); + + return sliced_tensor; + } + template auto AsSpan() const { diff --git a/include/ck_tile/host/reference/reference_softmax.hpp b/include/ck_tile/host/reference/reference_softmax.hpp index f1404f85a..d86e87994 100644 --- a/include/ck_tile/host/reference/reference_softmax.hpp +++ b/include/ck_tile/host/reference/reference_softmax.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -9,43 +9,81 @@ namespace ck_tile { -template -CK_TILE_HOST void reference_softmax(const HostTensor& a_m_n, - HostTensor& b_m_n) +template +CK_TILE_HOST void +reference_softmax(const HostTensor& x, HostTensor& y, index_t dim = -1) { - auto f = [&](auto m) { - const int N = a_m_n.mDesc.get_lengths()[1]; + index_t rank = x.get_num_of_dimension(); + assert(rank == y.get_num_of_dimension()); + assert(dim == -1 || dim < rank); - AccDataType v_max = ck_tile::numeric::Lowest(); + index_t target_dim = dim == -1 ? (rank - 1) : dim; + index_t softmax_len = x.get_length(target_dim); + index_t n_parallel = x.get_element_size() / softmax_len; + auto x_len = x.get_lengths(); - // max - for(int n = 0; n < N; ++n) - { - const ADataType v_a = a_m_n(m, n); + auto f = [&](auto i_element) { + std::vector coord = [&]() { + std::vector t_(rank, 0); + size_t r = i_element; + for(index_t i = rank - 1; i >= 0; i--) + { + if(i == target_dim) + continue; + t_[i] = r % x_len[i]; + r = r / x_len[i]; + } + return t_; + }(); + + ComputeType v_max = -ck_tile::numeric::infinity(); - v_max = v_max < v_a ? v_a : v_max; + // compute max + for(auto idx = 0; idx < softmax_len; idx++) + { + auto c_ = coord; + c_[target_dim] = idx; + const ComputeType v_x = ck_tile::type_convert(x(c_)); + v_max = v_max < v_x ? v_x : v_max; } - AccDataType v_exp_sum = 0; + ComputeType v_exp_sum = static_cast(0); // sum - for(int n = 0; n < N; ++n) + for(auto idx = 0; idx < softmax_len; idx++) { - const ADataType v_a = a_m_n(m, n); + auto c_ = coord; + c_[target_dim] = idx; - v_exp_sum += ck_tile::exp(v_a - v_max); + const ComputeType v_x = ck_tile::type_convert(x(c_)); + + v_exp_sum += ck_tile::exp(v_x - v_max); } // elementwise - for(int n = 0; n < N; ++n) + for(auto idx = 0; idx < softmax_len; idx++) { - const ADataType v_a = a_m_n(m, n); + auto c_ = coord; + c_[target_dim] = idx; + + const ComputeType v_x = ck_tile::type_convert(x(c_)); + + auto out = ck_tile::exp(v_x - v_max) / v_exp_sum; - b_m_n(m, n) = ck_tile::exp(v_a - v_max) / v_exp_sum; + y(c_) = ck_tile::type_convert(out); } }; - make_ParallelTensorFunctor(f, - b_m_n.mDesc.get_lengths()[0])(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency()); +} + +template +CK_TILE_HOST auto reference_softmax(const HostTensor& x, index_t dim = -1) +{ + HostTensor y(x.get_lengths(), x.get_strides()); + + reference_softmax(x, y, dim); + + return y; } } // namespace ck_tile diff --git a/include/ck_tile/host/reference/reference_topk.hpp b/include/ck_tile/host/reference/reference_topk.hpp new file mode 100644 index 000000000..3d0404a2e --- /dev/null +++ b/include/ck_tile/host/reference/reference_topk.hpp @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include +#include +#include +#include +#include + +namespace ck_tile { + +/* + similiar to torch.topk() + x (Tensor) – the input tensor. + k (int) – the k in “top-k” + dim (int, optional) – the dimension to sort along + largest (bool, optional) – largest or smallest elements + sorted (bool, optional) – elements in sorted order or not + + output: + y_values + y_indices + + https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/TopKImpl.h +*/ +template +CK_TILE_HOST void reference_topk(const HostTensor& x, + HostTensor& y_values, + HostTensor& y_indices, + index_t k, + index_t dim = -1, + bool largest = true, + bool sorted = true) +{ + // rank must be the same + index_t rank = x.get_num_of_dimension(); + assert(rank == y_values.get_num_of_dimension()); + assert(rank == y_indices.get_num_of_dimension()); + assert(dim == -1 || dim < rank); + + index_t topk_dim = dim == -1 ? (rank - 1) : dim; + index_t topk_src_len = x.get_length(topk_dim); + auto x_len = x.get_lengths(); + + assert(k <= topk_src_len); + assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim)); + + index_t n_parallel = x.get_element_size() / topk_src_len; + + // clang-format off + auto f = [&](auto i_element) { + std::vector topk_coord = [&](){ + std::vector t_(rank, 0); + size_t r = i_element; + for(index_t i = rank - 1; i >= 0; i--) { + if(i == topk_dim) continue; // topk dim should be zero + t_[i] = r % x_len[i]; r = r / x_len[i]; + } + return t_; + }(); + + using elem_t = std::pair; + std::vector q = [&](){ + std::vector t_(topk_src_len); + for(index_t i = 0; i < topk_src_len; i++) { + auto c_ = topk_coord; c_[topk_dim] = i; + t_[i].first = x(c_); t_[i].second = i; + } + return t_; + }(); + + // run topk + if(largest) { + std::nth_element(q.begin(), q.begin() + k - 1, q.end(), + [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; }); + if(sorted) { + std::sort(q.begin(), q.begin() + k - 1, + [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; }); + } + } else { + std::nth_element(q.begin(), q.begin() + k - 1, q.end(), + [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; }); + if(sorted) { + std::sort(q.begin(), q.begin() + k - 1, + [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; }); + } + } + + // write out + for(index_t i = 0; i < k; i++) { + auto c_ = topk_coord; c_[topk_dim] = i; + y_values(c_) = q[i].first; y_indices(c_) = q[i].second; + } + }; + // clang-format on + + make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency()); +} + +// TODO: if using this method, the return tensor would be dense(no stride) +template +CK_TILE_HOST auto reference_topk(const HostTensor& x, + index_t k, + index_t dim = -1, + bool largest = true, + bool sorted = true) +{ + auto lens = x.get_lengths(); + index_t target_dim = (dim == -1) ? (lens.size() - 1) : dim; + assert(target_dim < lens.size()); + assert(k <= lens[target_dim]); + lens[target_dim] = k; + HostTensor y_values(lens); + HostTensor y_indices(lens); + + reference_topk(x, y_values, y_indices, k, dim, largest, sorted); + + return ck_tile::make_tuple(y_values, y_indices); +} +} // namespace ck_tile diff --git a/include/ck_tile/ops/elementwise.hpp b/include/ck_tile/ops/elementwise.hpp new file mode 100644 index 000000000..62ba9dc0b --- /dev/null +++ b/include/ck_tile/ops/elementwise.hpp @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp new file mode 100644 index 000000000..01217e16c --- /dev/null +++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp @@ -0,0 +1,1163 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include + +namespace ck_tile { +namespace element_wise { + +#if 0 +struct PassThroughPack2 +{ + template + CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const; + + CK_TILE_HOST_DEVICE constexpr void operator()(ck_tile::half2_t& y, const ck_tile::f8x2_t& x) const + { + auto t = type_convert(x); + y = type_convert(t); + } + constexpr const static bool is_pack2_invocable = true; +}; +#endif + +struct PassThrough +{ + template + CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const; + + template <> + CK_TILE_HOST_DEVICE void operator()(double& y, const double& x) const + { + y = x; + } + + template <> + CK_TILE_HOST_DEVICE void operator()(float& y, const double& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(double& y, const float& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(float& y, const float& x) const + { + y = x; + } + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::fp16_t& y, const ck_tile::fp16_t& x) const + { + y = x; + } + + template <> + CK_TILE_HOST_DEVICE void operator()(ck_tile::fp16_t& y, + const float& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::bf16_t& y, const ck_tile::bf16_t& x) const + { + y = x; + } + + template <> + CK_TILE_HOST_DEVICE void operator()(int32_t& y, const int32_t& x) const + { + y = x; + } + + template <> + CK_TILE_HOST_DEVICE void operator()(ck_tile::bf16_t& y, + const float& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(float& y, + const ck_tile::bf16_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::bf16_t& y, const ck_tile::fp16_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(float& y, + const ck_tile::fp16_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(int8_t& y, const int8_t& x) const + { + y = x; + } + + template <> + CK_TILE_HOST_DEVICE void operator()(ck_tile::fp16_t& y, + const int8_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(ck_tile::bf16_t& y, + const int8_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(uint8_t& y, const uint8_t& x) const + { + y = x; + } + + template <> + CK_TILE_HOST_DEVICE void operator()(int8_t& y, const int32_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(int32_t& y, const int8_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(int8_t& y, const float& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(float& y, const int8_t& x) const + { + y = type_convert(x); + } + +#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 + template <> + CK_TILE_HOST_DEVICE void operator()(int4_t& y, const int4_t& x) const + { + y = x; + } + template <> + CK_TILE_HOST_DEVICE void operator()(int4_t& y, const int& x) const + { + y = type_convert(x); + } +#endif + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::fp8_t& y, const ck_tile::fp8_t& x) const + { + y = x; + } + + template <> + CK_TILE_HOST_DEVICE void operator()(float& y, + const ck_tile::fp8_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(ck_tile::fp8_t& y, + const float& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::fp16_t& y, const ck_tile::fp8_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::fp8_t& y, const ck_tile::fp16_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::bf8_t& y, const ck_tile::bf8_t& x) const + { + y = x; + } + + template <> + CK_TILE_HOST_DEVICE void operator()(float& y, + const ck_tile::bf8_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void operator()(ck_tile::bf8_t& y, + const float& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::fp16_t& y, const ck_tile::bf8_t& x) const + { + y = type_convert(x); + } + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::bf8_t& y, const ck_tile::fp16_t& x) const + { + y = ck_tile::type_convert(x); + } +}; + +#if 0 +struct UnaryConvert +{ + template + CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const + { + y = type_convert(x); + } +}; + +struct ConvertBF16RTN +{ + // convert to bf16 using round to nearest (rtn) + template + CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const + { + // check Y datatype + static_assert(std::is_same_v, "Data type is not supported by this operation!"); + + // check X datatype + static_assert(std::is_same_v || std::is_same_v, + "Data type is not supported by this operation!"); + + y = bf16_convert_rtn(x); + } +}; + +struct ConvertF8SR +{ + // convert to fp8 using stochastic rounding (SR) + template + CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const + { + // check Y datatype + static_assert(std::is_same_v || std::is_same_v, + "Data type is not supported by this operation!"); + + // check X datatype + static_assert(std::is_same_v || std::is_same_v, + "Data type is not supported by this operation!"); + + y = f8_convert_sr(x); + } +}; + +struct ConvertF8RNE +{ + // convert to fp8 using rounding to nearest even + template + CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const + { + // check Y datatype + static_assert(std::is_same_v || std::is_same_v, + "Data type is not supported by this operation!"); + + // check X datatype + static_assert(std::is_same_v || std::is_same_v, + "Data type is not supported by this operation!"); + + y = f8_convert_rne(x); + } +}; +#endif + +struct Scale +{ + CK_TILE_HOST_DEVICE Scale(float scale = 1.f) : scale_(scale) {} + + template + CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const + { + y = ck_tile::type_convert(ck_tile::type_convert(x) * scale_); + } + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::fp16_t& y, const ck_tile::fp16_t& x) const + { + y = ck_tile::type_convert(scale_) * x; + }; + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::bf16_t& y, const ck_tile::bf16_t& x) const + { + const float x_tmp = ck_tile::type_convert(x); + const float y_tmp = scale_ * x_tmp; + y = ck_tile::type_convert(y_tmp); + }; + + template <> + CK_TILE_HOST_DEVICE void operator()(float& y, const float& x) const + { + y = scale_ * x; + }; + + template <> + CK_TILE_HOST_DEVICE void operator()(double& y, const double& x) const + { + y = scale_ * x; + }; + + template <> + CK_TILE_HOST_DEVICE void operator()(int8_t& y, const int8_t& x) const + { + y = ck_tile::type_convert(scale_ * ck_tile::type_convert(x)); + }; + + float scale_; +}; + +struct ScaleAndResetNaNToMinusInfinity +{ + CK_TILE_HOST_DEVICE ScaleAndResetNaNToMinusInfinity(float scale) : scale_(scale) {} + + template + CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const; + + template <> + CK_TILE_HOST_DEVICE void operator()(float& y, const float& x) const + { + y = ck_tile::isnan(x) ? -numeric::infinity() : scale_ * x; + }; + + float scale_; +}; + +struct UnaryDivide +{ + CK_TILE_HOST_DEVICE UnaryDivide(const int32_t divider = 1) : divider_(divider) {} + + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = x / type_convert(divider_); + }; + + int32_t divider_ = 1; +}; + +struct UnarySquare +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v +#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 + || std::is_same_v +#endif + , + "Data type is not supported by this operation!"); + y = x * x; + }; +}; + +struct UnaryAbs +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::abs(x); + }; +}; + +struct UnarySqrt +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::sqrt(x); + }; +}; + +struct Relu +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + y = x > 0 ? x : 0; + } + + template <> + CK_TILE_HOST_DEVICE void operator()(ck_tile::bf16_t& y, const ck_tile::bf16_t& x) const + { + float x_f32 = ck_tile::type_convert(x); + float y_f32 = x_f32 > 0 ? x_f32 : 0; + y = ck_tile::type_convert(y_f32); + } +}; + +// Fast GeLU +// https://paperswithcode.com/method/gelu +// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3))) +// host code use higher accuracy "exp" and "div" +// gpu code use lower accuracy "_ocml_exp_f32" and "rcp" function +struct FastGelu +{ + template + CK_TILE_HOST void operator()(Y& y, const X& x) const; + + template + CK_TILE_DEVICE void operator()(Y& y, const X& x) const; + + template <> + CK_TILE_HOST void operator()(float& y, const float& x) const + { + // const float u = -2.f * x * (0.035677f * x * x + 0.797885f); + const float c1 = -2.0 * 0.035677f; + const float c2 = -2.0 * 0.797885f; + const float u = x * (c1 * x * x + c2); + const float emu = exp(u); + y = x / (1.f + emu); + } + + // device code, use lower precision "__ocml_exp_f32" and "rcp" + template <> + CK_TILE_DEVICE void operator()(float& y, const float& x) const + { + // const float u = 2.f * x * (0.035677f * x * x + 0.797885f); + const float c1 = -2.0 * 0.035677f; + const float c2 = -2.0 * 0.797885f; + const float u = x * (c1 * x * x + c2); + const float emu = __ocml_exp_f32(u); + + y = x * ck_tile::rcp(1.f + emu); + } + + template <> + CK_TILE_HOST void operator()(ck_tile::fp16_t& y, + const ck_tile::fp16_t& x) const + { + float y_f; + + this->operator()(y_f, type_convert(x)); + + y = type_convert(y_f); + } + + template <> + CK_TILE_DEVICE void operator()(ck_tile::fp16_t& y, + const ck_tile::fp16_t& x) const + { + float y_f; + + this->operator()(y_f, type_convert(x)); + + y = type_convert(y_f); + } + + template <> + CK_TILE_HOST void operator()(ck_tile::fp16_t& y, const float& x) const + { + float y_f; + + this->operator()(y_f, x); + + y = type_convert(y_f); + } + + template <> + CK_TILE_DEVICE void operator()(ck_tile::fp16_t& y, const float& x) const + { + float y_f; + + this->operator()(y_f, x); + + y = type_convert(y_f); + } + + template <> + CK_TILE_HOST void operator()(ck_tile::bf16_t& y, const float& x) const + { + float y_f; + + this->operator()(y_f, x); + + y = type_convert(y_f); + } + + template <> + CK_TILE_DEVICE void operator()(ck_tile::bf16_t& y, const float& x) const + { + float y_f; + + this->operator()(y_f, x); + + y = type_convert(y_f); + } + + template <> + CK_TILE_DEVICE void operator()(ck_tile::bf16_t& y, + const ck_tile::bf16_t& x) const + { + float y_f; + + this->operator()(y_f, type_convert(x)); + + y = type_convert(y_f); + } + + template <> + CK_TILE_HOST void operator()(ck_tile::bf16_t& y, + const ck_tile::bf16_t& x) const + { + float y_f; + + this->operator()(y_f, type_convert(x)); + + y = type_convert(y_f); + } +}; + +// https://paperswithcode.com/method/gelu +// y = 0.5*x*(1+erf(x/sqrt(2))) +struct Gelu +{ + template + CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const; + + template <> + CK_TILE_HOST_DEVICE void operator()(float& y, const float& x) const + { + y = 0.5f * x * (1.f + erf(float(0.70710678118f * x))); + } + + template <> + CK_TILE_HOST_DEVICE void + operator()(ck_tile::fp16_t& y, const ck_tile::fp16_t& x) const + { + y = ck_tile::fp16_t(0.5) * x * + (ck_tile::fp16_t(1) + ck_tile::fp16_t(erf(float(0.70710678118f * x)))); + } +}; + +struct Sigmoid +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + constexpr T one = type_convert(1); + y = one / (one + ck_tile::exp(-x)); + }; +}; + +struct Silu +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + constexpr T one = type_convert(1); + y = x * (one / (one + ck_tile::exp(-x))); + }; +}; + +struct TanH +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::tanh(x); + }; +}; + +struct ACos +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::acos(x); + }; +}; + +struct Neg +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::neg(x); + }; +}; + +struct ATan +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::atan(x); + }; +}; + +struct Sin +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::sin(x); + }; +}; + +struct ASinH +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::asinh(x); + }; +}; + +struct Cos +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::cos(x); + }; +}; + +struct ACosH +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::acosh(x); + }; +}; + +struct Tan +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::tan(x); + }; +}; + +struct ATanH +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::atanh(x); + }; +}; + +struct SinH +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::sinh(x); + }; +}; + +struct Ceil +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::ceil(x); + }; +}; + +struct Exp +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::exp(x); + }; +}; + +struct CosH +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::cosh(x); + }; +}; + +struct Floor +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::floor(x); + }; +}; + +struct Log +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::log(x); + }; +}; + +struct ASin +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::asin(x); + }; +}; + +struct Rcp +{ + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + y = ck_tile::rcp(x); + }; +}; + +struct Swish +{ + Swish(float beta = 1.0f) : beta_(beta) {} + + template + CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck_tile::exp(bx))); + }; + + const float beta_; +}; + +struct SoftRelu +{ + SoftRelu(float alpha = 1.f) : alpha_(alpha){}; + + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + T casted_alpha = type_convert(alpha_); + constexpr T one = type_convert(1); + y = ck_tile::log(one + ck_tile::exp(x * casted_alpha)) / casted_alpha; + } + const float alpha_; +}; + +struct Power +{ + Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f) + : alpha_(alpha), beta_(beta), gamma_(gamma){}; + + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + T casted_alpha = type_convert(alpha_); + T casted_beta = type_convert(beta_); + T casted_gamma = type_convert(gamma_); + T shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck_tile::pow(shifted_scaled_x, casted_gamma); + } + const float alpha_; + const float beta_; + const float gamma_; +}; + +struct ClippedRelu +{ + ClippedRelu(float alpha = 0.f, float beta = 1.f) : alpha_(alpha), beta_(beta){}; + + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + T casted_alpha = type_convert(alpha_); + T casted_beta = type_convert(beta_); + y = ck_tile::min(casted_beta, ck_tile::max(casted_alpha, x)); + } + const float alpha_; + const float beta_; +}; + +struct LeakyRelu +{ + LeakyRelu(float alpha = 0.01f) : alpha_(alpha){}; + + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + T casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + const float alpha_; +}; + +struct Elu +{ + Elu(float alpha = 1.f) : alpha_(alpha){}; + + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + T casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck_tile::expm1(x); + } + const float alpha_; +}; + +struct Logistic +{ + Logistic(float alpha = 1.f) : alpha_(alpha){}; + + template + CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const + { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + "Data type is not supported by this operation!"); + T casted_alpha = type_convert(alpha_); + constexpr T one = type_convert(1); + y = casted_alpha / (one + ck_tile::exp(-x) * casted_alpha); + } + const float alpha_; +}; + +struct ConvInvscale +{ + CK_TILE_HOST_DEVICE + ConvInvscale(float scale_in = 1.f, float scale_wei = 1.f, float scale_out = 1.f) + : scale_in_(scale_in), scale_wei_(scale_wei), scale_out_(scale_out) + { + } + + template + CK_TILE_HOST_DEVICE void operator()(E& e, const C& c) const; + + template <> + CK_TILE_HOST_DEVICE void operator()(ck_tile::fp8_t& e, + const float& c) const + { + e = type_convert(c / scale_in_ / scale_wei_ / scale_out_); + }; + + float scale_in_; + float scale_wei_; + float scale_out_; +}; + +struct ConvScale +{ + CK_TILE_HOST_DEVICE + ConvScale(float scale_in = 1.f, float scale_wei = 1.f, float scale_out = 1.f) + : scale_in_(scale_in), scale_wei_(scale_wei), scale_out_(scale_out) + { + } + + template + CK_TILE_HOST_DEVICE void operator()(E& e, const C& c) const; + + template <> + CK_TILE_HOST_DEVICE void operator()(ck_tile::fp8_t& e, + const float& c) const + { + e = type_convert(c * scale_in_ * scale_wei_ * scale_out_); + }; + + float scale_in_; + float scale_wei_; + float scale_out_; +}; + +struct ConvScaleRelu +{ + CK_TILE_HOST_DEVICE + ConvScaleRelu(float scale_in = 1.f, float scale_wei = 1.f, float scale_out = 1.f) + : scale_in_(scale_in), scale_wei_(scale_wei), scale_out_(scale_out) + { + } + + template + CK_TILE_HOST_DEVICE void operator()(E& e, const C& c) const; + + template <> + CK_TILE_HOST_DEVICE void operator()(ck_tile::fp8_t& e, + const float& c) const + { + float x; + Relu{}.template operator()(x, c * scale_in_ * scale_wei_); + e = type_convert(x * scale_out_); + }; + + float scale_in_; + float scale_wei_; + float scale_out_; +}; + +template +struct Cast +{ + template + CK_TILE_HOST_DEVICE void operator()(DstType& y, const SrcType& x) const + { + y = ck_tile::type_convert(x); + }; +}; + +// support fastconvert of int8 to fp16 +#if 0 +template +struct FastNumericArrayConverter +{ +}; + +template <> +struct FastNumericArrayConverter +{ + using InputArray = vector_type; + using OutputArray = vector_type; + + CK_TILE_DEVICE static OutputArray convert(InputArray const& Input) + { + OutputArray Output; + + uint32_t* half_2 = reinterpret_cast(&Output); + uint32_t const uint8_4 = reinterpret_cast(Input); + + static constexpr uint32_t byte_selector_01 = 0x05010500; + static constexpr uint32_t byte_selector_23 = 0x05030502; + static constexpr uint32_t fp16_adder = 0x64646464; + half_2[0] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_01); + half_2[1] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_23); + + static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480; + asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]" + : "=v"(half_2[0]) + : "v"(half_2[0]), "s"(I8s_TO_F16s_MAGIC_NUM)); + asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]" + : "=v"(half_2[1]) + : "v"(half_2[1]), "s"(I8s_TO_F16s_MAGIC_NUM)); + + return Output; + } + + CK_TILE_DEVICE OutputArray operator()(InputArray const& Input) { return convert(Input); } +}; + +template +struct FastNumericArrayConverter +{ + static constexpr int VEC_WIDTH = 4; + static_assert(!(N % VEC_WIDTH), "N must be multiple of 4."); + + using InputArray = vector_type; + using OutputArray = vector_type; + + CK_TILE_DEVICE static OutputArray convert(InputArray const& Input) + { + FastNumericArrayConverter converter; + + OutputArray Output; + + using Vec_InputArray = vector_type; + using Vec_OutputArray = vector_type; + + Vec_OutputArray* half_4_ptr = reinterpret_cast(&Output); + Vec_InputArray const* uint8_4_ptr = reinterpret_cast(&Input); + + static_for<0, N / VEC_WIDTH, 1>{}( + [&](auto i) { half_4_ptr[i] = converter(uint8_4_ptr[i]); }); + + return Output; + } + + CK_TILE_DEVICE OutputArray operator()(InputArray const& Input) { return convert(Input); } +}; +#endif +} // namespace element_wise +} // namespace ck_tile diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp index c4872def1..05d3dae1c 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp @@ -334,7 +334,7 @@ struct BlockFmhaPipelineQRKSVSAsync move_tile_window(k_dram_window, {0, kK0}); __builtin_amdgcn_sched_barrier(0); - buffer_load_fence(k_dram_window.get_num_access(), q.get_thread_buffer()); + buffer_load_fence(k_dram_window.get_num_of_access(), q.get_thread_buffer()); (void)q_element_func; // ??? rocm-6.x if use q element func will have scratch on hdim=64/32 // auto q_tile = q; // tile_elementwise_in(q_element_func, q); @@ -359,7 +359,7 @@ struct BlockFmhaPipelineQRKSVSAsync if constexpr(i_k0 < k0_loops - 1) move_tile_window(k_dram_window, {0, kK0}); - async_load_fence(k_dram_window.get_num_access()); + async_load_fence(k_dram_window.get_num_of_access()); __builtin_amdgcn_s_barrier(); __builtin_amdgcn_sched_barrier(0); gemm_0(s_acc, diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp index a01265ad5..51d55235e 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp @@ -4,9 +4,14 @@ #pragma once #include "ck_tile/core.hpp" +#include namespace ck_tile { +/* + * TODO: block_tile_reduce_sync() currently has a limitation + * Y dim must have at least one dim not been reduced + */ // synchronize reduce result (cross lane reduction and broadcast on replicated dimension) template CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor, @@ -104,6 +109,65 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor, }); } +/* + * this version is faster, using xor to do reduce, no need broadcast anymore + * TODO: the limitation is to-be-reduced P dim can only mapping to one R dim? + */ +template +CK_TILE_DEVICE void block_tile_reduce_xor_sync(AccDistributedTensor_& acc_tensor, + const ReduceFunc& reduce_func) +{ + using Dstr = typename AccDistributedTensor_::StaticTileDistribution; + using DstrEncode = typename Dstr::DstrEncode; + using DstrEncodeDetail = typename DstrEncode::detail; + + constexpr index_t NDimP = Dstr::get_num_of_dimension_p(); + constexpr index_t NDimR = Dstr::get_num_of_dimension_r(); + + constexpr index_t idim_p_lane = NDimP - 1; + + constexpr index_t thread_buf_size = AccDistributedTensor_::get_thread_buffer_size(); + + // loop over thread data + static_for<0, thread_buf_size, 1>{}([&](auto i) { + auto v_local = acc_tensor.get_thread_buffer()[i]; + + // cross-lane reduce for replication + // only reduce on R dimension correspond to lane + // (lane id maps to this R dimension) + static_for<0, NDimR, 1>{}([&](auto idim_r) { + // FIXME: nasty to use does_p_own_r_ + if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r]) + { + constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r]; + + constexpr index_t lid_over_rid_derivative = + DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r]; + + static_assert(is_power_of_two_integer(r_length), + "wrong! only support power of 2 reduction"); + + constexpr index_t nstage = integer_log2_floor(r_length); + + // reduction sweep forward + static_for<0, nstage, 1>{}([&](auto istage) { + // xor + index_t src_lane = + __lane_id() ^ (number{}.value); + + // pull data from remote lane + const auto v_remote = warp_shuffle(v_local, src_lane); + + // reduce + v_local = reduce_func(v_local, v_remote); + }); + } + }); + + acc_tensor.get_thread_buffer()(i) = v_local; + }); +} + // FIXME: this is for 2D to 1D reduce only, need to support n-D template 1D reduce (reduce-dim=seq<0, 1>) +// this version only support in/acc/out datatypes are the same +// this version will call thread/warp+sync in one function call +// +template +struct BlockReduce2D +{ + using InDistributedTensor = remove_cvref_t; + using InDataType = typename InDistributedTensor::DataType; + + CK_TILE_HOST_DEVICE BlockReduce2D(const InDistributedTensor& t_, const InDataType& reduce_init_) + : t(t_), reduce_init(reduce_init_) + { + } + + CK_TILE_HOST_DEVICE constexpr auto MakeDstBlockTile() const + { + using ReduceDim = sequence<1>; // hard coded + constexpr auto acc_dstr = + make_static_tile_distribution(ck_tile::detail::make_reduce_tile_distribution_encoding( + InDistributedTensor::get_tile_distribution() + .get_static_tile_distribution_encoding(), + ReduceDim{})); + + return make_static_distributed_tensor(acc_dstr); + } + + // return number of pixels each lane need to reduce + CK_TILE_HOST_DEVICE constexpr auto get_reduce_length_y() const + { + constexpr auto spans = InDistributedTensor::get_distributed_spans(); + } + + // Here ReducePacksPerXDim is not the same meaning as that in static_uford/sweep_tile_uspan + // this is number of packs along the X-dim. We need to compute the Unpacks along the Y dim + // internally + // For simplicity, we just support along the row dimension, ReducePacksPerXDim is always 2 + // element , and the first element is always ignored For simplicity, will always try from + // right-to-left to find alone which Y dim to split + template > + CK_TILE_HOST_DEVICE auto operator()(const ReduceFunc& reduce_func, + const ReduceSyncFunc& reduce_sync_func, + ReducePacksPerXDim = {}) const + { + constexpr auto spans = InDistributedTensor::get_distributed_spans(); + + constexpr auto row_y_unpacks = [&]() { + constexpr auto row_y_lengths = typename decltype(spans[number<1>{}])::Impl{}; + constexpr auto row_y_size = + reduce_on_sequence(row_y_lengths, multiplies{}, number<1>{}); + constexpr auto row_y_packs = ReducePacksPerXDim{}.at(number<1>{}); + + static_assert(row_y_size % row_y_packs == 0); + + constexpr auto row_y_slice_size = row_y_size / row_y_packs; + + constexpr auto slice_info = slice_sequence(row_y_lengths, number{}); + constexpr auto unpacks = slice_info[number<1>{}]; + return unpacks; + }(); + + auto acc_tensor = MakeDstBlockTile(); + + // in-thread reduction + // FIXME: hard coded to be 2D to 1D reduction + sweep_tile_span(spans[number<0>{}], [&](auto dstr_idx_i0) { + constexpr auto acc_dstr_idx = make_tuple(dstr_idx_i0); + + auto acc = acc_tensor[acc_dstr_idx]; + + sweep_tile_uspan( + spans[number<1>{}], + [&](auto... dstr_idx_i1) { + acc = reduce_func(acc, t[make_tuple(dstr_idx_i0, dstr_idx_i1)]...); + }, + row_y_unpacks); + + acc_tensor(acc_dstr_idx) = acc; + }); + + // TODO: always use xor to do cross-lane reduce + block_tile_reduce_xor_sync(acc_tensor, reduce_sync_func); + + return acc_tensor; + } + + template + CK_TILE_HOST_DEVICE auto operator()(const ReduceFunc& reduce_func) const + { + return operator()(reduce_func, reduce_func); + } + + InDistributedTensor t; + InDataType reduce_init; +}; + +// deduction guide +template +CK_TILE_HOST_DEVICE_EXTERN BlockReduce2D(const T&, const typename T::DataType&)->BlockReduce2D; + } // namespace ck_tile diff --git a/include/ck_tile/ops/softmax.hpp b/include/ck_tile/ops/softmax.hpp new file mode 100644 index 000000000..584ca7068 --- /dev/null +++ b/include/ck_tile/ops/softmax.hpp @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/softmax/block/block_softmax_2d.hpp" +#include "ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/softmax/block/block_softmax_2d.hpp b/include/ck_tile/ops/softmax/block/block_softmax_2d.hpp new file mode 100644 index 000000000..607ec7eb5 --- /dev/null +++ b/include/ck_tile/ops/softmax/block/block_softmax_2d.hpp @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/reduce.hpp" + +#define _BLOCK_SOFTMAX_USE_UNPACK2 0 + +namespace ck_tile { + +/* +simple 2d softmax implementation, along row (dim=1) +requirement: + 1). each row is within a warp + 2). data type must be a dword +*/ +template +struct BlockSoftmax2D +{ + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + + using DataType = typename Problem::DataType; + + template + CK_TILE_DEVICE void + operator()(const DistributedTensor& x, DistributedTensor& y, number = {}) + { + const auto f_max = [](auto e0, auto e1) { return max(e0, e1); }; + const auto f_sum = [](auto e0, auto e1) { return e0 + e1; }; +#if _BLOCK_SOFTMAX_USE_UNPACK2 + const auto f_max3 = [](auto e0, auto e1, auto e2) { + float rtn; + asm volatile("v_max3_f32 %0, %1, %2, %3" : "=v"(rtn) : "v"(e0), "v"(e1), "v"(e2)); + return rtn; + }; + const auto f_sum3 = [](auto e0, auto e1, auto e2) { return e0 + e1 + e2; }; +#endif + + // compute row max + auto reduce_row_max = BlockReduce2D{x, -numeric::infinity()}; +#if _BLOCK_SOFTMAX_USE_UNPACK2 + auto row_max = reduce_row_max(f_max3, f_max, sequence<1, 2>{}); +#else + auto row_max = reduce_row_max(f_max); +#endif + sweep_tile([&](auto idx) { + constexpr auto row_id = make_tuple(idx[number<0>{}]); + y(idx) = exp(x[idx] - row_max[row_id]); + }); + + // compute row sum + auto reduce_row_sum = BlockReduce2D{y, DataType{0}}; +#if _BLOCK_SOFTMAX_USE_UNPACK2 + auto row_sum = reduce_row_sum(f_sum3, f_sum, sequence<1, 2>{}); +#else + auto row_sum = reduce_row_sum(f_sum); +#endif + // reciprocal + auto r = make_static_distributed_tensor(row_sum.get_tile_distribution()); + sweep_tile(row_sum, [&](auto idx) { r(idx) = DataType{1} / row_sum(idx); }); + + // scale + sweep_tile([&](auto idx) { + constexpr auto row_id = make_tuple(idx[number<0>{}]); + y(idx) = y(idx) * r(row_id); + }); + } + + template + CK_TILE_DEVICE decltype(auto) operator()(const DistributedTensor& x, number = {}) + { + auto y = DistributedTensor{}; // distributed tensor + operator()(x, y, number{}); + return y; + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp b/include/ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp new file mode 100644 index 000000000..82b9a5a48 --- /dev/null +++ b/include/ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template +struct BlockSoftmax2DProblem +{ + using DataType = remove_cvref_t; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/topk.hpp b/include/ck_tile/ops/topk.hpp new file mode 100644 index 000000000..b1143e4a0 --- /dev/null +++ b/include/ck_tile/ops/topk.hpp @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/topk/block/block_topk_stream_2d.hpp" +#include "ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/topk/block/block_topk_stream_2d.hpp b/include/ck_tile/ops/topk/block/block_topk_stream_2d.hpp new file mode 100644 index 000000000..164685f98 --- /dev/null +++ b/include/ck_tile/ops/topk/block/block_topk_stream_2d.hpp @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +/* +simple 2d topk implementation, along row (dim=1) +requirement: + 1). each row is within a warp +*/ +template +struct BlockTopkStream2D +{ + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + + using DataType = typename Problem::DataType; + using IndexType = typename Problem::IndexType; + + // TODO: if DataType is subdword, need pack into single dword to use argmax + struct ArgmaxPacket + { + DataType arg; + index_t value; + }; + + template + CK_TILE_DEVICE void operator()(const DistributedTensor& x, + const OutWindow& out_window, + const IdxWindow& idx_window, + index_t k, + number = {}) + { + OutWindow out_window_tmp = out_window; + IdxWindow idx_window_tmp = idx_window; + static_assert( + std::is_same_v && + std::is_same_v); + static_assert(std::is_same_v); + + DistributedTensor x_tmp = x; + constexpr auto dst_dist = typename IdxWindow::TileDstr{}; + + // argmax for topk + const auto f_argmax = [](ArgmaxPacket e0, ArgmaxPacket e1) { + return e0.arg > e1.arg ? e0 : e1; + }; + + for(index_t i_k = 0; i_k < k; i_k++) + { + constexpr auto span_2d = DistributedTensor::get_distributed_spans(); + auto packet = [&]() { + auto tmp = make_static_distributed_tensor(x.get_tile_distribution()); + + sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) { + sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) { + const auto tile_idx = get_x_indices_from_distributed_indices( + tmp.get_tile_distribution(), make_tuple(idx0, idx1)); + constexpr auto i_j_idx = make_tuple(idx0, idx1); + ArgmaxPacket t; + t.arg = x_tmp(i_j_idx); // !!! we reference x here + t.value = tile_idx.at(number<1>{}); + tmp(i_j_idx) = t; + }); + }); + return tmp; + }(); + + auto argmax_init = ArgmaxPacket{-numeric::infinity(), 0}; + auto r = block_tile_reduce(packet, sequence<1>{}, f_argmax, argmax_init); + block_tile_reduce_xor_sync(r, f_argmax); + + auto o = make_static_distributed_tensor(dst_dist); + auto i = make_static_distributed_tensor(dst_dist); + sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) { + sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = make_tuple(idx0, idx1); + ArgmaxPacket tmp = r(i_j_idx); + o(i_j_idx) = tmp.arg; + i(i_j_idx) = tmp.value; + }); + }); + + // update value + sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) { + sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) { + const auto tile_idx = get_x_indices_from_distributed_indices( + x.get_tile_distribution(), make_tuple(idx0, idx1)); + auto col_id = tile_idx.at(number<1>{}); + + constexpr auto i_j_idx = make_tuple(idx0, idx1); + + x_tmp(i_j_idx) = (col_id == r(i_j_idx).value) ? -numeric::infinity() + : x_tmp(i_j_idx); + }); + }); + + if(threadIdx.x % Problem::ColLanes == 0) + { + store_tile(out_window_tmp, o); + store_tile(idx_window_tmp, i); + } + move_tile_window(out_window_tmp, {number<0>{}, number<1>{}}); + move_tile_window(idx_window_tmp, {number<0>{}, number<1>{}}); + } + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp b/include/ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp new file mode 100644 index 000000000..d47188d86 --- /dev/null +++ b/include/ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +/* +simple 2d topk implementation, along row (dim=1) +requirement: + 1). each row is within a warp +*/ +template +struct BlockTopkStream2DProblem +{ + using DataType = remove_cvref_t; + using IndexType = remove_cvref_t; + static constexpr index_t ColLanes = ColLanes_; +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/topk_softmax.hpp b/include/ck_tile/ops/topk_softmax.hpp new file mode 100644 index 000000000..809473d53 --- /dev/null +++ b/include/ck_tile/ops/topk_softmax.hpp @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp" +#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp" +#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp" +#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp new file mode 100644 index 000000000..b8520ae61 --- /dev/null +++ b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" +#include "ck_tile/ops/elementwise.hpp" +#include "ck_tile/host/hip_check_error.hpp" +#include +#include + +namespace ck_tile { + +struct TopkSoftmaxHostArgs +{ + const void* p_input; + void* p_output; + void* p_indices; + index_t num_rows; + index_t num_experts; + index_t topk; + index_t stride_input; // row stride for input, at least experts + index_t stride_output; // row stride for output/indices, at least tpok +}; + +template +struct TopkSoftmaxKernel +{ + using Pipeline = remove_cvref_t; + using Problem = remove_cvref_t; + + using InputType = typename Problem::InputType; + using WeightType = typename Problem::WeightType; + using IndexType = typename Problem::IndexType; + + struct TopkSoftmaxKargs + { + const void* p_input; + void* p_output; + void* p_indices; + index_t num_rows; + index_t num_experts; + index_t topk; + index_t stride_input; // row stride for input, at least experts + index_t stride_output; // row stride for output/indices, at least tpok + }; + + using Kargs = TopkSoftmaxKargs; + using Hargs = TopkSoftmaxHostArgs; + + CK_TILE_HOST static constexpr auto GridSize(const Hargs& h) + { + if constexpr(Problem::LaunchType > 0) + { + int num_cu = [&]() { + hipDeviceProp_t dev_prop; + hipDevice_t dev; + HIP_CHECK_ERROR(hipGetDevice(&dev)); + HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev)); + return dev_prop.multiProcessorCount; + }(); + return dim3(num_cu * Problem::LaunchType); + } + else + { + const int num_warps = (h.num_rows + Problem::RowsPerWarp - 1) / Problem::RowsPerWarp; + const int num_blocks = + (num_warps + Problem::WarpsPerBlock - 1) / Problem::WarpsPerBlock; + return dim3(num_blocks); + } + } + + CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h) + { + Kargs k; + k.p_input = h.p_input; + k.p_output = h.p_output; + k.p_indices = h.p_indices; + k.num_rows = h.num_rows; + k.num_experts = h.num_experts; + k.topk = h.topk; + k.stride_input = h.stride_input; + k.stride_output = h.stride_output; + return k; + } + + CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + index_t block_row_id = static_cast(blockIdx.x * Problem::RowsPerBlock); + + if(block_row_id > kargs.num_rows) + return; + + index_t block_os_inp = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_input); + index_t block_os_out = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_output); + index_t num_rows_rem = __builtin_amdgcn_readfirstlane(kargs.num_rows - block_row_id); + + const auto input_window = [&]() { + const InputType* p_input = + reinterpret_cast(kargs.p_input) + block_os_inp; + + auto tmp = make_naive_tensor_view( + p_input, + make_tuple(num_rows_rem, kargs.num_experts), + make_tuple(kargs.stride_input, 1), + number{}, + number<1>{}); + + auto view = pad_tensor_view( + tmp, + make_tuple(number{}, number{}), + sequence<0, 1>{}); // out-most dim no need pad(leverage oob) + + return make_tile_window( + view, + make_tuple(number{}, number{}), + {0, 0}); + }(); + + auto output_window = [&]() { + WeightType* p_output = reinterpret_cast(kargs.p_output) + block_os_out; + auto tmp = make_naive_tensor_view( + p_output, + make_tuple(num_rows_rem, kargs.topk), + make_tuple(kargs.stride_output, 1), + number{}, + number<1>{}); + auto view = + pad_tensor_view(tmp, + make_tuple(number{}, number<1>{}), + sequence<0, 0>{}); // 1. out-most dim no need pad(leverage oob) + // 2. we loop over topk 1-1, no need padding + return make_tile_window( + view, make_tuple(number{}, number<1>{}), {0, 0}); + }(); + + auto indices_window = [&]() { + IndexType* p_indices = reinterpret_cast(kargs.p_indices) + block_os_out; + auto tmp = make_naive_tensor_view( + p_indices, + make_tuple(num_rows_rem, kargs.topk), + make_tuple(kargs.stride_output, 1), + number{}, + number<1>{}); + auto view = + pad_tensor_view(tmp, + make_tuple(number{}, number<1>{}), + sequence<0, 0>{}); // 1. out-most dim no need pad(leverage oob) + // 2. we loop over topk 1-1, no need padding + return make_tile_window( + view, make_tuple(number{}, number<1>{}), {0, 0}); + }(); + + Pipeline{}(input_window, + output_window, + indices_window, + kargs.num_rows, + kargs.num_experts, + kargs.topk, + block_row_id); + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp new file mode 100644 index 000000000..d620d9bec --- /dev/null +++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp" +#include +#include + +#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW +#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 0 +#endif + +namespace ck_tile { + +template +struct TopkSoftmaxWarpPerRowPipeline +{ + // TODO: this kernel only support warp per row + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + using WeightType = typename Problem::WeightType; + + template + CK_TILE_DEVICE auto operator()(const InputWindow& input_window, + OutputWindow& out_window, + IndexWindow& idx_window, + index_t rows, + index_t experts, + index_t k, + index_t block_row_id) + { +#if TOPK_SOFTMAX_USE_RAW_TILE_WINDOW + auto inp_win = make_tile_window_linear_raw( + input_window, Policy::template MakeInputDistribution(), sequence<0, 1>{}); +#else + auto inp_win = make_tile_window_linear( + input_window, Policy::template MakeInputDistribution(), sequence<0, 1>{}); +#endif + auto out_win = make_tile_window_linear(out_window.get_bottom_tensor_view(), + out_window.get_window_lengths(), + out_window.get_window_origin(), + Policy::template MakeOutputDistribution()); + auto idx_win = make_tile_window_linear(idx_window.get_bottom_tensor_view(), + idx_window.get_window_lengths(), + idx_window.get_window_origin(), + Policy::template MakeOutputDistribution()); + + auto softmax = Policy::template GetSoftmax(); + auto topk = Policy::template GetTopk(); + + const index_t grid_rows_per_loop = gridDim.x * Problem::RowsPerBlock; + + while(1) + { +#if TOPK_SOFTMAX_USE_RAW_TILE_WINDOW + __builtin_amdgcn_sched_barrier(0); + auto x = + load_tile_raw(inp_win, number<-1>{}, bool_constant{}, bool_constant{}); + buffer_load_fence(number<0>{}); + __builtin_amdgcn_sched_barrier(0); +#else + auto x = load_tile(inp_win); +#endif + // cast and pad input data + auto w = [&]() { +#if 0 + auto w_ = cast_tile(x); + + constexpr auto span_2d = decltype(w_)::get_distributed_spans(); + sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) { + sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = make_tuple(idx0, idx1); + const auto x_indices = get_x_indices_from_distributed_indices( + w_.get_tile_distribution(), i_j_idx); + const auto current_expert = x_indices.at(number<1>{}); + // set to -INF if OOB so that later softmax can work properly + w_(i_j_idx) = current_expert >= experts ? -numeric::infinity() + : w_(i_j_idx); + }); + }); + return w_; +#else + auto w_ = make_static_distributed_tensor(x.get_tile_distribution()); + auto w_f = [&](auto idx) { + w_(idx) = type_convert(x(idx)); + const auto x_indices = + get_x_indices_from_distributed_indices(w_.get_tile_distribution(), idx); + const auto current_expert = x_indices.at(number<1>{}); + w_(idx) = + current_expert >= experts ? -numeric::infinity() : w_(idx); + }; + tile_sweeper ts{w_, w_f}; + ts(); + return w_; +#endif + }(); + + // softmax + auto y = softmax(w); + + topk(y, out_win, idx_win, k); + + // check exit + if constexpr(Problem::LaunchType == 0) + { + break; + } + else + { + block_row_id += grid_rows_per_loop; + if(block_row_id >= rows) + break; + } + + move_tile_window(inp_win, {grid_rows_per_loop, number<0>{}}); + move_tile_window(out_win, {grid_rows_per_loop, number<0>{}}); + move_tile_window(idx_win, {grid_rows_per_loop, number<0>{}}); + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp new file mode 100644 index 000000000..a6e886bd3 --- /dev/null +++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/softmax.hpp" +#include "ck_tile/ops/topk.hpp" + +namespace ck_tile { + +struct TopkSoftmaxWarpPerRowPolicy +{ + template + CK_TILE_HOST_DEVICE static constexpr auto MakeInputDistribution() + { + // TODO: Y dim must have one dim that is not reduced + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 1>>, + sequence<1, 2, 2>, + sequence<0, 0, 2>>{}); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution() + { + return make_static_tile_distribution( + tile_distribution_encoding, // repeat this one + tuple, + sequence<1>>, // each row write out single element + tuple, sequence<1, 0>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 0>>{}); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSoftmax() + { + using softmax_problem = BlockSoftmax2DProblem; + return BlockSoftmax2D{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetTopk() + { + using topk_problem = BlockTopkStream2DProblem; + // Note: replicate is LanesPerRow + return BlockTopkStream2D{}; + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp new file mode 100644 index 000000000..917096ad5 --- /dev/null +++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include +#include + +namespace ck_tile { + +template 0, persistent #occupancy + index_t BlockSize_ = 256> +struct TopkSoftmaxWarpPerRowProblem +{ + // TODO: this kernel only support warp per row + using InputType = remove_cvref_t; + using WeightType = remove_cvref_t; + using IndexType = remove_cvref_t; + + static constexpr index_t LaunchType = LaunchType_; + static constexpr index_t Experts = Experts_; + static constexpr index_t BytesPerIssue = BytesPerIssue_; + static constexpr index_t IssuesPerCol = IssuesPerCol_; + static constexpr index_t BlockSize = BlockSize_; + static constexpr index_t WarpSize = get_warp_size(); + + static_assert(BytesPerIssue % sizeof(InputType) == 0); + static constexpr index_t VectorSize = BytesPerIssue / sizeof(InputType); + static_assert(Experts % VectorSize == 0); + static constexpr index_t LanesPerRow = min(Experts / VectorSize, WarpSize); + static_assert(WarpSize % LanesPerRow == 0); + static constexpr index_t RowsPerWarpPerColIssue = WarpSize / LanesPerRow; + static constexpr index_t RowsPerWarp = IssuesPerCol * RowsPerWarpPerColIssue; + static constexpr index_t IssuesPerRow = Experts / (LanesPerRow * VectorSize); + + static constexpr index_t WarpsPerBlock = BlockSize / WarpSize; + static constexpr index_t RowsPerBlock = RowsPerWarp * WarpsPerBlock; +}; +} // namespace ck_tile -- GitLab From 922e42a039a42770446c42fabc62fe1e7b050625 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 28 Oct 2024 19:02:48 -0700 Subject: [PATCH 021/153] fix compilation errors for gfx12 with clang20 (#1606) --- include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 3ea19da74..fa389c340 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -352,7 +352,7 @@ struct BlockwiseGemmWMMA constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); - wmma_gemm.template Run( + wmma_gemm.template Run<>( a_thread_vec.template AsType(), b_thread_vec.template AsType(), c_thread_buf.GetVectorTypeReference(Number{})); @@ -406,7 +406,7 @@ struct BlockwiseGemmWMMA constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); - wmma_gemm.template Run( + wmma_gemm.template Run<>( a_thread_vec.template AsType(), b_thread_vec.template AsType(), c_thread_buf.GetVectorTypeReference(Number{})); -- GitLab From 9fbd72e97e34f530ae370527755b655bf390d9ee Mon Sep 17 00:00:00 2001 From: valarLip <103567126+valarLip@users.noreply.github.com> Date: Tue, 29 Oct 2024 18:05:53 +0800 Subject: [PATCH 022/153] [CK_TILE] add generic_permute (#1607) --- example/ck_tile/06_permute/CMakeLists.txt | 13 + example/ck_tile/06_permute/README.md | 46 ++ .../alternative_impl/matrix_core_swizzle.cpp | 98 +++++ .../alternative_impl/matrix_core_swizzle.hpp | 20 + .../matrix_core_swizzle_kernel.hpp | 413 ++++++++++++++++++ example/ck_tile/06_permute/permute.cpp | 411 +++++++++++++++++ example/ck_tile/06_permute/permute.hpp | 19 + .../ck_tile/06_permute/script/smoke_test.sh | 34 ++ example/ck_tile/CMakeLists.txt | 1 + include/ck_tile/host.hpp | 1 + .../host/reference/reference_permute.hpp | 57 +++ include/ck_tile/ops/permute.hpp | 8 + .../permute/kernel/generic_permute_kernel.hpp | 169 +++++++ .../pipeline/generic_petmute_problem.hpp | 28 ++ 14 files changed, 1318 insertions(+) create mode 100644 example/ck_tile/06_permute/CMakeLists.txt create mode 100644 example/ck_tile/06_permute/README.md create mode 100644 example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp create mode 100644 example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp create mode 100644 example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp create mode 100644 example/ck_tile/06_permute/permute.cpp create mode 100644 example/ck_tile/06_permute/permute.hpp create mode 100644 example/ck_tile/06_permute/script/smoke_test.sh create mode 100644 include/ck_tile/host/reference/reference_permute.hpp create mode 100644 include/ck_tile/ops/permute.hpp create mode 100644 include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp create mode 100644 include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp diff --git a/example/ck_tile/06_permute/CMakeLists.txt b/example/ck_tile/06_permute/CMakeLists.txt new file mode 100644 index 000000000..327fceb68 --- /dev/null +++ b/example/ck_tile/06_permute/CMakeLists.txt @@ -0,0 +1,13 @@ +# not using add_example_executable() to add this target, since we don't want this to have +# to be included in "make all/install/check" +add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp) + +if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL) +# set(PERMUTE_USE_ALTERNATIVE_IMPL false) +set(PERMUTE_USE_ALTERNATIVE_IMPL true) +endif() +if(PERMUTE_USE_ALTERNATIVE_IMPL) +target_compile_options(tile_example_permute PRIVATE -DPERMUTE_USE_ALTERNATIVE_IMPL) +target_sources(tile_example_permute PRIVATE alternative_impl/matrix_core_swizzle.cpp) +endif() +# target_compile_options(tile_example_permute PRIVATE -v --save-temps -Wno-gnu-line-marker) diff --git a/example/ck_tile/06_permute/README.md b/example/ck_tile/06_permute/README.md new file mode 100644 index 000000000..03bd810ff --- /dev/null +++ b/example/ck_tile/06_permute/README.md @@ -0,0 +1,46 @@ +# permute + +This folder contains example for permute kernel, which is similiar to [torch.permute](https://pytorch.org/docs/stable/generated/torch.permute.html) (combined with [torch.contiguous](https://pytorch.org/docs/stable/generated/torch.Tensor.contiguous.html)). Currently we implement a generic permute kernel that support up to rank 8 arbitrary permutation with a single kernel instance. Performance is not the first consideration, we prefer a simple and general kernel implementation using `ck_tile` in this example. + + +``` +args: + -v weather do CPU validation or not (default:1) + -prec data type. fp16/bf16/fp32 (default:fp16) + -shape the shape of the input tensor (default:2,3,4) + -perm permute perm (default:2,1,0) +``` + +## build +``` +# in the root of ck_tile +mkdir build && cd build +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +make tile_example_permute -j +``` +This will result in an executable `build/bin/tile_example_permute` + + +## some examples +``` +# torch +x=torch.randn(2,3,4,6) +y=x.permute(0,3,2,1).contiguous() + +# ck_tile +./build/bin/tile_example_permute -shape=2,3,4,6 -perm=0,3,2,1 +``` + +or you can try the smoke_test +``` +# in the root of ck_tile, after you build this example +sh example/ck_tile/06_permute/script/smoke_test.sh +``` + +### alternative implementation +we have an alternative implementation under `alternative_impl/` folder, that can swizzle the tensor to be more friendly for data loading for matrix core layout. This can be enabled when dealing with a `rank-7` tensor, with a fixed pattern of either `0,1,4,2,5,3,6` or `0,1,2,4,5,3,6`. There are other shape limitation of this implementation, check the source code of `permute.cpp` for detail. +``` +# example +./build/bin/tile_example_permute -shape=3,6,4,32,16,2,8 -perm=0,1,4,2,5,3,6 # b_n0_k0_n1_k1_n2_k2 +./build/bin/tile_example_permute -shape=3,8,4,16,16,4,8 -perm=0,1,2,4,5,3,6 # b_n0_n1_k0_k1_n2_k2 +``` diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp new file mode 100644 index 000000000..93c662a28 --- /dev/null +++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp @@ -0,0 +1,98 @@ +#include "matrix_core_swizzle.hpp" +#include "matrix_core_swizzle_kernel.hpp" + +float matrix_core_swizzle(matrix_core_swizzle_traits t, + matrix_core_swizzle_args a, + const ck_tile::stream_config& s) +{ + if(t.data_type.compare("fp16") == 0) + { + if(t.inst.compare("32x32x8") == 0) + { + constexpr int BLOCK_SIZE = 256; + constexpr int NPerBlock = 256; + constexpr int KPerBlock = 128; + constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_32x32x8_F16; + if(t.permute.compare("0,1,4,2,5,3,6") == 0) + { + constexpr matrix_core_permute_style pstyle = + matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2; + using Kernel = + matrix_core_swizzle_kernel; + + auto k = Kernel(a); + float ave_time = ck_tile::launch_kernel(s, k); + + return ave_time; + } + else if(t.permute.compare("0,1,2,4,5,3,6") == 0) + { + constexpr matrix_core_permute_style pstyle = + matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2; + using Kernel = + matrix_core_swizzle_kernel; + + auto k = Kernel(a); + float ave_time = ck_tile::launch_kernel(s, k); + + return ave_time; + } + else if(t.permute.compare("0,1,3,4,2,5") == 0) + { + constexpr matrix_core_permute_style pstyle = + matrix_core_permute_style::permute_b_nr_kr_kw_nw_kv; + using Kernel = + matrix_core_swizzle_kernel; + + auto k = Kernel(a); + float ave_time = ck_tile::launch_kernel(s, k); + + return ave_time; + } + } + else if(t.inst.compare("16x16x16") == 0) + { + constexpr int BLOCK_SIZE = 256; + constexpr int NPerBlock = 256; + constexpr int KPerBlock = 128; + constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_16x16x16_F16; + if(t.permute.compare("0,1,4,2,5,3,6") == 0) + { + constexpr matrix_core_permute_style pstyle = + matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2; + using Kernel = + matrix_core_swizzle_kernel; + + auto k = Kernel(a); + float ave_time = ck_tile::launch_kernel(s, k); + + return ave_time; + } + else if(t.permute.compare("0,1,2,4,5,3,6") == 0) + { + constexpr matrix_core_permute_style pstyle = + matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2; + using Kernel = + matrix_core_swizzle_kernel; + + auto k = Kernel(a); + float ave_time = ck_tile::launch_kernel(s, k); + + return ave_time; + } + else if(t.permute.compare("0,1,3,4,2,5") == 0) + { + constexpr matrix_core_permute_style pstyle = + matrix_core_permute_style::permute_b_nr_kr_kw_nw_kv; + using Kernel = + matrix_core_swizzle_kernel; + + auto k = Kernel(a); + float ave_time = ck_tile::launch_kernel(s, k); + + return ave_time; + } + } + } + return -1; +} diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp new file mode 100644 index 000000000..e1ecdbbe6 --- /dev/null +++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once +#include "matrix_core_swizzle_kernel.hpp" +#include + +struct matrix_core_swizzle_traits +{ + std::string data_type; // fp16 only + std::string inst; // 32x32x8, 16x16x16 + std::string permute; // +}; + +using matrix_core_swizzle_args = matrix_core_swizzle_host_args; + +// host API +float matrix_core_swizzle(matrix_core_swizzle_traits, + matrix_core_swizzle_args, + const ck_tile::stream_config&); diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp new file mode 100644 index 000000000..60ac103ec --- /dev/null +++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/gemm.hpp" + +// if set to 1, slightly more instructions generated to calculate address +#ifndef MERGE_2D_013425 +#define MERGE_2D_013425 0 +#endif + +enum class matrix_core_inst_enum +{ + MFMA_32x32x8_F16 = 0, + MFMA_16x16x16_F16 = 1, +}; + +namespace detail { +template +struct to_warp_gemm; + +template <> +struct to_warp_gemm +{ + using type = ck_tile::WarpGemmMfmaF16F16F32M32N32K8; +}; + +template <> +struct to_warp_gemm +{ + using type = ck_tile::WarpGemmMfmaF16F16F32M16N16K16; +}; +} // namespace detail +template +using to_warp_gemm_t = typename detail::to_warp_gemm::type; + +// TODO: in below permute pattern, the last 3 dim is within wave +enum class matrix_core_permute_style +{ + permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6 + permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6 + permute_b_nr_kr_kw_nw_kv = 2, // 0,1,3,4,2,5 + permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv, +}; + +// assume this is B matrix, originally we have batch*n*k +// now batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2 +// assume using 32x32x8-f16, 4 waves and extend the KPerLane to 8xfp16(dwordx4) +// +// 4(waves) 32(mfma_m lane) +// | | +// batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2 -> 8(thread loading) +// nr kr | +// nr 4 32 kr 2 8 2(klane) +// +// permute: 0,1,4,2,5,3,6 +// or +// batch* n0*n1*n2*k0*k1*k2 -> batch* n0*n1*k0*k1*n2*k2 -> 8(thread loading) +// permute: 0,1,2,4,5,3,6 +// +// this kernel only deal with fp16/bf16 data(16bit), and use 2d block size to do the swizzling +// for simplicity, only consider n/k is multiple of block-size + +// independend host arg with no template +struct matrix_core_swizzle_host_args +{ + const void* p_src; + void* p_dst; + int32_t batch; + int32_t n; + int32_t k; +}; + +// NOTE: this kernel could follow the style of generic permute kernel +// but here we pass in fixed layout as template arg and generate different kernel instance +// purposely +template +struct matrix_core_swizzle_kernel +{ + using karg = matrix_core_swizzle_host_args; + using harg = matrix_core_swizzle_host_args; + + static constexpr int BLOCK_SIZE = BLOCK_SIZE_; + static constexpr int WavesPerBlock_N = 4; + static constexpr int WavesPerBlock_K = 1; + static_assert(WavesPerBlock_N * WavesPerBlock_K * 64 == BLOCK_SIZE); + static constexpr int NPerBlock = NPerBlock_; + static constexpr int KPerBlock = KPerBlock_; + static constexpr matrix_core_permute_style pstyle = pstyle_; + static constexpr matrix_core_inst_enum Inst = Inst_; + + static constexpr ck_tile::index_t Alignment = 8; + karg a; + dim3 grids; + + using WarpGemm = to_warp_gemm_t; + + __host__ matrix_core_swizzle_kernel(harg h) + { + a = h; + ck_tile::index_t ns = (h.n + NPerBlock - 1) / NPerBlock; + ck_tile::index_t ks = (h.k + KPerBlock - 1) / KPerBlock; + grids = dim3(ks, ns, h.batch); + } + + __host__ bool is_applicable(harg h) { return h.n % NPerBlock == 0 && h.k % KPerBlock == 0; } + + __host__ void operator()(const ck_tile::stream_config& s) const + { + ck_tile::kentry<<>>(a); + } + + struct kernel + { + __device__ static constexpr auto get_src_dist() + { + using namespace ck_tile; + constexpr index_t K2 = Alignment; + constexpr index_t N2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane; + constexpr index_t K1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane; + constexpr index_t N1 = BLOCK_SIZE / get_warp_size(); + + static_assert(NPerBlock % (N1 * N2) == 0); + static_assert(KPerBlock % (K1 * K2) == 0); + + constexpr index_t K0 = KPerBlock / (K1 * K2); + constexpr index_t N0 = NPerBlock / (N1 * N2); + + // clang-format off + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>,// 0 + // 1 2 3 4 5 6 + tuple, sequence, sequence, sequence, sequence, sequence>, + + // N1 K1 N2 + tuple, sequence<5, 3>>, + tuple, sequence<0, 0>>, + + // N0 K0 K2 + sequence<1, 4, 6>, + sequence<0, 0, 0>>{}); + // clang-format on + } + __device__ static constexpr auto get_dst_dist() + { + using namespace ck_tile; + constexpr index_t K2 = Alignment; + constexpr index_t N2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane; + constexpr index_t K1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane; + constexpr index_t N1 = BLOCK_SIZE / get_warp_size(); + + static_assert(NPerBlock % (N1 * N2) == 0); + static_assert(KPerBlock % (K1 * K2) == 0); + + constexpr index_t K0 = KPerBlock / (K1 * K2); + constexpr index_t N0 = NPerBlock / (N1 * N2); + + if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2) + { + // clang-format off + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>,// 0 + // 1 2 3 4 5 6 + tuple, sequence, sequence, sequence, sequence, sequence>, + + // N1 K1 N2 + tuple, sequence<4, 5>>, + tuple, sequence<0, 0>>, + + // N0 K0 K2 + sequence<1, 2, 6>, + sequence<0, 0, 0>>{}); + // clang-format on + } + else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2) + { + // clang-format off + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>,// 0 + // 1 2 3 4 5 6 + tuple, sequence, sequence, sequence, sequence, sequence>, + + // N1 K1 N2 + tuple, sequence<4, 5>>, + tuple, sequence<0, 0>>, + + // N0 K0 K2 + sequence<1, 3, 6>, + sequence<0, 0, 0>>{}); + // clang-format on + } + else + { + // clang-format off + // permute_b_nr_kr_kw_nw_kv or permute_b_nr_kr_waveflatten + constexpr index_t Kv = Alignment; + constexpr index_t Nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane; + constexpr index_t Kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane; + + static_assert(KPerBlock % (K1 * K2) == 0); + constexpr index_t Nr = NPerBlock / Nw; + constexpr index_t Kr = KPerBlock / (Kv * Kw); + + constexpr index_t Nr_p = WavesPerBlock_N; + constexpr index_t Kr_p = WavesPerBlock_K; + constexpr index_t Nr_y = Nr / Nr_p; + constexpr index_t Kr_y = Kr / Kr_p; + + return make_static_tile_distribution( +#if MERGE_2D_013425 + tile_distribution_encoding< + sequence<1>,// 0 R + // major 1 2 + // minor 0 1 2 0 1 2 3 + tuple, sequence>, // H + + // Nr_p, Kr_p Kw Nw + tuple, sequence<2, 1>>, // p major + tuple, sequence<2, 2>>, // p minor + + // Nr_y Kr_y Kv + sequence<1, 2, 2>, // Y major + sequence<0, 0, 3>>{}); // y minor +#else + tile_distribution_encoding< + sequence<1>,// 0 R + // major 1 2 3 + // minor 0 1 0 1 0 1 2 + tuple, sequence, sequence>, // H + + // Nr_p, Kr_p Kw Nw + tuple, sequence<3, 3>>, // p major + tuple, sequence<0, 1>>, // p minor + + // Nr_y Kr_y Kv + sequence<1, 2, 3>, // Y major + sequence<0, 0, 2>>{}); // y minor +#endif + // clang-format on + } + } + + __device__ void operator()(karg a_) + { + using namespace ck_tile; + index_t i_k = blockIdx.x; + index_t i_n = blockIdx.y; + index_t i_b = blockIdx.z; + + constexpr index_t k2 = Alignment; + constexpr index_t n2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane; + constexpr index_t k1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane; + constexpr index_t n1 = BLOCK_SIZE / get_warp_size(); + const index_t k0 = a_.k / (k1 * k2); + const index_t n0 = a_.n / (n1 * n2); + + constexpr index_t k2_tile = Alignment; + constexpr index_t n2_tile = WarpGemm::WarpGemmAttribute::Impl::kAMLane; + constexpr index_t k1_tile = WarpGemm::WarpGemmAttribute::Impl::kABKLane; + constexpr index_t n1_tile = BLOCK_SIZE / get_warp_size(); + constexpr index_t k0_tile = KPerBlock / (k1_tile * k2_tile); + constexpr index_t n0_tile = NPerBlock / (n1_tile * n2_tile); + + const fp16_t* p_src = reinterpret_cast(a_.p_src) + i_b * a_.k * a_.n; + fp16_t* p_dst = reinterpret_cast(a_.p_dst) + i_b * a_.k * a_.n; + + const auto src_view = [&]() { + const auto tmp = make_naive_tensor_view_packed( + p_src, + make_tuple(n0, n1, n2, k0, k1, k2), + number{}); // control vector load + return tmp; + }(); + + const auto src_window = make_tile_window(src_view, + make_tuple(number{}, + number{}, + number{}, + number{}, + number{}, + number{}), + {i_n * n0_tile, 0, 0, i_k * k0_tile, 0, 0}, + get_src_dist()); + + auto dst_view = [&]() { + if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2) + { + auto tmp = make_naive_tensor_view_packed( + p_dst, + make_tuple(n0, k0, n1, k1, n2, k2), + number{}); // control vector load + return tmp; + } + else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2) + { + auto tmp = make_naive_tensor_view_packed( + p_dst, + make_tuple(n0, n1, k0, k1, n2, k2), + number{}); // control vector load + return tmp; + } + else + { +#if MERGE_2D_013425 + constexpr index_t kv = Alignment; + constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane; + constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane; + // constexpr index_t waveflatten = kw*nw*kv; + const index_t kr = a_.k / (k1 * k2); + const index_t nr = a_.n / nw; + auto tmp = make_naive_tensor_view_packed( + p_dst, + make_tuple(nr, kr, number{}, number{}, number{}), + number{}); // control vector load + auto tmp_1 = transform_tensor_view( + tmp, + make_tuple( + make_merge_transform(make_tuple(nr, number{})), + make_merge_transform(make_tuple(kr, number{}, number{}))), + make_tuple(sequence<0, 3>{}, sequence<1, 2, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + return tmp_1; +#else + // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv, + constexpr index_t kv = Alignment; + constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane; + constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane; + constexpr index_t waveflatten = kw * nw * kv; + const index_t kr = a_.k / (k1 * k2); + const index_t nr = a_.n / nw; + auto tmp = make_naive_tensor_view_packed( + p_dst, + make_tuple(nr, kr, waveflatten), + number{}); // control vector load + return tmp; +#endif + } + }(); + + auto dst_window = [&]() { + if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2) + { + return make_tile_window(dst_view, + make_tuple(number{}, + number{}, + number{}, + number{}, + number{}, + number{}), + {i_n * n0_tile, i_k * k0_tile, 0, 0, 0, 0}, + get_dst_dist()); + } + else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2) + { + return make_tile_window(dst_view, + make_tuple(number{}, + number{}, + number{}, + number{}, + number{}, + number{}), + {i_n * n0_tile, 0, i_k * k0_tile, 0, 0, 0}, + get_dst_dist()); + } + else + { +#if MERGE_2D_013425 + // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv + return make_tile_window(dst_view, + make_tuple(number{}, number{}), + {i_n * NPerBlock, i_k * KPerBlock}, + get_dst_dist()); +#else + // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv + constexpr index_t kv = Alignment; + constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane; + constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane; + constexpr index_t waveflatten_tile = kw * nw * kv; + constexpr index_t nr_tile = NPerBlock / nw; + constexpr index_t kr_tile = KPerBlock / (kw * kv); + return make_tile_window(dst_view, + make_tuple(number{}, + number{}, + number{}), + {i_n * nr_tile, i_k * kr_tile, 0}, + get_dst_dist()); +#endif + } + }(); + + // actual load store + auto src_tile = load_tile(src_window); + + // now we only swap the distribution from src to dst, no extra movement occurs + auto dst_tile = make_static_distributed_tensor(get_dst_dist()); + dst_tile.get_thread_buffer() = src_tile.get_thread_buffer(); + + // final store + store_tile(dst_window, dst_tile); + } + }; +}; diff --git a/example/ck_tile/06_permute/permute.cpp b/example/ck_tile/06_permute/permute.cpp new file mode 100644 index 000000000..af95b64e6 --- /dev/null +++ b/example/ck_tile/06_permute/permute.cpp @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "permute.hpp" +#include "ck_tile/host.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef PERMUTE_USE_ALTERNATIVE_IMPL +#include "alternative_impl/matrix_core_swizzle.hpp" +#endif + +namespace detail { +template +struct to_integer_type; + +template <> +struct to_integer_type<4> +{ + using type = int32_t; +}; +template <> +struct to_integer_type<2> +{ + using type = int16_t; +}; +template <> +struct to_integer_type<1> +{ + using type = int8_t; +}; +} // namespace detail + +template +using to_integer_type = typename detail::to_integer_type::type; + +// host API (shoule come from codegen) +float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s) +{ + if(t.data_type.compare("fp8") == 0) + { + using DataType = ck_tile::fp8_t; + using PipelineProblem = ck_tile::GenericPermuteProblem; + using Kernel = ck_tile::GenericPermute; + + auto kargs = Kernel::MakeKargs(a); + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; + } + else if(t.data_type.compare("fp16") == 0) + { + using DataType = ck_tile::half_t; + using PipelineProblem = ck_tile::GenericPermuteProblem; + using Kernel = ck_tile::GenericPermute; + + auto kargs = Kernel::MakeKargs(a); + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; + } + else if(t.data_type.compare("fp32") == 0) + { + using DataType = float; + using PipelineProblem = ck_tile::GenericPermuteProblem; + using Kernel = ck_tile::GenericPermute; + + auto kargs = Kernel::MakeKargs(a); + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; + } + + return 0; +} + +template +std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + using size_type = typename std::vector::size_type; + + os << "["; + for(size_type idx = 0; idx < v.size(); ++idx) + { + if(0 < idx) + { + os << ", "; + } + os << v[idx]; + } + return os << "]"; +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("v", "1", "weather do CPU validation or not") + .insert("prec", "fp16", "data type. fp8/fp16/fp32 (representing 8/16/32 bit data)") + .insert("shape", "2,3,4", "the shape of the input tensor") + .insert("perm", "2,1,0", "permute perm") + .insert("kname", "0", "t to 1 will print kernel name") + .insert("seed", + "11939", + "random seed used for initializing input tensors. 0 for " + "non-deterministic seed") + .insert("warmup", "5", "number of iterations before benchmark the kernel") + .insert("repeat", "20", "number of iterations to benchmark the kernel"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +// different threshold for different dtype +template +auto get_elimit(std::string /*init_method*/) +{ + double rtol = 1e-3; + double atol = 1e-3; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit(std::string /*init_method*/) +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit(std::string init_method) +{ + if(init_method == "ui" || init_method == "ni") + { + unsigned max_rounding_point_distance = 0; + double atol = 2e-3; + return ck_tile::make_tuple(max_rounding_point_distance, atol); + } + else + { + unsigned max_rounding_point_distance = 1; + double atol = 0.0625; + return ck_tile::make_tuple(max_rounding_point_distance, atol); + } +} + +// "1,2,3,4" -> vector{1,2,3,4} +std::vector decode_vec(std::string q_val) +{ +#define _S2I_(str_) static_cast(std::atoi((str_).c_str())) + std::string::size_type pos = 0; + std::vector v; + while(true) + { + auto found = q_val.find(',', pos); + ck_tile::index_t n = + _S2I_(q_val.substr(pos, found == std::string::npos ? found : found - pos)); + v.push_back(n); + if(found == std::string::npos) + { + break; + } + pos = found + 1; + } + return v; +#undef _S2I_ +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + std::string data_type = arg_parser.get_str("prec"); + int do_validation = arg_parser.get_int("v"); + + auto shape = decode_vec(arg_parser.get_str("shape")); + auto perm = decode_vec(arg_parser.get_str("perm")); + int stream_warmup = arg_parser.get_int("warmup"); + int stream_repeat = arg_parser.get_int("repeat"); + bool kname = arg_parser.get_bool("kname"); + int seed = arg_parser.get_int("seed"); + + assert(shape.size() == perm.size()); + ck_tile::index_t rank = perm.size(); + if(rank > ck_tile::GenericPermuteHostArgs::kMaxRanks) + { + printf("rank %d permute is not support yet\n", rank); + return false; + } + + ck_tile::HostTensor x(shape); + ck_tile::FillUniformDistributionIntegerValue{-15, 15, seed}(x); + + std::vector y_shape = [&]() { + std::vector tmp(rank, 0); + // std::cout << "@@@@" << tmp << std::endl; + for(int i = 0; i < static_cast(rank); i++) + { + // std::cout << " i:" << i << ", perm:" << perm[i] << ", rak:" << + // static_cast(rank) + // << std::endl; + tmp[i] = shape[perm[i]]; + } + // std::cout << "@@@" << tmp << std::endl; + return tmp; + }(); + + ck_tile::HostTensor y(y_shape); + + ck_tile::DeviceMem x_buf(x.get_element_space_size_in_bytes()); + ck_tile::DeviceMem y_buf(y.get_element_space_size_in_bytes()); + + x_buf.ToDevice(x.data()); + + std::cout << "[" << data_type << "] shape:" << shape << "->" << y_shape << ", permute:" << perm + << std::flush; + + ck_tile::stream_config stream_config{nullptr, + true, + /* log_level = */ (kname ? 1 : 0), + stream_warmup, + stream_repeat}; + float ave_time = 0.f; + auto run_permute = [&]() { + permute_traits t; + t.data_type = data_type; + + permute_args a; + a.p_src = x_buf.GetDeviceBuffer(); + a.p_dst = y_buf.GetDeviceBuffer(); + a.rank = rank; + std::copy(shape.begin(), shape.end(), a.shape); + std::copy(perm.begin(), perm.end(), a.perm); + + return permute(t, a, stream_config); + }; +#ifdef PERMUTE_USE_ALTERNATIVE_IMPL + // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2 + if((arg_parser.get_str("perm") == std::string("0,1,4,2,5,3,6") || + arg_parser.get_str("perm") == std::string("0,1,2,4,5,3,6") || + arg_parser.get_str("perm") == std::string("0,1,3,4,2,5"))) + { + if(arg_parser.get_str("perm") == std::string("0,1,3,4,2,5")) + { + // permute_b_nr_kr_kw_nw_kv = 2, // 0,1,3,4,2,5 + matrix_core_swizzle_traits t; + t.data_type = data_type; + t.permute = arg_parser.get_str("perm"); + + matrix_core_swizzle_args a; + a.p_src = x_buf.GetDeviceBuffer(); + a.p_dst = y_buf.GetDeviceBuffer(); + a.batch = shape[0]; + + auto nr = shape[1]; + auto nw = shape[2]; + auto kr = shape[3]; + auto kw = shape[4]; + auto kv = shape[5]; + a.n = nr * nw; + a.k = kr * kw * kv; + if(kv == 8 && kw == 4 && nw == 16 && nr % 4 == 0 && kr % 8 == 0) + { + t.inst = "16x16x16"; + std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush; + + ave_time = matrix_core_swizzle(t, a, stream_config); + } + else if(kv == 8 && kw == 2 && nw == 32 && nr % 4 == 0 && kr % 8 == 0) + { + t.inst = "32x32x8"; + std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush; + + ave_time = matrix_core_swizzle(t, a, stream_config); + } + else + { + ave_time = run_permute(); + } + } + else + { + matrix_core_swizzle_traits t; + t.data_type = data_type; + t.permute = arg_parser.get_str("perm"); + + matrix_core_swizzle_args a; + a.p_src = x_buf.GetDeviceBuffer(); + a.p_dst = y_buf.GetDeviceBuffer(); + a.batch = shape[0]; + a.n = shape[1] * shape[2] * shape[3]; + a.k = shape[4] * shape[5] * shape[6]; + if(shape[6] == 8 && shape[3] == 32 && shape[5] == 2 && shape[2] == 4 && + shape[4] % 8 == 0 && shape[1] % 2 == 0) + { + // 32x32x8 inst + // perm=0,1,4,2,5,3,6 + // y_shape=*,2x,8x,4,2,32,8 (3,6,16,4,2,32,8) + // shape = *,2x,4,32,8x,2,8 (3,6,4,32,16,2,8) + + t.inst = "32x32x8"; + std::cout << ", matrix_core_swizzle_" << t.inst << std::flush; + + ave_time = matrix_core_swizzle(t, a, stream_config); + } + else if(shape[6] == 8 && shape[3] == 16 && shape[5] == 4 && shape[2] == 4 && + shape[4] % 4 == 0 && shape[1] % 4 == 0) + { + // 16x16x16 inst + // perm=0,1,4,2,5,3,6 + // y_shape=*,4x,4x,4,4,16,8 + // shape = *,4x,4,16,4x,4,8 (3,8,4,16,16,4,8) + t.inst = "16x16x16"; + std::cout << ", matrix_core_swizzle_" << t.inst << std::flush; + + ave_time = matrix_core_swizzle(t, a, stream_config); + } + else + { + ave_time = run_permute(); + } + } + } + else +#endif + { + ave_time = run_permute(); + } + std::cout << ", time:" << ave_time << "ms" << std::flush; + + bool pass = true; + if(do_validation) + { + reference_permute(x, y, perm); +#if 0 + if constexpr (std::is_same_v){ + // using itype = to_integer_type; + fflush(stdout); + for(int zz = 0; zz < static_cast(x.get_element_size()); zz++ ) { + printf("%3.0f ", x.mData[zz]); + } + printf("->\n"); + for(int zz = 0; zz < static_cast(x.get_element_size()); zz++ ) { + printf("%3.0f ", y.mData[zz]); + } + fflush(stdout); + } +#endif + ck_tile::HostTensor y_dev(y.get_lengths()); + + y_buf.FromDevice(y_dev.data()); + + pass = std::equal( + y_dev.begin(), y_dev.end(), y.begin(), [&](const DataType& d, const DataType& h) { + using itype = to_integer_type; + itype i_d = ck_tile::bit_cast(d); + itype i_h = ck_tile::bit_cast(h); + return i_d == i_h; + }); + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush; + } + + std::cout << std::endl; + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + if(data_type == "fp8") + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "fp16") + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "fp32") + { + return run(arg_parser) ? 0 : -2; + } + + return -3; +} diff --git a/example/ck_tile/06_permute/permute.hpp b/example/ck_tile/06_permute/permute.hpp new file mode 100644 index 000000000..304da4dc9 --- /dev/null +++ b/example/ck_tile/06_permute/permute.hpp @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/permute.hpp" +#include + +struct permute_traits +{ + std::string data_type; +}; + +using permute_args = ck_tile::GenericPermuteHostArgs; + +// host API +float permute(permute_traits, permute_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/06_permute/script/smoke_test.sh b/example/ck_tile/06_permute/script/smoke_test.sh new file mode 100644 index 000000000..793e52d2b --- /dev/null +++ b/example/ck_tile/06_permute/script/smoke_test.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# TODO: run this script from CK root +BUILD=build +EXE=$BUILD/bin/tile_example_permute +COMMON_ARGS='-v=1 -warmup=0 -repeat=1' +# mode=0 +# export HIP_VISIBLE_DEVICES=4 +if [ $# -ge 1 ] ; then + set -x +fi + +$EXE -prec=fp16 -shape=3,6,4,32,16,2,8 -perm=0,1,4,2,5,3,6 $COMMON_ARGS +$EXE -prec=fp16 -shape=5,10,4,32,8,2,8 -perm=0,1,4,2,5,3,6 $COMMON_ARGS +$EXE -prec=fp16 -shape=3,8,4,16,16,4,8 -perm=0,1,4,2,5,3,6 $COMMON_ARGS +$EXE -prec=fp16 -shape=3,6,4,32,16,2,8 -perm=0,1,2,4,5,3,6 $COMMON_ARGS +$EXE -prec=fp16 -shape=5,10,4,32,8,2,8 -perm=0,1,2,4,5,3,6 $COMMON_ARGS +$EXE -prec=fp16 -shape=3,8,4,16,16,4,8 -perm=0,1,2,4,5,3,6 $COMMON_ARGS +$EXE -prec=fp16 -shape=2,8,16,8,4,8 -perm=0,1,3,4,2,5 $COMMON_ARGS +$EXE -prec=fp16 -shape=1,24,32,16,2,8 -perm=0,1,3,4,2,5 $COMMON_ARGS + +echo "------------------------------------------------------------------" + +for prec in "fp8" "fp16" "fp32" ; do + +$EXE -prec=$prec -shape=3,8 -perm=1,0 $COMMON_ARGS +$EXE -prec=$prec -shape=48,6,8 -perm=2,1,0 $COMMON_ARGS +$EXE -prec=$prec -shape=24,128,3 -perm=0,2,1 $COMMON_ARGS +$EXE -prec=$prec -shape=4,10,7,6 -perm=0,2,3,1 $COMMON_ARGS +$EXE -prec=$prec -shape=8,24,36,10 -perm=3,1,2,0 $COMMON_ARGS +$EXE -prec=$prec -shape=8,1,36,4 -perm=2,1,0,3 $COMMON_ARGS +$EXE -prec=$prec -shape=5,10,16,2,36,4 -perm=4,5,2,1,0,3 $COMMON_ARGS +$EXE -prec=$prec -shape=2,32,8,3,6,2,5,4 -perm=5,2,4,7,1,6,3,0 $COMMON_ARGS +echo "------------------------------------------------------------------" +done diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 366fb18a0..c85e31341 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -7,5 +7,6 @@ add_subdirectory(02_layernorm2d) add_subdirectory(03_gemm) add_subdirectory(04_img2col) add_subdirectory(05_reduce) +add_subdirectory(06_permute) add_subdirectory(09_topk_softmax) diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp index e17d7c22a..a17ce751c 100644 --- a/include/ck_tile/host.hpp +++ b/include/ck_tile/host.hpp @@ -22,6 +22,7 @@ #include "ck_tile/host/reference/reference_gemm.hpp" #include "ck_tile/host/reference/reference_im2col.hpp" #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp" +#include "ck_tile/host/reference/reference_permute.hpp" #include "ck_tile/host/reference/reference_reduce.hpp" #include "ck_tile/host/reference/reference_softmax.hpp" #include "ck_tile/host/reference/reference_topk.hpp" diff --git a/include/ck_tile/host/reference/reference_permute.hpp b/include/ck_tile/host/reference/reference_permute.hpp new file mode 100644 index 000000000..1c8248340 --- /dev/null +++ b/include/ck_tile/host/reference/reference_permute.hpp @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include +#include +#include + +namespace ck_tile { + +/* + this will do permute + contiguous like functionality in pytorch +*/ +template +CK_TILE_HOST void +reference_permute(const HostTensor& x, HostTensor& y, std::vector dims) +{ + const auto x_len = x.mDesc.get_lengths(); + const auto y_len = y.mDesc.get_lengths(); + assert(x_len.size() == y_len.size()); + index_t rank = x_len.size(); + const auto x_elm = std::accumulate(x_len.begin(), x_len.end(), 1, std::multiplies()); + const auto y_elm = std::accumulate(y_len.begin(), y_len.end(), 1, std::multiplies()); + assert(x_elm == y_elm); + (void)y_elm; + + auto f = [&](auto i_element) { + std::vector y_coord = [&]() { + std::vector tmp(rank, 0); + size_t r = i_element; + for(index_t i = rank - 1; i >= 0; i--) + { + tmp[i] = r % y_len[i]; + r = r / y_len[i]; + } + return tmp; + }(); + + std::vector x_coord = [&]() { + std::vector tmp(rank, 0); + for(index_t i = 0; i < rank; i++) + { + tmp[dims[i]] = y_coord[i]; + } + return tmp; + }(); + + // do permute + y(y_coord) = x(x_coord); + }; + + make_ParallelTensorFunctor(f, x_elm)(std::thread::hardware_concurrency()); +} +} // namespace ck_tile diff --git a/include/ck_tile/ops/permute.hpp b/include/ck_tile/ops/permute.hpp new file mode 100644 index 000000000..ee8c69372 --- /dev/null +++ b/include/ck_tile/ops/permute.hpp @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/permute/kernel/generic_permute_kernel.hpp" +#include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp b/include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp new file mode 100644 index 000000000..1c5cc4a11 --- /dev/null +++ b/include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" +// #include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp" + +namespace ck_tile { + +/* independent host side argument, no template + */ +struct GenericPermuteHostArgs +{ + static constexpr index_t kMaxRanks = 8; // TODO: hardcoded + + const void* p_src; + void* p_dst; + index_t rank; + index_t shape[kMaxRanks]; // input shape + index_t perm[kMaxRanks]; // permute index +}; + +/* +simulate torch.permute: +x_ = x_.view(x.shape[0], + x.shape[1]//16, 16, + x.shape[2]//32, 4, 8) +x_ = x_.permute(0,1,3,4,2,5) +x_ = x_.contiguous() +x_ = x_.view(x.shape[0], x.shape[1], x.shape[2]);// + +this kernel is supposed not to be performant(just OK), with functional support up to kMaxRanks +dim of permutation, with a single kernel + +*/ +template +struct GenericPermute +{ + using Problem = ck_tile::remove_cvref_t; + + using DataType = remove_cvref_t; + static constexpr index_t kBlockSize = Problem::kBlockSize; + static constexpr index_t kMaxRanks = Problem::kMaxRanks; + static constexpr bool KeepLastDim = Problem::KeepLastDim; + + struct __attribute__((packed)) Kargs + { + const void* p_src; + void* p_dst; + // index_t rank; + index_t num_elements; + index_t perm_length[kMaxRanks]; // tensor length after permutation + index_t perm_stride[kMaxRanks]; // tensor stride after permutation + }; + + CK_TILE_HOST static constexpr index_t TotalElements(const GenericPermuteHostArgs& h) + { + index_t n = 1; + for(auto i = 0; i < h.rank; i++) + { + n *= h.shape[i]; + } + return n; + } + + CK_TILE_HOST static constexpr Kargs MakeKargs(const GenericPermuteHostArgs& h) + { + Kargs a; + a.p_src = h.p_src; + a.p_dst = h.p_dst; + + // assert rank <= kMaxRanks + index_t i = 0; + + index_t perm[kMaxRanks]; + index_t x_shape[kMaxRanks]; + index_t x_stride[kMaxRanks]; + // index_t perm_length[kMaxRanks]; + + for(; i < h.rank; i++) + { + x_shape[i] = h.shape[i]; + perm[i] = h.perm[i]; + } + for(; i < kMaxRanks; i++) + { + x_shape[i] = 1; + perm[i] = i; // will index to len = 1 + } + + index_t stride = 1; + for(index_t j = kMaxRanks - 1; j >= 0; j--) + { + x_stride[j] = stride; + stride *= x_shape[j]; + } + + for(index_t j = 0; j < kMaxRanks; j++) + { + a.perm_length[j] = x_shape[perm[j]]; + a.perm_stride[j] = x_stride[perm[j]]; + } + + a.num_elements = TotalElements(h); + return a; + } + + CK_TILE_HOST static constexpr auto GridSize(GenericPermuteHostArgs h) + { + auto total = TotalElements(h); + auto grids = dim3((total + BlockSize() - 1) / BlockSize()); + // printf("### total:%d, grids:%dx%dx%d\n", total, ); + return grids; + } + + CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::kBlockSize; } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + index_t id = blockIdx.x * BlockSize() + threadIdx.x; + + if(id >= kargs.num_elements) + return; + + const auto perm_length = + generate_tuple([&](auto I) { return kargs.perm_length[I]; }, number{}); + const auto perm_stride = + generate_tuple([&](auto I) { return kargs.perm_stride[I]; }, number{}); + + const DataType* p_src = reinterpret_cast(kargs.p_src); + DataType* p_dst = reinterpret_cast(kargs.p_dst); + + const auto src_view_0 = make_naive_tensor_view( + p_src, perm_length, perm_stride, number<1>{}, number<1>{}); + + const auto src_view = transform_tensor_view( + src_view_0, + make_tuple(make_merge_transform(perm_length)), + make_tuple(typename arithmetic_sequence_gen<0, kMaxRanks, 1>::type{}), + make_tuple(sequence<0>{})); + + auto dst_view_0 = make_naive_tensor_view_packed( + p_dst, perm_length, number<1>{}); + + auto dst_view = transform_tensor_view( + dst_view_0, + make_tuple(make_merge_transform(perm_length)), + make_tuple(typename arithmetic_sequence_gen<0, kMaxRanks, 1>::type{}), + make_tuple(sequence<0>{})); + + // TODO: hard code to vector 1 + using vector_t = thread_buffer; + + const auto src_coord = + make_tensor_coordinate(src_view.get_tensor_descriptor(), array{id}); + const auto dst_coord = + make_tensor_coordinate(dst_view.get_tensor_descriptor(), array{id}); + + // printf("src id:%d, os:%d\n", id, src_coord.get_offset()); + // printf("dst id:%d, os:%d\n", id, dst_coord.get_offset()); + + const vector_t x = src_view.template get_vectorized_elements(src_coord, 0); + dst_view.template set_vectorized_elements(dst_coord, 0, x); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp b/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp new file mode 100644 index 000000000..e504ed747 --- /dev/null +++ b/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/utility/type_traits.hpp" + +namespace ck_tile { + +template +struct GenericPermuteProblem +{ + using DataType = remove_cvref_t; + static constexpr index_t kBlockSize = kBlockSize_; + static constexpr index_t kMaxRanks = kMaxRanks_; + /* KeepLastDim: + * if last dim keep the same? this can help enable vector load + * permute(0, 2, 4, 1, 3, 5) -> true + * permute(0, 3, 2, 1) -> false + */ + static constexpr bool KeepLastDim = KeepLastDim_; + // TODO: not used(?) +}; + +} // namespace ck_tile -- GitLab From 4d7e063a0a2dfb183bc3876b1ff021829aabd38b Mon Sep 17 00:00:00 2001 From: valarLip <103567126+valarLip@users.noreply.github.com> Date: Tue, 29 Oct 2024 18:19:29 +0800 Subject: [PATCH 023/153] [CK_TILE] add scatter_gather (#1609) --- include/ck_tile/core.hpp | 1 + .../core/algorithm/coordinate_transform.hpp | 104 +++++++ .../core/algorithm/indexing_adaptor.hpp | 60 ++++ test/CMakeLists.txt | 1 + test/scatter_gather/CMakeLists.txt | 2 + test/scatter_gather/scatter_gather.cpp | 276 ++++++++++++++++++ 6 files changed, 444 insertions(+) create mode 100644 include/ck_tile/core/algorithm/indexing_adaptor.hpp create mode 100644 test/scatter_gather/CMakeLists.txt create mode 100644 test/scatter_gather/scatter_gather.cpp diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 56dfbd636..14991d375 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -5,6 +5,7 @@ #include "ck_tile/core/algorithm/cluster_descriptor.hpp" #include "ck_tile/core/algorithm/coordinate_transform.hpp" +#include "ck_tile/core/algorithm/indexing_adaptor.hpp" #include "ck_tile/core/algorithm/space_filling_curve.hpp" #include "ck_tile/core/arch/amd_buffer_addressing.hpp" #include "ck_tile/core/arch/arch.hpp" diff --git a/include/ck_tile/core/algorithm/coordinate_transform.hpp b/include/ck_tile/core/algorithm/coordinate_transform.hpp index 5c7e48980..aaa7db257 100644 --- a/include/ck_tile/core/algorithm/coordinate_transform.hpp +++ b/include/ck_tile/core/algorithm/coordinate_transform.hpp @@ -23,6 +23,7 @@ enum struct coord_transform_enum replicate, xor_t, offset, + indexing, }; template @@ -1526,6 +1527,88 @@ struct offset : public base_transform<1, 1> } }; +template +struct indexing : public base_transform<1, 1> +{ + static constexpr index_t NDimUp = 1; + + using LowerIndex = multi_index<1>; + using UpperIndex = multi_index<1>; + + using UpLengths = decltype(make_tuple(UpLength{})); + UpLengths up_lengths_; + IndexingAdaptor iadaptor_; + + CK_TILE_HOST_DEVICE constexpr indexing() = default; + + CK_TILE_HOST_DEVICE constexpr indexing(const UpLength& up_length, + const IndexingAdaptor& iadaptor) + : up_lengths_{make_tuple(up_length)}, iadaptor_{iadaptor} + { + } + + CK_TILE_HOST_DEVICE static constexpr auto get_type_enum() + { + return coord_transform_enum::indexing; + } + + CK_TILE_HOST_DEVICE constexpr const auto& get_upper_lengths() const { return up_lengths_; } + + template + CK_TILE_HOST_DEVICE constexpr void calculate_lower_index(LowIdx& idx_low, + const UpIdx& idx_up) const + { + static_assert(LowIdx::size() == 1 && UpIdx::size() == NDimUp, + "wrong! inconsistent # of dimension"); + iadaptor_.calculate_lower_index(idx_low, idx_up); + } + + template + CK_TILE_HOST_DEVICE void update_lower_index(LowIdxDiff& idx_diff_low, + const UpIdxDiff& idx_diff_up, + LowIdx& idx_low, + const UpIdx& idx_up) const + { + // TODO: nonthing changed here + static_assert(LowIdxDiff::size() == 1 && UpIdxDiff::size() == NDimUp && + LowIdx::size() == 1 && UpIdx::size() == NDimUp, + "wrong! inconsistent # of dimension"); + + iadaptor_.update_lower_index(idx_diff_low, idx_diff_up, idx_low, idx_up); + } + + CK_TILE_HOST_DEVICE static constexpr bool + is_valid_upper_index_always_mapped_to_valid_lower_index() + { + return true; + } + + template + CK_TILE_HOST_DEVICE static constexpr bool + is_valid_upper_index_mapped_to_valid_lower_index(const UpIdx& /* idx_up */) + { + return true; + } + + CK_TILE_HOST_DEVICE static constexpr bool is_known_at_compile_time() + { + return ck_tile::is_known_at_compile_time::value && + IndexingAdaptor::is_known_at_compile_time(); + } + + CK_TILE_HOST_DEVICE void print() const + { + printf("embed{"); + + // + printf("up_lengths_: "); + print(up_lengths_); + printf(", "); + + printf("}"); + } +}; + //******************************************************************************************************* template @@ -1646,3 +1729,24 @@ CK_TILE_HOST_DEVICE constexpr auto make_offset_transform(const LowLength& low_le } } // namespace ck_tile + +#include "ck_tile/core/algorithm/indexing_adaptor.hpp" +namespace ck_tile { + +template +CK_TILE_HOST_DEVICE constexpr auto make_indexing_transform(const UpLength& up_lengths, + const Indices& indices) +{ + // by default we use the simplest one + return indexing>>{ + up_lengths, indexing_adaptor_onshot_cached>{indices}}; +} + +template +CK_TILE_HOST_DEVICE constexpr auto +make_indexing_transform_with_adaptor(const UpLength& up_lengths, const IndexingAdaptor& iadaptor) +{ + return indexing{up_lengths, iadaptor}; +} + +} // namespace ck_tile diff --git a/include/ck_tile/core/algorithm/indexing_adaptor.hpp b/include/ck_tile/core/algorithm/indexing_adaptor.hpp new file mode 100644 index 000000000..ef59abdc9 --- /dev/null +++ b/include/ck_tile/core/algorithm/indexing_adaptor.hpp @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/config.hpp" +#include "ck_tile/core/container/multi_index.hpp" +#include "ck_tile/core/container/container_helper.hpp" +#include "ck_tile/core/utility/functional.hpp" +#include "ck_tile/core/utility/type_traits.hpp" + +namespace ck_tile { +// pre-defined indexing adaptor used for indexing(scatter/gather) + +// this version cache the index inside thread register(which is also prefered in real senario) +// however it's user's responsibility that each thread only provide one indexing, which means +// move coordinate will not change on this dim +template +struct indexing_adaptor_onshot_cached +{ + + CK_TILE_HOST_DEVICE constexpr indexing_adaptor_onshot_cached() = default; + CK_TILE_HOST_DEVICE constexpr indexing_adaptor_onshot_cached(const IndexingType& idx) + : cached_idx_(idx) + { + } + IndexingType cached_idx_; + + template + CK_TILE_HOST_DEVICE constexpr void calculate_lower_index(LowIdx& idx_low, + const UpIdx& /*idx_up*/) const + { + static_assert(LowIdx::size() == 1 && UpIdx::size() == 1, + "wrong! inconsistent # of dimension"); + + idx_low(number<0>{}) = cached_idx_; + } + + template + CK_TILE_HOST_DEVICE void update_lower_index(LowIdxDiff& idx_diff_low, + const UpIdxDiff& idx_diff_up, + LowIdx& /*idx_low*/, + const UpIdx& /*idx_up*/) const + { + // TODO: nonthing changed here + static_assert(LowIdxDiff::size() == 1 && UpIdxDiff::size() == 1 && LowIdx::size() == 1 && + UpIdx::size() == 1, + "wrong! inconsistent # of dimension"); + + idx_diff_low(number<0>{}) = idx_diff_up[number<0>{}]; + + // pass the diff to lower, but not changing the actually index + } + + CK_TILE_HOST_DEVICE static constexpr bool is_known_at_compile_time() + { + return ck_tile::is_known_at_compile_time::value; + } +}; +} // namespace ck_tile diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b836dd687..b12ced524 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -210,3 +210,4 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL add_subdirectory(smfmac_op) endif() add_subdirectory(position_embedding) +add_subdirectory(scatter_gather) diff --git a/test/scatter_gather/CMakeLists.txt b/test/scatter_gather/CMakeLists.txt new file mode 100644 index 000000000..cc327d42d --- /dev/null +++ b/test/scatter_gather/CMakeLists.txt @@ -0,0 +1,2 @@ +add_test_executable(test_scatter_gather scatter_gather.cpp) +# target_compile_options(test_scatter_gather PRIVATE -v --save-temps -Wno-gnu-line-marker) diff --git a/test/scatter_gather/scatter_gather.cpp b/test/scatter_gather/scatter_gather.cpp new file mode 100644 index 000000000..439e792dd --- /dev/null +++ b/test/scatter_gather/scatter_gather.cpp @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" + +#ifndef TEST_SCATTER_GATHER_VERBOSE +#define TEST_SCATTER_GATHER_VERBOSE 1 +#endif + +#define HIP_CALL(call) \ + do \ + { \ + hipError_t err = call; \ + if(err != hipSuccess) \ + { \ + printf("[hiperror](%d) fail to call %s", static_cast(err), #call); \ + exit(0); \ + } \ + } while(0) + +/* +TODO: +This is a simple design of scatter/gather through indexing transform, with limitations +We may design a scatter/gather adaptor layer directly inside tile window +*/ +template +__global__ void row_scatter_gather(const INDEX_BUF_TYPE* src_row_idx_ptr, + const INDEX_BUF_TYPE* dst_row_idx_ptr, + const DATA_TYPE* src_ptr, + DATA_TYPE* dst_ptr, + ck_tile::index_t n_row_total, + ck_tile::index_t /*n_row_select*/, + ck_tile::index_t n_cols) +{ + using namespace ck_tile; + + // some constexpr vars + constexpr index_t vec = ALIGNMENT; + static_assert(COL_TILE_SIZE % vec == 0); + constexpr index_t col_lanes = COL_TILE_SIZE / vec; + constexpr index_t warp_size = ck_tile::get_warp_size(); + static_assert(warp_size % col_lanes == 0); + constexpr index_t row_lanes = warp_size / col_lanes; + constexpr index_t num_warps = BLOCK_SIZE / warp_size; + static_assert(ROW_TILE_SIZE % (num_warps * row_lanes) == 0); + constexpr index_t row_repeat = ROW_TILE_SIZE / (num_warps * row_lanes); + static_assert( + row_repeat == 1, + "currently indexing not support(and would be not performant) if row_repeat has more"); + + // tile partitioner + index_t tile_col_idx = 0; + index_t tile_row_idx = blockIdx.x * ROW_TILE_SIZE; + + // create our tild distribution, which tell us the location of different threads + constexpr auto src_dist = make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + const auto coord = src_dist.calculate_index(); + const auto row_coord = coord[number<0>{}] + tile_row_idx; + + // load the current row index from the indexing buffer. we do not use ck_tile utility here + INDEX_BUF_TYPE src_row_id = src_row_idx_ptr[row_coord]; + INDEX_BUF_TYPE dst_row_id = dst_row_idx_ptr[row_coord]; + + // printf("-- tid:%d, src_row_id:%d, dst_row_id:%d\n", static_cast(threadIdx.x), + // static_cast(src_row_id), static_cast(dst_row_id)); + + const auto src_view = + make_naive_tensor_view(src_ptr, + make_tuple(n_row_total, n_cols), + make_tuple(n_cols, 1), + number{}, // alignement + number<1>{}); + + const auto src_gather_view = transform_tensor_view( + src_view, + make_tuple(make_indexing_transform( + n_row_total, + src_row_id), // here we replace row_idx which is loaded from another buffer + make_pass_through_transform(n_cols)), + make_tuple(sequence<0>{}, sequence<1>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + auto src_tile = make_tile_window(src_gather_view, + make_tuple(number{}, number{}), + {tile_row_idx, tile_col_idx}, + src_dist); + + const auto dst_view = + make_naive_tensor_view(dst_ptr, + make_tuple(n_row_total, n_cols), + make_tuple(n_cols, 1), + number{}, + number<1>{}); + + const auto dst_scatter_view = transform_tensor_view( + dst_view, + make_tuple(make_indexing_transform( + n_row_total, + dst_row_id), // here we replace row_idx which is loaded from another buffer + make_pass_through_transform(n_cols)), + make_tuple(sequence<0>{}, sequence<1>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + auto dst_tile = make_tile_window(dst_scatter_view, + make_tuple(number{}, number{}), + {tile_row_idx, tile_col_idx}, + src_dist /*reuse distribution*/); + + // we finished descriptor construction and index calculation, now start load/store + for(auto i = 0; i < n_cols; i += COL_TILE_SIZE) + { + // note that scatter/gather are just the same API when doing load store as normal memory + // operation + auto data = load_tile(src_tile); + store_tile(dst_tile, data); + + move_tile_window(src_tile, {number<0>{}, number{}}); + move_tile_window(dst_tile, {number<0>{}, number{}}); + } +} + +union pixel +{ + struct __attribute__((packed)) + { + unsigned int r : 6; + unsigned int c : 10; + }; + ushort data; +}; + +struct unique_linear_rand +{ + unique_linear_rand(int capacity_) : capacity(capacity_) {} + std::unordered_set set; + int gen() + { + if(static_cast(set.size()) >= capacity) + { + printf("overflow, but will give you an number as well\n"); + return std::rand() % capacity; + } + while(1) + { + int r = std::rand() % capacity; + if(set.count(r) == 1) + { + continue; + } + set.insert(r); + return r; + } + } + + int capacity; +}; + +int main() +{ + int row_total = 64; + int row_select = 8 * 2; + int col = 256 * 2; + using fp16_t = ck_tile::fp16_t; + + constexpr int row_tile = 8; + constexpr int col_tile = 256; + + fp16_t* src = reinterpret_cast(malloc(row_total * col * sizeof(fp16_t))); + for(int i_r = 0; i_r < row_total; i_r++) + { + for(int i_c = 0; i_c < col; i_c++) + { + int i = i_r * col + i_c; + pixel p; + p.r = i_r; + p.c = i_c; + ushort d = p.data; + src[i] = ck_tile::bit_cast(d); // for simplicity, just cast + } + } + + fp16_t* dst = reinterpret_cast(malloc(row_total * col * sizeof(fp16_t))); + int* src_idx = reinterpret_cast(malloc(row_select * sizeof(int))); + int* dst_idx = reinterpret_cast(malloc(row_select * sizeof(int))); + // std::srand(std::time(std::nullptr)); + // std::srand(11935); + std::srand(std::time(nullptr)); + auto src_gen = unique_linear_rand(row_total); + auto dst_gen = unique_linear_rand(row_total); // dst index must be unique. src is fine + for(int i_r = 0; i_r < row_select; i_r++) + { + src_idx[i_r] = src_gen.gen(); + dst_idx[i_r] = dst_gen.gen(); + } + + void* dev_src; + void* dev_dst; + void* dev_src_idx; + void* dev_dst_idx; + HIP_CALL(hipMalloc(&dev_src, row_total * col * sizeof(fp16_t))); + HIP_CALL(hipMalloc(&dev_dst, row_total * col * sizeof(fp16_t))); + HIP_CALL(hipMalloc(&dev_src_idx, row_select * sizeof(int))); + HIP_CALL(hipMalloc(&dev_dst_idx, row_select * sizeof(int))); + + HIP_CALL(hipMemcpy(dev_src, src, row_total * col * sizeof(fp16_t), hipMemcpyHostToDevice)); + HIP_CALL(hipMemcpy(dev_src_idx, src_idx, row_select * sizeof(int), hipMemcpyHostToDevice)); + HIP_CALL(hipMemcpy(dev_dst_idx, dst_idx, row_select * sizeof(int), hipMemcpyHostToDevice)); + + constexpr int bdim = 256; + int gdim = (row_select + row_tile - 1) / row_tile; + row_scatter_gather<<>>(reinterpret_cast(dev_src_idx), + reinterpret_cast(dev_dst_idx), + reinterpret_cast(dev_src), + reinterpret_cast(dev_dst), + row_total, + row_select, + col); + + HIP_CALL(hipMemcpy(dst, dev_dst, row_total * col * sizeof(fp16_t), hipMemcpyDeviceToHost)); + +#if TEST_SCATTER_GATHER_VERBOSE + printf("select row:"); + for(int i_r = 0; i_r < row_select; i_r++) + { + printf("%d->%d->%d ", i_r, src_idx[i_r], dst_idx[i_r]); + } + printf("\n"); +#endif + + int err_cnt = 0; + for(int i_r = 0; i_r < row_select; i_r++) + { + for(int i_c = 0; i_c < col; i_c++) + { + int i = dst_idx[i_r] * col + i_c; + pixel p = ck_tile::bit_cast(dst[i]); + bool is_ok = p.r == src_idx[i_r] && p.c == i_c; + if(!is_ok) + { + if(i_c == 0) + printf("(%d)pixel: %dx%d -> %d\n", i_r, p.r, p.c, dst_idx[i_r]); + err_cnt++; + } + } + } +#if TEST_SCATTER_GATHER_VERBOSE + printf("err:%d\n", err_cnt); +#endif + + free(src); + free(dst); + free(src_idx); + free(dst_idx); + return err_cnt == 0 ? 0 : -1; +} -- GitLab From 863222181477ff42e809d034428f9160490a63ba Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Wed, 30 Oct 2024 14:03:16 +0800 Subject: [PATCH 024/153] [CK_TILE] Add fmha fwd headdim96 support (#1608) * Add ceil_to_qualified_tile_length() * Rename kK0BlockLength to kQKHeaddim * Add kSubQKHeaddim concept to support headdim96 * Fix in math.hpp to avoid using __half interfaces * Add LdsBufferSequence instance for headdim96 * Update in fmha_fwd/fmha_fwd_splitkv codegen to support hd96 testing * Disable hd96 instance generation in codegen fmha_fwd and fmha_fwd_splitkv to save compiling time * Reformat one file * Fix text alignment in fmha_fwd_splitkv.py --------- Co-authored-by: Po Yen Chen --- .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 41 +++++++++++------- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 42 ++++++++++++------- include/ck_tile/core/numeric/math.hpp | 12 +++--- .../ops/fmha/kernel/fmha_fwd_kernel.hpp | 8 ++-- .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 8 ++-- ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp | 23 +++++----- .../pipeline/block_fmha_pipeline_qr_ks_vs.hpp | 23 +++++----- .../block_fmha_pipeline_qr_ks_vs_async.hpp | 23 +++++----- .../block_fmha_pipeline_qr_ks_vs_fp8.hpp | 22 +++++----- .../pipeline/block_fmha_pipeline_qs_ks_vs.hpp | 23 +++++----- ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 15 ++++--- .../ops/fmha/pipeline/tile_fmha_shape.hpp | 20 ++++++++- 12 files changed, 153 insertions(+), 107 deletions(-) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index 805803fed..e5ee1d22e 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -21,6 +21,14 @@ DTYPE_BITS = { "bf8" : 8 } +K0_MAX_SUBMAX_MAP = { + 32 : 32, + 64 : 64, + 96 : 128, + 128: 128, + 256: 256 +} + TILE_PARTITIONER_MAP = { "shb" : "ck_tile::FmhaFwdTilePartitioner_SHB", "hbs" : "ck_tile::FmhaFwdTilePartitioner_HBS", @@ -35,7 +43,7 @@ FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT FMHA_FWD_KERNEL_BODY=""" using fmha_dtype_{F_idx} = {F_dtype}; -using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>; +using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>; using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>; using fmha_shape_{F_idx} = ck_tile::TileFmhaShape; -using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, +using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; #include @@ -125,7 +133,7 @@ FMHA_FWD_API_PER_HDIM_CASE=""" {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v < FMHA_FWD_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{ - using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; + using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; return fmha_fwd_(s, a); }} """ @@ -142,7 +150,7 @@ class FmhaFwdApiTrait: bk0 : int # tile size along qk gemm unroll bn1 : int # tile size along v head_dim bk1 : int # tile size along kv gemm unroll - bk0blen : int + bk0max : int vlayout : str mask : str bias : str # @@ -156,7 +164,7 @@ class FmhaFwdApiTrait: @property def name(self) -> str: - return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0blen}-'+\ + return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\ f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}' @property @@ -188,8 +196,9 @@ class FmhaFwdApiTrait: if self.dpad == 't': return f'a.hdim_q % {vec} == 0' else : assert False elif self.pipeline_tag in ['qr']: - if self.dpad == 't': return f'true /*a.hdim_q % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly) - else : return f'a.hdim_q % {self.bk0blen} == 0' + bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max] + if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly) + else : return f'a.hdim_q % {bk0submax} == 0' else: assert False @property @@ -199,8 +208,9 @@ class FmhaFwdApiTrait: if self.dvpad == 't': return f'a.hdim_v % {vec} == 0' else : assert False elif self.pipeline_tag in ['qr']: - if self.dvpad == 't': return f'true /*a.hdim_v % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly) - else : return f'a.hdim_v % {self.bk0blen} == 0' + bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max] + if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly) + else : return f'a.hdim_v % {bk0submax} == 0' else: assert False @dataclass @@ -271,7 +281,7 @@ class FmhaFwdApiPool: F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout] , F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], - F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0blen=trait.bk0blen, + F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) @@ -289,7 +299,7 @@ class FmhaFwdTileSize: F_bk0 : int # tile size along qk gemm unroll F_bn1 : int # tile size along v head_dim F_bk1 : int # tile size along kv gemm unroll - F_bk0blen : int # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile) + F_bk0max : int # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile) F_rm0 : int # number of warps for gemm0 along q seqlen F_rn0 : int # number of warps for gemm0 along k seqlen F_rk0 : int # number of warps for gemm0 along head dim q (not used) @@ -302,7 +312,7 @@ class FmhaFwdTileSize: F_occupancy : int # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy @property def name(self) -> str: - return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0blen}" +\ + return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\ f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\ f"_w{self.F_wm}x{self.F_wn}x{self.F_wk}" + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}") @@ -335,7 +345,7 @@ class FmhaFwdKernel: F_bk0 = self.F_tile.F_bk0, F_bn1 = self.F_tile.F_bn1, F_bk1 = self.F_tile.F_bk1, - F_bk0blen = self.F_tile.F_bk0blen, + F_bk0max = self.F_tile.F_bk0max, F_rm0 = self.F_tile.F_rm0, F_rn0 = self.F_tile.F_rn0, F_rk0 = self.F_tile.F_rk0, @@ -382,7 +392,7 @@ class FmhaFwdKernel: bk0=self.F_tile.F_bk0, bn1=self.F_tile.F_bn1, bk1=self.F_tile.F_bk1, - bk0blen=self.F_tile.F_bk0blen, + bk0max=self.F_tile.F_bk0max, vlayout=self.F_pipeline.F_vlayout, mask=self.F_pipeline.F_mask, bias=self.F_pipeline.F_bias, @@ -401,6 +411,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]: return { '32' : FmhaFwdTileSize(128, 64, 16, 32, 32, 32, 2, 1, 1, 2, 1, 1, 32, 32, 16, -1), '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), + ## '96' : FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), } @@ -510,4 +521,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im _, kernels = get_fwd_blobs(kernel_filter, receipt, mask_impl) for kernel in kernels: f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n") - f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n") \ No newline at end of file + f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n") diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index 46c26b22c..b084e9d0f 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -29,6 +29,14 @@ DTYPE_BITS = { "bf8" : 8 } +K0_MAX_SUBMAX_MAP = { + 32 : 32, + 64 : 64, + 96 : 128, + 128: 128, + 256: 256 +} + FMHA_FWD_SPLITKV_PIPELINE_MAP = { "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS", "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync", @@ -41,7 +49,7 @@ using fmha_mask_{F_idx} = {F_mask}; namespace {{ template struct kernel_runner {{ -using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>; +using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>; using fmha_warp_tile = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>; using fmha_shape = ck_tile::TileFmhaShape; @@ -241,7 +249,7 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const FMHA_FWD_SPLITKV_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) && ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{ - using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; + using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>; return fmha_fwd_splitkv_(s, a); @@ -260,7 +268,7 @@ class FmhaFwdSplitKVApiTrait: bk0 : int # tile size along qk gemm unroll bn1 : int # tile size along v head_dim bk1 : int # tile size along kv gemm unroll - bk0blen : int + bk0max : int vlayout : str mask : str bias : str # @@ -270,11 +278,11 @@ class FmhaFwdSplitKVApiTrait: skpad : str dpad : str dvpad : str - pagedkv : str + pagedkv : str @property def name(self) -> str: - return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0blen}-'+\ + return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\ f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-'+\ f'{self.dvpad}-{self.pagedkv}' @@ -307,8 +315,9 @@ class FmhaFwdSplitKVApiTrait: if self.dpad == 't': return f'a.hdim_q % {vec} == 0' else : assert False elif self.pipeline_tag in ['qr']: - if self.dpad == 't': return f'true /*a.hdim_q % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly) - else : return f'a.hdim_q % {self.bk0blen} == 0' + bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max] + if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly) + else : return f'a.hdim_q % {bk0submax} == 0' else: assert False @property @@ -318,8 +327,9 @@ class FmhaFwdSplitKVApiTrait: if self.dvpad == 't': return f'a.hdim_v % {vec} == 0' else : assert False elif self.pipeline_tag in ['qr']: - if self.dvpad == 't': return f'true /*a.hdim_v % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly) - else : return f'a.hdim_v % {self.bk0blen} == 0' + bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max] + if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly) + else : return f'a.hdim_v % {bk0submax} == 0' else: assert False @dataclass @@ -414,7 +424,7 @@ class FmhaFwdSplitKVApiPool: F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], - F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0blen=trait.bk0blen, + F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) @@ -458,7 +468,7 @@ class FmhaFwdSplitKVKernel: F_bk0 = self.F_tile.F_bk0, F_bn1 = self.F_tile.F_bn1, F_bk1 = self.F_tile.F_bk1, - F_bk0blen = self.F_tile.F_bk0blen, + F_bk0max = self.F_tile.F_bk0max, F_rm0 = self.F_tile.F_rm0, F_rn0 = self.F_tile.F_rn0, F_rk0 = self.F_tile.F_rk0, @@ -504,7 +514,7 @@ class FmhaFwdSplitKVKernel: bk0=self.F_tile.F_bk0, bn1=self.F_tile.F_bn1, bk1=self.F_tile.F_bk1, - bk0blen=self.F_tile.F_bk0blen, + bk0max=self.F_tile.F_bk0max, vlayout=self.F_pipeline.F_vlayout, mask=self.F_pipeline.F_mask, bias=self.F_pipeline.F_bias, @@ -559,6 +569,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]: return { '32' : FmhaFwdTileSize(32, 64, 16, 32, 32, 32, 2, 1, 1, 2, 1, 1, 16, 16, 16, -1), '64' : FmhaFwdTileSize(64, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), + ## '96' : FmhaFwdTileSize(64, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), '128' : FmhaFwdTileSize(64, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), '256' : FmhaFwdTileSize(64, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), } @@ -576,6 +587,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d return { '32' : FmhaFwdSplitKVCombineTileSize(16, 16, -1), '64' : FmhaFwdSplitKVCombineTileSize(32, 32, -1), + ## '96' : FmhaFwdSplitKVCombineTileSize(32, 64, -1), '128' : FmhaFwdSplitKVCombineTileSize(32, 64, -1), '256' : FmhaFwdSplitKVCombineTileSize(32, 128, -1), } @@ -604,7 +616,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> if dtype in ['fp16', 'bf16']: for mask, bias, lse, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]): # TODO: use async pipeline when compiler is more stable - if hdim == 256 or hdim in [32, 64, 128]: + if hdim == 256 or hdim in [32, 64, 128]: ### [32, 64, 96, 128]: # if True: pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask)) pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask)) @@ -743,4 +755,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im _, kernels = get_fwd_splitkv_blobs(kernel_filter, receipt, mask_impl) for kernel in kernels: f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n") - f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME) + "\n") \ No newline at end of file + f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME) + "\n") diff --git a/include/ck_tile/core/numeric/math.hpp b/include/ck_tile/core/numeric/math.hpp index 785691b66..0faf1aa04 100644 --- a/include/ck_tile/core/numeric/math.hpp +++ b/include/ck_tile/core/numeric/math.hpp @@ -1126,7 +1126,7 @@ CK_TILE_DEVICE int8_t neg(int8_t x) template <> CK_TILE_DEVICE fp16_t neg(fp16_t x) { - return __hneg(x); + return -x; }; template @@ -1168,7 +1168,7 @@ CK_TILE_DEVICE double sin(double x) template <> CK_TILE_DEVICE fp16_t sin(fp16_t x) { - return ::hsin(x); + return __ocml_sin_f16(x); }; template @@ -1300,7 +1300,7 @@ CK_TILE_DEVICE double ceil(double x) template <> CK_TILE_DEVICE fp16_t ceil(fp16_t x) { - return ::hceil(x); + return __ocml_ceil_f16(x); }; template @@ -1342,7 +1342,7 @@ CK_TILE_DEVICE double floor(double x) template <> CK_TILE_DEVICE fp16_t floor(fp16_t x) { - return ::hfloor(x); + return __ocml_floor_f16(x); }; template @@ -1365,7 +1365,7 @@ CK_TILE_DEVICE T exp(T x) template <> CK_TILE_DEVICE fp16_t exp(fp16_t x) { - return hexp(x); + return __ocml_exp_f16(x); }; template <> @@ -1389,7 +1389,7 @@ CK_TILE_DEVICE T log(T x) template <> CK_TILE_DEVICE fp16_t log(fp16_t x) { - return hlog(x); + return __ocml_log_f16(x); }; template <> diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 8c1f6c805..e0c145fde 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -82,10 +82,10 @@ struct FmhaFwdKernel if (kPadHeadDimV) n += "dv"; return n.empty() ? n : std::string("p") + n; }(); return - _SS_("fmha_fwd_d") + _TS_(bfs::kK0BlockLength) + "_" + _SS_(t2s::name) + + _SS_("fmha_fwd_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s::name) + "_" + (kIsGroupMode ? "group" : "batch") + "_" + _SS_(TilePartitioner::name) + "_" "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" + - _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kK0BlockLength) + "_" + + _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" + "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" + "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" + "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" + @@ -657,7 +657,7 @@ struct FmhaFwdKernel { return pad_tensor_view( q_dram_naive, - make_tuple(number{}, number{}), + make_tuple(number{}, number{}), sequence{}); } else @@ -724,7 +724,7 @@ struct FmhaFwdKernel [&]() { if constexpr(FmhaPipeline::kQLoadOnce) return make_tuple(number{}, - number{}); + number{}); else return make_tuple(number{}, number{}); }(), diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index ea30025b5..4ffebc3c9 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -78,10 +78,10 @@ struct FmhaFwdSplitKVKernel if (kPadHeadDimV) n += "dv"; return n.empty() ? n : std::string("p") + n; }(); return - _SS_("fmha_fwd_splitkv_d") + _TS_(bfs::kK0BlockLength) + "_" + _SS_(t2s::name) + + _SS_("fmha_fwd_splitkv_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s::name) + "_" + (kIsGroupMode ? "group" : "batch") + "_" "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" + - _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kK0BlockLength) + "_" + + _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" + "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" + "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" + "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" + @@ -586,7 +586,7 @@ struct FmhaFwdSplitKVKernel { return pad_tensor_view( q_dram_naive, - make_tuple(number{}, number{}), + make_tuple(number{}, number{}), sequence{}); } else @@ -735,7 +735,7 @@ struct FmhaFwdSplitKVKernel [&]() { if constexpr(FmhaPipeline::kQLoadOnce) return make_tuple(number{}, - number{}); + number{}); else return make_tuple(number{}, number{}); }(), diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp index 6e7416ce8..71c3bd171 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp @@ -34,12 +34,13 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS static constexpr index_t kBlockSize = Problem::kBlockSize; - static constexpr index_t kM0 = BlockFmhaShape::kM0; - static constexpr index_t kN0 = BlockFmhaShape::kN0; - static constexpr index_t kK0 = BlockFmhaShape::kK0; - static constexpr index_t kN1 = BlockFmhaShape::kN1; - static constexpr index_t kK1 = BlockFmhaShape::kK1; - static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength; + static constexpr index_t kM0 = BlockFmhaShape::kM0; + static constexpr index_t kN0 = BlockFmhaShape::kN0; + static constexpr index_t kK0 = BlockFmhaShape::kK0; + static constexpr index_t kN1 = BlockFmhaShape::kN1; + static constexpr index_t kK1 = BlockFmhaShape::kK1; + static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim; + static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim; static constexpr bool kIsGroupMode = Problem::kIsGroupMode; static constexpr bool kPadSeqLenQ = Problem::kPadSeqLenQ; @@ -75,22 +76,22 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS return Problem::kBlockPerCu; else { - if constexpr(kK0BlockLength <= 32) + if constexpr(kQKHeaddim <= 32) { return 2; } - else if constexpr(kK0BlockLength <= 64) + else if constexpr(kQKHeaddim <= 64) { return 3; } - else if constexpr(kK0BlockLength <= 128) + else if constexpr(kQKHeaddim <= 128) { if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) return 1; else return 2; } - else if constexpr(kK0BlockLength <= 256) + else if constexpr(kQKHeaddim <= 256) { return 1; } @@ -270,7 +271,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS // prefetch K tile index_t i_total_loops = 0; - constexpr index_t k0_loops = kK0BlockLength / kK0; + constexpr index_t k0_loops = kQKHeaddim / kK0; constexpr index_t k1_loops = kN0 / kK1; static_assert(2 <= k0_loops); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp index 6837ffdee..a7e928714 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp @@ -37,12 +37,13 @@ struct BlockFmhaPipelineQRKSVS static constexpr index_t kBlockSize = Problem::kBlockSize; - static constexpr index_t kM0 = BlockFmhaShape::kM0; - static constexpr index_t kN0 = BlockFmhaShape::kN0; - static constexpr index_t kK0 = BlockFmhaShape::kK0; - static constexpr index_t kN1 = BlockFmhaShape::kN1; - static constexpr index_t kK1 = BlockFmhaShape::kK1; - static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength; + static constexpr index_t kM0 = BlockFmhaShape::kM0; + static constexpr index_t kN0 = BlockFmhaShape::kN0; + static constexpr index_t kK0 = BlockFmhaShape::kK0; + static constexpr index_t kN1 = BlockFmhaShape::kN1; + static constexpr index_t kK1 = BlockFmhaShape::kK1; + static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim; + static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim; static constexpr bool kIsGroupMode = Problem::kIsGroupMode; static constexpr bool kPadSeqLenQ = Problem::kPadSeqLenQ; @@ -76,22 +77,22 @@ struct BlockFmhaPipelineQRKSVS return Problem::kBlockPerCu; else { - if constexpr(kK0BlockLength <= 32) + if constexpr(kQKHeaddim <= 32) { return 2; } - else if constexpr(kK0BlockLength <= 64) + else if constexpr(kQKHeaddim <= 64) { return 3; } - else if constexpr(kK0BlockLength <= 128) + else if constexpr(kQKHeaddim <= 128) { if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) return 1; else return 2; } - else if constexpr(kK0BlockLength <= 256) + else if constexpr(kQKHeaddim <= 256) { return 1; } @@ -261,7 +262,7 @@ struct BlockFmhaPipelineQRKSVS // prefetch K tile index_t i_total_loops = 0; - constexpr index_t k0_loops = kK0BlockLength / kK0; + constexpr index_t k0_loops = kQKHeaddim / kK0; constexpr index_t k1_loops = kN0 / kK1; static_assert(2 <= k0_loops); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp index 05d3dae1c..10bb01168 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp @@ -38,12 +38,13 @@ struct BlockFmhaPipelineQRKSVSAsync static constexpr index_t kBlockSize = Problem::kBlockSize; - static constexpr index_t kM0 = BlockFmhaShape::kM0; - static constexpr index_t kN0 = BlockFmhaShape::kN0; - static constexpr index_t kK0 = BlockFmhaShape::kK0; - static constexpr index_t kN1 = BlockFmhaShape::kN1; - static constexpr index_t kK1 = BlockFmhaShape::kK1; - static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength; + static constexpr index_t kM0 = BlockFmhaShape::kM0; + static constexpr index_t kN0 = BlockFmhaShape::kN0; + static constexpr index_t kK0 = BlockFmhaShape::kK0; + static constexpr index_t kN1 = BlockFmhaShape::kN1; + static constexpr index_t kK1 = BlockFmhaShape::kK1; + static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim; + static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim; static constexpr bool kIsGroupMode = Problem::kIsGroupMode; // TODO: seq_q always support padding, hdim_q/v support multiple of vector(like 8x) @@ -87,7 +88,7 @@ struct BlockFmhaPipelineQRKSVSAsync return 1; } - if constexpr(kK0BlockLength <= 32) + if constexpr(kQKHeaddim <= 32) { if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS && FmhaMask::IsMasking) @@ -95,21 +96,21 @@ struct BlockFmhaPipelineQRKSVSAsync else return 2; } - else if constexpr(kK0BlockLength <= 64) + else if constexpr(kQKHeaddim <= 64) { if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) return 2; else return 3; } - else if constexpr(kK0BlockLength <= 128) + else if constexpr(kQKHeaddim <= 128) { if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) return 1; else return 2; } - else if constexpr(kK0BlockLength <= 256) + else if constexpr(kQKHeaddim <= 256) { return 1; } @@ -339,7 +340,7 @@ struct BlockFmhaPipelineQRKSVSAsync // auto q_tile = q; // tile_elementwise_in(q_element_func, q); index_t i_total_loops = 0; - constexpr index_t k0_loops = kK0BlockLength / kK0; + constexpr index_t k0_loops = kQKHeaddim / kK0; constexpr index_t k1_loops = kN0 / kK1; static_assert(1 <= k0_loops); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp index f4767de0e..a1b1e0e15 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp @@ -36,12 +36,12 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8 static constexpr index_t kBlockSize = Problem::kBlockSize; - static constexpr index_t kM0 = BlockFmhaShape::kM0; - static constexpr index_t kN0 = BlockFmhaShape::kN0; - static constexpr index_t kK0 = BlockFmhaShape::kK0; - static constexpr index_t kN1 = BlockFmhaShape::kN1; - static constexpr index_t kK1 = BlockFmhaShape::kK1; - static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength; + static constexpr index_t kM0 = BlockFmhaShape::kM0; + static constexpr index_t kN0 = BlockFmhaShape::kN0; + static constexpr index_t kK0 = BlockFmhaShape::kK0; + static constexpr index_t kN1 = BlockFmhaShape::kN1; + static constexpr index_t kK1 = BlockFmhaShape::kK1; + static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim; static constexpr bool kIsGroupMode = Problem::kIsGroupMode; static constexpr bool kPadSeqLenQ = Problem::kPadSeqLenQ; @@ -75,22 +75,22 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8 return Problem::kBlockPerCu; else { - if constexpr(kK0BlockLength <= 32) + if constexpr(kQKHeaddim <= 32) { return 2; } - else if constexpr(kK0BlockLength <= 64) + else if constexpr(kQKHeaddim <= 64) { return 3; } - else if constexpr(kK0BlockLength <= 128) + else if constexpr(kQKHeaddim <= 128) { if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) return 1; else return 2; } - else if constexpr(kK0BlockLength <= 256) + else if constexpr(kQKHeaddim <= 256) { return 1; } @@ -232,7 +232,7 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8 // prefetch K tile index_t i_total_loops = 0; - constexpr index_t k0_loops = kK0BlockLength / kK0; + constexpr index_t k0_loops = kQKHeaddim / kK0; constexpr index_t k1_loops = kN0 / kK1; static_assert(2 <= k0_loops); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp index d08a8d489..b98247df9 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp @@ -36,12 +36,13 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS static constexpr index_t kBlockSize = Problem::kBlockSize; - static constexpr index_t kM0 = BlockFmhaShape::kM0; - static constexpr index_t kN0 = BlockFmhaShape::kN0; - static constexpr index_t kK0 = BlockFmhaShape::kK0; - static constexpr index_t kN1 = BlockFmhaShape::kN1; - static constexpr index_t kK1 = BlockFmhaShape::kK1; - static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength; + static constexpr index_t kM0 = BlockFmhaShape::kM0; + static constexpr index_t kN0 = BlockFmhaShape::kN0; + static constexpr index_t kK0 = BlockFmhaShape::kK0; + static constexpr index_t kN1 = BlockFmhaShape::kN1; + static constexpr index_t kK1 = BlockFmhaShape::kK1; + static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim; + static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim; static constexpr bool kIsGroupMode = Problem::kIsGroupMode; static constexpr bool kPadSeqLenQ = Problem::kPadSeqLenQ; @@ -56,22 +57,22 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS return Problem::kBlockPerCu; else { - if constexpr(kK0BlockLength <= 32) + if constexpr(kQKHeaddim <= 32) { return 2; } - else if constexpr(kK0BlockLength <= 64) + else if constexpr(kQKHeaddim <= 64) { return 3; } - else if constexpr(kK0BlockLength <= 128) + else if constexpr(kQKHeaddim <= 128) { if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) return 1; else return 2; } - else if constexpr(kK0BlockLength <= 256) + else if constexpr(kQKHeaddim <= 256) { return 1; } @@ -235,7 +236,7 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS // prefetch K tile index_t i_total_loops = 0; - constexpr index_t k0_loops = kK0BlockLength / kK0; + constexpr index_t k0_loops = kQKHeaddim / kK0; constexpr index_t k1_loops = kN0 / kK1; static_assert(2 <= k0_loops); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp index 807ad6548..fbb05e164 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp @@ -55,7 +55,7 @@ struct BlockFmhaPipelineQXCustomPolicy constexpr index_t MWarp = config.template at<1>(); constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0; - constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0BlockLength; + constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim; constexpr index_t K2 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane; constexpr index_t K1 = WG::WarpGemmAttribute::Impl::kABKLane; @@ -323,6 +323,9 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy struct LdsBufferSequence<3, 3, 3, 3> { using type = sequence<1, 2, 0, 1, 2, 0>; }; + template<> struct + LdsBufferSequence<3, 3, 3, 4> { using type = sequence<1, 2, 0, 0, 1, 2, 0>; }; + template<> struct LdsBufferSequence<3, 3, 2, 2> { using type = sequence<1, 2, 1, 0>;}; // clang-format on @@ -332,12 +335,12 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy; - constexpr index_t kN0 = BlockFmhaShape::kN0; - constexpr index_t kK0 = BlockFmhaShape::kK0; - constexpr index_t kK1 = BlockFmhaShape::kK1; - constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength; + constexpr index_t kN0 = BlockFmhaShape::kN0; + constexpr index_t kK0 = BlockFmhaShape::kK0; + constexpr index_t kK1 = BlockFmhaShape::kK1; + constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim; - constexpr index_t k0_loops = kK0BlockLength / kK0; + constexpr index_t k0_loops = kQKHeaddim / kK0; constexpr index_t k1_loops = kN0 / kK1; return typename LdsBufferSequence::type{}; diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp index f2bb2200f..570754b22 100644 --- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp +++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp @@ -7,6 +7,20 @@ namespace ck_tile { +static CK_TILE_HOST_DEVICE constexpr index_t ceil_to_qualified_tile_length(index_t len) +{ + if(len == 96) + return 128; + if(len == 160) + return 256; + + // only length of 96, 160 and power-of-two is supported + if(!(len & (len - 1))) + return len; + + return 0; +}; + template {}); // tile size along qk gemm unroll static constexpr index_t kN1 = BlockTile::at(number<3>{}); // tile size along v head_dim static constexpr index_t kK1 = BlockTile::at(number<4>{}); // tile size along kv gemm unroll - static constexpr index_t kK0BlockLength = + static constexpr index_t kQKHeaddim = BlockTile::at(number<5>{}); // total length of K0, used for pipeline that need load Q at // once (or repeately load Q as a whole tile) - static_assert(kK0BlockLength % kK0 == 0, "kK0BlockLength should be divisible by kK0"); + static_assert(kQKHeaddim % kK0 == 0, "kQKHeaddim should be divisible by kK0"); + + static constexpr index_t kSubQKHeaddim = ceil_to_qualified_tile_length(kQKHeaddim); // v, rowmajor : seqlen*hdim, colmajor : hdim*seqlen static constexpr bool IsVLayoutRowMajor = IsVLayoutRowMajor_; -- GitLab From 3d60953477bd575e320c84240a9f8ef49eb7bedd Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 30 Oct 2024 15:22:56 +0800 Subject: [PATCH 025/153] [Ck tile] support rmsnorm and related fusion (#1605) * Add reduce2d new api * Prevent user use cross warp reduction * Fix bug of std caculation * Add rmsnorm2d * Add rmsnorm small example * Remove static assert to prevent compile fail * Add script to test performance and correctness * Add missing cmake change * refine naming * refine example of rmsnorm * Fix bug of rmsnorm * Refine naming * Fix cmake * clang format * Refine pipeline name * Add add_rmsnorm2d_rdquant kernel * Add reduce op * host verification * Fix bug of one pass pipeline * Refine tile size * Add two pass pipeline * Rename two pass to three pass * Fix bug of kSaveX == false * Add instance library * Add test script * Fix bug of x verification * Add save_x to trait * Add README * Move reduce2d into reduce folder * Fix bug of welford when number of m warp > 1 * remove reduncant comment * 1. move 06_rmsnorm2d to 10_rmsnorm2d 2. move 07_add_rmsnorm2d_rdquant to 11_add_rmsnorm2d_rdquant * clang format and add missing header * Add host validation of add + layernorm2d + rsquant * Revert "Add host validation of add + layernorm2d + rsquant" This reverts commit 936cb457978b928b90eff89a08fcdb7dc8bbed67. * Remove deprecated flag --- example/ck_tile/05_reduce/reduce.cpp | 65 ++-- example/ck_tile/05_reduce/reduce.hpp | 172 +++++++---- example/ck_tile/10_rmsnorm2d/CMakeLists.txt | 25 ++ example/ck_tile/10_rmsnorm2d/README.md | 22 ++ .../10_rmsnorm2d/example_rmsnorm2d_fwd.cpp | 165 +++++++++++ .../instances/rmsnorm2d_fwd_api.cpp | 153 ++++++++++ .../rmsnorm2d_fwd_bf16_n1024_instance.cpp | 22 ++ .../rmsnorm2d_fwd_bf16_n1536_instance.cpp | 13 + .../rmsnorm2d_fwd_bf16_n2048_instance.cpp | 14 + .../rmsnorm2d_fwd_bf16_n256_instance.cpp | 12 + .../rmsnorm2d_fwd_bf16_n3072_instance.cpp | 14 + .../rmsnorm2d_fwd_bf16_n4096_instance.cpp | 14 + .../rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp | 14 + .../rmsnorm2d_fwd_bf16_n512_instance.cpp | 13 + .../rmsnorm2d_fwd_bf16_n64_n128_instance.cpp | 12 + .../rmsnorm2d_fwd_bf16_n768_instance.cpp | 12 + .../rmsnorm2d_fwd_fp16_n1024_instance.cpp | 22 ++ .../rmsnorm2d_fwd_fp16_n1536_instance.cpp | 13 + .../rmsnorm2d_fwd_fp16_n2048_instance.cpp | 14 + .../rmsnorm2d_fwd_fp16_n256_instance.cpp | 12 + .../rmsnorm2d_fwd_fp16_n3072_instance.cpp | 14 + .../rmsnorm2d_fwd_fp16_n4096_instance.cpp | 14 + .../rmsnorm2d_fwd_fp16_n4096_tp_instance.cpp | 14 + .../rmsnorm2d_fwd_fp16_n512_instance.cpp | 13 + .../rmsnorm2d_fwd_fp16_n64_n128_instance.cpp | 12 + .../rmsnorm2d_fwd_fp16_n768_instance.cpp | 12 + .../rmsnorm2d_fwd_instance_common.hpp | 65 ++++ .../ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp | 179 +++++++++++ .../ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp | 117 ++++++++ .../ck_tile/10_rmsnorm2d/script/perf_test.sh | 38 +++ .../ck_tile/10_rmsnorm2d/script/smoke_test.sh | 31 ++ .../11_add_rmsnorm2d_rdquant/CMakeLists.txt | 25 ++ .../11_add_rmsnorm2d_rdquant/README.md | 22 ++ .../add_rmsnorm2d_rdquant_fwd.cpp | 279 +++++++++++++++++ .../add_rmsnorm2d_rdquant_fwd.hpp | 123 ++++++++ .../example_add_rmsnorm2d_rdquant_fwd.cpp | 280 ++++++++++++++++++ .../add_rmsnorm2d_rdquant_fwd_api.cpp | 157 ++++++++++ ...norm2d_rdquant_fwd_bf16_n1024_instance.cpp | 22 ++ ...norm2d_rdquant_fwd_bf16_n1536_instance.cpp | 13 + ...norm2d_rdquant_fwd_bf16_n2048_instance.cpp | 14 + ...snorm2d_rdquant_fwd_bf16_n256_instance.cpp | 12 + ...norm2d_rdquant_fwd_bf16_n3072_instance.cpp | 14 + ...norm2d_rdquant_fwd_bf16_n4096_instance.cpp | 14 + ...m2d_rdquant_fwd_bf16_n4096_tp_instance.cpp | 14 + ...snorm2d_rdquant_fwd_bf16_n512_instance.cpp | 13 + ...m2d_rdquant_fwd_bf16_n64_n128_instance.cpp | 12 + ...snorm2d_rdquant_fwd_bf16_n768_instance.cpp | 12 + ...norm2d_rdquant_fwd_fp16_n1024_instance.cpp | 22 ++ ...norm2d_rdquant_fwd_fp16_n1536_instance.cpp | 13 + ...norm2d_rdquant_fwd_fp16_n2048_instance.cpp | 14 + ...snorm2d_rdquant_fwd_fp16_n256_instance.cpp | 12 + ...norm2d_rdquant_fwd_fp16_n3072_instance.cpp | 14 + ...norm2d_rdquant_fwd_fp16_n4096_instance.cpp | 14 + ...m2d_rdquant_fwd_fp16_n4096_tp_instance.cpp | 14 + ...snorm2d_rdquant_fwd_fp16_n512_instance.cpp | 13 + ...m2d_rdquant_fwd_fp16_n64_n128_instance.cpp | 12 + ...snorm2d_rdquant_fwd_fp16_n768_instance.cpp | 12 + ..._rmsnorm2d_rdquant_fwd_instance_common.hpp | 67 +++++ .../script/perf_test.sh | 38 +++ .../script/smoke_test.sh | 31 ++ example/ck_tile/CMakeLists.txt | 3 +- include/ck_tile/core.hpp | 1 + .../ck_tile/core/utility/reduce_operator.hpp | 95 ++++++ include/ck_tile/host.hpp | 3 + .../host/reference/reference_elementwise.hpp | 47 +++ .../host/reference/reference_reduce.hpp | 17 +- .../reference/reference_rmsnorm2d_fwd.hpp | 52 ++++ .../reference_rowwise_quantization2d.hpp | 33 +++ include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp | 12 + .../add_rmsnorm2d_rdquant_fwd_kernel.hpp | 239 +++++++++++++++ .../add_rmsnorm2d_rdquant_fwd_shape.hpp | 78 +++++ ...2d_rdquant_fwd_pipeline_default_policy.hpp | 94 ++++++ ...msnorm2d_rdquant_fwd_pipeline_one_pass.hpp | 142 +++++++++ ...rmsnorm2d_rdquant_fwd_pipeline_problem.hpp | 41 +++ ...norm2d_rdquant_fwd_pipeline_three_pass.hpp | 266 +++++++++++++++++ .../layernorm2d_fwd_pipeline_one_pass.hpp | 4 +- .../layernorm2d_fwd_pipeline_two_pass.hpp | 6 +- include/ck_tile/ops/reduce.hpp | 3 + .../ck_tile/ops/reduce/block/block_reduce.hpp | 19 +- .../ops/reduce/block/block_reduce2d.hpp | 260 ++++++++++++++++ .../block/block_reduce2d_default_policy.hpp | 79 +++++ .../reduce/block/block_reduce2d_problem.hpp | 18 ++ include/ck_tile/ops/rmsnorm2d.hpp | 12 + .../rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp | 202 +++++++++++++ .../rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp | 78 +++++ .../rmsnorm2d_fwd_pipeline_default_policy.hpp | 94 ++++++ .../rmsnorm2d_fwd_pipeline_one_pass.hpp | 101 +++++++ .../rmsnorm2d_fwd_pipeline_problem.hpp | 36 +++ .../rmsnorm2d_fwd_pipeline_two_pass.hpp | 131 ++++++++ .../ops/welford/block/block_welford.hpp | 8 +- 90 files changed, 4667 insertions(+), 121 deletions(-) create mode 100644 example/ck_tile/10_rmsnorm2d/CMakeLists.txt create mode 100644 example/ck_tile/10_rmsnorm2d/README.md create mode 100644 example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n2048_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n256_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n3072_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n512_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n64_n128_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n768_instance.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_instance_common.hpp create mode 100644 example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp create mode 100644 example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp create mode 100755 example/ck_tile/10_rmsnorm2d/script/perf_test.sh create mode 100755 example/ck_tile/10_rmsnorm2d/script/smoke_test.sh create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/README.md create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp create mode 100755 example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh create mode 100755 example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh create mode 100644 include/ck_tile/core/utility/reduce_operator.hpp create mode 100644 include/ck_tile/host/reference/reference_elementwise.hpp create mode 100644 include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp create mode 100644 include/ck_tile/host/reference/reference_rowwise_quantization2d.hpp create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp create mode 100644 include/ck_tile/ops/reduce/block/block_reduce2d.hpp create mode 100644 include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp create mode 100644 include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp create mode 100644 include/ck_tile/ops/rmsnorm2d.hpp create mode 100644 include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp create mode 100644 include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp create mode 100644 include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp create mode 100644 include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp create mode 100644 include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp create mode 100644 include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp index 7973a8dfd..005541dc6 100644 --- a/example/ck_tile/05_reduce/reduce.cpp +++ b/example/ck_tile/05_reduce/reduce.cpp @@ -19,9 +19,9 @@ auto create_args(int argc, char* argv[]) template bool run(const ck_tile::ArgParser& arg_parser) { - using ADataType = DataType; - using AccDataType = float; - using BDataType = DataType; + using XDataType = DataType; + using ComputeDataType = float; + using YDataType = DataType; ck_tile::index_t m = arg_parser.get_int("m"); ck_tile::index_t n = arg_parser.get_int("n"); @@ -29,35 +29,39 @@ bool run(const ck_tile::ArgParser& arg_parser) int warmup = arg_parser.get_int("warmup"); int repeat = arg_parser.get_int("repeat"); - ck_tile::HostTensor a_host({m, n}); - ck_tile::HostTensor b_host_ref({m}); - ck_tile::HostTensor b_host_dev({m}); + ck_tile::HostTensor x_host({m, n}); + ck_tile::HostTensor y_host_ref({m}); + ck_tile::HostTensor y_host_dev({m}); - ck_tile::FillUniformDistribution{-5.f, 5.f}(a_host); + ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host); - ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes()); - ck_tile::DeviceMem b_buf(b_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes()); - a_buf.ToDevice(a_host.data()); + x_buf.ToDevice(x_host.data()); + using ReduceOp = ck_tile::ReduceOp::Add; using BlockWarps = ck_tile::sequence<4, 1>; using BlockTile = ck_tile::sequence<128, 128>; using WarpTile = ck_tile::sequence<32, 128>; - using ThreadTile = ck_tile::sequence<8, 8>; + using Vector = ck_tile::sequence<8, 8>; - constexpr ck_tile::index_t kBlockSize = 256; + // cross warp-reduce + // using BlockWarps = ck_tile::sequence<2, 2>; + // using BlockTile = ck_tile::sequence<2, 1024>; + // using WarpTile = ck_tile::sequence<1, 512>; + // using Vector = ck_tile::sequence<1, 8>; + + constexpr ck_tile::index_t kBlockSize = 512; constexpr ck_tile::index_t kBlockPerCu = 1; ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{})); std::cout << "grid size " << kGridSize << std::endl; - using Kernel = ck_tile::Reduce; + using Shape = ck_tile::Reduce2dShape; + using Porblem = + ck_tile::Reduce2dProblem; + + using Kernel = ck_tile::Reduce; float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, ck_tile::make_kernel( @@ -65,12 +69,12 @@ bool run(const ck_tile::ArgParser& arg_parser) kGridSize, kBlockSize, 0, - static_cast(a_buf.GetDeviceBuffer()), - static_cast(b_buf.GetDeviceBuffer()), + static_cast(x_buf.GetDeviceBuffer()), + static_cast(y_buf.GetDeviceBuffer()), m, n)); - std::size_t num_btype = sizeof(ADataType) * m * n + sizeof(BDataType) * m; + std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m; float gb_per_sec = num_btype / 1.E6 / ave_time; @@ -81,9 +85,10 @@ bool run(const ck_tile::ArgParser& arg_parser) if(do_validation) { // reference - ck_tile::reference_reduce(a_host, b_host_ref); - b_buf.FromDevice(b_host_dev.mData.data()); - pass = ck_tile::check_err(b_host_dev, b_host_ref); + ck_tile::reference_reduce( + x_host, y_host_ref, ReduceOp{}); + y_buf.FromDevice(y_host_dev.mData.data()); + pass = ck_tile::check_err(y_host_dev, y_host_ref); std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl; } @@ -103,8 +108,8 @@ int main(int argc, char* argv[]) { return run(arg_parser) ? 0 : -2; } - if(data_type == "bf16") - { - return run(arg_parser) ? 0 : -2; - } + // else if(data_type == "bf16") + // { + // return run(arg_parser) ? 0 : -2; + // } } diff --git a/example/ck_tile/05_reduce/reduce.hpp b/example/ck_tile/05_reduce/reduce.hpp index e36b46895..55e479591 100644 --- a/example/ck_tile/05_reduce/reduce.hpp +++ b/example/ck_tile/05_reduce/reduce.hpp @@ -5,20 +5,16 @@ #include "ck_tile/core.hpp" #include "ck_tile/ops/common.hpp" - #include "ck_tile/ops/reduce/block/block_reduce.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp" namespace ck_tile { -template +template typename BlockTile, // block size, seq typename WarpTile, // warp size, seq - typename ThreadTile> // contiguous pixels(vector size) along seq -struct Reduce + typename Vector> // contiguous pixels(vector size) along seq +struct Reduce2dShape { static constexpr index_t Block_M = BlockTile::at(number<0>{}); static constexpr index_t Block_N = BlockTile::at(number<1>{}); @@ -26,93 +22,143 @@ struct Reduce static constexpr index_t Warp_M = WarpTile::at(number<0>{}); static constexpr index_t Warp_N = WarpTile::at(number<1>{}); - static constexpr index_t Thread_M = ThreadTile::at(number<0>{}); - static constexpr index_t Thread_N = ThreadTile::at(number<1>{}); + static constexpr index_t Vector_M = Vector::at(number<0>{}); + static constexpr index_t Vector_N = Vector::at(number<1>{}); static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{}); static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{}); - static constexpr index_t ThreadPerWarp_M = Warp_M / Thread_M; - static constexpr index_t ThreadPerWarp_N = Warp_N / Thread_N; + static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; + static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); - __device__ static constexpr auto MakeABlockTileDistribution() - { - return make_static_tile_distribution( - tile_distribution_encoding< - sequence<>, - tuple, - sequence>, - tuple, sequence<1, 2>>, - tuple, sequence<2, 2>>, - sequence<1, 1, 2, 2>, - sequence<0, 3, 0, 3>>{}); - } + static constexpr index_t BlockSize = + warpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{}); +}; + +template +struct Reduce2dProblem +{ + using XDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; + using ReduceOp = ReduceOp_; + + static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; + static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; +}; + +template +struct Reduce +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using XDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using YDataType = ck_tile::remove_cvref_t; - __device__ void operator()(const ADataType* p_a, BDataType* p_b, index_t M, index_t N) const +#if 0 + CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) + const { - const auto a_m_n = make_naive_tensor_view( - p_a, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); + using S = typename Problem::BlockShape; - const auto iM = get_block_id() * Block_M; + const auto x_m_n = make_naive_tensor_view( + p_x, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); - // A window - auto a_block_window = make_tile_window(a_m_n, - make_tuple(number{}, number{}), - {iM, 0}, - MakeABlockTileDistribution()); + const auto y_m = make_naive_tensor_view_packed( + p_y, make_tuple(M), number<1>{}); + + const auto iM = get_block_id() * S::Block_M; + + auto x_window = make_tile_window(x_m_n, + make_tuple(number{}, number{}), + {iM, 0}, + Policy::template MakeXBlockTileDistribution()); + + auto y_window = make_tile_window(y_m, make_tuple(number{}), {iM}); const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; }; - const ADataType reduce_init_value = 0; + const XDataType reduce_init_value = 0; constexpr auto reduce_dims = sequence<1>{}; - // Acc tile - // TODO: support cross warp reduction - auto acc_block_tensor = decltype(block_tile_reduce( - load_tile(a_block_window), reduce_dims, f_reduce, reduce_init_value)){}; + auto y_compute = decltype(block_tile_reduce( + load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){}; - // init Acc tile - tile_elementwise_inout( - [&](auto& acc) { acc = type_convert(reduce_init_value); }, - acc_block_tensor); + set_tile(y_compute, reduce_init_value); - // loop - index_t iN = 0; + index_t num_n_tile_iteration = + __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N)); - do + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { - const auto a_block_tensor = load_tile(a_block_window); + const auto x = load_tile(x_window); + block_tile_reduce(y_compute, x, reduce_dims, f_reduce); + move_tile_window(x_window, {0, S::Block_N}); + } - // FIXME: support cross warp reduction - block_tile_reduce(acc_block_tensor, a_block_tensor, reduce_dims, f_reduce); + block_tile_reduce_sync(y_compute, f_reduce); + + store_tile(y_window, cast_tile(y_compute)); + } +#else + CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) const + { + using S = typename Problem::BlockShape; - move_tile_window(a_block_window, {0, Block_N}); + const auto x_m_n = make_naive_tensor_view( + p_x, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); - iN += Block_N; + const auto y_m = make_naive_tensor_view_packed( + p_y, make_tuple(M), number<1>{}); - } while(iN < N); + const auto iM = get_block_id() * S::Block_M; - // FIXME: support cross warp reduction - block_tile_reduce_sync(acc_block_tensor, f_reduce); + auto x_window = make_tile_window(x_m_n, + make_tuple(number{}, number{}), + {iM, 0}, + Policy::template MakeXBlockTileDistribution()); - // convert acc_block_tensor to b_block_tensor - const auto b_block_tensor = tile_elementwise_in( - [](const auto& acc) { return type_convert(acc); }, acc_block_tensor); + auto y_window = make_tile_window(y_m, make_tuple(number{}), {iM}); - // B - const auto b_m = make_naive_tensor_view_packed( - p_b, make_tuple(M), number<32>{}); + __shared__ char smem[Policy::template GetSmemSize()]; + + index_t num_n_tile_iteration = + __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N)); + + auto reduce_func = typename Problem::ReduceOp{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto block_reduce2d_cross_warp_sync = + Policy::template GetBlockReduce2dCrossWarpSync(); + + using XTensorType = decltype(load_tile(x_window)); + auto y_compute = block_reduce2d.template MakeYBlockTile(); + set_tile(y_compute, reduce_func.template GetIdentityValue()); + + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto x = load_tile(x_window); + block_reduce2d(x, y_compute, reduce_func); + move_tile_window(x_window, {0, S::Block_N}); + } - // B window - auto b_block_window = make_tile_window(b_m, make_tuple(number{}), {iM}); + block_reduce2d_sync(y_compute, reduce_func); + block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func); - // store B tile - store_tile(b_block_window, b_block_tensor); + store_tile(y_window, cast_tile(y_compute)); } +#endif }; } // namespace ck_tile diff --git a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt new file mode 100644 index 000000000..a3ff8fdf4 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt @@ -0,0 +1,25 @@ +set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd") +# not using add_example_executable() to add this target, since we don't want this to have +# to be included in "make all/install/check" +message("adding ${TILE_RMSNORM2D_FWD}") +file(GLOB INSTANCE_SRCS instances/*.cpp) +add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp) +target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${INSTANCE_SRCS}) + +set(TILE_RMSNORM2D_FWD_COMPILE_OPTIONS) + +# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations +list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + +target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS}) + +set(EXAMPLE_RMSNORM2D_FWD "tile_example_rmsnorm2d_fwd") +add_executable(${EXAMPLE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL example_rmsnorm2d_fwd.cpp) +target_compile_options(${EXAMPLE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS}) + +# TODO: we have to turn off this global prop, otherwise the progress bar generated +# by cmake will print too many files, execvp: /bin/sh: Argument list too long +# however, this property may affect global +# TODO: consider codegen a makefile by us +set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) diff --git a/example/ck_tile/10_rmsnorm2d/README.md b/example/ck_tile/10_rmsnorm2d/README.md new file mode 100644 index 000000000..c06749647 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/README.md @@ -0,0 +1,22 @@ +# Rmsnorm2D forward + +This folder contains example for Rmsnorm2D forward using ck_tile tile-programming implementation. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +make tile_rmsnorm2d_fwd -j +``` +This will result in an executable `build/bin/tile_rmsnorm2d_fwd` + +## cmdline +``` +args: + -m m dimension (default:3328) + -n m dimension (default:4096) + -e epsilon (default:1e-5) + -v cpu validation or not (default:1) + -prec precision (default:fp16) +``` diff --git a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp new file mode 100644 index 000000000..bb2c94901 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp @@ -0,0 +1,165 @@ +#include "ck_tile/host.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/rmsnorm2d.hpp" +#include + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3328", "m dimension") + .insert("n", "4096", "n dimension") + .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("e", "1e-5", "epsilon") + .insert("v", "1", "cpu validation or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "0", "cold iter") + .insert("repeat", "1", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = n; + float epsilon = arg_parser.get_float("e"); + std::string data_type = arg_parser.get_str("prec"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(stride >= n); + + using XDataType = DataType; + using YDataType = DataType; + using GammaDataType = DataType; + using InvRmsDataType = ck_tile::null_type; + + using ComputeDataType = float; + + // host verify + ck_tile::HostTensor x_host({m, n}, {stride, 1}); + ck_tile::HostTensor gamma_host({n}); + + ck_tile::HostTensor y_host_ref({m, n}, {stride, 1}); + ck_tile::HostTensor y_host_dev({m, n}, {stride, 1}); + + ck_tile::HostTensor invRms_host_ref({m}); + + ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(gamma_host); + + ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes()); + + x_buf.ToDevice(x_host.data()); + gamma_buf.ToDevice(gamma_host.data()); + + constexpr bool kTwoPass = true; + + using BlockWarps = ck_tile::sequence<2, 2>; + using BlockTile = ck_tile::sequence<2, 128>; + using WarpTile = ck_tile::sequence<1, 64>; + using Vector = ck_tile::sequence<1, 1>; + + using Shape = ck_tile::Rmsnorm2dShape; + using Problem = ck_tile::Rmsnorm2dFwdPipelineProblem; + + using OnePassPipeline = ck_tile::Rmsnorm2dFwdPipelineOnePass; + using TwoPassPipeline = ck_tile::Rmsnorm2dFwdPipelineTwoPass; + using Pipeline = std::conditional_t; + using Kernel = ck_tile::Rmsnorm2dFwd; + + ck_tile::Rmsnorm2dFwdHostArgs args{x_buf.GetDeviceBuffer(), + gamma_buf.GetDeviceBuffer(), + y_buf.GetDeviceBuffer(), + nullptr, + epsilon, + m, + n, + stride}; + + auto kargs = Kernel::MakeKargs(args); + + const dim3 grids = Kernel::GridSize(args); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + auto s = ck_tile::stream_config{nullptr, true, 0, warmup, repeat}; + + ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + bool pass = true; + + if(do_validation) + { + // reference + ck_tile::reference_rmsnorm2d_fwd( + x_host, gamma_host, y_host_ref, invRms_host_ref, epsilon); + + y_buf.FromDevice(y_host_dev.data()); + + auto [rtol, atol] = ck_tile::make_tuple(1e-3, 1e-3); + if(stride == n) + { + pass = ck_tile::check_err( + y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol); + } + else + { + for(int i_r = 0; i_r < m; i_r++) + { + std::vector y_host_dev_row(y_host_dev.begin() + i_r * stride, + y_host_dev.begin() + i_r * stride + n); + std::vector y_host_ref_row(y_host_ref.begin() + i_r * stride, + y_host_ref.begin() + i_r * stride + n); + pass &= ck_tile::check_err(y_host_dev_row, + y_host_ref_row, + std::string("OUT[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + + std::cout << "[" << data_type << "]" + << " m:" << m << ", n:" << n << ", stride:" << stride + << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + if(data_type == "fp16") + { + return run(arg_parser) ? 0 : -2; + } + + return -3; +} diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp new file mode 100644 index 000000000..f9cfe72de --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "rmsnorm2d_fwd.hpp" + +template +using trait_ = rmsnorm2d_fwd_traits_; + +template +float rmsnorm2d_fwd_b16_(rmsnorm2d_fwd_traits /*t*/, + rmsnorm2d_fwd_args a, + const ck_tile::stream_config& s) +{ +#if 1 + float r = -1; + // clang-format off + // rm rn tm tn vn pd rms 2p + if(a.n <= 64) { + r = rmsnorm2d_fwd_>(s, a); + } + else if(a.n <= 128) { + if (a.n % 2 == 0) + r = rmsnorm2d_fwd_>(s, a); + else + r = rmsnorm2d_fwd_>(s, a); + } + else if(a.n <= 256) { + if (a.n % 4 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = rmsnorm2d_fwd_>(s, a); + else + r = rmsnorm2d_fwd_>(s, a); + } + else if(a.n <= 512) { + if (a.n % 8 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = rmsnorm2d_fwd_>(s, a); + else + r = rmsnorm2d_fwd_>(s, a); + } + else if(a.n <= 768) { + if (a.n % 4 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = rmsnorm2d_fwd_>(s, a); + else + r = rmsnorm2d_fwd_>(s, a); + } + else if(a.n <= 1024) { + if (a.n % 8 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = rmsnorm2d_fwd_>(s, a); + else + r = rmsnorm2d_fwd_>(s, a); + } + else if(a.n <= 1536) { + if (a.n % 8 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = rmsnorm2d_fwd_>(s, a); + else + r = rmsnorm2d_fwd_>(s, a); + } + else if(a.n <= 2048) { + if (a.n % 8 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = rmsnorm2d_fwd_>(s, a); + else + r = rmsnorm2d_fwd_>(s, a); + } + else if(a.n <= 3072) { + if (a.n % 8 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = rmsnorm2d_fwd_>(s, a); + else + r = rmsnorm2d_fwd_>(s, a); + } + else if(a.n <= 4096) { + if (a.n % 8 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = rmsnorm2d_fwd_>(s, a); + else + r = rmsnorm2d_fwd_>(s, a); + } + else if(a.n > 4096) { + if (a.n % 8 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 4 == 0) + r = rmsnorm2d_fwd_>(s, a); + else if (a.n % 2 == 0) + r = rmsnorm2d_fwd_>(s, a); + else + r = rmsnorm2d_fwd_>(s, a); + } + return r; +#else + return rmsnorm2d_fwd_>(s, a); +#endif + // clang-format on +} + +float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t, rmsnorm2d_fwd_args a, const ck_tile::stream_config& s) +{ + + float r = -1; + if(t.data_type.compare("fp16") == 0) + { + return rmsnorm2d_fwd_b16_(t, a, s); + } + else if(t.data_type.compare("bf16") == 0) + { + return rmsnorm2d_fwd_b16_(t, a, s); + } + if(r < 0) + throw std::runtime_error("Without supported instances!"); + + return r; +} diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp new file mode 100644 index 000000000..5e2a35f9e --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +#if 0 +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); + +template float rmsnorm2d_fwd_>(const S&, A); +#endif + +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp new file mode 100644 index 000000000..8c734806e --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp new file mode 100644 index 000000000..922200143 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp new file mode 100644 index 000000000..ed33c8492 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp new file mode 100644 index 000000000..b753bbc34 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp new file mode 100644 index 000000000..27cb9bdf3 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp new file mode 100644 index 000000000..23afb5672 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp new file mode 100644 index 000000000..b428f5805 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp new file mode 100644 index 000000000..300110669 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp new file mode 100644 index 000000000..e9c8d6a1d --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp new file mode 100644 index 000000000..15198eebe --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +#if 0 +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); + +template float rmsnorm2d_fwd_>(const S&, A); +#endif + +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp new file mode 100644 index 000000000..8ac85fa9b --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n2048_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n2048_instance.cpp new file mode 100644 index 000000000..10e8fafc2 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n256_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n256_instance.cpp new file mode 100644 index 000000000..4e1a80bf6 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n3072_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n3072_instance.cpp new file mode 100644 index 000000000..45e56a92b --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_instance.cpp new file mode 100644 index 000000000..35401f6f8 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_tp_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_tp_instance.cpp new file mode 100644 index 000000000..1e3700fad --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n512_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n512_instance.cpp new file mode 100644 index 000000000..cdc4d00bd --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n64_n128_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n64_n128_instance.cpp new file mode 100644 index 000000000..ec80c2ee4 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n768_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n768_instance.cpp new file mode 100644 index 000000000..ddfc5a54e --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "rmsnorm2d_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd rms 2p +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +template float rmsnorm2d_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_instance_common.hpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_instance_common.hpp new file mode 100644 index 000000000..8f6ff84b6 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_instance_common.hpp @@ -0,0 +1,65 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "rmsnorm2d_fwd.hpp" +#include + +#pragma once + +using S = ck_tile::stream_config; +using A = rmsnorm2d_fwd_args; + +template +using trait_ = rmsnorm2d_fwd_traits_; + +template +float rmsnorm2d_fwd_(const S& s, A a) +{ + using DataType = typename Traits_::DataType; + + using PipelineProblem = + ck_tile::Rmsnorm2dFwdPipelineProblem::XDataType, + typename RmsnormTypeConfig::GammaDataType, + typename RmsnormTypeConfig::ComputeDataType, + typename RmsnormTypeConfig::YDataType, + typename RmsnormTypeConfig::InvRmsDataType, + typename Traits_::Shape, + Traits_::kPadN, + Traits_::kSaveInvRms, + Traits_::kTwoPass>; + + using OnePassPipeline = ck_tile::Rmsnorm2dFwdPipelineOnePass; + using TwoPassPipeline = ck_tile::Rmsnorm2dFwdPipelineTwoPass; + using Pipeline = std::conditional_t; + + using Kernel = ck_tile::Rmsnorm2dFwd; + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + auto kargs = Kernel::MakeKargs(a); + if(s.log_level_ > 0) + std::cout << ", " << Kernel::GetName() << std::flush; + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); +} diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp new file mode 100644 index 000000000..698a8b43e --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp @@ -0,0 +1,179 @@ +#include "ck_tile/host.hpp" +#include "rmsnorm2d_fwd.hpp" +#include + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3328", "m dimension") + .insert("n", "4096", "n dimension") + .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("e", "1e-5", "epsilon") + .insert("save_rms", "0", "save rms(invrms) or not. set to 1 in training case") + .insert("v", "1", "cpu validation or not") + .insert("kname", "1", "print kernel name or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = n; + float epsilon = arg_parser.get_float("e"); + std::string data_type = arg_parser.get_str("prec"); + int kname = arg_parser.get_int("kname"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(stride >= n); + + using TypeConfig = RmsnormTypeConfig; + + using XDataType = typename TypeConfig::XDataType; + using YDataType = typename TypeConfig::YDataType; + using GammaDataType = typename TypeConfig::GammaDataType; + + using InvRmsDataType = + std::conditional_t; + + using ComputeDataType = typename TypeConfig::ComputeDataType; + + // host verify + ck_tile::HostTensor x_host({m, n}, {stride, 1}); + ck_tile::HostTensor gamma_host({n}); + + ck_tile::HostTensor y_host_ref({m, n}, {stride, 1}); + ck_tile::HostTensor y_host_dev({m, n}, {stride, 1}); + + ck_tile::HostTensor invRms_host_ref({m}); + + ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(gamma_host); + + ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes()); + + x_buf.ToDevice(x_host.data()); + gamma_buf.ToDevice(gamma_host.data()); + + std::cout << "[" << data_type << "]" + << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush; + + rmsnorm2d_fwd_traits traits{data_type, SaveRms}; + + rmsnorm2d_fwd_args args{x_buf.GetDeviceBuffer(), + gamma_buf.GetDeviceBuffer(), + y_buf.GetDeviceBuffer(), + nullptr, + epsilon, + m, + n, + stride}; + + float ave_time = rmsnorm2d_fwd( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + std::size_t num_byte = + sizeof(XDataType) * m * n + sizeof(GammaDataType) * n + sizeof(YDataType) * m * n; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush; + + bool pass = true; + + if(do_validation) + { + // reference + ck_tile::reference_rmsnorm2d_fwd( + x_host, gamma_host, y_host_ref, invRms_host_ref, epsilon); + + y_buf.FromDevice(y_host_dev.data()); + + auto [rtol, atol] = get_elimit(); + if(stride == n) + { + pass = ck_tile::check_err( + y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol); + } + else + { + for(int i_r = 0; i_r < m; i_r++) + { + std::vector y_host_dev_row(y_host_dev.begin() + i_r * stride, + y_host_dev.begin() + i_r * stride + n); + std::vector y_host_ref_row(y_host_ref.begin() + i_r * stride, + y_host_ref.begin() + i_r * stride + n); + pass &= ck_tile::check_err(y_host_dev_row, + y_host_ref_row, + std::string("OUT[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + int save_rms = arg_parser.get_int("save_rms"); + if(data_type == "fp16" && save_rms) + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "fp16" && !save_rms) + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "bf16" && save_rms) + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "bf16" && !save_rms) + { + return run(arg_parser) ? 0 : -2; + } + + return -3; +} diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp new file mode 100644 index 000000000..756ecb2c4 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/rmsnorm2d.hpp" +#include + +template +struct RmsnormTypeConfig; + +template <> +struct RmsnormTypeConfig +{ + using XDataType = ck_tile::half_t; + using YDataType = ck_tile::half_t; + using GammaDataType = ck_tile::half_t; + using InvRmsDataType = ck_tile::half_t; + using ComputeDataType = float; +}; + +template <> +struct RmsnormTypeConfig +{ + using XDataType = ck_tile::bf16_t; + using YDataType = ck_tile::bf16_t; + using GammaDataType = ck_tile::bf16_t; + using InvRmsDataType = ck_tile::bf16_t; + using ComputeDataType = float; +}; + +// runtime args +struct rmsnorm2d_fwd_args : public ck_tile::Rmsnorm2dFwdHostArgs +{ +}; + +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template +struct rmsnorm2d_fwd_traits_ +{ + using DataType = ck_tile::remove_cvref_t; + + static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize; + static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0); + static constexpr ck_tile::index_t total_warps = + (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize; + + // num of warps along m + static constexpr ck_tile::index_t BlockWarps_M = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return total_warps * (warpSize / ThreadPerBlock_N_); + } + else + { + // static_assert(warpSize % ThreadPerBlock_M_ == 0); + return total_warps / (ThreadPerBlock_N_ / warpSize); + } + }(); + + // num of warps along n + static constexpr ck_tile::index_t BlockWarps_N = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return 1; + } + else + { + static_assert(ThreadPerBlock_N_ % warpSize == 0); + return ThreadPerBlock_N_ / warpSize; + } + }(); + + static constexpr ck_tile::index_t Repeat_M = Repeat_M_; + static constexpr ck_tile::index_t Repeat_N = Repeat_N_; + + static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_; + static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_; + + static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M; + static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_; + + using BlockTile = ck_tile::sequence; + using BlockWarps = ck_tile::sequence; + using WarpTile = ck_tile::sequence; + using Vector = ck_tile::sequence<1, Vector_N_>; + + using Shape = ck_tile::Rmsnorm2dShape; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kSaveInvRms = kSaveInvRms_; + static constexpr bool kTwoPass = kTwoPass_; +}; + +template +float rmsnorm2d_fwd_(const ck_tile::stream_config& s, rmsnorm2d_fwd_args a); + +// This is the public API, will be generated by script +struct rmsnorm2d_fwd_traits +{ + std::string data_type; + bool save_rms; +}; + +float rmsnorm2d_fwd(rmsnorm2d_fwd_traits, rmsnorm2d_fwd_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/10_rmsnorm2d/script/perf_test.sh b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh new file mode 100755 index 000000000..f3cfcc4b8 --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh @@ -0,0 +1,38 @@ + +# run from top of ck folder +EXE=build/bin/tile_rmsnorm2d_fwd + +$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 + +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 \ No newline at end of file diff --git a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh new file mode 100755 index 000000000..6ec5e846c --- /dev/null +++ b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# call from top of CK folder +EXE=./build/bin/tile_rmsnorm2d_fwd + +for pr_i in "fp16" "bf16" ; do +$EXE -prec=$pr_i -m=99 -n=13 +$EXE -prec=$pr_i -m=17 -n=16 +$EXE -prec=$pr_i -m=1 -n=100 +$EXE -prec=$pr_i -m=4 -n=128 +$EXE -prec=$pr_i -m=80 -n=127 +$EXE -prec=$pr_i -m=22 -n=255 -stride=256 +$EXE -prec=$pr_i -m=7 -n=599 +$EXE -prec=$pr_i -m=19 -n=512 +$EXE -prec=$pr_i -m=33 -n=313 -stride=1000 +$EXE -prec=$pr_i -m=11 -n=510 +$EXE -prec=$pr_i -m=171 -n=676 -stride=818 +$EXE -prec=$pr_i -m=91 -n=636 +$EXE -prec=$pr_i -m=12 -n=768 -stride=800 +$EXE -prec=$pr_i -m=100 -n=766 -stride=812 +$EXE -prec=$pr_i -m=31 -n=1024 +$EXE -prec=$pr_i -m=64 -n=1000 -stride=1004 +$EXE -prec=$pr_i -m=8 -n=1501 +$EXE -prec=$pr_i -m=3 -n=1826 +$EXE -prec=$pr_i -m=5 -n=2040 +$EXE -prec=$pr_i -m=7 -n=2734 +$EXE -prec=$pr_i -m=1 -n=3182 +$EXE -prec=$pr_i -m=9 -n=4096 +$EXE -prec=$pr_i -m=3 -n=8192 +$EXE -prec=$pr_i -m=1 -n=10547 +$EXE -prec=$pr_i -m=3 -n=17134 +done diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt new file mode 100644 index 000000000..6b0c3cef7 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt @@ -0,0 +1,25 @@ +set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd") +# not using add_example_executable() to add this target, since we don't want this to have +# to be included in "make all/install/check" +message("adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}") +file(GLOB INSTANCE_SRCS instances/*.cpp) +add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp) +target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_sources(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${INSTANCE_SRCS}) + +set(TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS) + +# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations +list(APPEND TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + +target_compile_options(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS}) + +set(EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD "tile_example_add_rmsnorm2d_rdquant_fwd") +add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL example_add_rmsnorm2d_rdquant_fwd.cpp) +target_compile_options(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS}) + +# TODO: we have to turn off this global prop, otherwise the progress bar generated +# by cmake will print too many files, execvp: /bin/sh: Argument list too long +# however, this property may affect global +# TODO: consider codegen a makefile by us +set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md b/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md new file mode 100644 index 000000000..960369b78 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md @@ -0,0 +1,22 @@ +# Add + Rmsnorm2D + rowwise dynamic quantization forward + +This folder contains example for add + Rmsnorm2D + rowwise dynamic quantization forward using ck_tile tile-programming implementation. Rdquant is short for rowwise dynamic quantization here. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +make tile_add_rmsnorm2d_rdquant_fwd -j +``` +This will result in an executable `build/bin/tile_add_rmsnorm2d_rdquant_fwd` + +## cmdline +``` +args: + -m m dimension (default:3328) + -n m dimension (default:4096) + -e epsilon (default:1e-5) + -v cpu validation or not (default:1) + -prec precision (default:fp16) +``` diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp new file mode 100644 index 000000000..43bc9a6cf --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp @@ -0,0 +1,279 @@ +#include "ck_tile/host.hpp" +#include "add_rmsnorm2d_rdquant_fwd.hpp" +#include + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + // due to rounding, int8 quantization might have 1 abs error + double rtol = 1; + double atol = 1; + return ck_tile::make_tuple(rtol, atol); +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3328", "m dimension") + .insert("n", "4096", "n dimension") + .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("e", "1e-5", "epsilon") + .insert("save_x", "1", "save rms(invrms) or not. set to 1 in training case") + .insert("v", "1", "cpu validation or not") + .insert("kname", "1", "print kernel name or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = n; + float epsilon = arg_parser.get_float("e"); + std::string data_type = arg_parser.get_str("prec"); + int kname = arg_parser.get_int("kname"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(stride >= n); + + using TypeConfig = AddRmsnormRdquantTypeConfig; + + using ADataType = typename TypeConfig::ADataType; + using BDataType = typename TypeConfig::BDataType; + using GammaDataType = typename TypeConfig::GammaDataType; + using XDataType = typename TypeConfig::XDataType; + using YScaleDataType = typename TypeConfig::YScaleDataType; + using QYDataType = typename TypeConfig::QYDataType; + using ComputeDataType = float; + + // host verify + ck_tile::HostTensor a_host({m, n}, {stride, 1}); + ck_tile::HostTensor b_host({m, n}, {stride, 1}); + ck_tile::HostTensor gamma_host({n}); + + ck_tile::HostTensor x_host_ref({m, n}, {stride, 1}); + ck_tile::HostTensor x_host_dev({m, n}, {stride, 1}); + + ck_tile::HostTensor yscale_host_ref({m}, {1}); + ck_tile::HostTensor yscale_host_dev({m}, {1}); + + ck_tile::HostTensor qy_host_ref({m, n}, {stride, 1}); + ck_tile::HostTensor qy_host_dev({m, n}, {stride, 1}); + + ck_tile::FillUniformDistribution{-.5f, .5f}(a_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(b_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(gamma_host); + + ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_buf(b_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem x_buf(x_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes()); + + a_buf.ToDevice(a_host.data()); + b_buf.ToDevice(b_host.data()); + gamma_buf.ToDevice(gamma_host.data()); + + std::cout << "[" << data_type << "]" + << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush; + + add_rmsnorm2d_rdquant_fwd_traits traits{data_type, SaveX}; + + add_rmsnorm2d_rdquant_fwd_args args{a_buf.GetDeviceBuffer(), + b_buf.GetDeviceBuffer(), + gamma_buf.GetDeviceBuffer(), + x_buf.GetDeviceBuffer(), + yscale_buf.GetDeviceBuffer(), + qy_buf.GetDeviceBuffer(), + epsilon, + m, + n, + stride}; + + float ave_time = add_rmsnorm2d_rdquant_fwd( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + std::size_t num_byte = sizeof(ADataType) * m * n + sizeof(BDataType) * m * n + + sizeof(GammaDataType) * n + sizeof(YScaleDataType) * m + + sizeof(QYDataType) * m * n; + + if constexpr(SaveX) + num_byte += sizeof(XDataType) * m * n; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush; + + bool pass = true; + + if(do_validation) + { + using YDataType = ComputeDataType; + using InvRmsDataType = DataType; + + // Add + { + auto op = [](const auto& v0, const auto& v1) { return v0 + v1; }; + ck_tile::reference_binary_elementwise( + a_host, b_host, x_host_ref, op); + + x_buf.FromDevice(x_host_dev.data()); + + auto [rtol, atol] = get_elimit(); + if(stride == n) + { + pass = ck_tile::check_err( + x_host_dev, x_host_ref, std::string("x Error: Incorrect results!"), rtol, atol); + } + else + { + for(int i_r = 0; i_r < m; i_r++) + { + std::vector x_host_dev_row(x_host_dev.begin() + i_r * stride, + x_host_dev.begin() + i_r * stride + n); + std::vector x_host_ref_row(x_host_ref.begin() + i_r * stride, + x_host_ref.begin() + i_r * stride + n); + pass &= ck_tile::check_err(x_host_dev_row, + x_host_ref_row, + std::string("x[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + } + + ck_tile::HostTensor y_host({m, n}); + // Rmsnorm2d + { + ck_tile::HostTensor invRms_host_ref({m}); + + // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for + // simplicity + ck_tile::reference_rmsnorm2d_fwd( + x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon); + } + + // yscale + { + ck_tile::HostTensor y_rowwise_amax_host({m}); + + using ReduceAmax = ck_tile::ReduceOp::AbsMax; + ck_tile::reference_reduce( + y_host, y_rowwise_amax_host, ReduceAmax{}); + + auto op = [](const auto& v0) { + return v0 / + ck_tile::type_convert(ck_tile::numeric::max()); + }; + ck_tile::reference_unary_elementwise( + y_rowwise_amax_host, yscale_host_ref, op); + + yscale_buf.FromDevice(yscale_host_dev.mData.data()); + + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err(yscale_host_dev, + yscale_host_ref, + std::string("yscale Error: Incorrect results!"), + rtol, + atol); + } + + // rowwise quantization + { + ck_tile::reference_rowwise_quantization2d( + y_host, yscale_host_ref, qy_host_ref); + + qy_buf.FromDevice(qy_host_dev.data()); + auto [rtol, atol] = get_elimit(); + + if(stride == n) + { + pass = ck_tile::check_err(qy_host_dev, + qy_host_ref, + std::string("qy Error: Incorrect results!"), + rtol, + atol); + } + else + { + for(int i_r = 0; i_r < m; i_r++) + { + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, + qy_host_dev.begin() + i_r * stride + n); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, + qy_host_ref.begin() + i_r * stride + n); + pass &= ck_tile::check_err(qy_host_dev_row, + qy_host_ref_row, + std::string("qy[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + } + + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + int save_x = arg_parser.get_int("save_x"); + if(data_type == "fp16" && save_x) + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "fp16" && !save_x) + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "bf16" && save_x) + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "bf16" && !save_x) + { + return run(arg_parser) ? 0 : -2; + } + + return -3; +} diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp new file mode 100644 index 000000000..bf70d9d23 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/add_rmsnorm2d_rdquant.hpp" +#include + +template +struct AddRmsnormRdquantTypeConfig; + +template <> +struct AddRmsnormRdquantTypeConfig +{ + using ADataType = ck_tile::half_t; + using BDataType = ck_tile::half_t; + using GammaDataType = ck_tile::half_t; + using XDataType = ck_tile::half_t; + using YScaleDataType = ck_tile::half_t; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; +}; + +template <> +struct AddRmsnormRdquantTypeConfig +{ + using ADataType = ck_tile::bf16_t; + using BDataType = ck_tile::bf16_t; + using GammaDataType = ck_tile::bf16_t; + using XDataType = ck_tile::bf16_t; + using YScaleDataType = ck_tile::bf16_t; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; +}; + +// runtime args +struct add_rmsnorm2d_rdquant_fwd_args : public ck_tile::AddRmsnorm2dRdquantFwdHostArgs +{ +}; + +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template +struct add_rmsnorm2d_rdquant_fwd_traits_ +{ + using DataType = ck_tile::remove_cvref_t; + + static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize; + static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0); + static constexpr ck_tile::index_t total_warps = + (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize; + + // num of warps along m + static constexpr ck_tile::index_t BlockWarps_M = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return total_warps * (warpSize / ThreadPerBlock_N_); + } + else + { + // static_assert(warpSize % ThreadPerBlock_M_ == 0); + return total_warps / (ThreadPerBlock_N_ / warpSize); + } + }(); + + // num of warps along n + static constexpr ck_tile::index_t BlockWarps_N = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return 1; + } + else + { + static_assert(ThreadPerBlock_N_ % warpSize == 0); + return ThreadPerBlock_N_ / warpSize; + } + }(); + + static constexpr ck_tile::index_t Repeat_M = Repeat_M_; + static constexpr ck_tile::index_t Repeat_N = Repeat_N_; + + static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_; + static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_; + + static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M; + static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_; + + using BlockTile = ck_tile::sequence; + using BlockWarps = ck_tile::sequence; + using WarpTile = ck_tile::sequence; + using Vector = ck_tile::sequence<1, Vector_N_>; + + using Shape = ck_tile::AddRmsnorm2dRdquantShape; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kSaveX = kSaveX_; + static constexpr bool kThreePass = kThreePass_; +}; + +template +float add_rmsnorm2d_rdquant_fwd_(const ck_tile::stream_config& s, add_rmsnorm2d_rdquant_fwd_args a); + +// This is the public API, will be generated by script +struct add_rmsnorm2d_rdquant_fwd_traits +{ + std::string data_type; + bool save_x; +}; + +float add_rmsnorm2d_rdquant_fwd(add_rmsnorm2d_rdquant_fwd_traits, + add_rmsnorm2d_rdquant_fwd_args, + const ck_tile::stream_config&); diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp new file mode 100644 index 000000000..40fabf7f5 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp @@ -0,0 +1,280 @@ +#include "ck_tile/host.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/add_rmsnorm2d_rdquant.hpp" +#include + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + // due to rounding, int8 quantization might have 1 abs error + double rtol = 1; + double atol = 1; + return ck_tile::make_tuple(rtol, atol); +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3328", "m dimension") + .insert("n", "4096", "n dimension") + .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("e", "1e-5", "epsilon") + .insert("v", "1", "cpu validation or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "0", "cold iter") + .insert("repeat", "1", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = n; + float epsilon = arg_parser.get_float("e"); + std::string data_type = arg_parser.get_str("prec"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(stride >= n); + + using ADataType = DataType; + using BDataType = DataType; + using GammaDataType = DataType; + using XDataType = DataType; + using YScaleDataType = DataType; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; + + // host verify + ck_tile::HostTensor a_host({m, n}, {stride, 1}); + ck_tile::HostTensor b_host({m, n}, {stride, 1}); + ck_tile::HostTensor gamma_host({n}); + + ck_tile::HostTensor x_host_ref({m, n}, {stride, 1}); + ck_tile::HostTensor x_host_dev({m, n}, {stride, 1}); + ck_tile::HostTensor yscale_host_ref({m}, {1}); + ck_tile::HostTensor yscale_host_dev({m}, {1}); + ck_tile::HostTensor qy_host_ref({m, n}, {stride, 1}); + ck_tile::HostTensor qy_host_dev({m, n}, {stride, 1}); + + ck_tile::FillUniformDistribution{-.5f, .5f}(a_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(b_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(gamma_host); + + ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_buf(b_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem x_buf(x_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes()); + + a_buf.ToDevice(a_host.data()); + b_buf.ToDevice(b_host.data()); + gamma_buf.ToDevice(gamma_host.data()); + + constexpr bool kThreePass = true; + + using BlockWarps = ck_tile::sequence<2, 2>; + using BlockTile = ck_tile::sequence<2, 128>; + using WarpTile = ck_tile::sequence<1, 64>; + using Vector = ck_tile::sequence<1, 1>; + + using Shape = ck_tile::AddRmsnorm2dRdquantShape; + using Problem = ck_tile::AddRmsnorm2dRdquantFwdPipelineProblem; + + using OnePassPipeline = ck_tile::AddRmsnorm2dRdquantFwdPipelineOnePass; + using ThreePassPipeline = ck_tile::AddRmsnorm2dRdquantFwdPipelineThreePass; + using Pipeline = std::conditional_t; + using Kernel = ck_tile::AddRmsnorm2dRdquantFwd; + + ck_tile::AddRmsnorm2dRdquantFwdHostArgs args{a_buf.GetDeviceBuffer(), + b_buf.GetDeviceBuffer(), + gamma_buf.GetDeviceBuffer(), + x_buf.GetDeviceBuffer(), + yscale_buf.GetDeviceBuffer(), + qy_buf.GetDeviceBuffer(), + epsilon, + m, + n, + stride}; + + auto kargs = Kernel::MakeKargs(args); + + const dim3 grids = Kernel::GridSize(args); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + auto s = ck_tile::stream_config{nullptr, true, 0, warmup, repeat}; + + ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + bool pass = true; + + if(do_validation) + { + using YDataType = ComputeDataType; + using InvRmsDataType = DataType; + + // Add + { + auto op = [](const auto& v0, const auto& v1) { return v0 + v1; }; + ck_tile::reference_binary_elementwise( + a_host, b_host, x_host_ref, op); + + x_buf.FromDevice(x_host_dev.data()); + + auto [rtol, atol] = get_elimit(); + if(stride == n) + { + pass = ck_tile::check_err( + x_host_dev, x_host_ref, std::string("x Error: Incorrect results!"), rtol, atol); + } + else + { + for(int i_r = 0; i_r < m; i_r++) + { + std::vector x_host_dev_row(x_host_dev.begin() + i_r * stride, + x_host_dev.begin() + i_r * stride + n); + std::vector x_host_ref_row(x_host_ref.begin() + i_r * stride, + x_host_ref.begin() + i_r * stride + n); + pass &= ck_tile::check_err(x_host_dev_row, + x_host_ref_row, + std::string("x[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + } + + ck_tile::HostTensor y_host({m, n}); + // Rmsnorm2d + { + ck_tile::HostTensor invRms_host_ref({m}); + + // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for + // simplicity + ck_tile::reference_rmsnorm2d_fwd( + x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon); + } + + // yscale + { + ck_tile::HostTensor y_rowwise_amax_host({m}); + + using ReduceAmax = ck_tile::ReduceOp::AbsMax; + ck_tile::reference_reduce( + y_host, y_rowwise_amax_host, ReduceAmax{}); + + auto op = [](const auto& v0) { + return v0 / + ck_tile::type_convert(ck_tile::numeric::max()); + }; + ck_tile::reference_unary_elementwise( + y_rowwise_amax_host, yscale_host_ref, op); + + yscale_buf.FromDevice(yscale_host_dev.mData.data()); + + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err(yscale_host_dev, + yscale_host_ref, + std::string("yscale Error: Incorrect results!"), + rtol, + atol); + } + + // rowwise quantization + { + ck_tile::reference_rowwise_quantization2d( + y_host, yscale_host_ref, qy_host_ref); + + qy_buf.FromDevice(qy_host_dev.data()); + auto [rtol, atol] = get_elimit(); + + if(stride == n) + { + pass = ck_tile::check_err(qy_host_dev, + qy_host_ref, + std::string("qy Error: Incorrect results!"), + rtol, + atol); + } + else + { + for(int i_r = 0; i_r < m; i_r++) + { + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, + qy_host_dev.begin() + i_r * stride + n); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, + qy_host_ref.begin() + i_r * stride + n); + pass &= ck_tile::check_err(qy_host_dev_row, + qy_host_ref_row, + std::string("qy[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + } + + std::cout << "[" << data_type << "]" + << " m:" << m << ", n:" << n << ", stride:" << stride + << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + if(data_type == "fp16") + { + return run(arg_parser) ? 0 : -2; + } + + return -3; +} diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp new file mode 100644 index 000000000..57a0f254d --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "add_rmsnorm2d_rdquant_fwd.hpp" + +template +using trait_ = add_rmsnorm2d_rdquant_fwd_traits_; + +template +float add_rmsnorm2d_rdquant_fwd_b16_(add_rmsnorm2d_rdquant_fwd_traits /*t*/, + add_rmsnorm2d_rdquant_fwd_args a, + const ck_tile::stream_config& s) +{ +#if 1 + float r = -1; + // clang-format off + // rm rn tm tn vn pd x 3p + if(a.n <= 64) { + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + else if(a.n <= 128) { + if (a.n % 2 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + else if(a.n <= 256) { + if (a.n % 4 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 2 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + else if(a.n <= 512) { + if (a.n % 8 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 4 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 2 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + else if(a.n <= 768) { + if (a.n % 4 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 2 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + else if(a.n <= 1024) { + if (a.n % 8 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 4 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 2 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + else if(a.n <= 1536) { + if (a.n % 8 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 4 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 2 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + else if(a.n <= 2048) { + if (a.n % 8 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 4 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 2 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + else if(a.n <= 3072) { + if (a.n % 8 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 4 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 2 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + else if(a.n <= 4096) { + if (a.n % 8 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 4 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 2 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + else if(a.n > 4096) { + if (a.n % 8 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 4 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else if (a.n % 2 == 0) + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + else + r = add_rmsnorm2d_rdquant_fwd_>(s, a); + } + return r; +#else + return add_rmsnorm2d_rdquant_fwd_>(s, a); +#endif + // clang-format on +} + +float add_rmsnorm2d_rdquant_fwd(add_rmsnorm2d_rdquant_fwd_traits t, + add_rmsnorm2d_rdquant_fwd_args a, + const ck_tile::stream_config& s) +{ + + float r = -1; + // Only support instance of save_x == true for now + assert(t.save_x); + if(t.data_type.compare("fp16") == 0) + { + return add_rmsnorm2d_rdquant_fwd_b16_(t, a, s); + } + else if(t.data_type.compare("bf16") == 0) + { + return add_rmsnorm2d_rdquant_fwd_b16_(t, a, s); + } + if(r < 0) + throw std::runtime_error("Without supported instances!"); + + return r; +} diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp new file mode 100644 index 000000000..5495e3c9a --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +#if 0 +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); + +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +#endif + +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp new file mode 100644 index 000000000..8bbfdc858 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp new file mode 100644 index 000000000..381a11fc8 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp new file mode 100644 index 000000000..2fefac693 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp new file mode 100644 index 000000000..263713bbc --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp new file mode 100644 index 000000000..c62c596fa --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_tp_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_tp_instance.cpp new file mode 100644 index 000000000..e4951f6ab --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp new file mode 100644 index 000000000..4c7ee48e8 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp new file mode 100644 index 000000000..8659dc82b --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp new file mode 100644 index 000000000..5f15f11b4 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp new file mode 100644 index 000000000..8ffdacbdc --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +#if 0 +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); + +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +#endif + +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp new file mode 100644 index 000000000..355109965 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp new file mode 100644 index 000000000..d4d0474c2 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp new file mode 100644 index 000000000..2cb300eda --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp new file mode 100644 index 000000000..fb0ceb4c5 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp new file mode 100644 index 000000000..3a241a3c9 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_tp_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_tp_instance.cpp new file mode 100644 index 000000000..d3094679f --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp new file mode 100644 index 000000000..919bc177e --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp new file mode 100644 index 000000000..8a44f5e00 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp new file mode 100644 index 000000000..5c4f05ec3 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd x 3p +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +template float add_rmsnorm2d_rdquant_fwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp new file mode 100644 index 000000000..6baaad471 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp @@ -0,0 +1,67 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "add_rmsnorm2d_rdquant_fwd.hpp" +#include + +#pragma once + +using S = ck_tile::stream_config; +using A = add_rmsnorm2d_rdquant_fwd_args; + +template +using trait_ = add_rmsnorm2d_rdquant_fwd_traits_; + +template +float add_rmsnorm2d_rdquant_fwd_(const S& s, A a) +{ + using DataType = typename Traits_::DataType; + + using PipelineProblem = ck_tile::AddRmsnorm2dRdquantFwdPipelineProblem< + typename AddRmsnormRdquantTypeConfig::ADataType, + typename AddRmsnormRdquantTypeConfig::BDataType, + typename AddRmsnormRdquantTypeConfig::GammaDataType, + typename AddRmsnormRdquantTypeConfig::ComputeDataType, + typename AddRmsnormRdquantTypeConfig::XDataType, + typename AddRmsnormRdquantTypeConfig::YScaleDataType, + typename AddRmsnormRdquantTypeConfig::QYDataType, + typename Traits_::Shape, + Traits_::kPadN, + Traits_::kSaveX, + Traits_::kThreePass>; + + using OnePassPipeline = ck_tile::AddRmsnorm2dRdquantFwdPipelineOnePass; + using ThreePassPipeline = ck_tile::AddRmsnorm2dRdquantFwdPipelineThreePass; + using Pipeline = std::conditional_t; + + using Kernel = ck_tile::AddRmsnorm2dRdquantFwd; + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + auto kargs = Kernel::MakeKargs(a); + if(s.log_level_ > 0) + std::cout << ", " << Kernel::GetName() << std::flush; + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); +} diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh new file mode 100755 index 000000000..11fd36488 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh @@ -0,0 +1,38 @@ + +# run from top of ck folder +EXE=build/bin/tile_add_rmsnorm2d_rdquant_fwd + +$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 + +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 \ No newline at end of file diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh new file mode 100755 index 000000000..4a02cdcb6 --- /dev/null +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# call from top of CK folder +EXE=./build/bin/tile_add_rmsnorm2d_rdquant_fwd + +for pr_i in "fp16" "bf16" ; do +$EXE -prec=$pr_i -m=99 -n=13 +$EXE -prec=$pr_i -m=17 -n=16 +$EXE -prec=$pr_i -m=1 -n=100 +$EXE -prec=$pr_i -m=4 -n=128 +$EXE -prec=$pr_i -m=80 -n=127 +$EXE -prec=$pr_i -m=22 -n=255 -stride=256 +$EXE -prec=$pr_i -m=7 -n=599 +$EXE -prec=$pr_i -m=19 -n=512 +$EXE -prec=$pr_i -m=33 -n=313 -stride=1000 +$EXE -prec=$pr_i -m=11 -n=510 +$EXE -prec=$pr_i -m=171 -n=676 -stride=818 +$EXE -prec=$pr_i -m=91 -n=636 +$EXE -prec=$pr_i -m=12 -n=768 -stride=800 +$EXE -prec=$pr_i -m=100 -n=766 -stride=812 +$EXE -prec=$pr_i -m=31 -n=1024 +$EXE -prec=$pr_i -m=64 -n=1000 -stride=1004 +$EXE -prec=$pr_i -m=8 -n=1501 +$EXE -prec=$pr_i -m=3 -n=1826 +$EXE -prec=$pr_i -m=5 -n=2040 +$EXE -prec=$pr_i -m=7 -n=2734 +$EXE -prec=$pr_i -m=1 -n=3182 +$EXE -prec=$pr_i -m=9 -n=4096 +$EXE -prec=$pr_i -m=3 -n=8192 +$EXE -prec=$pr_i -m=1 -n=10547 +$EXE -prec=$pr_i -m=3 -n=17134 +done diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index c85e31341..e404e5019 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -9,4 +9,5 @@ add_subdirectory(04_img2col) add_subdirectory(05_reduce) add_subdirectory(06_permute) add_subdirectory(09_topk_softmax) - +add_subdirectory(10_rmsnorm2d) +add_subdirectory(11_add_rmsnorm2d_rdquant) diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 14991d375..fa4b8d3cc 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -59,6 +59,7 @@ #include "ck_tile/core/utility/magic_div.hpp" #include "ck_tile/core/utility/philox_rand.hpp" #include "ck_tile/core/utility/random.hpp" +#include "ck_tile/core/utility/reduce_operator.hpp" #include "ck_tile/core/utility/to_sequence.hpp" #include "ck_tile/core/utility/transpose_vectors.hpp" #include "ck_tile/core/utility/type_traits.hpp" diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp new file mode 100644 index 000000000..8b15d187f --- /dev/null +++ b/include/ck_tile/core/utility/reduce_operator.hpp @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/config.hpp" + +namespace ck_tile { + +namespace ReduceOp { +// y = ReduceOp(y, x); +struct Add +{ + template + CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() + { + return type_convert(0.0f); + }; + + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const + { + return y + x; + } + + template || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const + { + float y_ = type_convert(y); + float x_ = type_convert(x); + + return type_convert(y_ + x_); + } +}; + +struct SquareAdd +{ + template + CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() + { + return type_convert(0.0f); + }; + + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const + { + return y + (x * x); + } +}; + +struct Max +{ + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() + { + return numeric::min(); + }; + + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const + { + return max(y, x); + } +}; + +struct AbsMax +{ + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() + { + return numeric::min(); + }; + + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const + { + return max(y, abs(x)); + } +}; + +} // namespace ReduceOp +} // namespace ck_tile diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp index a17ce751c..c0ab13ce3 100644 --- a/include/ck_tile/host.hpp +++ b/include/ck_tile/host.hpp @@ -19,11 +19,14 @@ #include "ck_tile/host/reference/reference_batched_masking.hpp" #include "ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp" #include "ck_tile/host/reference/reference_batched_softmax.hpp" +#include "ck_tile/host/reference/reference_elementwise.hpp" #include "ck_tile/host/reference/reference_gemm.hpp" #include "ck_tile/host/reference/reference_im2col.hpp" #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp" #include "ck_tile/host/reference/reference_permute.hpp" #include "ck_tile/host/reference/reference_reduce.hpp" +#include "ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp" +#include "ck_tile/host/reference/reference_rowwise_quantization2d.hpp" #include "ck_tile/host/reference/reference_softmax.hpp" #include "ck_tile/host/reference/reference_topk.hpp" #include "ck_tile/host/stream_config.hpp" diff --git a/include/ck_tile/host/reference/reference_elementwise.hpp b/include/ck_tile/host/reference/reference_elementwise.hpp new file mode 100644 index 000000000..809049fa6 --- /dev/null +++ b/include/ck_tile/host/reference/reference_elementwise.hpp @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include + +namespace ck_tile { +template +CK_TILE_HOST void reference_unary_elementwise(const HostTensor& a, + HostTensor& b, + ElementOp element_op) +{ + // TODO: imeplement gpu version reference function + auto f = [&](auto i) { + auto v_a = type_convert(a.mData[i]); + auto v_b = element_op(v_a); + b.mData[i] = ck_tile::type_convert(v_b); + }; + + make_ParallelTensorFunctor(f, b.get_element_space_size())(std::thread::hardware_concurrency()); +} + +template +CK_TILE_HOST void reference_binary_elementwise(const HostTensor& a, + const HostTensor& b, + HostTensor& c, + ElementOp element_op) +{ + // TODO: imeplement gpu version reference function + auto f = [&](auto i) { + auto v_a = type_convert(a.mData[i]); + auto v_b = type_convert(b.mData[i]); + auto v_c = element_op(v_a, v_b); + c.mData[i] = ck_tile::type_convert(v_c); + }; + + make_ParallelTensorFunctor(f, c.get_element_space_size())(std::thread::hardware_concurrency()); +} + +} // namespace ck_tile diff --git a/include/ck_tile/host/reference/reference_reduce.hpp b/include/ck_tile/host/reference/reference_reduce.hpp index b16cee3f9..8f8aa2367 100644 --- a/include/ck_tile/host/reference/reference_reduce.hpp +++ b/include/ck_tile/host/reference/reference_reduce.hpp @@ -9,24 +9,25 @@ namespace ck_tile { -template -CK_TILE_HOST void reference_reduce(const HostTensor& a_m_n, HostTensor& b_m) +template +CK_TILE_HOST void +reference_reduce(const HostTensor& x_m_n, HostTensor& y_m, ReduceOp reduce_op) { auto f = [&](auto m) { - const int N = a_m_n.mDesc.get_lengths()[1]; + const int N = x_m_n.mDesc.get_lengths()[1]; - AccDataType v_acc = 0; + ComputeDataType v_acc = reduce_op.template GetIdentityValue(); for(int n = 0; n < N; ++n) { - const ADataType v_a = a_m_n(m, n); + const ComputeDataType v_a = type_convert(x_m_n(m, n)); - v_acc += v_a; + v_acc = reduce_op(v_acc, v_a); } - b_m(m) = ck_tile::type_convert(v_acc); + y_m(m) = ck_tile::type_convert(v_acc); }; - make_ParallelTensorFunctor(f, b_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency()); } } // namespace ck_tile diff --git a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp new file mode 100644 index 000000000..db6e92f4c --- /dev/null +++ b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" + +namespace ck_tile { + +template +void reference_rmsnorm2d_fwd(const HostTensor& x_m_n, + const HostTensor& gamma_n, + HostTensor& y_m_n, + HostTensor& invRms_m, + ComputeDataType epsilon) +{ + auto rmsnorm2d_fwd_func = [&](auto m) { + const int N = x_m_n.mDesc.get_lengths()[1]; + + ComputeDataType mean_square = 0; + ComputeDataType divisor = 0; + + for(int n = 0; n < N; ++n) + { + ComputeDataType x = ck_tile::type_convert(x_m_n(m, n)); + mean_square += x * x; + } + + mean_square = mean_square / N; + divisor = ck_tile::type_convert(1) / ck_tile::sqrt(mean_square + epsilon); + + if constexpr(!std::is_same_v) + invRms_m(m) = ck_tile::type_convert(divisor); + + for(int n = 0; n < N; ++n) + { + ComputeDataType x = ck_tile::type_convert(x_m_n(m, n)); + ComputeDataType gamma = ck_tile::type_convert(gamma_n(n)); + auto y = x * divisor * gamma; + y_m_n(m, n) = ck_tile::type_convert(y); + } + }; + + make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])( + std::thread::hardware_concurrency()); +} +} // namespace ck_tile diff --git a/include/ck_tile/host/reference/reference_rowwise_quantization2d.hpp b/include/ck_tile/host/reference/reference_rowwise_quantization2d.hpp new file mode 100644 index 000000000..e9a398876 --- /dev/null +++ b/include/ck_tile/host/reference/reference_rowwise_quantization2d.hpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include + +namespace ck_tile { +template +CK_TILE_HOST void reference_rowwise_quantization2d(const HostTensor& x_m_n, + const HostTensor& scale_m, + HostTensor& qx_m_n) +{ + auto f = [&](auto m) { + const int N = x_m_n.mDesc.get_lengths()[1]; + + for(int n = 0; n < N; ++n) + { + auto v_x = x_m_n(m, n); + // scale = amax / 127 for int8 + auto v_scale = type_convert(scale_m(m)); + auto v_qx = v_x / v_scale; + qx_m_n(m, n) = saturates{}(v_qx); + } + }; + + make_ParallelTensorFunctor(f, + scale_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency()); +} + +} // namespace ck_tile diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp new file mode 100644 index 000000000..eb06fea2d --- /dev/null +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp" +#include "ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp" +#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp" +#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp" +#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp" +#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp new file mode 100644 index 000000000..4a0e29035 --- /dev/null +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +namespace ck_tile { + +// host side args +struct AddRmsnorm2dRdquantFwdHostArgs +{ + const void* p_a; + const void* p_b; + const void* p_gamma; + + void* p_x; + void* p_yscale; + void* p_qy; + + float epsilon; + + index_t m; + index_t n; + index_t stride; // row_stride +}; + +// TODO: Extract some type to wrapper class +template +struct AddRmsnorm2dRdquantFwd +{ + using Pipeline = remove_cvref_t; + using Problem = typename Pipeline::Problem; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using GammaDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using XDataType = remove_cvref_t; + using YScaleDataType = remove_cvref_t; + using QYDataType = remove_cvref_t; + + static constexpr bool kSaveX = Problem::kSaveX; + + static constexpr index_t Block_M = Problem::BlockShape::Block_M; + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + static constexpr bool kPadM = false; // always no need to pad along M + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kThreePass = Problem::kThreePass; + + static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N; + static constexpr index_t Vector_N = Problem::BlockShape::Vector_N; + static constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N; + + static constexpr auto I0 = number<0>{}; + static constexpr auto I1 = number<1>{}; + + struct Kargs + { + const void* p_a; + const void* p_b; + const void* p_gamma; + + void* p_x; + void* p_yscale; + void* p_qy; + + float epsilon; + + index_t m; + index_t n; + index_t stride; // row_stride + }; + using Hargs = AddRmsnorm2dRdquantFwdHostArgs; + + CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) + { + return Kargs{hargs.p_a, + hargs.p_b, + hargs.p_gamma, + hargs.p_x, + hargs.p_yscale, + hargs.p_qy, + hargs.epsilon, + hargs.m, + hargs.n, + hargs.stride}; + } + + CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) + { + return integer_divide_ceil(hargs.m, Block_M); + } + + CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; } + + // clang-format off + template struct t2s; + template <> struct t2s { static constexpr const char * name = "fp32"; }; + template <> struct t2s { static constexpr const char * name = "fp16"; }; + template <> struct t2s { static constexpr const char * name = "bf16"; }; + template <> struct t2s { static constexpr const char * name = "fp8"; }; + template <> struct t2s { static constexpr const char * name = "bf8"; }; + // clang-format on + + // in byte + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); } + + CK_TILE_HOST static std::string GetName() + { + // clang-format off + using S_ = typename Problem::BlockShape; + auto surfix = [&] () { + std::string n; + if (kPadN) n += "_pn"; + if (kSaveX) n += "_x"; + if (kThreePass) n += "_2p"; + return n; }(); + + #define _SS_ std::string + #define _TS_ std::to_string + return _SS_("add_rmsnorm2d_rdquant_fwd_") + _SS_(t2s::name) + "_" + + _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" + + _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" + + _SS_(Pipeline::name) + surfix; + #undef _SS_ + #undef _TS_ + // clang-format on + } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + const auto iM = get_block_id() * Block_M; + + const auto a_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_a), + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, + number<1>{}); + + const auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); + }(); + + const auto b_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_b), + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, + number<1>{}); + + const auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); + }(); + + const auto gamma_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_gamma), + make_tuple(kargs.n), + make_tuple(1), + number{}, + number<1>{}); + + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + + return make_tile_window(tmp2_, make_tuple(number{}), {0}); + }(); + + auto x_window = [&]() { + if constexpr(kSaveX) + { + const auto tmp2_ = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_x), + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, + number<1>{}); + + return pad_tensor_view(tmp_, + make_tuple(number{}, number{}), + sequence{}); + }(); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); + } + else + return make_null_tile_window(make_tuple(number{}, number{})); + }(); + + auto yscale_window = [&]() { + auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_yscale), + make_tuple(kargs.m), + make_tuple(1), + number<1>{}); + + auto tmp2_ = pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + return make_tile_window(tmp2_, make_tuple(number{}), {iM}); + }(); + + auto qy_window = [&]() { + auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_qy), + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, + number<1>{}); + + auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); + }(); + + __shared__ char smem[GetSmemSize()]; + + Pipeline{}(a_window, + b_window, + gamma_window, + x_window, + yscale_window, + qy_window, + static_cast(kargs.epsilon), + kargs.n, + smem); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp new file mode 100644 index 000000000..a17c53c73 --- /dev/null +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { +/* +// clang-format off + +4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector + + Block_N (Warp_N * WarpPerBlock_N * Repeat_N ) + +<----------------------< Repeat_N(2)>--------------------->+ + | | + +<-- -->+ + Warp_N + +--------------+--------------+--------------+--------------+----+----------------+ + Warp_M | wrap_0 | wrap_1 | | ^ ^ + +--------------+--------------+ | | + | wrap_2 | wrap_3 | | v + +--------------+--------------+--------------+--------------+----+ Block_M + | | | + + + | + | | | v + +--------------+--------------+--------------+--------------+ + + + each Warp-tile (e.g 16 thrd per row) + + Vector_N (contiguous pixels each thrd holds along N, or vector size) + +-----------+-----------+-----------+-----------+-----------+ + | thrd_0 | thrd_1 | thrd_2 | thrd_3 | ... Vector_M + +-----------+-----------+-----------+-----------+-----------+ + | thrd_16 | thrd_17 | thrd_18 | thrd_19 | ... + +-----------+-----------+-----------+-----------+-----------+ +// clang-format on +*/ +template + typename WarpPerBlock_, // num warps along seq + typename WarpTile_, // warp size, seq + typename Vector_, // contiguous pixels(vector size) along seq + index_t BlockSize_ = + warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})> +struct AddRmsnorm2dRdquantShape +{ + // block size + static constexpr index_t Block_M = BlockTile_::at(number<0>{}); + static constexpr index_t Block_N = BlockTile_::at(number<1>{}); + + // num warps along seq, within each block + static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{}); + static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{}); + + // warp size + static constexpr index_t Warp_M = WarpTile_::at(number<0>{}); + static constexpr index_t Warp_N = WarpTile_::at(number<1>{}); + + static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0); + static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0); + // repeat of each thread along seq + static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); + static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); + + // vector size along seq + static constexpr index_t Vector_M = Vector_::at(number<0>{}); + static constexpr index_t Vector_N = Vector_::at(number<1>{}); + + static_assert(Warp_M % Vector_M == 0); + static_assert(Warp_N % Vector_N == 0); + // num of threads along seq, within each warp + static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; + static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; + + static constexpr index_t BlockSize = BlockSize_; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp new file mode 100644 index 000000000..73ba633b1 --- /dev/null +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d.hpp" + +namespace ck_tile { + +struct AddRmsnorm2dRdquantFwdPipelineDefaultPolicy +{ + template + CK_TILE_DEVICE static constexpr auto MakeABXBlockTileDistribution() + { + using S = typename Problem::BlockShape; + + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 2>>, + sequence<1, 1, 2, 2>, + sequence<0, 3, 0, 3>>{}); + } + template + CK_TILE_DEVICE static constexpr auto MakeGammaBlockTileDistribution() + { + using S = typename Problem::BlockShape; + + return make_static_tile_distribution( + tile_distribution_encoding< + sequence, + tuple>, + tuple, sequence<0, 1>>, + tuple, sequence<1, 2>>, + sequence<1, 1>, + sequence<0, 3>>{}); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2d{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2dSync{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2dCrossWarpSync{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + if constexpr(Problem::kNeedCrossWarpSync) + { + using P_ = BlockReduce2dProblem; + + using block_reduce2d = BlockReduce2d; + using x_block_tile = + decltype(make_static_distributed_tensor( + MakeABXBlockTileDistribution())); + using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile()); + + return GetBlockReduce2dCrossWarpSync().template GetSmemSize(); + } + else + { + return 1; // zero size arrays are an extension + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp new file mode 100644 index 000000000..12a15938a --- /dev/null +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp" +#include +#include + +namespace ck_tile { + +template +struct AddRmsnorm2dRdquantFwdPipelineOnePass +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using ADataType = ck_tile::remove_cvref_t; + using BDataType = ck_tile::remove_cvref_t; + using GammaDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using XDataType = ck_tile::remove_cvref_t; + using YScaleDataType = ck_tile::remove_cvref_t; + using QYDataType = ck_tile::remove_cvref_t; + + static constexpr bool kHasGamma = !std::is_same_v; + static constexpr bool kSaveX = Problem::kSaveX; + + static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; + static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + + static constexpr const char* name = []() { + if constexpr(kNeedCrossWarpSync) + return "bpr_op"; // block per row + else + return "wpr_op"; // warp per row + }(); + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + CK_TILE_DEVICE auto operator()(const AWindow& a_window_, + const BWindow& b_window_, + const GammaWindow& gamma_window_, + XWindow& x_window, + YScaleWindow& yscale_window, + QYWindow& qy_window, + ComputeDataType epsilon, + ck_tile::index_t row_size, + void* smem) const + { + const auto a_window = + make_tile_window(a_window_, Policy::template MakeABXBlockTileDistribution()); + const auto b_window = + make_tile_window(b_window_, Policy::template MakeABXBlockTileDistribution()); + const auto gamma_window = make_tile_window( + gamma_window_, Policy::template MakeGammaBlockTileDistribution()); + + auto reduce_square_sum_func = ReduceOp::SquareAdd{}; + auto reduce_sum_func = ReduceOp::Add{}; + auto reduce_absmax_func = ReduceOp::AbsMax{}; + auto reduce_max_func = ReduceOp::Max{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto block_reduce2d_cross_warp_sync = + Policy::template GetBlockReduce2dCrossWarpSync(); + + const auto a = load_tile(a_window); + const auto b = load_tile(b_window); + const auto gamma = load_tile(gamma_window); + + auto x = tile_elementwise_in( + [&](const auto& a_, const auto& b_) { + return type_convert(a_) + type_convert(b_); + }, + a, + b); + + if constexpr(kSaveX) + store_tile(x_window, cast_tile(x)); + + // compute mean square, each-thread->cross-lane->cross-warp + auto square_sum = block_reduce2d( + x, reduce_square_sum_func.GetIdentityValue(), reduce_square_sum_func); + block_reduce2d_sync(square_sum, reduce_sum_func); + block_reduce2d_cross_warp_sync(square_sum, smem, reduce_sum_func); + + auto inv_rms = tile_elementwise_in( + [&](const auto& v_) { + return type_convert(1.0f) / (sqrt(v_ / row_size + epsilon)); + }, + square_sum); + + // rmsnorm computation + auto y = make_static_distributed_tensor(x.get_tile_distribution()); + sweep_tile(y, [&, inv_rms_ = inv_rms](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + + const auto gamma_ = type_convert(gamma[j_idx]); + + const auto x_ = type_convert(x[idx]); + auto y_ = x_ * inv_rms_[i_idx] * gamma_; + + y(idx) = type_convert(y_); + }); + + // compute absmax, each-thread->cross-lane->cross-warp + auto absmax = block_reduce2d( + y, reduce_absmax_func.GetIdentityValue(), reduce_absmax_func); + block_reduce2d_sync(absmax, reduce_max_func); + block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func); + + // ex: yscale = absmax / 127 if int8 + auto yscale = tile_elementwise_in( + [&](const auto& v_) { + return v_ / type_convert(numeric::max()); + }, + absmax); + store_tile(yscale_window, cast_tile(yscale)); + + // quantize y to qy + auto qy = make_static_distributed_tensor(y.get_tile_distribution()); + sweep_tile(qy, [&, yscale_ = yscale](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + auto qy_ = y[idx] / yscale_[i_idx]; + qy(idx) = saturates{}(qy_); + }); + store_tile(qy_window, qy); + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp new file mode 100644 index 000000000..106e5086b --- /dev/null +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/utility/type_traits.hpp" + +namespace ck_tile { + +// X = A + B, Y = Rmsnorm2d(X), QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale) +template +struct AddRmsnorm2dRdquantFwdPipelineProblem +{ + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using GammaDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using XDataType = remove_cvref_t; + using YScaleDataType = remove_cvref_t; + using QYDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; + + static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; + static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kSaveX = kSaveX_; + static constexpr bool kThreePass = kThreePass_; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp new file mode 100644 index 000000000..0dbb20645 --- /dev/null +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp" +#include +#include + +namespace ck_tile { + +template +struct AddRmsnorm2dRdquantFwdPipelineThreePass +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using ADataType = ck_tile::remove_cvref_t; + using BDataType = ck_tile::remove_cvref_t; + using GammaDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using XDataType = ck_tile::remove_cvref_t; + using YScaleDataType = ck_tile::remove_cvref_t; + using QYDataType = ck_tile::remove_cvref_t; + + static constexpr bool kHasGamma = !std::is_same_v; + static constexpr bool kSaveX = Problem::kSaveX; + + static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; + static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + + static constexpr const char* name = []() { + if constexpr(kNeedCrossWarpSync) + return "bpr_tp"; // block per row + else + return "wpr_tp"; // warp per row + }(); + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + CK_TILE_DEVICE auto operator()(const AWindow& a_window_, + const BWindow& b_window_, + const GammaWindow& gamma_window_, + XWindow& x_window_, + YScaleWindow& yscale_window, + QYWindow& qy_window, + ComputeDataType epsilon, + ck_tile::index_t row_size, + void* smem) const + { + auto a_window = + make_tile_window(a_window_, Policy::template MakeABXBlockTileDistribution()); + auto b_window = + make_tile_window(b_window_, Policy::template MakeABXBlockTileDistribution()); + auto x_window = [&]() { + if constexpr(kSaveX) + return make_tile_window(x_window_, + Policy::template MakeABXBlockTileDistribution()); + else + return x_window_; + }(); + auto gamma_window = make_tile_window( + gamma_window_, Policy::template MakeGammaBlockTileDistribution()); + + auto reduce_square_sum_func = ReduceOp::SquareAdd{}; + auto reduce_sum_func = ReduceOp::Add{}; + auto reduce_absmax_func = ReduceOp::AbsMax{}; + auto reduce_max_func = ReduceOp::Max{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto block_reduce2d_cross_warp_sync = + Policy::template GetBlockReduce2dCrossWarpSync(); + + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + index_t num_n_tile_iteration = + __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); + + using XTensorType = decltype(cast_tile(load_tile(a_window))); + auto square_sum = block_reduce2d.template MakeYBlockTile(); + set_tile(square_sum, reduce_square_sum_func.GetIdentityValue()); + + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto a = load_tile(a_window); + const auto b = load_tile(b_window); + + auto x = tile_elementwise_in( + [&](const auto& a_, const auto& b_) { + return type_convert(a_) + type_convert(b_); + }, + a, + b); + + if constexpr(kSaveX) + store_tile(x_window, cast_tile(x)); + + block_reduce2d(x, square_sum, reduce_square_sum_func); + move_tile_window(x_window, {0, Block_N}); + move_tile_window(a_window, {0, Block_N}); + move_tile_window(b_window, {0, Block_N}); + } + + block_reduce2d_sync(square_sum, reduce_sum_func); + block_reduce2d_cross_warp_sync(square_sum, smem, reduce_sum_func); + + auto inv_rms = tile_elementwise_in( + [&](const auto& v_) { + return type_convert(1.0f) / (sqrt(v_ / row_size + epsilon)); + }, + square_sum); + + // reverse read x to reuse cache + ck_tile::index_t stride_to_right_most_window = + row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N; + + if constexpr(kSaveX) + move_tile_window(x_window, {0, -Block_N}); + else + { + move_tile_window(a_window, {0, -Block_N}); + move_tile_window(b_window, {0, -Block_N}); + } + move_tile_window(gamma_window, {stride_to_right_most_window}); + + using YTensorType = XTensorType; + auto absmax = block_reduce2d.template MakeYBlockTile(); + set_tile(absmax, reduce_absmax_func.GetIdentityValue()); + + // rmsnorm computation + absmax(threadwise reduce) + if constexpr(kSaveX) + __syncthreads(); + + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + auto x = [&]() { + if constexpr(kSaveX) + { + return load_tile(x_window); + } + else + { + const auto a = load_tile(a_window); + const auto b = load_tile(b_window); + return tile_elementwise_in( + [&](const auto& a_, const auto& b_) { + return type_convert(a_) + + type_convert(b_); + }, + a, + b); + } + }(); + + auto gamma = load_tile(gamma_window); + auto y = make_static_distributed_tensor(x.get_tile_distribution()); + + sweep_tile(y, [&](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + + const auto gamma_ = type_convert(gamma[j_idx]); + + const auto x_ = type_convert(x[idx]); + auto y_ = x_ * inv_rms[i_idx] * gamma_; + + y(idx) = type_convert(y_); + }); + + block_reduce2d(y, absmax, reduce_absmax_func); + + if constexpr(kSaveX) + move_tile_window(x_window, {0, -Block_N}); + else + { + move_tile_window(a_window, {0, -Block_N}); + move_tile_window(b_window, {0, -Block_N}); + } + move_tile_window(gamma_window, {-Block_N}); + } + + // compute absmax, cross-lane->cross-warp + block_reduce2d_sync(absmax, reduce_max_func); + block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func); + + // ex: yscale = absmax / 127 if int8 + auto yscale = tile_elementwise_in( + [&](const auto& v_) { + return v_ / type_convert(numeric::max()); + }, + absmax); + store_tile(yscale_window, cast_tile(yscale)); + + // quantize y to qy + // recompute rmsnorm, try to save y in the future + if constexpr(kSaveX) + move_tile_window(x_window, {0, Block_N}); + else + { + move_tile_window(a_window, {0, Block_N}); + move_tile_window(b_window, {0, Block_N}); + } + move_tile_window(gamma_window, {Block_N}); + + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + auto x = [&]() { + if constexpr(kSaveX) + { + return load_tile(x_window); + } + else + { + const auto a = load_tile(a_window); + const auto b = load_tile(b_window); + return tile_elementwise_in( + [&](const auto& a_, const auto& b_) { + return type_convert(a_) + + type_convert(b_); + }, + a, + b); + } + }(); + + auto gamma = load_tile(gamma_window); + auto y = make_static_distributed_tensor(x.get_tile_distribution()); + auto qy = make_static_distributed_tensor(y.get_tile_distribution()); + + sweep_tile(y, [&](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + + const auto gamma_ = type_convert(gamma[j_idx]); + + const auto x_ = type_convert(x[idx]); + auto y_ = x_ * inv_rms[i_idx] * gamma_; + auto qy_ = y_ / yscale[i_idx]; + qy(idx) = saturates{}(qy_); + }); + + store_tile(qy_window, qy); + + if constexpr(kSaveX) + move_tile_window(x_window, {0, Block_N}); + else + { + move_tile_window(a_window, {0, Block_N}); + move_tile_window(b_window, {0, Block_N}); + } + move_tile_window(gamma_window, {Block_N}); + move_tile_window(qy_window, {0, Block_N}); + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index bf002141b..c767a472a 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -35,9 +35,9 @@ struct Layernorm2dFwdPipelineOnePass static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) - return "bpr"; // block per row + return "bpr_op"; // block per row else - return "wpr"; // warp per row + return "wpr_op"; // warp per row }(); CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp index db094ac2a..e35d02e70 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -35,9 +35,9 @@ struct Layernorm2dFwdPipelineTwoPass static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) - return "bpr"; // block per row + return "bpr_tp"; // block per row else - return "wpr"; // warp per row + return "wpr_tp"; // warp per row }(); CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() @@ -118,8 +118,6 @@ struct Layernorm2dFwdPipelineTwoPass ck_tile::index_t stride_to_right_most_window = row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N; - // x_window.foo(); - // gamma_window.foo(); move_tile_window(x_window, {0, -Block_N}); move_tile_window(gamma_window, {stride_to_right_most_window}); move_tile_window(beta_window, {stride_to_right_most_window}); diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp index a5ba745d2..fe2d24044 100644 --- a/include/ck_tile/ops/reduce.hpp +++ b/include/ck_tile/ops/reduce.hpp @@ -4,4 +4,7 @@ #pragma once #include "ck_tile/ops/reduce/block/block_reduce.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp index 51d55235e..d9df949cf 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp @@ -6,6 +6,7 @@ #include "ck_tile/core.hpp" #include +// This file is not support cross warp reduce namespace ck_tile { /* @@ -15,8 +16,8 @@ namespace ck_tile { // synchronize reduce result (cross lane reduction and broadcast on replicated dimension) template CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor, - const ReduceFunc& reduce_func, - bool_constant = {}) + const ReduceFunc& reduce_func, + bool_constant = {}) { using Dstr = typename AccDistributedTensor_::StaticTileDistribution; using DstrEncode = typename Dstr::DstrEncode; @@ -115,7 +116,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor, */ template CK_TILE_DEVICE void block_tile_reduce_xor_sync(AccDistributedTensor_& acc_tensor, - const ReduceFunc& reduce_func) + const ReduceFunc& reduce_func) { using Dstr = typename AccDistributedTensor_::StaticTileDistribution; using DstrEncode = typename Dstr::DstrEncode; @@ -174,9 +175,9 @@ template CK_TILE_DEVICE void block_tile_reduce(AccDistributedTensor_& acc_tensor, - const InDistributedTensor_& in_tensor, - sequence, - const ReduceFunc& reduce_func) + const InDistributedTensor_& in_tensor, + sequence, + const ReduceFunc& reduce_func) { constexpr auto I0 = number<0>{}; constexpr auto I1 = number<1>{}; @@ -249,9 +250,9 @@ template CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor, - sequence in_reduce_dims, - const ReduceFunc& reduce_func, - const InDataType_& reduce_init) + sequence in_reduce_dims, + const ReduceFunc& reduce_func, + const InDataType_& reduce_init) { using InDataType = typename InDistributedTensor_::DataType; using AccDataType = remove_cvref_t; diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp new file mode 100644 index 000000000..beb8c718e --- /dev/null +++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template +struct BlockReduce2d +{ + // in-thread reduction + using Problem = remove_cvref_t; + using XDataType = typename Problem::XDataType; + using ComputeDataType = typename Problem::ComputeDataType; + + CK_TILE_DEVICE constexpr BlockReduce2d() {} + + template + CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor, + YDistributedTensor_& y_tensor, + const ReduceFunc& reduce_func) + { + constexpr auto I0 = number<0>{}; + constexpr auto I1 = number<1>{}; + + constexpr auto spans = XDistributedTensor_::get_distributed_spans(); + + // FIXME: hard coded to reduce 2nd axis + sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) { + constexpr auto y_dstr_idx = make_tuple(dstr_idx_i0); + + auto y = y_tensor[y_dstr_idx]; + + sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) { + constexpr auto in_dstr_idx = make_tuple(dstr_idx_i0, dstr_idx_i1); + const auto x = ck_tile::type_convert(x_tensor[in_dstr_idx]); + + y = reduce_func(y, x); + }); + + y_tensor(y_dstr_idx) = y; + }); + } + + template + CK_TILE_DEVICE static auto MakeYBlockTile() + { + static_assert(std::is_same_v, "wrong!"); + + // FIXME: hard coded to reduce 2nd axis + constexpr auto reduce_dims = sequence<1>{}; + + constexpr auto dstr = + make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding( + XDistributedTensor_::get_tile_distribution() + .get_static_tile_distribution_encoding(), + reduce_dims)); + + auto tensor = make_static_distributed_tensor(dstr); + + return tensor; + } + + template + CK_TILE_DEVICE auto operator()(const XDistributedTensor_& x_tensor, + const ComputeDataType& reduce_init, + const ReduceFunc& reduce_func) + { + auto y_tensor = MakeYBlockTile(); + set_tile(y_tensor, reduce_init); + (*this)(x_tensor, y_tensor, reduce_func); + + return y_tensor; + } +}; + +template +struct BlockReduce2dSync +{ + using Problem = remove_cvref_t; + + template + CK_TILE_DEVICE void operator()(YDistributedTensor_& y_tensor, const ReduceFunc& reduce_func) + { + using Dstr = typename YDistributedTensor_::StaticTileDistribution; + using DstrEncode = typename Dstr::DstrEncode; + using DstrEncodeDetail = typename DstrEncode::detail; + + constexpr index_t NDimP = Dstr::get_num_of_dimension_p(); + constexpr index_t NDimR = Dstr::get_num_of_dimension_r(); + + constexpr index_t idim_p_lane = NDimP - 1; + + // const auto ps_idx = make_array(get_warp_id(), get_lane_id()); + // const auto rs_idx = + // y_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx); + + constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size(); + + // loop over thread data + static_for<0, thread_buf_size, 1>{}([&](auto i) { + auto v_local = y_tensor.get_thread_buffer()[i]; + + // cross-lane reduce for replication + // only reduce on R dimension correspond to lane + // (lane id maps to this R dimension) + static_for<0, NDimR, 1>{}([&](auto idim_r) { + // FIXME: nasty to use does_p_own_r_ + if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r]) + { + constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r]; + + constexpr index_t lid_over_rid_derivative = + DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r]; + + static_assert(is_power_of_two_integer(r_length), + "wrong! only support power of 2 reduction"); + + constexpr index_t nstage = integer_log2_floor(r_length); + + // reduction sweep forward + static_for<0, nstage, 1>{}([&](auto istage) { + // xor + index_t src_lane = + (__lane_id()) ^ + (number{}.value); + + // pull data from remote lane + const auto v_remote = warp_shuffle(v_local, src_lane); + + // reduce + v_local = reduce_func(v_local, v_remote); + }); + } + }); + + // TODO - Do we need to broadcast to other lane? + y_tensor.get_thread_buffer()(i) = v_local; + }); + } +}; + +template +struct BlockReduce2dCrossWarpSync +{ + using Problem = remove_cvref_t; + using BlockShape = typename Problem::BlockShape; + + template + CK_TILE_DEVICE static constexpr index_t GetReduceWarps() + { + constexpr index_t num_reduce_warps = [&]() { + using Dstr = typename YDistributedTensor_::StaticTileDistribution; + using DstrEncode = typename Dstr::DstrEncode; + using DstrEncodeDetail = typename DstrEncode::detail; + + constexpr index_t NDimR = Dstr::get_num_of_dimension_r(); + + constexpr index_t idim_p_warp = 0; + + index_t len_ = 1; + static_for<0, NDimR, 1>{}([&](auto idim_r) { + if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_warp][idim_r]) + { + constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r]; + len_ *= r_length; + } + }); + return len_; + }(); + return num_reduce_warps; + } + + // return in byte + template + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + using DataType = typename YDistributedTensor_::DataType; + // constexpr auto num_reduce_warps = GetReduceWarps(); + + constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size(); + + // we need to store all data from every wave into smem + // e.g. 2x2 reduce along N + // -------------> reduce N + // | w0 | w1 | ___> | w01 | + // | w2 | w3 | | w23 | + // + // -> store data from every wave into LDS + // + // + // -------------> reduce N + // | w0 | w1 | w2 | w3 | -----> | w0123 | + // + // -> also store data from every wave into LDS + constexpr index_t num_warps = BlockShape::BlockSize / warpSize; + return num_warps * thread_buf_size * sizeof(DataType); + } + + template + CK_TILE_DEVICE void + operator()(YDistributedTensor_& y_tensor, void* smem, const ReduceFunc& reduce_func) + { + using DataType = typename YDistributedTensor_::DataType; + + constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size(); + + DataType* smem_ptr = reinterpret_cast(smem); + const index_t lane_id = get_lane_id(); + const index_t warp_id = get_warp_id(); + constexpr auto num_reduce_warps = GetReduceWarps(); + constexpr index_t num_warps = BlockShape::BlockSize / warpSize; + const index_t smem_offset = warp_id; + + // skip if nonthing to do + if constexpr(num_reduce_warps == 1) + return; + + // store into smem only for lane-0 within one warp + if(lane_id == 0) + { + static_for<0, thread_buf_size, 1>{}([&](auto i) { + smem_ptr[smem_offset + i * num_warps] = y_tensor.get_thread_buffer()[i]; + }); + } + block_sync_lds(); + + // load from smem. here we let everythread to do compute :) + index_t local_warp_id = warp_id / num_reduce_warps; + index_t local_smem_os = local_warp_id * num_reduce_warps; + DataType all_scratch[thread_buf_size * num_reduce_warps]; + static_for<0, thread_buf_size, 1>{}([&](auto i_0) { + static_for<0, num_reduce_warps, 1>{}([&](auto i_1) { + all_scratch[i_0 * num_reduce_warps + i_1] = + smem_ptr[i_0 * num_warps + local_smem_os + i_1]; + }); + }); + block_sync_lds(); // TODO: we don't need sync here + + static_for<0, thread_buf_size, 1>{}([&](auto i_0) { + // TODO: use descriptor for this + auto v_local = all_scratch[i_0 * num_reduce_warps]; + + // further reduce mean/var + static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) { + constexpr auto i_1 = number{}; + const DataType v_remote = all_scratch[i_0 * num_reduce_warps + i_1]; + + // reduce + v_local = reduce_func(v_local, v_remote); + }); + + y_tensor.get_thread_buffer()(i_0) = v_local; + }); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp new file mode 100644 index 000000000..3c547242d --- /dev/null +++ b/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d.hpp" + +namespace ck_tile { + +struct BlockReduce2dDefaultPolicy +{ + template + CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution() + { + using S = typename Problem::BlockShape; + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 2>>, + sequence<1, 1, 2, 2>, + sequence<0, 3, 0, 3>>{}); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2d{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2dSync{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2dCrossWarpSync{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + if constexpr(Problem::kNeedCrossWarpSync) + { + using P_ = BlockReduce2dProblem; + + using block_reduce2d = BlockReduce2d; + using x_block_tile = + decltype(make_static_distributed_tensor( + MakeXBlockTileDistribution())); + using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile()); + + return GetBlockReduce2dCrossWarpSync().template GetSmemSize(); + } + else + { + return 1; // zero size arrays are an extension + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp new file mode 100644 index 000000000..b75f4f076 --- /dev/null +++ b/include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template +struct BlockReduce2dProblem +{ + using XDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/rmsnorm2d.hpp b/include/ck_tile/ops/rmsnorm2d.hpp new file mode 100644 index 000000000..98c60f1b5 --- /dev/null +++ b/include/ck_tile/ops/rmsnorm2d.hpp @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp" +#include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp" +#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp" +#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp" +#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp" +#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp new file mode 100644 index 000000000..99084a25e --- /dev/null +++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +namespace ck_tile { + +// host side args +struct Rmsnorm2dFwdHostArgs +{ + const void* p_x; + const void* p_gamma; + + void* p_y; + void* p_invRms; + + float epsilon; + + index_t m; + index_t n; + index_t stride; // row_stride +}; + +// TODO: Extract some type to wrapper class +template +struct Rmsnorm2dFwd +{ + using Pipeline = remove_cvref_t; + using Problem = typename Pipeline::Problem; + + using XDataType = remove_cvref_t; + using GammaDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YDataType = remove_cvref_t; + using InvRmsDataType = remove_cvref_t; + + static constexpr bool kHasGamma = !std::is_same_v; + static constexpr bool kSaveInvRms = Problem::kSaveInvRms; + + static constexpr index_t Block_M = Problem::BlockShape::Block_M; + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + static constexpr bool kPadM = false; // always no need to pad along M + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kTwoPass = Problem::kTwoPass; + + static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N; + static constexpr index_t Vector_N = Problem::BlockShape::Vector_N; + static constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N; + + static constexpr auto I0 = number<0>{}; + static constexpr auto I1 = number<1>{}; + + struct Kargs + { + const void* p_x; + const void* p_gamma; + + void* p_y; + void* p_invRms; + + float epsilon; + + index_t m; + index_t n; + index_t stride; // row_stride + }; + using Hargs = Rmsnorm2dFwdHostArgs; + + CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) + { + return Kargs{hargs.p_x, + hargs.p_gamma, + hargs.p_y, + hargs.p_invRms, + hargs.epsilon, + hargs.m, + hargs.n, + hargs.stride}; + } + + CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) + { + return (hargs.m + Block_M - 1) / Block_M; + } + + CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; } + + // clang-format off + template struct t2s; + template <> struct t2s { static constexpr const char * name = "fp32"; }; + template <> struct t2s { static constexpr const char * name = "fp16"; }; + template <> struct t2s { static constexpr const char * name = "bf16"; }; + template <> struct t2s { static constexpr const char * name = "fp8"; }; + template <> struct t2s { static constexpr const char * name = "bf8"; }; + // clang-format on + + // in byte + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); } + + CK_TILE_HOST static std::string GetName() + { + // clang-format off + using S_ = typename Problem::BlockShape; + auto surfix = [&] () { + std::string n; + if (kPadN) n += "_pn"; + if (kSaveInvRms) n += "_rms"; + if (kTwoPass) n += "_2p"; + return n; }(); + + #define _SS_ std::string + #define _TS_ std::to_string + return _SS_("rmsnorm2d_fwd_") + _SS_(t2s::name) + "_" + + _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" + + _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" + + _SS_(Pipeline::name) + surfix; + #undef _SS_ + #undef _TS_ + // clang-format on + } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + const auto iM = get_block_id() * Block_M; + + const auto x_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_x), + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, + number<1>{}); + + const auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); + }(); + + const auto gamma_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_gamma), + make_tuple(kargs.n), + make_tuple(1), + number{}, + number<1>{}); + + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + + return make_tile_window(tmp2_, make_tuple(number{}), {0}); + }(); + + auto y_window = [&]() { + auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_y), + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, + number<1>{}); + + auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); + }(); + + auto inv_rms_window = [&]() { + if constexpr(kSaveInvRms) + { + const auto inv_rms_m = [&]() { + const auto inv_rms_dram_naive = + make_naive_tensor_view_packed( + static_cast(kargs.p_invRms), + make_tuple(kargs.m), + number<1>{}); + + return pad_tensor_view( + inv_rms_dram_naive, make_tuple(number{}), sequence{}); + }(); + return make_tile_window(inv_rms_m, make_tuple(number{}), {iM}); + } + else + return make_null_tile_window(make_tuple(number{})); + }(); + + __shared__ char smem[GetSmemSize()]; + + Pipeline{}(x_window, + gamma_window, + y_window, + inv_rms_window, + static_cast(kargs.epsilon), + kargs.n, + smem); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp new file mode 100644 index 000000000..fb484a106 --- /dev/null +++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { +/* +// clang-format off + +4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector + + Block_N (Warp_N * WarpPerBlock_N * Repeat_N ) + +<----------------------< Repeat_N(2)>--------------------->+ + | | + +<-- -->+ + Warp_N + +--------------+--------------+--------------+--------------+----+----------------+ + Warp_M | wrap_0 | wrap_1 | | ^ ^ + +--------------+--------------+ | | + | wrap_2 | wrap_3 | | v + +--------------+--------------+--------------+--------------+----+ Block_M + | | | + + + | + | | | v + +--------------+--------------+--------------+--------------+ + + + each Warp-tile (e.g 16 thrd per row) + + Vector_N (contiguous pixels each thrd holds along N, or vector size) + +-----------+-----------+-----------+-----------+-----------+ + | thrd_0 | thrd_1 | thrd_2 | thrd_3 | ... Vector_M + +-----------+-----------+-----------+-----------+-----------+ + | thrd_16 | thrd_17 | thrd_18 | thrd_19 | ... + +-----------+-----------+-----------+-----------+-----------+ +// clang-format on +*/ +template + typename WarpPerBlock_, // num warps along seq + typename WarpTile_, // warp size, seq + typename Vector_, // contiguous pixels(vector size) along seq + index_t BlockSize_ = + warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})> +struct Rmsnorm2dShape +{ + // block size + static constexpr index_t Block_M = BlockTile_::at(number<0>{}); + static constexpr index_t Block_N = BlockTile_::at(number<1>{}); + + // num warps along seq, within each block + static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{}); + static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{}); + + // warp size + static constexpr index_t Warp_M = WarpTile_::at(number<0>{}); + static constexpr index_t Warp_N = WarpTile_::at(number<1>{}); + + static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0); + static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0); + // repeat of each thread along seq + static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); + static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); + + // vector size along seq + static constexpr index_t Vector_M = Vector_::at(number<0>{}); + static constexpr index_t Vector_N = Vector_::at(number<1>{}); + + static_assert(Warp_M % Vector_M == 0); + static_assert(Warp_N % Vector_N == 0); + // num of threads along seq, within each warp + static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; + static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; + + static constexpr index_t BlockSize = BlockSize_; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp new file mode 100644 index 000000000..e4814cf45 --- /dev/null +++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d.hpp" + +namespace ck_tile { + +struct Rmsnorm2dFwdPipelineDefaultPolicy +{ + template + CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution() + { + using S = typename Problem::BlockShape; + + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 2>>, + sequence<1, 1, 2, 2>, + sequence<0, 3, 0, 3>>{}); + } + template + CK_TILE_DEVICE static constexpr auto MakeGammaBlockTileDistribution() + { + using S = typename Problem::BlockShape; + + return make_static_tile_distribution( + tile_distribution_encoding< + sequence, + tuple>, + tuple, sequence<0, 1>>, + tuple, sequence<1, 2>>, + sequence<1, 1>, + sequence<0, 3>>{}); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2d{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2dSync{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2dCrossWarpSync{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + if constexpr(Problem::kNeedCrossWarpSync) + { + using P_ = BlockReduce2dProblem; + + using block_reduce2d = BlockReduce2d; + using x_block_tile = + decltype(make_static_distributed_tensor( + MakeXBlockTileDistribution())); + using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile()); + + return GetBlockReduce2dCrossWarpSync().template GetSmemSize(); + } + else + { + return 1; // zero size arrays are an extension + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp new file mode 100644 index 000000000..68cfe4282 --- /dev/null +++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp" +#include +#include + +namespace ck_tile { + +template +struct Rmsnorm2dFwdPipelineOnePass +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using XDataType = ck_tile::remove_cvref_t; + using GammaDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using YDataType = ck_tile::remove_cvref_t; + using InvRmsDataType = ck_tile::remove_cvref_t; + + static constexpr bool kHasGamma = !std::is_same_v; + static constexpr bool kSaveInvRms = Problem::kSaveInvRms; + + static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; + static constexpr bool kPadM = false; // TODO - BlockRmsnorm2dFwdProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + + static constexpr const char* name = []() { + if constexpr(kNeedCrossWarpSync) + return "bpr_op"; // block per row + else + return "wpr_op"; // warp per row + }(); + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + CK_TILE_DEVICE auto operator()(const XWindow& x_window_, + const GammaWindow& gamma_window_, + YWindow& y_window, + InvRmsWindow& inv_rms_window, + ComputeDataType epsilon, + ck_tile::index_t row_size, + void* smem) const + { + const auto x_window = + make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); + const auto gamma_window = make_tile_window( + gamma_window_, Policy::template MakeGammaBlockTileDistribution()); + + auto reduce_square_sum_func = ReduceOp::SquareAdd{}; + auto reduce_sum_func = ReduceOp::Add{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto block_reduce2d_cross_warp_sync = + Policy::template GetBlockReduce2dCrossWarpSync(); + + const auto x = load_tile(x_window); + // load gamma (TODO: support no gamma?) + const auto gamma = load_tile(gamma_window); + + // compute mean square each-thread->cross-lane->cross-warp + auto square_sum = block_reduce2d( + x, reduce_square_sum_func.GetIdentityValue(), reduce_square_sum_func); + block_reduce2d_sync(square_sum, reduce_sum_func); + block_reduce2d_cross_warp_sync(square_sum, smem, reduce_sum_func); + + // compute inv-rms + auto inv_rms = tile_elementwise_in( + [&](const auto& v_) { + return type_convert(1.0f) / (sqrt(v_ / row_size + epsilon)); + }, + square_sum); + + if constexpr(kSaveInvRms) + store_tile(inv_rms_window, cast_tile(inv_rms)); + + // rmsnorm computation + auto y = make_static_distributed_tensor(x.get_tile_distribution()); + sweep_tile(y, [&, inv_rms_ = inv_rms](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + + const auto gamma_ = type_convert(gamma[j_idx]); + + const auto x_ = type_convert(x[idx]); + auto y_ = x_ * inv_rms_[i_idx] * gamma_; + + y(idx) = type_convert(y_); + }); + store_tile(y_window, y); + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp new file mode 100644 index 000000000..87cab3463 --- /dev/null +++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/utility/type_traits.hpp" + +namespace ck_tile { + +template +struct Rmsnorm2dFwdPipelineProblem +{ + using XDataType = remove_cvref_t; + using GammaDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YDataType = remove_cvref_t; + using InvRmsDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; + + static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; + static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kSaveInvRms = kSaveInvRms_; + static constexpr bool kTwoPass = kTwoPass_; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp new file mode 100644 index 000000000..a892df6bd --- /dev/null +++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp" +#include +#include + +namespace ck_tile { + +template +struct Rmsnorm2dFwdPipelineTwoPass +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using XDataType = ck_tile::remove_cvref_t; + using GammaDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using YDataType = ck_tile::remove_cvref_t; + using InvRmsDataType = ck_tile::remove_cvref_t; + + static constexpr bool kHasGamma = !std::is_same_v; + static constexpr bool kSaveInvRms = Problem::kSaveInvRms; + + static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; + static constexpr bool kPadM = false; // TODO - BlockRmsnorm2dFwdProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + + static constexpr const char* name = []() { + if constexpr(kNeedCrossWarpSync) + return "bpr_tp"; // block per row + else + return "wpr_tp"; // warp per row + }(); + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + CK_TILE_DEVICE auto operator()(const XWindow& x_window_, + const GammaWindow& gamma_window_, + YWindow& y_window, + InvRmsWindow& inv_rms_window, + ComputeDataType epsilon, + ck_tile::index_t row_size, + void* smem) const + { + auto x_window = + make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); + auto gamma_window = make_tile_window( + gamma_window_, Policy::template MakeGammaBlockTileDistribution()); + + // Problem::BlockShape + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + index_t num_n_tile_iteration = + __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); + + auto reduce_square_sum_func = ReduceOp::SquareAdd{}; + auto reduce_sum_func = ReduceOp::Add{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto block_reduce2d_cross_warp_sync = + Policy::template GetBlockReduce2dCrossWarpSync(); + + using XTensorType = decltype(load_tile(x_window)); + auto square_sum = block_reduce2d.template MakeYBlockTile(); + set_tile(square_sum, reduce_square_sum_func.GetIdentityValue()); + + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto x = load_tile(x_window); + block_reduce2d(x, square_sum, reduce_square_sum_func); + move_tile_window(x_window, {0, Block_N}); + } + + block_reduce2d_sync(square_sum, reduce_sum_func); + block_reduce2d_cross_warp_sync(square_sum, smem, reduce_sum_func); + + // compute inv-rms + auto inv_rms = tile_elementwise_in( + [&](const auto& v_) { + return type_convert(1.0f) / (sqrt(v_ / row_size + epsilon)); + }, + square_sum); + + if constexpr(kSaveInvRms) + store_tile(inv_rms_window, cast_tile(inv_rms)); + + // reverse read x to reuse cache + ck_tile::index_t stride_to_right_most_window = + row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N; + + move_tile_window(x_window, {0, -Block_N}); + move_tile_window(gamma_window, {stride_to_right_most_window}); + move_tile_window(y_window, {0, stride_to_right_most_window}); + + // rmsnorm computation + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto x = load_tile(x_window); + // load gamma/beta (TODO: support no gamma/beta?) + const auto gamma = load_tile(gamma_window); + + auto y = make_static_distributed_tensor(x.get_tile_distribution()); + + sweep_tile(y, [&, inv_rms_ = inv_rms](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + + const auto gamma_ = type_convert(gamma[j_idx]); + + const auto x_ = type_convert(x[idx]); + auto y_ = x_ * inv_rms_[i_idx] * gamma_; + + y(idx) = type_convert(y_); + }); + + store_tile(y_window, y); + + move_tile_window(x_window, {0, -Block_N}); + move_tile_window(gamma_window, {-Block_N}); + move_tile_window(y_window, {0, -Block_N}); + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp index 55d55402d..623e1e16d 100644 --- a/include/ck_tile/ops/welford/block/block_welford.hpp +++ b/include/ck_tile/ops/welford/block/block_welford.hpp @@ -276,8 +276,8 @@ struct BlockWelfordCrossWarpSync fp32x4_t all_scratch[thread_buf_size * num_reduce_warps]; static_for<0, thread_buf_size, 1>{}([&](auto i_0) { static_for<0, num_reduce_warps, 1>{}([&](auto i_1) { - all_scratch[i_0 * num_warps + i_1] = - smem_ptr[i_0 * num_reduce_warps + local_smem_os + i_1]; + all_scratch[i_0 * num_reduce_warps + i_1] = + smem_ptr[i_0 * num_warps + local_smem_os + i_1]; }); }); block_sync_lds(); // TODO: we don't need sync here @@ -286,7 +286,7 @@ struct BlockWelfordCrossWarpSync static_for<0, thread_buf_size, 1>{}([&](auto i_0) { // TODO: use descriptor for this - auto v_local = all_scratch[i_0 * num_warps]; + auto v_local = all_scratch[i_0 * num_reduce_warps]; auto v_local_mean = bit_cast(v_local[0]); auto v_local_var = bit_cast(v_local[1]); auto v_local_count = bit_cast(v_local[2]); @@ -294,7 +294,7 @@ struct BlockWelfordCrossWarpSync // further reduce mean/var static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) { constexpr auto i_1 = number{}; - const fp32x4_t v_remote = all_scratch[i_0 * num_warps + i_1]; + const fp32x4_t v_remote = all_scratch[i_0 * num_reduce_warps + i_1]; const auto v_remote_mean = bit_cast(v_remote[0]); const auto v_remote_var = bit_cast(v_remote[1]); const auto v_remote_count = bit_cast(v_remote[2]); -- GitLab From 24d996aae11c45430571ebc1ee428dc67fd2d91b Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 30 Oct 2024 10:05:15 +0100 Subject: [PATCH 026/153] [CK-Tile] Universal gemm memory bound pipeline (#1558) * CK-Tile GEMM with memory bound pipeline. * Memory bound gemm pipeline. * Fix not closed namespace. * Block gemm mem pipeline draft. * Do not use ck_tile:: within ck_tile namespace. * Refactoring & Move Layout info to pipeline problem. * Get hot loop and TailNum information before lunching kernel. * Fixes in pipeline. * Add comment to load_tile_raw and change variable naming style. * Few small changes & formatting. * Do not use macro. * Add gtests. * Use AccDataType for Output of MFMA instruction. * Formatting. * Refactor gemm examples. * Switch over to current block gemm. * Use currently available pipeline policy. * Refactoring and review comment.s * Fixes after merge. * Add missing include. * Add load tile overload which accepts output tensor as parameter. * This give 8% perf boost at the cost of using more registers. * Rename example. * Small changes. * Fix compilation err and lower K. * Support different layouts for A/B * Fix vector size for different layouts. * Rename Alignment into VectorSize * Unblock tests. --- example/ck_tile/03_gemm/CMakeLists.txt | 4 +- example/ck_tile/03_gemm/gemm_basic.cpp | 366 ++-------------- example/ck_tile/03_gemm/gemm_basic.hpp | 32 +- example/ck_tile/03_gemm/gemm_mem_pipeline.cpp | 188 ++++++++ example/ck_tile/03_gemm/run_gemm_example.inc | 217 +++++++++ include/ck_tile/core.hpp | 1 + include/ck_tile/core/tensor/load_tile.hpp | 27 +- include/ck_tile/core/tensor/tile_window.hpp | 17 +- include/ck_tile/core/utility/literals.hpp | 22 + .../ck_tile/host/reference/reference_gemm.hpp | 60 +-- include/ck_tile/ops/gemm.hpp | 2 + .../block/block_gemm_areg_bgmem_creg_v1.hpp | 2 +- .../block/block_gemm_asmem_bsmem_creg_v1.hpp | 30 +- .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 123 +++--- .../ops/gemm/kernel/gemm_tile_partitioner.hpp | 24 +- .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 413 ++++++++++++++++++ .../gemm_pipeline_ag_bg_cr_scheduler.hpp | 71 +++ .../gemm_pipeline_agmem_bgmem_creg_v1.hpp | 24 +- ...ine_agmem_bgmem_creg_v1_default_policy.hpp | 10 +- .../gemm_pipeline_agmem_bgmem_creg_v2.hpp | 6 +- .../gemm/pipeline/gemm_pipeline_problem.hpp | 53 ++- .../ops/gemm/pipeline/tile_gemm_traits.hpp | 16 +- .../warp/warp_gemm_attribute_mfma_impl.hpp | 52 +-- .../ops/gemm/warp/warp_gemm_dispatcher.hpp | 58 +-- test/ck_tile/CMakeLists.txt | 1 + test/ck_tile/gemm/CMakeLists.txt | 4 + test/ck_tile/gemm/test_gemm_mem_pipeline.cpp | 29 ++ .../gemm/test_gemm_mem_pipeline_ut_cases.inc | 41 ++ .../gemm/test_gemm_mem_pipeline_util.hpp | 318 ++++++++++++++ 29 files changed, 1655 insertions(+), 556 deletions(-) create mode 100644 example/ck_tile/03_gemm/gemm_mem_pipeline.cpp create mode 100644 example/ck_tile/03_gemm/run_gemm_example.inc create mode 100644 include/ck_tile/core/utility/literals.hpp create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp create mode 100644 test/ck_tile/gemm/CMakeLists.txt create mode 100644 test/ck_tile/gemm/test_gemm_mem_pipeline.cpp create mode 100644 test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc create mode 100644 test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt index 03fc9c7eb..8ae46cadc 100644 --- a/example/ck_tile/03_gemm/CMakeLists.txt +++ b/example/ck_tile/03_gemm/CMakeLists.txt @@ -1,2 +1,2 @@ -set(CMAKE_BUILD_TYPE Debug) -add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp) \ No newline at end of file +add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp) +add_executable(tile_example_gemm_mem_pipeline EXCLUDE_FROM_ALL gemm_mem_pipeline.cpp) diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp index 569afed25..09427217c 100644 --- a/example/ck_tile/03_gemm/gemm_basic.cpp +++ b/example/ck_tile/03_gemm/gemm_basic.cpp @@ -1,7 +1,6 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. -#include "gemm_basic.hpp" #include #include @@ -10,51 +9,48 @@ #include #include -auto create_args(int argc, char* argv[]) -{ - ck_tile::ArgParser arg_parser; - arg_parser.insert("b", "1", "batch size") - .insert("m", "1024", "m dimension") - .insert("n", "2048", "n dimension") - .insert("k", "64", "k dimension") - .insert("stride_a", "0", "Tensor A stride") - .insert("stride_b", "0", "Tensor B stride") - .insert("stride_c", "0", "Tensor C stride") - .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") - .insert("e", "1e-5", "Absolute error tolerance") - .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") - .insert("warmup", "10", "number of iterations before benchmark the kernel") - .insert("repeat", "100", "number of iterations to benchmark the kernel") - .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer"); - - bool result = arg_parser.parse(argc, argv); - return std::make_tuple(result, arg_parser); -} +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/host.hpp" +#include "gemm_basic.hpp" -template +template float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) { // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part. constexpr bool kPadA = true; constexpr bool kPadB = true; + constexpr bool kPadC = true; constexpr bool kTilePermute = false; + // The rank and permutation will also be generate out by the CodeGen part. + constexpr ck_tile::index_t kOutputRank = 2; constexpr int kBlockPerCu = 1; - using TilePartitioner = ck_tile::GemmTilePartitioner; + // This part comes from the Codegen + constexpr ck_tile::index_t M_Tile = 128; + constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t K_Tile = 32; - // The rank and permutation will also be generate out by the CodeGen part. - constexpr ck_tile::index_t kOutputRank = 2; + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; // Whether doing the CShuffle (transpose before the global memory), depending on the output // layout. constexpr bool CShuffleEpilogue = - std::is_same_v; + std::is_same_v; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = ck_tile::GemmTilePartitioner; using GemmEpilogue = std::conditional_t< CShuffleEpilogue, @@ -70,14 +66,21 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) TilePartitioner::kN>>, ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem>>; + + using CodegenGemmTraits = + ck_tile::TileGemmTraits; + using CodegenPipelineProblem = ck_tile:: + GemmPipelineProblem; + using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy; + using CodegenGemmPipeline = + ck_tile::GemmPipelineAGmemBGmemCRegV1; // ToDo: Will add the codegen part to test different pipeline policies in GEMM. // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. - using Kernel = ck_tile::GemmKernel; + using Kernel = ck_tile::GemmKernel; auto kargs = Kernel::MakeKargs(args.p_a, args.p_b, args.p_c, - args.epsilon, args.M, args.N, args.K, @@ -88,299 +91,20 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); constexpr dim3 blocks = Kernel::BlockSize(); - float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - - return ave_time; -} - -template -float invoke_gemm(ck_tile::DeviceMem& a_buf, - ck_tile::DeviceMem& b_buf, - ck_tile::DeviceMem& c_buf, - const ck_tile::ArgParser& arg_parser) -{ - - std::string data_type = arg_parser.get_str("prec"); - - if(data_type != DataTypeTraits::name) - { - std::cerr << "Data type mismatch: expected " << DataTypeTraits::name << ", got " - << data_type << std::endl; - return -1; // Or handle the error appropriately - } - - float epsilon = arg_parser.get_float("e"); - ck_tile::index_t batch_size = arg_parser.get_int("b"); - ck_tile::index_t M = arg_parser.get_int("m"); - ck_tile::index_t N = arg_parser.get_int("n"); - ck_tile::index_t K = arg_parser.get_int("k"); - - ck_tile::index_t stride_a = arg_parser.get_int("stride_a"); - ck_tile::index_t stride_b = arg_parser.get_int("stride_b"); - ck_tile::index_t stride_c = arg_parser.get_int("stride_c"); - - gemm_basic_args args; - args.p_a = a_buf.GetDeviceBuffer(); - args.p_b = b_buf.GetDeviceBuffer(); - args.p_c = c_buf.GetDeviceBuffer(); - args.epsilon = epsilon; - args.kbatch = batch_size; - args.M = M; - args.N = N; - args.K = K; - - // Only set stride_M and stride_N if they are non-zero and not equal to K. - if(stride_a != 0) - { - args.stride_A = stride_a; - } - else - { - args.stride_A = [&]() { - if constexpr(std::is_same_v) - { - return M; - } - else - { - return K; - } - }(); - } - - if(stride_b != 0) - { - args.stride_B = stride_b; - } - else + if(s.log_level_ > 0) { - args.stride_B = [&]() { - if constexpr(std::is_same_v) - { - return N; - } - else - { - return K; - } - }(); + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; } - if(stride_c != 0) - { - args.stride_C = stride_c; - } - else - { - args.stride_C = [&]() { - if constexpr(std::is_same_v) - { - return M; - } - else - { - return N; - } - }(); - } - - float ave_time = gemm_calc( - args, ck_tile::stream_config{nullptr, true}); - std::size_t num_byte = - sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N; - float gb_per_sec = num_byte / 1.E6 / ave_time; - - std::cout << "The overall perfomance of the GEMM with " - << "[" << data_type << "]" - << "batch size: " << batch_size << ". m:" << M << ", n:" << N << ", k:" << K - << " is: \n"; - std::cout << "Running time: " << ave_time << "ms, Throughput " << gb_per_sec << "GB/s \n" - << std::flush; + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); return ave_time; } -int main(int argc, char* argv[]) -{ - auto [result, arg_parser] = create_args(argc, argv); - if(!result) - return -1; - - ck_tile::index_t M = arg_parser.get_int("m"); - ck_tile::index_t N = arg_parser.get_int("n"); - ck_tile::index_t K = arg_parser.get_int("k"); - - // The Matrix Multiplication goes with Matrix A (M, K), Matrix B (N, K) = Matrix C (M, N). - using matrix_a_layout = ck_tile::tensor_layout::gemm::RowMajor; - using matrix_b_layout = ck_tile::tensor_layout::gemm::ColumnMajor; - using matrix_c_layout = ck_tile::tensor_layout::gemm::RowMajor; - - // host verify - std::vector a_dimensions = - (std::is_same_v) - ? std::vector{M, K} - : std::vector{K, M}; - std::vector b_dimensions = - (std::is_same_v) - ? std::vector{N, K} - : std::vector{K, N}; - std::vector c_dimensions = - (std::is_same_v) - ? std::vector{M, N} - : std::vector{N, M}; - - ck_tile::HostTensor a_host(a_dimensions); - ck_tile::HostTensor b_host(b_dimensions); - - ck_tile::HostTensor c_host_ref(c_dimensions); - ck_tile::HostTensor c_host_dev(c_dimensions); - - ck_tile::FillUniformDistribution{-5.f, 5.f}(a_host); - ck_tile::FillUniformDistribution{-5.f, 5.f}(b_host); - - ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes()); - ck_tile::DeviceMem b_buf(b_host.get_element_space_size_in_bytes()); - ck_tile::DeviceMem c_buf(c_host_dev.get_element_space_size_in_bytes()); - - a_buf.ToDevice(a_host.data()); - b_buf.ToDevice(b_host.data()); - - // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part. - constexpr bool kPadA = true; - constexpr bool kPadB = true; - constexpr bool kPadC = true; - - // This part comes from the Codegen - constexpr ck_tile::index_t M_Tile = 128; - constexpr ck_tile::index_t N_Tile = 128; - constexpr ck_tile::index_t K_Tile = 32; - - constexpr ck_tile::index_t M_Warp = 2; - constexpr ck_tile::index_t N_Warp = 2; - constexpr ck_tile::index_t K_Warp = 1; - - constexpr ck_tile::index_t M_Warp_Tile = 32; - constexpr ck_tile::index_t N_Warp_Tile = 32; - constexpr ck_tile::index_t K_Warp_Tile = 8; - - using CodegenGemmShape = - ck_tile::TileGemmShape, - ck_tile::sequence, - ck_tile::sequence>; - - using CodegenGemmTraits = ck_tile:: - TileGemmTraits; - - using CodegenPipelineProblem = ck_tile:: - GemmPipelineProblem; - - using CodegenGemmPolicy = ck_tile:: - UniversalGemmPipelineAgBgCrPolicy; - - using CodegenGemmPipeline = - ck_tile::GemmPipelineAGmemBGmemCRegV1; - - invoke_gemm(a_buf, b_buf, c_buf, arg_parser); - - c_buf.FromDevice(c_host_dev.data()); - - bool pass_cpu = true; - - if(arg_parser.get_int("v") == 1) - { - // ToDo: Will Add the Element Op (bias) verification in the future. - ck_tile::reference_gemm(a_host, b_host, c_host_ref); - - pass_cpu = ck_tile::check_err(c_host_dev, c_host_ref); - - std::cout << "The CPU veification result is:" << (pass_cpu ? "correct" : "fail") - << std::flush; - } - - bool pass_gpu = true; - - if(arg_parser.get_int("v") == 2) - { - ck_tile::index_t stride_a = arg_parser.get_int("stride_a"); - ck_tile::index_t stride_b = arg_parser.get_int("stride_b"); - ck_tile::index_t stride_c = arg_parser.get_int("stride_c"); - - if(stride_a == 0) - { - if constexpr(std::is_same_v) - { - stride_a = M; - } - else - { - stride_a = K; - } - } - - if(stride_b == 0) - { - if constexpr(std::is_same_v) - { - stride_b = N; - } - else - { - stride_b = K; - } - } - - if(stride_c == 0) - { - if constexpr(std::is_same_v) - { - stride_c = M; - } - else - { - stride_c = N; - } - } - - ck_tile::HostTensor c_host_gpu_ref(c_dimensions); - ck_tile::DeviceMem c_gpu_buf(c_host_gpu_ref.get_element_space_size_in_bytes()); +#include "run_gemm_example.inc" - ck_tile::reference_gemm_gpu( - a_buf, b_buf, c_gpu_buf, M, N, K, stride_a, stride_b, stride_c); - - c_buf.FromDevice(c_host_gpu_ref.data()); - - pass_gpu = ck_tile::check_err(c_host_dev, c_host_gpu_ref); - - std::cout << "The GPU veification result is: " << (pass_gpu ? "correct" : "fail") - << std::flush; - } - - std::cout << std::endl << std::flush; - - return !pass_gpu; -} +int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/ck_tile/03_gemm/gemm_basic.hpp b/example/ck_tile/03_gemm/gemm_basic.hpp index ce2e0f706..23e99bc2a 100644 --- a/example/ck_tile/03_gemm/gemm_basic.hpp +++ b/example/ck_tile/03_gemm/gemm_basic.hpp @@ -4,12 +4,10 @@ #pragma once +#include + #include "ck_tile/core.hpp" #include "ck_tile/host/kernel_launch.hpp" -#include "ck_tile/ops/epilogue.hpp" -#include "ck_tile/ops/gemm.hpp" -#include "ck_tile/host.hpp" -#include template struct GemmBasicTypeConfig; @@ -20,7 +18,7 @@ struct GemmBasicTypeConfig using ADataType = ck_tile::half_t; using BDataType = ck_tile::half_t; using AccDataType = float; - using CDataType = ck_tile::half_t; // type convert + using CDataType = ck_tile::half_t; // ToDo: Add more bias config to support different categories of GEMM. }; @@ -58,7 +56,6 @@ struct gemm_basic_args const void* p_a; const void* p_b; void* p_c; - float epsilon; ck_tile::index_t kbatch; ck_tile::index_t M; ck_tile::index_t N; @@ -68,5 +65,28 @@ struct gemm_basic_args ck_tile::index_t stride_C; }; +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("b", "1", "batch size") + .insert("m", "3840", "m dimension") + .insert("n", "4096", "n dimension") + .insert("k", "2048", "k dimension") + .insert("a_layout", "R", "A tensor data layout - Row by default") + .insert("b_layout", "R", "B tensor data layout - Row by default") + .insert("c_layout", "R", "C tensor data layout - Row by default") + .insert("stride_a", "0", "Tensor A stride") + .insert("stride_b", "0", "Tensor B stride") + .insert("stride_c", "0", "Tensor C stride") + .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") + .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") + .insert("warmup", "50", "number of iterations before benchmark the kernel") + .insert("repeat", "100", "number of iterations to benchmark the kernel") + .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + // host API float gemm_calc(gemm_basic_args args, const ck_tile::stream_config& s); diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp new file mode 100644 index 000000000..2ee0395e4 --- /dev/null +++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include +#include +#include +#include +#include + +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/host.hpp" +#include "gemm_basic.hpp" + +template +float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) +{ + // ToDo: This will be modified by the codegen code later. + constexpr ck_tile::index_t M_Tile = 128; + constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t K_Tile = 32; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + + // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part. + constexpr bool kPadA = true; + constexpr bool kPadB = true; + constexpr bool kPadC = true; + + constexpr int kBlockPerCu = 1; + + // =============================================== + + using GemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + using TilePartitioner = ck_tile::GemmTilePartitioner; + + using GemmEpilogue = ck_tile::Default2DEpilogue< + ck_tile::Default2DEpilogueProblem>; + + using Traits = ck_tile::TileGemmTraits; + + using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem< + ck_tile::GemmPipelineProblem>; + + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(args.K); + const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + + float ave_time{0}; + + const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + + using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem< + ck_tile::UniversalGemmPipelineProblem>; + using Kernel = ck_tile::GemmKernel; + auto kargs = Kernel::MakeKargs(args.p_a, + args.p_b, + args.p_c, + args.M, + args.N, + args.K, + args.stride_A, + args.stride_B, + args.stride_C); + + const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); + constexpr dim3 blocks = Kernel::BlockSize(); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + return ave_time; + }; + + if(has_hot_loop) + { + // Tail pipeline One to Seven + if(tail_num == ck_tile::TailNumber::One) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else if(tail_num == ck_tile::TailNumber::Full) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + + if constexpr(BaseGemmPipeline::PrefetchStages > 2) + { + if(tail_num == ck_tile::TailNumber::Two) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + if constexpr(BaseGemmPipeline::PrefetchStages > 3) + { + if(tail_num == ck_tile::TailNumber::Three) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + if constexpr(BaseGemmPipeline::PrefetchStages > 4) + { + if(tail_num == ck_tile::TailNumber::Four) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + if constexpr(BaseGemmPipeline::PrefetchStages > 5) + { + if(tail_num == ck_tile::TailNumber::Five) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + if constexpr(BaseGemmPipeline::PrefetchStages > 6) + { + if(tail_num == ck_tile::TailNumber::Six) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + if constexpr(BaseGemmPipeline::PrefetchStages > 7) + { + if(tail_num == ck_tile::TailNumber::Seven) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + } + else + { + // Tail number always Full - #PrefetchStages + if(tail_num == ck_tile::TailNumber::Full) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else + { + std::ostringstream err; + err << "When there's no hot loop, this tail number \"" << tail_num + << "\" is not supported! " << __FILE__ << ":" << __LINE__ + << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } + } + + return ave_time; +} + +#include "run_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc new file mode 100644 index 000000000..8db131738 --- /dev/null +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +template +float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, + ck_tile::DeviceMem& b_k_n_dev_buf, + ck_tile::DeviceMem& c_m_n_dev_buf, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t stride_A, + ck_tile::index_t stride_B, + ck_tile::index_t stride_C, + ck_tile::index_t kbatch, + int n_warmup, + int n_repeat) +{ + gemm_basic_args args; + args.p_a = a_m_k_dev_buf.GetDeviceBuffer(); + args.p_b = b_k_n_dev_buf.GetDeviceBuffer(); + args.p_c = c_m_n_dev_buf.GetDeviceBuffer(); + args.kbatch = kbatch; + args.M = M; + args.N = N; + args.K = K; + args.stride_A = stride_A; + args.stride_B = stride_B; + args.stride_C = stride_C; + + float ave_time = gemm_calc( + args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); + + std::string op_name{"Gemm{MemBoundPipeline}"}; + + std::size_t flop = std::size_t(2) * M * N * K; + std::size_t num_byte = + sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N; + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K + << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C + << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << std::endl; + + return ave_time; +} + +template +int run_gemm_example_with_layouts(int argc, + char* argv[], + const ALayout a_layout = ALayout{}, + const BLayout b_layout = BLayout{}, + [[maybe_unused]] const CLayout c_layout = CLayout{}) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + ck_tile::index_t M = arg_parser.get_int("m"); + ck_tile::index_t N = arg_parser.get_int("n"); + ck_tile::index_t K = arg_parser.get_int("k"); + + ck_tile::index_t stride_A = arg_parser.get_int("stride_a"); + ck_tile::index_t stride_B = arg_parser.get_int("stride_b"); + ck_tile::index_t stride_C = arg_parser.get_int("stride_c"); + + ck_tile::index_t batch_size = arg_parser.get_int("b"); + int n_warmup = arg_parser.get_int("warmup"); + int n_repeat = arg_parser.get_int("repeat"); + + using namespace ck_tile::literals; + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + auto f_get_default_stride = [](std::size_t row, + std::size_t col, + std::size_t stride, + auto layout) { + if(stride == 0) + { + // give a chance if stride is zero, return a default packed stride + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; + }; + + stride_A = f_get_default_stride(M, K, stride_A, a_layout); + stride_B = f_get_default_stride(K, N, stride_B, b_layout); + stride_C = f_get_default_stride(M, N, stride_C, CLayout{}); + + ck_tile::HostTensor a_m_k(f_host_tensor_descriptor(M, K, stride_A, a_layout)); + ck_tile::HostTensor b_k_n(f_host_tensor_descriptor(K, N, stride_B, b_layout)); + ck_tile::HostTensor c_m_n_dev_result( + f_host_tensor_descriptor(M, N, stride_C, CLayout{})); + + // TODO: add different init types + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + b_k_n_dev_buf.ToDevice(b_k_n.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + invoke_gemm(a_m_k_dev_buf, + b_k_n_dev_buf, + c_m_n_dev_buf, + M, + N, + K, + stride_A, + stride_B, + stride_C, + batch_size, + n_warmup, + n_repeat); + + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + bool pass = true; + + if(arg_parser.get_int("v") == 1) + { + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(M, N, stride_C, CLayout{})); + c_m_n_host_ref.SetZero(); + + ck_tile::reference_gemm( + a_m_k, b_k_n, c_m_n_host_ref); + + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref); + + std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl; + } + else if(arg_parser.get_int("v") == 2) + { + ck_tile::HostTensor c_m_n_gpu_ref( + f_host_tensor_descriptor(M, N, stride_C, CLayout{})); + ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes()); + c_m_n_gpu_ref.SetZero(); + c_m_n_gpu_buf_ref.SetZero(); + + ck_tile::reference_gemm_gpu( + a_m_k_dev_buf, b_k_n_dev_buf, c_m_n_gpu_buf_ref, M, N, K, stride_A, stride_B, stride_C); + + c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data()); + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_gpu_ref); + + std::cout << "The GPU veification result is: " << (pass ? "correct" : "fail") << std::endl; + } + + return pass; +} + +int run_gemm_example(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + using Row = ck_tile::tensor_layout::gemm::RowMajor; + using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + + std::string a_layout = arg_parser.get_str("a_layout"); + std::string b_layout = arg_parser.get_str("b_layout"); + + if(a_layout == "R" && b_layout == "R") + { + return run_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{}); + } + else if(a_layout == "R" && b_layout == "C") + { + return run_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{}); + } + else if(a_layout == "C" && b_layout == "C") + { + return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{}); + } + else if(a_layout == "C" && b_layout == "R") + { + return run_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{}); + } + else + { + throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!"); + } +} diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index fa4b8d3cc..2c423831e 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -56,6 +56,7 @@ #include "ck_tile/core/utility/functional.hpp" #include "ck_tile/core/utility/functional_with_tuple.hpp" #include "ck_tile/core/utility/ignore.hpp" +#include "ck_tile/core/utility/literals.hpp" #include "ck_tile/core/utility/magic_div.hpp" #include "ck_tile/core/utility/philox_rand.hpp" #include "ck_tile/core/utility/random.hpp" diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp index 06b5a8da0..f150fc54c 100644 --- a/include/ck_tile/core/tensor/load_tile.hpp +++ b/include/ck_tile/core/tensor/load_tile.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -46,6 +46,31 @@ CK_TILE_DEVICE auto load_tile(const tile_window_linear{}, bool_constant{}); } +template +CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile, + const tile_window_with_static_distribution& tile_window, + bool_constant = {}) +{ + return tile_window.load(dst_tile, bool_constant{}); +} + +/** + * @brief Loads a tile of data using inline assembly. + * + * @note Bare in mind that loading data this way, you have to manually initialize your + * thread buffer and synchronize load afterwards in order to make sure it's done before + * using loaded data from registers + * @see `tile_window_with_static_distribution::init_raw()` and `buffer_view.hpp` + * @see `buffer_load_fence()` + */ template = {}, bool_constant = {}) const { - using Traits = load_store_traits; + constexpr auto tile_dstr = TileDstr{}; + auto dst_tensor = make_static_distributed_tensor(tile_dstr); + load(dst_tensor, bool_constant{}); + return dst_tensor; + } + template + CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor, + bool_constant = {}) const + { + using Traits = load_store_traits; using vector_t = typename Traits::vector_t; using SFC_Ys = typename Traits::SFC_Ys; constexpr auto tile_dstr = TileDstr{}; - auto dst_tensor = make_static_distributed_tensor(tile_dstr); - // loop over thread tensor space [y0, y1, ...] static_for<0, NumCoord, 1>{}([&](auto iCoord) { /// TODO: use structure binding (to be captured later) if compiled in C++20 @@ -353,8 +360,6 @@ struct tile_window_with_static_distribution } }); }); - - return dst_tensor; } template + +namespace ck_tile { +namespace literals { +// [P0330] Literal Suffix for (signed) size_t (C++23) +// ref: https://wg21.link/p0330r8 +inline constexpr std::size_t operator""_uz(unsigned long long size) +{ + return static_cast(size); +} + +inline constexpr std::size_t operator""_zu(unsigned long long size) +{ + return static_cast(size); +} +} // namespace literals +} // namespace ck_tile diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp index a496c91e0..dbdef0e9c 100644 --- a/include/ck_tile/host/reference/reference_gemm.hpp +++ b/include/ck_tile/host/reference/reference_gemm.hpp @@ -1,12 +1,13 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#include +#include + #include "ck_tile/core.hpp" #include "ck_tile/host/host_tensor.hpp" -#include "ck_tile/ops/common/tensor_layout.hpp" -#include namespace ck_tile { @@ -14,55 +15,36 @@ template CK_TILE_HOST void reference_gemm(const HostTensor& a_m_k, - const HostTensor& b_n_k, + const HostTensor& b_k_n, HostTensor& c_m_n, const AElementOp& a_element_op = {}, const BElementOp& b_element_op = {}, const ACCElementOp& acc_element_op = {}) { - const int N = (std::is_same_v) - ? b_n_k.mDesc.get_lengths()[0] - : b_n_k.mDesc.get_lengths()[1]; - const int K = (std::is_same_v) - ? a_m_k.mDesc.get_lengths()[1] - : a_m_k.mDesc.get_lengths()[0]; - const int M = (std::is_same_v) - ? a_m_k.mDesc.get_lengths()[0] - : a_m_k.mDesc.get_lengths()[1]; - - auto f = [&](auto m) { - for(int n = 0; n < N; ++n) + const std::size_t M = a_m_k.get_length(0); + const std::size_t N = b_k_n.get_length(1); + const std::size_t K = a_m_k.get_length(1); + + auto f_mn = [&](auto m, auto n) { + AccDataType v_acc = 0; + + for(std::size_t k = 0; k < K; ++k) { - AccDataType v_acc = 0; - - for(int k = 0; k < K; ++k) - { - ADataType v_a = (std::is_same_v) - ? a_element_op(a_m_k(m, k)) - : a_element_op(a_m_k(k, m)); - BDataType v_b = (std::is_same_v) - ? b_element_op(b_n_k(n, k)) - : b_element_op(b_n_k(k, n)); - - v_acc += ck_tile::type_convert(v_a) * - ck_tile::type_convert(v_b); - } - - CDataType& c_ref = (std::is_same_v) - ? c_m_n(m, n) - : c_m_n(n, m); - c_ref = ck_tile::type_convert(acc_element_op(v_acc)); + ADataType v_a = a_element_op(a_m_k(m, k)); + BDataType v_b = b_element_op(b_k_n(k, n)); + + v_acc += + ck_tile::type_convert(v_a) * ck_tile::type_convert(v_b); } + + c_m_n(m, n) = ck_tile::type_convert(acc_element_op(v_acc)); }; - make_ParallelTensorFunctor(f, M)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency()); } template , BlockGemmARegBGmemCRegV1DefaultPolicy>; - CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetStaticLdsSize() + CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize() { return sizeof(BDataType) * Policy::template MakeBSmemBlockDescriptor().get_element_space_size(); diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp index dc0b41135..d6fee879b 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp @@ -24,19 +24,19 @@ struct BlockGemmASmemBSmemCRegV1 static constexpr index_t kBlockSize = Problem::kBlockSize; // C += A * B - template + template CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor, - const ABlockWindowTmp& a_block_window_tmp, - const BBlockWindowTmp& b_block_window_tmp) const + const ABlockWindow& a_block_window, + const BBlockWindow& b_block_window) const { - static_assert(std::is_same_v && - std::is_same_v && + static_assert(std::is_same_v && + std::is_same_v && std::is_same_v, "wrong!"); - constexpr index_t MPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<0>{}]; - constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}]; - constexpr index_t KPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<1>{}]; + constexpr index_t MPerBlock = ABlockWindow{}.get_window_lengths()[number<0>{}]; + constexpr index_t NPerBlock = BBlockWindow{}.get_window_lengths()[number<0>{}]; + constexpr index_t KPerBlock = ABlockWindow{}.get_window_lengths()[number<1>{}]; static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN && KPerBlock == BlockGemmShape::kK, @@ -62,9 +62,9 @@ struct BlockGemmASmemBSmemCRegV1 // construct A-warp-window auto a_warp_window_tmp = make_tile_window( - a_block_window_tmp.get_bottom_tensor_view(), + a_block_window.get_bottom_tensor_view(), make_tuple(number{}, number{}), - a_block_window_tmp.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0}, + a_block_window.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0}, make_static_tile_distribution(typename WG::AWarpDstrEncoding{})); #if 0 // FIXME: using array will cause register spill @@ -97,9 +97,9 @@ struct BlockGemmASmemBSmemCRegV1 // construct B-warp-window auto b_warp_window_tmp = make_tile_window( - b_block_window_tmp.get_bottom_tensor_view(), + b_block_window.get_bottom_tensor_view(), make_tuple(number{}, number{}), - b_block_window_tmp.get_window_origin() + multi_index<2>{iNWarp * WG::kN, 0}, + b_block_window.get_window_origin() + multi_index<2>{iNWarp * WG::kN, 0}, make_static_tile_distribution(typename WG::BWarpDstrEncoding{})); #if 0 // FIXME: using array will cause register spill @@ -200,12 +200,12 @@ struct BlockGemmASmemBSmemCRegV1 } // C = A * B - template + template CK_TILE_DEVICE auto operator()(const ABlockTensorTmp& a_block_tensor_tmp, - const BBlockWindowTmp& b_block_window_tmp) const + const BBlockWindow& b_block_window) const { auto c_block_tensor = MakeCBlockTile(); - operator()(c_block_tensor, a_block_tensor_tmp, b_block_window_tmp); + operator()(c_block_tensor, a_block_tensor_tmp, b_block_window); return c_block_tensor; } }; diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp index 48329c8ba..1671ddad3 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp @@ -3,12 +3,13 @@ #pragma once -#include "ck_tile/core.hpp" -#include "ck_tile/ops/common.hpp" #include - #include +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" + namespace ck_tile { template @@ -17,20 +18,19 @@ struct GemmKernel using TilePartitioner = remove_cvref_t; using GemmPipeline = remove_cvref_t; using EpiloguePipeline = remove_cvref_t; - static constexpr index_t KernelBlockSize = GemmPipeline::kBlockSize; - - using ADataType = remove_cvref_t; - using BDataType = remove_cvref_t; - using CAccDataType = remove_cvref_t; - using CODataType = remove_cvref_t; + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize; - using LayoutA = remove_cvref_t; - using LayoutB = remove_cvref_t; - using LayoutC = remove_cvref_t; + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + // using CAccDataType = remove_cvref_t; + using CDataType = remove_cvref_t; - __host__ static constexpr auto GridSize(index_t M_size, index_t N_size, index_t Batch_size) + __host__ static constexpr auto GridSize(index_t M, index_t N, index_t KBatch) { - return TilePartitioner::GridSize(M_size, N_size, Batch_size); + return TilePartitioner::GridSize(M, N, KBatch); } __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); } @@ -40,34 +40,30 @@ struct GemmKernel const void* a_ptr; const void* b_ptr; void* c_ptr; - - float epsilon; - - ck_tile::index_t M; - ck_tile::index_t N; - ck_tile::index_t K; - ck_tile::index_t stride_A; - ck_tile::index_t stride_B; - ck_tile::index_t stride_C; + index_t M; + index_t N; + index_t K; + index_t stride_A; + index_t stride_B; + index_t stride_C; }; CK_TILE_HOST static constexpr GemmCommonKargs MakeKargs(const void* a_ptr, const void* b_ptr, void* c_ptr, - float epsilon, - ck_tile::index_t M, - ck_tile::index_t N, - ck_tile::index_t K, - ck_tile::index_t stride_A, - ck_tile::index_t stride_B, - ck_tile::index_t stride_C) + index_t M, + index_t N, + index_t K, + index_t stride_A, + index_t stride_B, + index_t stride_C) { - return GemmCommonKargs{a_ptr, b_ptr, c_ptr, epsilon, M, N, K, stride_A, stride_B, stride_C}; + return GemmCommonKargs{a_ptr, b_ptr, c_ptr, M, N, K, stride_A, stride_B, stride_C}; } - CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { - return ck_tile::max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); + return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); } CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const @@ -78,13 +74,13 @@ struct GemmKernel const BDataType* b_start = static_cast(kargs.b_ptr); // Convert pointers to tensor views auto a_tensor_view = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) { return make_naive_tensor_view( a_start, make_tuple(kargs.M, kargs.K), - make_tuple(1, kargs.stride_A), - number{}, + make_tuple(kargs.stride_A, 1), + number{}, number<1>{}); } else @@ -92,29 +88,29 @@ struct GemmKernel return make_naive_tensor_view( a_start, make_tuple(kargs.M, kargs.K), - make_tuple(kargs.stride_A, 1), - number{}, + make_tuple(1, kargs.stride_A), + number<1>{}, number<1>{}); } }(); auto b_tensor_view = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) { return make_naive_tensor_view( b_start, make_tuple(kargs.N, kargs.K), make_tuple(1, kargs.stride_B), - number{}, + number<1>{}, number<1>{}); } else - { // Default NK layout + { return make_naive_tensor_view( b_start, make_tuple(kargs.N, kargs.K), make_tuple(kargs.stride_B, 1), - number{}, + number{}, number<1>{}); } }(); @@ -122,10 +118,12 @@ struct GemmKernel auto a_pad_view = pad_tensor_view( a_tensor_view, make_tuple(number{}, number{}), - sequence < 0, - GemmPipeline::kPadA ? 1 : 0 > {}); + // somehow clang-format is splitting below line into multiple. + // clang-format off + sequence{}); + // clang-format on - auto ABlockWindow = make_tile_window( + auto a_block_window = make_tile_window( a_pad_view, make_tuple(number{}, number{}), {i_m, 0}); @@ -133,10 +131,11 @@ struct GemmKernel auto b_pad_view = pad_tensor_view( b_tensor_view, make_tuple(number{}, number{}), - sequence < 0, - GemmPipeline::kPadB ? 1 : 0 > {}); + // clang-format off + sequence{}); + // clang-format on - auto BBlockWindow = make_tile_window( + auto b_block_window = make_tile_window( b_pad_view, make_tuple(number{}, number{}), {i_n, 0}); @@ -144,20 +143,21 @@ struct GemmKernel // allocate LDS __shared__ char smem_ptr[GetSmemSize()]; - const index_t num_loop = (kargs.K + TilePartitioner::kK - 1) / TilePartitioner::kK; - - auto acc = GemmPipeline{}(ABlockWindow, BBlockWindow, num_loop, smem_ptr); + const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); - CODataType* c_start = static_cast(kargs.c_ptr); + // Run GEMM cooperatively by whole wokrgroup. + auto c_block_tile = + GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr); + CDataType* c_start = static_cast(kargs.c_ptr); auto c_tensor_view = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) { return make_naive_tensor_view( c_start, make_tuple(kargs.M, kargs.N), - make_tuple(1, kargs.stride_C), - number{}, + make_tuple(kargs.stride_C, 1), + number{}, number<1>{}); } else @@ -165,8 +165,8 @@ struct GemmKernel return make_naive_tensor_view( c_start, make_tuple(kargs.M, kargs.N), - make_tuple(kargs.stride_C, 1), - number{}, + make_tuple(1, kargs.stride_C), + number<1>{}, number<1>{}); } }(); @@ -174,14 +174,15 @@ struct GemmKernel auto c_pad_view = pad_tensor_view( c_tensor_view, make_tuple(number{}, number{}), - sequence < 0, - GemmPipeline::kPadC ? 1 : 0 > {}); - auto CBlockWindow_pad = make_tile_window( + // clang-format off + sequence{}); + // clang-format on + auto c_block_window = make_tile_window( c_pad_view, make_tuple(number{}, number{}), {i_m, i_n}); - EpiloguePipeline{}(CBlockWindow_pad, acc); + EpiloguePipeline{}(c_block_window, c_block_tile); } }; diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp index a49ffc291..6387233c0 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp @@ -9,26 +9,30 @@ namespace ck_tile { template struct GemmTilePartitioner { - using BlockGemmShape = ck_tile::remove_cvref_t; + using BlockGemmShape = remove_cvref_t; - static constexpr ck_tile::index_t kM = BlockGemmShape::kM; - static constexpr ck_tile::index_t kN = BlockGemmShape::kN; - static constexpr ck_tile::index_t kK = BlockGemmShape::kK; + static constexpr index_t kM = BlockGemmShape::kM; + static constexpr index_t kN = BlockGemmShape::kN; + static constexpr index_t kK = BlockGemmShape::kK; - CK_TILE_HOST static constexpr auto - GridSize(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t batch_size) + CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t batch_size) { - ck_tile::index_t GridDimX = (M + kM - 1) / kM; - ck_tile::index_t GridDimY = (N + kN - 1) / kN; - ck_tile::index_t GridDimZ = batch_size; + index_t GridDimX = (M + kM - 1) / kM; + index_t GridDimY = (N + kN - 1) / kN; + index_t GridDimZ = batch_size; return dim3(GridDimX, GridDimY, GridDimZ); } + CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K) + { + return integer_divide_ceil(K, kK); + } + CK_TILE_DEVICE auto operator()() { const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kM); const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kN); - return ck_tile::make_tuple(iM, iN); + return make_tuple(iM, iN); } }; } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp new file mode 100644 index 000000000..b9b45d3f4 --- /dev/null +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" + +namespace ck_tile { + +// A Tile Window: global memory +// B Tile Window: global memory +// C Distributed tensor: register +template +struct BaseGemmPipelineAgBgCrMem +{ + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; + + static constexpr index_t BlockSize = Problem::kBlockSize; + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + // TODO: Is this 32K value gfx9 arch specific? + static constexpr index_t MinMemInFlyBytes = 32768; + + static constexpr index_t WgpPerCU = + (4 * get_warp_size() / BlockSize) >= 1 ? 4 * get_warp_size() / BlockSize : 1; + static constexpr index_t FullMemBandPrefetchStages = integer_divide_ceil( + MinMemInFlyBytes / WgpPerCU, + (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock); + static constexpr index_t PrefetchStages = + FullMemBandPrefetchStages >= 2 + ? FullMemBandPrefetchStages <= 8 ? FullMemBandPrefetchStages : 8 + : 2; + + static constexpr index_t LocalPrefillStages = 1; + static constexpr index_t GlobalBufferNum = PrefetchStages; + + CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop) + { + return num_loop > PrefetchStages; + } + + CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop) + { + if(num_loop % PrefetchStages == 1) + { + return TailNumber::One; + } + else if(num_loop % PrefetchStages == 2) + { + return TailNumber::Two; + } + else if(num_loop % PrefetchStages == 3) + { + return TailNumber::Three; + } + else if(num_loop % PrefetchStages == 4) + { + return TailNumber::Four; + } + else if(num_loop % PrefetchStages == 5) + { + return TailNumber::Five; + } + else if(num_loop % PrefetchStages == 6) + { + return TailNumber::Six; + } + else if(num_loop % PrefetchStages == 7) + { + return TailNumber::Seven; + } + else + { + return TailNumber::Full; + } + } +}; + +// Maximum Global Memory throughput pipeline with >=32KB data in fly +// GlobalPrefetchStages: >=2 +// LocalPreFillStages: 1 +// LocalPreFetchStages: 0 +// LocalSharedMemoryBuffer: 1 +template +struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem +{ + using Base = BaseGemmPipelineAgBgCrMem; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; + + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + + using BlockGemm = remove_cvref_t())>; + using I0 = number<0>; + + static constexpr index_t BlockSize = Problem::kBlockSize; + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + static constexpr index_t VectorSizeA = Problem::VectorSizeA; + static constexpr index_t VectorSizeB = Problem::VectorSizeB; + static constexpr index_t VectorSizeC = Problem::VectorSizeC; + + static constexpr bool kPadA = Problem::kPadA; + static constexpr bool kPadB = Problem::kPadB; + static constexpr bool kPadC = Problem::kPadC; + + // Where is the right place for HasHotLoop and TailNum ??? + static constexpr bool HasHotLoop = Problem::HasHotLoop; + static constexpr auto TailNum = Problem::TailNum; + static constexpr auto Scheduler = Problem::Scheduler; + + using Base::PrefetchStages; + + CK_TILE_HOST_DEVICE constexpr index_t GetStaticLdsSize() + { + return integer_divide_ceil( + sizeof(ADataType) * + Policy::template MakeALdsBlockDescriptor().get_element_space_size(), + 16) * + 16 + + sizeof(BDataType) * + Policy::template MakeBLdsBlockDescriptor().get_element_space_size(); + } + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + struct PipelineImpl + { + }; + + template <> + struct PipelineImpl + { + template + CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, + SrcTileWindow& dram_tile_window) const + { + load_tile(dst_block_tile, dram_tile_window); + move_tile_window(dram_tile_window, {0, KPerBlock}); + } + + template + CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window, + const SrcBlockTile& src_block_tile, + const ElementFunction& element_func) const + { + const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile); + store_tile(lds_tile_window, block_tile_tmp); + } + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const AElementFunction& a_element_func, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + const BElementFunction& b_element_func, + index_t num_loop, + void* p_smem) const + { + static_assert( + std::is_same_v> && + std::is_same_v>, + "A/B Dram block window should have the same data type as appropriate " + "([A|B]DataType) defined in Problem definition!"); + + static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && + NPerBlock == + BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}], + "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" + " or KPerBlock!"); + + // ------------------------------------------------------------------------------------ + // Definitions of all needed tiles + + // A tile in LDS + ADataType* p_a_lds = static_cast(p_smem); + constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor(); + auto a_lds_block = make_tensor_view(p_a_lds, a_lds_block_desc); + + // TODO: LDS alignment should come from Policy! + constexpr index_t a_lds_block_space_size_aligned = + integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), + 16) * + 16; + + // B tile in LDS + BDataType* p_b_lds = static_cast( + static_cast(static_cast(p_smem) + a_lds_block_space_size_aligned)); + constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor(); + auto b_lds_block = make_tensor_view(p_b_lds, b_lds_block_desc); + + // A DRAM tile window for load + auto a_copy_dram_window = + make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_dram_block_window_tmp.get_window_origin(), + Policy::template MakeADramTileDistribution()); + + // A LDS tile window for store + auto a_copy_lds_window = + make_tile_window(a_lds_block, + make_tuple(number{}, number{}), + {0, 0}, + a_copy_dram_window.get_tile_distribution()); + // B DRAM tile window for load + auto b_copy_dram_window = + make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_dram_block_window_tmp.get_window_origin(), + Policy::template MakeBDramTileDistribution()); + + // B LDS tile window for store + auto b_copy_lds_window = + make_tile_window(b_lds_block, + make_tuple(number{}, number{}), + {0, 0}, + b_copy_dram_window.get_tile_distribution()); + + // A LDS tile for block GEMM + auto a_lds_gemm_window = make_tile_window( + a_lds_block, make_tuple(number{}, number{}), {0, 0}); + // B LDS tile for block GEMM + auto b_lds_gemm_window = make_tile_window( + b_lds_block, make_tuple(number{}, number{}), {0, 0}); + + // Block GEMM + constexpr auto block_gemm = BlockGemm(); + auto c_block_tile = block_gemm.MakeCBlockTile(); + + using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution()); + using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution()); + + using ABlockTile = + decltype(make_static_distributed_tensor(ABlockTileDistr{})); + using BBlockTile = + decltype(make_static_distributed_tensor(BBlockTileDistr{})); + + tuple_array a_block_tiles; + tuple_array b_block_tiles; + + // ----------------------------------------------------------------------------------------- + // Gemm pipeline start + + // prefetch + // global read 0 + GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); + GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); + + // initialize C + tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); + + // LDS write 0 + LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); + LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + + // Global prefetch [1, PrefetchStages] + static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) { + GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); + GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); + }); + + // main body + if constexpr(HasHotLoop) + { + index_t i = 0; + do + { + static_for<0, PrefetchStages, 1>{}([&](auto prefetch_idx) { + block_sync_lds(); + // block_gemm.LocalPrefetch(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + + block_sync_lds(); + + LocalPrefill( + a_copy_lds_window, + a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), + a_element_func); + LocalPrefill( + b_copy_lds_window, + b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), + b_element_func); + + GlobalPrefetch(a_block_tiles.get(number{}), + a_copy_dram_window); + GlobalPrefetch(b_block_tiles.get(number{}), + b_copy_dram_window); + }); + + i += PrefetchStages; + } while(i < (num_loop - PrefetchStages)); + } + + auto HotLoopTail = [&](auto tail_num) { + static_for<1, tail_num, 1>{}([&](auto prefetch_idx) { + block_sync_lds(); + + // block_gemm.LocalPrefetch(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + + block_sync_lds(); + LocalPrefill(a_copy_lds_window, + a_block_tiles.get(number{}), + a_element_func); + LocalPrefill(b_copy_lds_window, + b_block_tiles.get(number{}), + b_element_func); + }); + + block_sync_lds(); + // block_gemm.LocalPrefetch(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + }; + + if constexpr(TailNum == TailNumber::One) + { + block_sync_lds(); + // block_gemm.LocalPrefetch(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + } + else if constexpr(TailNum == TailNumber::Two) + { + HotLoopTail(number<2>{}); + } + else if constexpr(TailNum == TailNumber::Three) + { + HotLoopTail(number<3>{}); + } + else if constexpr(TailNum == TailNumber::Four) + { + HotLoopTail(number<4>{}); + } + else if constexpr(TailNum == TailNumber::Five) + { + HotLoopTail(number<5>{}); + } + else if constexpr(TailNum == TailNumber::Six) + { + HotLoopTail(number<6>{}); + } + else if constexpr(TailNum == TailNumber::Seven) + { + HotLoopTail(number<7>{}); + } + else if constexpr(TailNum == TailNumber::Full) + { + HotLoopTail(number{}); + } + + return c_block_tile; + } + }; + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const AElementFunction& a_element_func, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + const BElementFunction& b_element_func, + index_t num_loop, + void* p_smem) const + { + return PipelineImpl{}.template operator()( + a_dram_block_window_tmp, + a_element_func, + b_dram_block_window_tmp, + b_element_func, + num_loop, + p_smem); + } + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + index_t num_loop, + void* p_smem) const + { + return PipelineImpl{}.template operator()( + a_dram_block_window_tmp, + [](const ADataType& a) { return a; }, + b_dram_block_window_tmp, + [](const BDataType& b) { return b; }, + num_loop, + p_smem); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp new file mode 100644 index 000000000..5e93ca21c --- /dev/null +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +enum struct GemmPipelineScheduler +{ + Intrawave, + Interwave, +}; + +enum struct TailNumber +{ + // Single / Double buffer pipeline + Odd, + Even, + + // Long prefetch pipeline, up to 8 + One, + Two, + Three, + Four, + Five, + Six, + Seven, + + // Unroll stages > Prefetch stages, number of loop is multiple of unroll stages + Empty, + // Unroll stages <= Prefetch stages, number of loop is multiple of unroll stages add + // prefetchstages + Full, +}; + +} // namespace ck_tile + +inline std::ostream& operator<<(std::ostream& os, const ck_tile::GemmPipelineScheduler& s) +{ + switch(s) + { + case ck_tile::GemmPipelineScheduler::Intrawave: os << "Intrawave"; break; + case ck_tile::GemmPipelineScheduler::Interwave: os << "Interwave"; break; + default: os << ""; + } + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const ck_tile::TailNumber& s) +{ + switch(s) + { + case ck_tile::TailNumber::Odd: os << "Odd"; break; + case ck_tile::TailNumber::Even: os << "Even"; break; + case ck_tile::TailNumber::One: os << "One"; break; + case ck_tile::TailNumber::Two: os << "Two"; break; + case ck_tile::TailNumber::Three: os << "Three"; break; + case ck_tile::TailNumber::Four: os << "Four"; break; + case ck_tile::TailNumber::Five: os << "Five"; break; + case ck_tile::TailNumber::Six: os << "Six"; break; + case ck_tile::TailNumber::Seven: os << "Seven"; break; + case ck_tile::TailNumber::Empty: os << "Empty"; break; + case ck_tile::TailNumber::Full: os << "Full"; break; + default: os << ""; + } + return os; +} diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index 5ed7d036e..a2424290e 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -19,27 +19,27 @@ struct GemmPipelineAGmemBGmemCRegV1 using CDataType = remove_cvref_t; using BlockGemmShape = remove_cvref_t; - static constexpr index_t kBlockSize = Problem::kBlockSize; + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + + static constexpr index_t BlockSize = Problem::kBlockSize; static constexpr index_t kMPerBlock = BlockGemmShape::kM; static constexpr index_t kNPerBlock = BlockGemmShape::kN; static constexpr index_t kKPerBlock = BlockGemmShape::kK; - static constexpr index_t AlignmentA = Problem::AlignmentA; - static constexpr index_t AlignmentB = Problem::AlignmentB; - static constexpr index_t AlignmentC = Problem::AlignmentC; + static constexpr index_t VectorSizeA = Problem::VectorSizeA; + static constexpr index_t VectorSizeB = Problem::VectorSizeB; + static constexpr index_t VectorSizeC = Problem::VectorSizeC; static constexpr bool kPadA = Problem::kPadA; static constexpr bool kPadB = Problem::kPadB; static constexpr bool kPadC = Problem::kPadC; - using LayoutA = remove_cvref_t; - using LayoutB = remove_cvref_t; - using LayoutC = remove_cvref_t; - - CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetStaticLdsSize() + CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize() { - return ck_tile::integer_divide_ceil( + return integer_divide_ceil( sizeof(ADataType) * Policy::template MakeALdsBlockDescriptor().get_element_space_size(), 16) * @@ -48,7 +48,7 @@ struct GemmPipelineAGmemBGmemCRegV1 Policy::template MakeBLdsBlockDescriptor().get_element_space_size(); } - CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Policy::template GetSmemSize(); } diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp index 8639f00fb..199ba56aa 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -71,8 +71,6 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy template CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor() { - using namespace ck_tile; - constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN; constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK; @@ -93,7 +91,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy } template - CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeA() + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA() { constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) * MakeALdsBlockDescriptor().get_element_space_size(); @@ -101,7 +99,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy } template - CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeB() + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB() { constexpr index_t smem_size_b = sizeof(typename Problem::BDataType) * MakeBLdsBlockDescriptor().get_element_space_size(); @@ -109,7 +107,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy } template - CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { constexpr index_t smem_size_a = GetSmemSizeA(); constexpr index_t smem_size_b = GetSmemSizeB(); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp index bff7fc0a0..96a5a61c8 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -25,9 +25,9 @@ struct GemmPipelineAGmemBGmemCRegV2 static constexpr index_t kNPerBlock = BlockGemmShape::kN; static constexpr index_t kKPerBlock = BlockGemmShape::kK; - CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetStaticLdsSize() + CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize() { - return ck_tile::integer_divide_ceil( + return integer_divide_ceil( sizeof(ADataType) * Policy::template MakeALdsBlockDescriptor().get_element_space_size(), 16) * diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp index d7b3b24a4..1156f549b 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp @@ -1,14 +1,15 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "ck_tile/core.hpp" - -#define VectorLoadSize 16 +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" namespace ck_tile { +static constexpr int _VectorSize = 16; + template ; using GemmTraits = remove_cvref_t; + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size(); static constexpr bool kPadA = GemmTraits::kPadA; static constexpr bool kPadB = GemmTraits::kPadB; static constexpr bool kPadC = GemmTraits::kPadC; - using LayoutA = remove_cvref_t; - using LayoutB = remove_cvref_t; - using LayoutC = remove_cvref_t; + static constexpr index_t VectorSizeA = kPadA ? 1 : _VectorSize / sizeof(ADataType); + static constexpr index_t VectorSizeB = kPadB ? 1 : _VectorSize / sizeof(BDataType); + static constexpr index_t VectorSizeC = kPadC ? 1 : _VectorSize / sizeof(CDataType); +}; + +template +struct UniversalGemmPipelineProblem +{ + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; + using GemmTraits = remove_cvref_t; + + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + + static constexpr auto Scheduler = Scheduler_; + static constexpr auto HasHotLoop = HasHotLoop_; + static constexpr auto TailNum = TailNum_; + static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size(); + + static constexpr bool kPadA = GemmTraits::kPadA; + static constexpr bool kPadB = GemmTraits::kPadB; + static constexpr bool kPadC = GemmTraits::kPadC; - static constexpr index_t AlignmentA = kPadA ? 1 : VectorLoadSize / sizeof(ADataType); - static constexpr index_t AlignmentB = kPadB ? 1 : VectorLoadSize / sizeof(BDataType); - static constexpr index_t AlignmentC = kPadC ? 1 : VectorLoadSize / sizeof(CDataType); + static constexpr index_t VectorSizeA = kPadA ? _VectorSize / sizeof(ADataType) : 1; + static constexpr index_t VectorSizeB = kPadB ? _VectorSize / sizeof(BDataType) : 1; + static constexpr index_t VectorSizeC = kPadC ? _VectorSize / sizeof(CDataType) : 1; }; } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp index 98da1510c..9d050be2f 100644 --- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp +++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp @@ -1,27 +1,25 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include "ck_tile/core.hpp" - namespace ck_tile { template + typename ALayout_, + typename BLayout_, + typename CLayout_> struct TileGemmTraits { static constexpr bool kPadA = kPadA_; static constexpr bool kPadB = kPadB_; static constexpr bool kPadC = kPadC_; - using LayoutA = LayoutA_; - using LayoutB = LayoutB_; - using LayoutC = LayoutC_; + using ALayout = ALayout_; + using BLayout = BLayout_; + using CLayout = CLayout_; }; } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp index dd164e72e..bb59a7298 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -39,9 +39,9 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8 #if defined(__gfx9__) c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0); #else - ck_tile::ignore = c_vec; - ck_tile::ignore = a_vec; - ck_tile::ignore = b_vec; + ignore = c_vec; + ignore = a_vec; + ignore = b_vec; #endif } @@ -52,8 +52,8 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8 return bit_cast( __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0)); #else - ck_tile::ignore = a_vec; - ck_tile::ignore = b_vec; + ignore = a_vec; + ignore = b_vec; return CVecType{0.f}; #endif } @@ -90,9 +90,9 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16 #if defined(__gfx9__) c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0); #else - ck_tile::ignore = c_vec; - ck_tile::ignore = a_vec; - ck_tile::ignore = b_vec; + ignore = c_vec; + ignore = a_vec; + ignore = b_vec; #endif } @@ -103,8 +103,8 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16 return bit_cast( __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0)); #else - ck_tile::ignore = a_vec; - ck_tile::ignore = b_vec; + ignore = a_vec; + ignore = b_vec; return CVecType{0.f}; #endif } @@ -154,9 +154,9 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8 0); }); #else - ck_tile::ignore = c_vec; - ck_tile::ignore = a_vec; - ck_tile::ignore = b_vec; + ignore = c_vec; + ignore = a_vec; + ignore = b_vec; #endif } @@ -181,8 +181,8 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8 }); return c_vec; #else - ck_tile::ignore = a_vec; - ck_tile::ignore = b_vec; + ignore = a_vec; + ignore = b_vec; return CVecType{0.f}; #endif } @@ -231,9 +231,9 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16 0); }); #else - ck_tile::ignore = c_vec; - ck_tile::ignore = a_vec; - ck_tile::ignore = b_vec; + ignore = c_vec; + ignore = a_vec; + ignore = b_vec; #endif } @@ -258,8 +258,8 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16 }); return c_vec; #else - ck_tile::ignore = a_vec; - ck_tile::ignore = b_vec; + ignore = a_vec; + ignore = b_vec; return CVecType{0.f}; #endif } @@ -320,9 +320,9 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0); }); #else - ck_tile::ignore = c_vec; - ck_tile::ignore = a_vec; - ck_tile::ignore = b_vec; + ignore = c_vec; + ignore = a_vec; + ignore = b_vec; #endif } @@ -356,8 +356,8 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base }); return c_vec; #else - ck_tile::ignore = a_vec; - ck_tile::ignore = b_vec; + ignore = a_vec; + ignore = b_vec; return CVecType{0.f}; #endif } diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp index 99cd5d787..4183d9cb9 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -21,40 +21,40 @@ struct WarpGemmMfmaDispatcher; // clang-format off // fp16 -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; }; // bf16 -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; }; // fp8 -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; }; // clang-format on } // namespace impl diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index 9075ca2ed..ac9c4311d 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(image_to_column) +add_subdirectory(gemm) diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt new file mode 100644 index 000000000..f96ad9c6e --- /dev/null +++ b/test/ck_tile/gemm/CMakeLists.txt @@ -0,0 +1,4 @@ +# Currently ck_tile is only built on gfx9 +if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_ck_tile_gemm_mem_pipeline test_gemm_mem_pipeline.cpp) +endif() diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp new file mode 100644 index 000000000..f72a80b5a --- /dev/null +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "gtest/gtest.h" + +#include "ck_tile/host.hpp" +#include "test_gemm_mem_pipeline_util.hpp" + +using F16 = ck_tile::half_t; +using F32 = float; + +using Row = ck_tile::tensor_layout::gemm::RowMajor; +using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + +// clang-format off +using KernelTypes = ::testing::Types< + // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType + std::tuple< Row, Col, Row, F16, F16, F32, F16>, + std::tuple< Col, Row, Row, F16, F16, F32, F16>, + std::tuple< Row, Row, Row, F16, F16, F32, F16>, + std::tuple< Col, Col, Row, F16, F16, F32, F16> + >; +// clang-format on + +TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes); + +#include "test_gemm_mem_pipeline_ut_cases.inc" diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc new file mode 100644 index 000000000..b26114f39 --- /dev/null +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc @@ -0,0 +1,41 @@ +#pragma once + +TYPED_TEST(TestCkTileGemmMemPipeline, SmallM) +{ + std::vector Ms{1, 2, 3, 4, 5, 6}; + constexpr int N = 1024; + constexpr int K = 320; + + for(int M : Ms) + this->Run(M, N, K); +} + +TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM) +{ + std::vector Ms{127, 255, 312, 799, 1573}; + constexpr int N = 1024; + constexpr int K = 320; + + for(int M : Ms) + this->Run(M, N, K); +} + +TYPED_TEST(TestCkTileGemmMemPipeline, PaddK) +{ + std::vector Ms{127}; + constexpr int N = 1024; + constexpr int K = 432; + + for(int M : Ms) + this->Run(M, N, K); +} + +TYPED_TEST(TestCkTileGemmMemPipeline, Regular) +{ + std::vector Ms{512}; + constexpr int N = 1024; + constexpr int K = 512; + + for(int M : Ms) + this->Run(M, N, K); +} diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp new file mode 100644 index 000000000..1b243ab43 --- /dev/null +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" + +template +class TestCkTileGemmMemPipeline : public ::testing::Test +{ + protected: + using ALayout = std::tuple_element_t<0, Tuple>; + using BLayout = std::tuple_element_t<1, Tuple>; + using CLayout = std::tuple_element_t<2, Tuple>; + using ADataType = std::tuple_element_t<3, Tuple>; + using BDataType = std::tuple_element_t<4, Tuple>; + using AccDataType = std::tuple_element_t<5, Tuple>; + using CDataType = std::tuple_element_t<6, Tuple>; + // TODO: expose tile size through test t-param ? + + struct gemm_basic_args + { + const void* p_a; + const void* p_b; + void* p_c; + ck_tile::index_t kbatch; + ck_tile::index_t M; + ck_tile::index_t N; + ck_tile::index_t K; + ck_tile::index_t stride_A; + ck_tile::index_t stride_B; + ck_tile::index_t stride_C; + }; + + void invoke_gemm(const gemm_basic_args& args, const ck_tile::stream_config& s) + { + // TODO: This should be parameterized in tests + constexpr ck_tile::index_t M_Tile = 128; + constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t K_Tile = 32; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + + constexpr bool kPadA = true; + constexpr bool kPadB = true; + constexpr bool kPadC = true; + + constexpr int kBlockPerCu = 1; + + // =============================================== + + using GemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + using TilePartitioner = ck_tile::GemmTilePartitioner; + + using GemmEpilogue = ck_tile::Default2DEpilogue< + ck_tile::Default2DEpilogueProblem>; + + using Traits = ck_tile::TileGemmTraits; + + using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem< + ck_tile::GemmPipelineProblem>; + + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(args.K); + const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + + const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + + using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem< + ck_tile::UniversalGemmPipelineProblem>; + using Kernel = ck_tile::GemmKernel; + auto kargs = Kernel::MakeKargs(args.p_a, + args.p_b, + args.p_c, + args.M, + args.N, + args.K, + args.stride_A, + args.stride_B, + args.stride_C); + + const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); + constexpr dim3 blocks = Kernel::BlockSize(); + + if(s.log_level_ > 0) + { + std::cout << "Lunching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z + << "}" << std::endl; + } + + ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + }; + + if(has_hot_loop) + { + // Tail pipeline One to Seven + if(tail_num == ck_tile::TailNumber::One) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else if(tail_num == ck_tile::TailNumber::Full) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + + if constexpr(BaseGemmPipeline::PrefetchStages > 2) + { + if(tail_num == ck_tile::TailNumber::Two) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + if constexpr(BaseGemmPipeline::PrefetchStages > 3) + { + if(tail_num == ck_tile::TailNumber::Three) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + if constexpr(BaseGemmPipeline::PrefetchStages > 4) + { + if(tail_num == ck_tile::TailNumber::Four) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + if constexpr(BaseGemmPipeline::PrefetchStages > 5) + { + if(tail_num == ck_tile::TailNumber::Five) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + if constexpr(BaseGemmPipeline::PrefetchStages > 6) + { + if(tail_num == ck_tile::TailNumber::Six) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + if constexpr(BaseGemmPipeline::PrefetchStages > 7) + { + if(tail_num == ck_tile::TailNumber::Seven) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + } + else + { + // Tail number always Full - #PrefetchStages + if(tail_num == ck_tile::TailNumber::Full) + { + Run(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else + { + std::ostringstream err; + err << "When there's no hot loop, this tail number \"" << tail_num + << "\" is not supported! " << __FILE__ << ":" << __LINE__ + << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } + } + } + + public: + std::vector k_batches_; + + void SetUp() override { k_batches_ = {1}; } + + void Run(const int M, + const int N, + const int K, + const int StrideA = 0, + const int StrideB = 0, + const int StrideC = 0) + { + for(auto kb : k_batches_) + { + RunSingle(M, N, K, StrideA, StrideB, StrideC, kb); + } + } + + void RunSingle(const int M, + const int N, + const int K, + const int StrideA, + const int StrideB, + const int StrideC, + int kbatch = 1) + { + using namespace ck_tile::literals; + + auto f_host_tensor_descriptor = [](std::size_t row, + std::size_t col, + std::size_t stride, + auto layout) { + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + auto f_get_default_stride = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(stride == 0) + { + // give a chance if stride is zero, return a default packed stride + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; + }; + + std::size_t stride_A = f_get_default_stride(M, K, StrideA, ALayout{}); + std::size_t stride_B = f_get_default_stride(K, N, StrideB, BLayout{}); + std::size_t stride_C = f_get_default_stride(M, N, StrideC, CLayout{}); + + ck_tile::HostTensor a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{})); + ck_tile::HostTensor b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{})); + ck_tile::HostTensor c_m_n_dev_result( + f_host_tensor_descriptor(M, N, stride_C, CLayout{})); + + ck_tile::FillUniformDistributionIntegerValue{-5, 5}(a_m_k); + ck_tile::FillUniformDistributionIntegerValue{-5, 5}(b_k_n); + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + b_k_n_dev_buf.ToDevice(b_k_n.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + gemm_basic_args args; + args.p_a = a_m_k_dev_buf.GetDeviceBuffer(); + args.p_b = b_k_n_dev_buf.GetDeviceBuffer(); + args.p_c = c_m_n_dev_buf.GetDeviceBuffer(); + args.kbatch = kbatch; + args.M = M; + args.N = N; + args.K = K; + args.stride_A = stride_A; + args.stride_B = stride_B; + args.stride_C = stride_C; + + invoke_gemm(args, ck_tile::stream_config{nullptr, false}); + + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + bool pass = true; + + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(M, N, stride_C, CLayout{})); + c_m_n_host_ref.SetZero(); + + ck_tile::reference_gemm( + a_m_k, b_k_n, c_m_n_host_ref); + + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref); + EXPECT_TRUE(pass); + } +}; -- GitLab From 7d9111545f7541b16aca7c52c871314402983596 Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 30 Oct 2024 23:13:30 +0800 Subject: [PATCH 027/153] clang-format (#1612) --- .../ck_tile/ops/reduce/block/block_reduce.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp index d9df949cf..fa3007d1e 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp @@ -16,8 +16,8 @@ namespace ck_tile { // synchronize reduce result (cross lane reduction and broadcast on replicated dimension) template CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor, - const ReduceFunc& reduce_func, - bool_constant = {}) + const ReduceFunc& reduce_func, + bool_constant = {}) { using Dstr = typename AccDistributedTensor_::StaticTileDistribution; using DstrEncode = typename Dstr::DstrEncode; @@ -116,7 +116,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor, */ template CK_TILE_DEVICE void block_tile_reduce_xor_sync(AccDistributedTensor_& acc_tensor, - const ReduceFunc& reduce_func) + const ReduceFunc& reduce_func) { using Dstr = typename AccDistributedTensor_::StaticTileDistribution; using DstrEncode = typename Dstr::DstrEncode; @@ -175,9 +175,9 @@ template CK_TILE_DEVICE void block_tile_reduce(AccDistributedTensor_& acc_tensor, - const InDistributedTensor_& in_tensor, - sequence, - const ReduceFunc& reduce_func) + const InDistributedTensor_& in_tensor, + sequence, + const ReduceFunc& reduce_func) { constexpr auto I0 = number<0>{}; constexpr auto I1 = number<1>{}; @@ -250,9 +250,9 @@ template CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor, - sequence in_reduce_dims, - const ReduceFunc& reduce_func, - const InDataType_& reduce_init) + sequence in_reduce_dims, + const ReduceFunc& reduce_func, + const InDataType_& reduce_init) { using InDataType = typename InDistributedTensor_::DataType; using AccDataType = remove_cvref_t; -- GitLab From 9a8a52130d780ca449ae261bb03ae4783f18f296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Wed, 30 Oct 2024 17:42:50 +0100 Subject: [PATCH 028/153] Remove virtual destructors from unary ops (#1610) * Remove virtual destructors from unary ops * Fixes * Fixes * clang format fixes --- .../element/unary_element_wise_operation.hpp | 112 +++++++++++++++--- include/ck_tile/core/numeric/math.hpp | 2 +- .../host/reference/reference_elementwise.hpp | 2 +- .../host/reference/reference_permute.hpp | 2 +- .../reference/reference_rmsnorm2d_fwd.hpp | 2 +- .../add_rmsnorm2d_rdquant_fwd_shape.hpp | 2 +- ...rmsnorm2d_rdquant_fwd_pipeline_problem.hpp | 2 +- .../ops/fmha/pipeline/tile_fmha_shape.hpp | 2 +- .../pipeline/generic_petmute_problem.hpp | 2 +- .../rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp | 2 +- .../rmsnorm2d_fwd_pipeline_problem.hpp | 2 +- .../ops/welford/block/block_welford.hpp | 2 +- 12 files changed, 105 insertions(+), 29 deletions(-) diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 712b88618..39b81ca57 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -13,15 +13,17 @@ namespace ck { namespace tensor_operation { namespace element_wise { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wnon-virtual-dtor" struct UnaryOpBase { public: - __host__ __device__ virtual ~UnaryOpBase() = default; + __host__ __device__ ~UnaryOpBase() = default; - __host__ __device__ UnaryOpBase() = default; - __host__ __device__ UnaryOpBase(const UnaryOpBase&) = default; + __host__ __device__ constexpr UnaryOpBase() = default; + __host__ __device__ constexpr UnaryOpBase(const UnaryOpBase&) = default; + __host__ __device__ constexpr UnaryOpBase(UnaryOpBase&&) = default; __host__ __device__ UnaryOpBase& operator=(const UnaryOpBase&) = default; - __host__ __device__ UnaryOpBase(UnaryOpBase&&) = default; __host__ __device__ UnaryOpBase& operator=(UnaryOpBase&&) = default; __host__ __device__ virtual inline void operator()(float& y, const float& x) const = 0; @@ -50,8 +52,14 @@ struct PassThroughPack2 constexpr const static bool is_pack2_invocable = true; }; -struct PassThrough : public UnaryOpBase +struct PassThrough final : public UnaryOpBase { + __host__ __device__ constexpr PassThrough() = default; + __host__ __device__ constexpr PassThrough(const PassThrough&) = default; + __host__ __device__ constexpr PassThrough(PassThrough&&) = default; + __host__ __device__ PassThrough& operator=(const PassThrough&) = default; + __host__ __device__ PassThrough& operator=(PassThrough&&) = default; + __host__ __device__ ~PassThrough() = default; __host__ __device__ inline void operator()(float& y, const float& x) const final { y = x; } @@ -409,8 +417,15 @@ struct UnarySquare }; }; -struct UnaryAbs : public UnaryOpBase +struct UnaryAbs final : public UnaryOpBase { + __host__ __device__ constexpr UnaryAbs() = default; + __host__ __device__ constexpr UnaryAbs(const UnaryAbs&) = default; + __host__ __device__ constexpr UnaryAbs(UnaryAbs&&) = default; + __host__ __device__ UnaryAbs& operator=(const UnaryAbs&) = default; + __host__ __device__ UnaryAbs& operator=(UnaryAbs&&) = default; + __host__ __device__ ~UnaryAbs() = default; + __host__ __device__ inline void operator()(float& y, const float& x) const final { y = ck::math::abs(x); @@ -459,8 +474,15 @@ struct UnarySqrt }; }; -struct Relu : public UnaryOpBase +struct Relu final : public UnaryOpBase { + __host__ __device__ constexpr Relu() = default; + __host__ __device__ constexpr Relu(const Relu&) = default; + __host__ __device__ constexpr Relu(Relu&&) = default; + __host__ __device__ Relu& operator=(const Relu&) = default; + __host__ __device__ Relu& operator=(Relu&&) = default; + __host__ __device__ ~Relu() = default; + __host__ __device__ inline void operator()(float& y, const float& x) const final { y = x > 0 ? x : 0; @@ -633,8 +655,14 @@ struct Gelu } }; -struct Sigmoid : public UnaryOpBase +struct Sigmoid final : public UnaryOpBase { + __host__ __device__ constexpr Sigmoid() = default; + __host__ __device__ constexpr Sigmoid(const Sigmoid&) = default; + __host__ __device__ constexpr Sigmoid(Sigmoid&&) = default; + __host__ __device__ Sigmoid& operator=(const Sigmoid&) = default; + __host__ __device__ Sigmoid& operator=(Sigmoid&&) = default; + __host__ __device__ ~Sigmoid() = default; __host__ __device__ inline void operator()(float& y, const float& x) const final { @@ -688,8 +716,15 @@ struct Silu }; }; -struct TanH : public UnaryOpBase +struct TanH final : public UnaryOpBase { + __host__ __device__ constexpr TanH() = default; + __host__ __device__ constexpr TanH(const TanH&) = default; + __host__ __device__ constexpr TanH(TanH&&) = default; + __host__ __device__ TanH& operator=(const TanH&) = default; + __host__ __device__ TanH& operator=(TanH&&) = default; + __host__ __device__ ~TanH() = default; + __host__ __device__ inline void operator()(float& y, const float& x) const final { y = ck::math::tanh(x); @@ -959,8 +994,12 @@ struct Rcp }; }; -struct Swish : public UnaryOpBase +struct Swish final : public UnaryOpBase { + __host__ __device__ constexpr Swish(const Swish&) = default; + __host__ __device__ constexpr Swish(Swish&&) = default; + __host__ __device__ ~Swish() = default; + __host__ __device__ Swish(float beta = 1.0f) : beta_(beta) {} __host__ __device__ float get_beta() const { return beta_; } @@ -1019,8 +1058,12 @@ struct Swish : public UnaryOpBase } }; -struct SoftRelu : public UnaryOpBase +struct SoftRelu final : public UnaryOpBase { + __host__ __device__ constexpr SoftRelu(const SoftRelu&) = default; + __host__ __device__ constexpr SoftRelu(SoftRelu&&) = default; + __host__ __device__ ~SoftRelu() = default; + __host__ __device__ SoftRelu(float alpha = 1.0f) : alpha_(alpha) {} __host__ __device__ float get_alpha() const { return alpha_; } @@ -1070,8 +1113,12 @@ struct SoftRelu : public UnaryOpBase } }; -struct Power : public UnaryOpBase +struct Power final : public UnaryOpBase { + __host__ __device__ constexpr Power(const Power&) = default; + __host__ __device__ constexpr Power(Power&&) = default; + __host__ __device__ ~Power() = default; + __host__ __device__ Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f) : alpha_(alpha), beta_(beta), gamma_(gamma) { @@ -1148,8 +1195,12 @@ struct Power : public UnaryOpBase } }; -struct ClippedRelu : public UnaryOpBase +struct ClippedRelu final : public UnaryOpBase { + __host__ __device__ constexpr ClippedRelu(const ClippedRelu&) = default; + __host__ __device__ constexpr ClippedRelu(ClippedRelu&&) = default; + __host__ __device__ ~ClippedRelu() = default; + __host__ __device__ ClippedRelu(float alpha = 0.f, float beta = 1.f) : alpha_(alpha), beta_(beta) { @@ -1205,8 +1256,11 @@ struct ClippedRelu : public UnaryOpBase } }; -struct LeakyRelu : public UnaryOpBase +struct LeakyRelu final : public UnaryOpBase { + __host__ __device__ constexpr LeakyRelu(const LeakyRelu&) = default; + __host__ __device__ constexpr LeakyRelu(LeakyRelu&&) = default; + __host__ __device__ ~LeakyRelu() = default; __host__ __device__ LeakyRelu(float alpha = 0.f) : alpha_(alpha) {} @@ -1250,8 +1304,11 @@ struct LeakyRelu : public UnaryOpBase } }; -struct Elu : public UnaryOpBase +struct Elu final : public UnaryOpBase { + __host__ __device__ constexpr Elu(const Elu&) = default; + __host__ __device__ constexpr Elu(Elu&&) = default; + __host__ __device__ ~Elu() = default; __host__ __device__ Elu(float alpha = 1.f) : alpha_(alpha) {} @@ -1296,8 +1353,11 @@ struct Elu : public UnaryOpBase } }; -struct Logistic : public UnaryOpBase +struct Logistic final : public UnaryOpBase { + __host__ __device__ constexpr Logistic(const Logistic&) = default; + __host__ __device__ constexpr Logistic(Logistic&&) = default; + __host__ __device__ ~Logistic() = default; __host__ __device__ Logistic(float alpha = 1.0f) : alpha_(alpha) {} @@ -1631,8 +1691,23 @@ struct DynamicUnaryOp __host__ __device__ ~DynamicUnaryOp() { - if(unary_op_ptr_) - delete unary_op_ptr_; + switch(unary_op_type_) + { + case(UnaryOpType::Swish): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::Sigmoid): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::PassThrough): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::Logistic): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::TanH): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::Relu): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::SoftRelu): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::UnaryAbs): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::Power): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::ClippedRelu): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::LeakyRelu): delete static_cast(unary_op_ptr_); break; + case(UnaryOpType::Elu): delete static_cast(unary_op_ptr_); break; + + default: break; + } } __device__ void InitUnaryOpPtrOnDevice() @@ -1721,6 +1796,7 @@ struct DynamicUnaryOp float beta; float gamma; }; +#pragma clang diagnostic pop } // namespace element_wise } // namespace tensor_operation diff --git a/include/ck_tile/core/numeric/math.hpp b/include/ck_tile/core/numeric/math.hpp index 0faf1aa04..6bdcb509b 100644 --- a/include/ck_tile/core/numeric/math.hpp +++ b/include/ck_tile/core/numeric/math.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/host/reference/reference_elementwise.hpp b/include/ck_tile/host/reference/reference_elementwise.hpp index 809049fa6..65303279b 100644 --- a/include/ck_tile/host/reference/reference_elementwise.hpp +++ b/include/ck_tile/host/reference/reference_elementwise.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/host/reference/reference_permute.hpp b/include/ck_tile/host/reference/reference_permute.hpp index 1c8248340..14ed4f815 100644 --- a/include/ck_tile/host/reference/reference_permute.hpp +++ b/include/ck_tile/host/reference/reference_permute.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp index db6e92f4c..b14e25a85 100644 --- a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp +++ b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp index a17c53c73..4bc7db434 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp index 106e5086b..2e6406003 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp index 570754b22..bb33b5f02 100644 --- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp +++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp b/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp index e504ed747..17f18acb5 100644 --- a/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp +++ b/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp index fb484a106..fc4b9f470 100644 --- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp +++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp index 87cab3463..2820e1813 100644 --- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp +++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp index 623e1e16d..ce73c183e 100644 --- a/include/ck_tile/ops/welford/block/block_welford.hpp +++ b/include/ck_tile/ops/welford/block/block_welford.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once -- GitLab From c3a4800c5fe1f7cbdd00f36b7bc4851e0299ddc9 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Thu, 31 Oct 2024 14:54:53 +0800 Subject: [PATCH 029/153] [CK_TILE] layernorm support fused-quant/fused-add (#1604) * add prenorm/postnorm support, refactor using generate.py * update README * update README * fix format * update some description and fix format * update format * format * use non-raw for loading * format and update n4096 * dynamic-quant ready * update readme * support fused dynamic-quant * update fused-quant, with smooth * update README * update args * update some based on comment --- example/ck_tile/02_layernorm2d/CMakeLists.txt | 31 +- example/ck_tile/02_layernorm2d/README.md | 69 +- example/ck_tile/02_layernorm2d/generate.py | 670 ++++++++++++++++++ .../instances/layernorm2d_fwd_api.cpp | 155 ---- .../layernorm2d_fwd_bf16_n1024_instance.cpp | 22 - .../layernorm2d_fwd_bf16_n1536_instance.cpp | 13 - .../layernorm2d_fwd_bf16_n2048_instance.cpp | 14 - .../layernorm2d_fwd_bf16_n256_instance.cpp | 12 - .../layernorm2d_fwd_bf16_n3072_instance.cpp | 14 - .../layernorm2d_fwd_bf16_n4096_instance.cpp | 14 - ...layernorm2d_fwd_bf16_n4096_tp_instance.cpp | 14 - .../layernorm2d_fwd_bf16_n512_instance.cpp | 13 - ...layernorm2d_fwd_bf16_n64_n128_instance.cpp | 12 - .../layernorm2d_fwd_bf16_n768_instance.cpp | 12 - .../layernorm2d_fwd_fp16_n1024_instance.cpp | 22 - .../layernorm2d_fwd_fp16_n1536_instance.cpp | 13 - .../layernorm2d_fwd_fp16_n2048_instance.cpp | 14 - .../layernorm2d_fwd_fp16_n256_instance.cpp | 12 - .../layernorm2d_fwd_fp16_n3072_instance.cpp | 14 - .../layernorm2d_fwd_fp16_n4096_instance.cpp | 14 - ...layernorm2d_fwd_fp16_n4096_tp_instance.cpp | 14 - .../layernorm2d_fwd_fp16_n512_instance.cpp | 13 - ...layernorm2d_fwd_fp16_n64_n128_instance.cpp | 12 - .../layernorm2d_fwd_fp16_n768_instance.cpp | 12 - .../layernorm2d_fwd_instance_common.hpp | 67 -- .../02_layernorm2d/layernorm2d_fwd.cpp | 270 ++++++- .../02_layernorm2d/layernorm2d_fwd.hpp | 103 +-- .../ck_tile/02_layernorm2d/misc/dquant.png | Bin 0 -> 36863 bytes example/ck_tile/02_layernorm2d/misc/pnorm.png | Bin 0 -> 32113 bytes .../02_layernorm2d/script/perf_test.sh | 66 +- .../02_layernorm2d/script/smoke_test.sh | 54 +- include/ck_tile/core.hpp | 1 + include/ck_tile/core/numeric/int8.hpp | 104 +++ include/ck_tile/core/numeric/type_convert.hpp | 4 + .../ck_tile/core/tensor/null_tile_window.hpp | 7 + .../reference/reference_layernorm2d_fwd.hpp | 37 +- include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp | 1 + include/ck_tile/ops/common.hpp | 1 + .../generic_2d_block_shape.hpp} | 7 +- include/ck_tile/ops/elementwise.hpp | 1 + include/ck_tile/ops/epilogue.hpp | 2 + .../ops/epilogue/default_2d_epilogue.hpp | 28 +- .../ops/epilogue/dynamic_quant_epilogue.hpp | 140 ++++ include/ck_tile/ops/fmha.hpp | 1 + include/ck_tile/ops/gemm.hpp | 1 + include/ck_tile/ops/image_to_column.hpp | 1 + include/ck_tile/ops/layernorm2d.hpp | 3 +- .../kernel/layernorm2d_fwd_kernel.hpp | 191 ++++- .../layernorm2d_fwd_pipeline_one_pass.hpp | 82 ++- .../layernorm2d_fwd_pipeline_problem.hpp | 12 +- .../layernorm2d_fwd_pipeline_two_pass.hpp | 79 ++- .../pipeline/layernorm2d_fwd_traits.hpp | 54 ++ include/ck_tile/ops/permute.hpp | 1 + include/ck_tile/ops/reduce.hpp | 1 + .../ck_tile/ops/reduce/block/block_reduce.hpp | 5 +- .../ops/reduce/block/block_reduce2d.hpp | 26 +- include/ck_tile/ops/rmsnorm2d.hpp | 1 + include/ck_tile/ops/softmax.hpp | 1 + include/ck_tile/ops/topk.hpp | 1 + include/ck_tile/ops/topk_softmax.hpp | 1 + include/ck_tile/ops/welford.hpp | 1 + 61 files changed, 1792 insertions(+), 768 deletions(-) create mode 100644 example/ck_tile/02_layernorm2d/generate.py delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp create mode 100644 example/ck_tile/02_layernorm2d/misc/dquant.png create mode 100644 example/ck_tile/02_layernorm2d/misc/pnorm.png create mode 100644 include/ck_tile/core/numeric/int8.hpp rename include/ck_tile/ops/{layernorm2d/kernel/layernorm2d_fwd_shape.hpp => common/generic_2d_block_shape.hpp} (96%) create mode 100644 include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp create mode 100644 include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp diff --git a/example/ck_tile/02_layernorm2d/CMakeLists.txt b/example/ck_tile/02_layernorm2d/CMakeLists.txt index feae5f791..1bf74bc05 100644 --- a/example/ck_tile/02_layernorm2d/CMakeLists.txt +++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt @@ -1,11 +1,34 @@ +set(LAYERNORM2D_FWD_KNOWN_APIS "fwd;bwd") +set(LAYERNORM2D_FWD_ENABLE_APIS "fwd" CACHE STRING + "semicolon-separated list of APIs to generate (${LAYERNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".") +if(LAYERNORM2D_FWD_ENABLE_APIS STREQUAL "all") + set(LAYERNORM2D_FWD_ENABLE_APIS ${LAYERNORM2D_FWD_KNOWN_APIS}) +endif() + +# generate a list of kernels, but not actually emit files at config sta +execute_process( + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py + --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --list_blobs + RESULT_VARIABLE ret +) +if(ret AND NOT ret EQUAL 0) + message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}") +endif() + +file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/layernorm2d_fwd_blobs.txt LAYERNORM2D_FWD_GEN_BLOBS) + +add_custom_command( + OUTPUT ${LAYERNORM2D_FWD_GEN_BLOBS} + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py + --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --gen_blobs +) + set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd") -# not using add_example_executable() to add this target, since we don't want this to have -# to be included in "make all/install/check" + message("adding example ${EXAMPLE_LAYERNORM2D_FWD}") -file(GLOB INSTANCE_SRCS instances/*.cpp) add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp) target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${INSTANCE_SRCS}) +target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS}) set(EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS) diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md index 405325a2a..14c6fc0d6 100644 --- a/example/ck_tile/02_layernorm2d/README.md +++ b/example/ck_tile/02_layernorm2d/README.md @@ -1,6 +1,42 @@ # Layernorm2D forward -This folder contains example for Layernorm2D forward using ck_tile tile-programming implementation. +This folder contains example for Layernorm2D forward using `ck_tile` tile-programming implementation. + +# Implementation and feature support + +## welford online algorithm +We use welfold algorithm to update `mean`/`variance` block by block. For `N <=4096` case we can compute `mean`/`var`/`normalization` within one loop, we call it `one-pass`. For large N case, it is hard to keep `mean`/`var` inside register/LDS and then computation `normalization`, so we need to load input twice, first time to compute `mean`/`var` block-by-block, then load input another time to compute the `normalization`. We call it `two-pass`. + +## mean/variance save +In training case the mean/variance need to store out (TBD, not supported yet) + +## prenorm/postnorm + +![](misc/pnorm.png) + +since [prenorm/postnorm](https://arxiv.org/pdf/1906.01787) is quite common in LLM blocks, this example boosts this feature by kernel fusion. Note that `prenorm`/`postnorm` always need to do elementwise-add a `shortcut` before the actual layernorm computation, and optionally store out the result to global. You can use `-fadd=1` to test `pre-add+store`, or `-fadd=2` to test `pre-add` without store out (not codegen by default). + +## smooth-quant/dynamic-quant +we support smooth/dynamic quantization for `int8` output, by setting `-fquant=1` and `-prec_o=int8`. In this case the output will doing a rowwise dynamic quantization like below. Note that smooth-quant require input a `(1*N)` size per-channel scale(in fp32 in our example, though this is customizable), then elememt-wise multiply the tensor for each row, then compute the rowwise dynamic quant. if set `-fquant=2` will have the input per-channel scale stage, only the dynamic quant. This case is supported in our kernel but by default not generated (TBD: add some filter in generate.py support on-demand codegen) +![](misc/dquant.png) + +``` +# assume output int8, hidden_states is [m, n] shape and in fp16/bf16 +# [m, 1] +per_token_amax, _ = torch.max( + input=torch.abs(hidden_states), + dim=-1, + keepdim=True +) +per_token_scale = per_token_amax.to(dtype=torch.float32) / 127.0 + +# quant hidden_states +hidden_states = (hidden_states / per_token_scale).to(dtype=torch.int8) + +return hidden_states, per_token_scale +# hidden_states now is int8 will feed to next layer as intput +# per_token_scale will be used as dequant factor later layer +``` ## build ``` @@ -15,8 +51,35 @@ This will result in an executable `build/bin/tile_example_layernorm2d_fwd` ``` args: -m m dimension (default:3328) - -n m dimension (default:4096) + -n n dimension (default:4096) + -stride stride per row, if -1 then equal to n (default:-1) -e epsilon (default:1e-5) + -save_mv save mean/variance(invstd) or not. set to 1 in training case (default:0) -v cpu validation or not (default:1) - -prec precision (default:fp16) + -kname print kernel name or not (default:1) + -prec_i input precision (default:fp16) + -prec_o output precision, set auto will be the same as input (default:auto) + -prec_sx output quant scale type, set auto will be the same as input. used when fquant=1 (default:auto) + -prec_sy output quant scale type, set auto will be the same as input. used when fquant=1 or 2 (default:auto) + -fadd fused-add, 0:no fused add, 1:preadd+store, 2:preadd only (default:0) + -fquant fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant (default:0) + -warmup cold iter (default:5) + -repeat hot iter (default:20) + ``` + +## limitations +Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by default generated. though our kernel template suppor this. (TBD: add some flag in generate.py) to generate those instance on demand. Beside, N>8192 case will by default using two-pass pipeline, and `-fquant=1/2` are not supported yet. + +``` +# some case +# standard fp16 layernorm 2d, m=10. n=1024 +./build/bin/tile_example_layernorm2d_fwd -m=10 -n=1024 + +# standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant, output in int8 +./build/bin/tile_example_layernorm2d_fwd -m=10 -n=1024 -prec_o=int8 -fquant=1 + +# standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant+fused-add-store, output in int8 +./build/bin/tile_example_layernorm2d_fwd -m=10 -n=1024 -prec_o=int8 -fquant=1 -fadd=1 + +``` \ No newline at end of file diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py new file mode 100644 index 000000000..300f6c05e --- /dev/null +++ b/example/ck_tile/02_layernorm2d/generate.py @@ -0,0 +1,670 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +# generate kernel instances to speed up compilation + +import argparse +from enum import IntEnum +from pathlib import Path +import sys +from typing import List, Optional, Any +import functools +import itertools +import copy +from dataclasses import dataclass + +def get_if_str(idx, total, lase_else = True): + if idx == 0: + return 'if' + elif idx < total - 1: + return 'else if' + else: + if lase_else: + return 'else' + else: + return 'else if' + +FUSED_ADD_ENUM_STR_MAP = [ + 'no', + 'pras', # pre-norm + 'pra' ] # post-norm + +FUSED_FUSED_SWEEP_STR_MAP = [ + 'no', + 'dquant' ] + +DATA_TYPE_MAP = {'fp32' : 'float', + 'fp16' : 'ck_tile::fp16_t', + 'bf16' : 'ck_tile::bf16_t', + 'int8' : 'ck_tile::int8_t'} + +def BOOL_MAP(b_) -> str: + if b_: + return 'true' + else: + return 'false' + +class layernorm_fwd_codegen: + API_TRAITS_DEFINE = """ +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template +struct layernorm2d_fwd_traits_ +{ + using XDataType = ck_tile::remove_cvref_t; + using YDataType = ck_tile::remove_cvref_t; + using XScaleDataType = ck_tile::remove_cvref_t; + using YScaleDataType = ck_tile::remove_cvref_t; + + static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize; + static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0); + static constexpr ck_tile::index_t total_warps = + (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize; + + // num of warps along m + static constexpr ck_tile::index_t BlockWarps_M = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return total_warps * (warpSize / ThreadPerBlock_N_); + } + else + { + // static_assert(warpSize % ThreadPerBlock_M_ == 0); + return total_warps / (ThreadPerBlock_N_ / warpSize); + } + }(); + + // num of warps along n + static constexpr ck_tile::index_t BlockWarps_N = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return 1; + } + else + { + static_assert(ThreadPerBlock_N_ % warpSize == 0); + return ThreadPerBlock_N_ / warpSize; + } + }(); + + static constexpr ck_tile::index_t Repeat_M = Repeat_M_; + static constexpr ck_tile::index_t Repeat_N = Repeat_N_; + + static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_; + static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_; + + static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M; + static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_; + + using BlockTile = ck_tile::sequence; + using BlockWarps = ck_tile::sequence; + using WarpTile = ck_tile::sequence; + using Vector = ck_tile::sequence<1, Vector_N_>; + + using Shape = ck_tile::Generic2dBlockShape; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; + static constexpr bool kTwoPass = kTwoPass_; + static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_; + static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_; +}; + +template +using traits_ = layernorm2d_fwd_traits_; +""" + API_COMMON_HEADER = """ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "layernorm2d_fwd.hpp" +#include +#include + +#pragma once + +using S = ck_tile::stream_config; +using A = layernorm2d_fwd_args; + +{F_traits_define} + +template +float layernorm2d_fwd_(const S& s, A a) +{{ + using XDataType = typename Traits_::XDataType; + using YDataType = typename Traits_::YDataType; + using XScaleDataType = typename Traits_::XScaleDataType; + using YScaleDataType = typename Traits_::YScaleDataType; + using ComputeDataType = typename LayerNormTypeConfig::ComputeDataType; + + using PipelineTraits = ck_tile::Layernorm2dFwdTraits(Traits_::kFusedAdd), + static_cast(Traits_::kFusedQuant)>; + using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem< + typename LayerNormTypeConfig::XDataType, + typename LayerNormTypeConfig::GammaDataType, + typename LayerNormTypeConfig::BetaDataType, + typename LayerNormTypeConfig::ComputeDataType, + typename LayerNormTypeConfig::YDataType, + typename LayerNormTypeConfig::MeanDataType, + typename LayerNormTypeConfig::InvStdDataType, + typename LayerNormTypeConfig::XScaleDataType, + typename LayerNormTypeConfig::YScaleDataType, + typename Traits_::Shape, + PipelineTraits>; + + using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass; + using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass; + using Pipeline = std::conditional_t; + + using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem; + using Default2DEpilogue = ck_tile::Default2DEpilogue; + + using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem>; + + using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue; + + using Epilogue = std::conditional_t; + + using Kernel = ck_tile::Layernorm2dFwd; + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + auto kargs = Kernel::MakeKargs(a); + if(s.log_level_ > 0) + std::cout << ", " << Kernel::GetName() << std::flush; + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{{}}, grids, blocks, 0, kargs)); +}} + +""" + + API_BASE = """ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "layernorm2d_fwd.hpp" + +{F_traits_define} + +// Note: this internal API only declare, not define here, otherwise will block `make -j` +template +float layernorm2d_fwd_(const ck_tile::stream_config& s, layernorm2d_fwd_args a); + +float layernorm2d_fwd(layernorm2d_fwd_traits t, + layernorm2d_fwd_args a, + const ck_tile::stream_config& s) +{{ + float r = -1; +{F_dispatch} + return r; +}} + +""" + + API_PER_DTYPE=""" {F_if}(t.prec_i == \"{F_i_type}\" && t.prec_o == \"{F_o_type}\"){{ +{F_per_n_case} + }} +""" + API_PER_N_CASE=""" {F_if} {F_N_COND} {{ +{F_inner_dispatch} + }} +""" + API_INNER_CASE=""" {F_if} {F_VEC_COND} + r={F_instance_func}(s, a); +""" + + INSTANCE_BASE = """ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_fwd_api_common.hpp" + +// clang-format off +// prec_i prec_o prec_sy rm rn tm tn vn pd mv 2p add sweep +{F_instance_def} +// clang-format on + +""" + + def __init__(self, working_path, kernel_filter): + self.working_path = working_path + self.kernel_filter = kernel_filter + + class k_fuesd_add_enum(IntEnum): + F_NO_ADD = 0 + F_PRE_ADD = 1 + F_PRE_ADD_STORE_RESIDUAL = 2 + + class k_fused_sweep_enum(IntEnum): + F_NO_SWEEP = 0 + F_RENORM = 1 + F_DYNAMIC_QUANT = 2 + + @dataclass + class k_traits: + F_kPadN : bool + F_kSaveMeanInvStd : bool + F_kTwoPass : bool + F_kFusedAdd : Any #: layernorm_fwd_codegen.k_fuesd_add_enum + F_kFusedQuant : Any #: layernorm_fwd_codegen.k_fused_sweep_enum + + @dataclass + class k_shape: + F_BlockTile : List[int] + F_WarpPerBlock : List[int] + F_WarpTile : List[int] + F_Vector_ : List[int] + @property + def F_BlockSize(self) -> int: + return functools.reduce(lambda a, b: a*b, self.F_WarpTile) + + @dataclass + class k_problem: + F_XDataType : str + F_GammaDataType : str + F_BetaDataType : str + F_ComputeDataType : str + F_YDataType : str + F_MeanDataType : str + F_InvStdDataType : str + F_BlockShape : str + F_Traits : Any #k_traits + + @dataclass + class k_pipeline_one_pass: + F_Problem : Any #k_problem + + @dataclass + class k_pipeline_two_pass: + F_Problem : Any #k_problem + + @dataclass + class default_2d_epilogue_problem: + F_AccDataType : str + F_ODataType : str + F_kPadM : bool + F_kPadN : bool + + @dataclass + class default_2d_epilogue: + F_problem : Any + + @dataclass + class k_kernel: + F_pipeline : Any + F_epilogue : Any + + @dataclass + class h_traits: + F_XDataType : str + F_YDataType : str + F_XScaleDataType : str + F_YScaleDataType : str + F_Repeat_M : int + F_Repeat_N : int + F_ThreadPerBlock_M : int + F_ThreadPerBlock_N : int + F_Vector_N : int + F_kPadN : bool + F_kSaveMeanInvStd_ : bool + F_kTwoPass_ : bool + F_kFusedAdd : int + F_kFusedQuant : int + + @property + def trait_name(self) ->str: + t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_XScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}' + t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}' + t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}' + return t_ + + # string when calling this kernel + @property + def call_name(self) -> str: + return f'layernorm2d_fwd_>' + + # string when define this kernel + @property + def def_name(self) -> str: + return f'template float layernorm2d_fwd_>(const S&, A);' + + # this class hold kernel under same source file + @dataclass + class h_instance: + F_DataTypePair : str + F_N : str + F_add : int + F_sweep : int + instance_list : List[Any] # List[h_traits] + + @property + def name(self) -> str: + prec_i, prec_o = self.F_DataTypePair.split(',') + dtype_str = f'{prec_i}' if prec_i == prec_o else f'{prec_i}_{prec_o}' + nnn = f'layernorm2d_fwd_{dtype_str}_n{self.F_N}' + if self.F_add != 0: + nnn = nnn + '_' + FUSED_ADD_ENUM_STR_MAP[self.F_add] + if self.F_sweep != 0: + nnn = nnn + '_' + FUSED_FUSED_SWEEP_STR_MAP[self.F_sweep] + return nnn + + @property + def instance_name(self) ->str: + return self.name + + @property + def content(self) ->str: + instance_defs = '' + for ins in self.instance_list: + instance_defs += ins.def_name + '\n' + return layernorm_fwd_codegen.INSTANCE_BASE.format(F_instance_def=instance_defs) + + @property + def name_api(self) -> str: + return 'layernorm2d_fwd_api' + + @property + def name_common_header(self) -> str: + return 'layernorm2d_fwd_api_common' + + @property + def content_api(self) -> str: + # 1 sort based on dtype + t_dtype_dict = dict() + blobs = self.get_blobs() + for blob in blobs: + if blob.F_DataTypePair not in t_dtype_dict: + t_dtype_dict[blob.F_DataTypePair] = {} + if blob.F_N not in t_dtype_dict[blob.F_DataTypePair]: + t_dtype_dict[blob.F_DataTypePair][blob.F_N] = [] + t_dtype_dict[blob.F_DataTypePair][blob.F_N].append(blob) + + d_str = '' + for i_d, dtype_ in enumerate(t_dtype_dict): + blob_per_t = t_dtype_dict[dtype_] + n_str = '' + for i_n, n_ in enumerate(blob_per_t): + blob_per_n = blob_per_t[n_] + inner_str = "" + for i_b, b_ in enumerate(blob_per_n): + # generate single kernel instance file + #vec_str = "" + for i_ins, ins in enumerate(b_.instance_list): + idx_in_n = i_b * len(b_.instance_list) + i_ins + len_in_n = len(blob_per_n) * len(b_.instance_list) + # _if = 'if' if i_ins == 0 else 'else if' + if ins.F_kFusedQuant == 0: + _sweep_cond = 't.fused_quant == {f_fused_sweep}'.format(f_fused_sweep = ins.F_kFusedQuant) + elif ins.F_kFusedQuant == 1: + _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sx == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\")'.format( + f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_XScaleDataType, f_sy_type=ins.F_YScaleDataType) + elif ins.F_kFusedQuant == 2: + _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\")'.format( + f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType) + _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format( + f_vec_n = ins.F_Vector_N, f_fused_add = ins.F_kFusedAdd, + f_sweep_cond = _sweep_cond) + inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False), + F_VEC_COND = _cond, F_instance_func=ins.call_name) + #inner_str = inner_str + vec_str + n_cnd = f'(a.n <= {n_})' if (i_n < len(blob_per_t) - 1) else '' + n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t)), F_N_COND=n_cnd, F_inner_dispatch=inner_str) + prec_i, prec_o = dtype_.split(',') + d_str += self.API_PER_DTYPE.format(F_if = get_if_str(i_d, len(t_dtype_dict), False), F_i_type=prec_i, F_o_type=prec_o, F_per_n_case=n_str) + + api_base = self.API_BASE.format(F_traits_define=self.API_TRAITS_DEFINE, F_dispatch=d_str) + return api_base + + @property + def content_common_header(self) -> str: + return self.API_COMMON_HEADER.format(F_traits_define=self.API_TRAITS_DEFINE) + + def get_blobs(self): + h_traits = layernorm_fwd_codegen.h_traits + h_instance = layernorm_fwd_codegen.h_instance + + dynamic_quant_out_dtype = ['int8'] + # some predefined support range + # (prec_i,prec_o) for simplicity this string will be used as key for dict + scale_list = [('fp32,fp32')] + dtype_list = [('fp16,fp16'), ('bf16,bf16'), + ('fp16,int8'), ('bf16,int8')] # NOTE: only fused-dynamic-quant use int8 out + #fused_add_list = [0, 1, 2] + #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused dynamic quant + fused_add_list = [0, 1] + fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant + + # rm rn tm tn vn pd mv 2p add sweep + h_trait_dict = {'64' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 1, True, False, False, 0, 0)], + '128' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 1, True, False, False, 0, 0)], + '256' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 1, True, False, False, 0, 0)], + '512' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 8, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 4, 64, 1, True, False, False, 0, 0)], + '768' : [ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 4, 64, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 12, 4, 64, 1, True, False, False, 0, 0)], + '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 2, 128, 8, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 2, 128, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 2, 128, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 1, True, False, False, 0, 0)], + '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 8, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 2, 128, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 1, True, False, False, 0, 0)], + '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 1, 256, 8, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 1, 256, 1, True, False, False, 0, 0)], + '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 128, 8, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 1, True, False, False, 0, 0)], + '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, False, 0, 0)], + '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 8, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 512, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1,1024, 1, True, False, False, 0, 0)], + '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 8, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 512, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 2, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 1,1024, 1, True, False, False, 0, 0)], + 'big' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, 0, 0)]} + total_blob = list() + for hs_key in h_trait_dict: + hs = h_trait_dict[hs_key] + current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N + for dtype, scale_type, fused_add, fused_quant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list): + prec_i, prec_o = dtype.split(',') + scale_x, scale_y = scale_type.split(',') + if prec_o in dynamic_quant_out_dtype and fused_quant != 1: + continue # skip non dynamic quant case + if fused_quant == 1 and hs_key == 'big': + continue + current_hs = list() + for chs_ in hs: + h_ = copy.copy(chs_) # copy the base instance out + h_.F_XDataType = prec_i + h_.F_YDataType = prec_o + h_.F_XScaleDataType = scale_y + h_.F_YScaleDataType = scale_x + h_.F_kFusedAdd = fused_add + h_.F_kFusedQuant = fused_quant + current_hs.append(h_) # + "\n" + #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_ + current_n_str = 'big' if hs_key == 'big' else current_n + total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, current_hs)) + return total_blob + + def list_blobs(self) -> None: + w_p = Path(self.working_path) + list_p = w_p / 'layernorm2d_fwd_blobs.txt' + blobs = self.get_blobs() + with list_p.open('a') as list_f: + # api related file + list_f.write(str(w_p / (self.name_api + ".cpp")) + "\n") + list_f.write(str(w_p / (self.name_common_header + ".hpp")) + "\n") + # kernel instance file + for b in blobs: + list_f.write(str(w_p / (b.name + ".cpp")) + "\n") + + def gen_blobs(self) -> None: + w_p = Path(self.working_path) + (w_p / (self.name_api + ".cpp")).write_text(self.content_api) + (w_p / (self.name_common_header + ".hpp")).write_text(self.content_common_header) + blobs = self.get_blobs() + for b in blobs: + (w_p / (b.name + ".cpp")).write_text(b.content) + +def list_blobs(args): + api_list = args.api.split(',') + for api in api_list: + if api == 'fwd': + layernorm_fwd_codegen(args.working_path, args.filter).list_blobs() + + +def gen_blobs(args): + api_list = args.api.split(',') + for api in api_list: + if api == 'fwd': + layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs() + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK layernorm kernel", + ) + parser.add_argument( + "-a", + "--api", + default='fwd[all]', + required=False, + help="supply API(s) to generate (default: fwd). separated by comma." + ) + + # the directory for list_blobs/gen_blobs to write files into + parser.add_argument( + "-w", + "--working_path", + default="./", + required=False, + help="the path where all the blobs are going to be generated" + ) + + # this script have 2 modes + # 1) list_blobs mode, will generate a txt file with all the files going to be generated. + # this is useful in build system like cmake to construct source code dependency, by + # reading the content out of this file + # 2) gen_blobs mode, will generate the actuall kernel instance and api. If in framework + # like FA, only need to use this mode + parser.add_argument( + "-l", + "--list_blobs", + action='store_true', + help="list all the kernels to a file, " + ) + + parser.add_argument( + "-g", + "--gen_blobs", + action='store_true', + help="generate all kernels into different tile" + ) + + # TODO: if using filter, must apply same value to output_dir and list_blobs + parser.add_argument( + "-f", + "--filter", + required=False, + help="filter out kernels that need to generate, using fnmatch module" + ) + + parser.add_argument( + "-t", + "--traits", + default="all", + required=False, + help="enable/disable some feature. default generate all" + ) + + parser.add_argument( + "-r", + "--receipt", + default=0, + required=False, + help="codegen receipt." + ) + + args = parser.parse_args() + + # print(f'{args.list_blobs}-{args.gen_blobs}') + if (args.gen_blobs and args.list_blobs) or ((not args.gen_blobs) and (not args.list_blobs)): + print('gen_blobs/list_blobs must specify only one option') + sys.exit() + + p = Path(args.working_path) + if not p.exists(): + p.mkdir() + + if args.list_blobs: + list_blobs(args) + else: + gen_blobs(args) diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp deleted file mode 100644 index f2f51de5d..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include "layernorm2d_fwd.hpp" - -template -using trait_ = layernorm2d_fwd_traits_; - -template -float layernorm2d_fwd_b16_(layernorm2d_fwd_traits /*t*/, - layernorm2d_fwd_args a, - const ck_tile::stream_config& s) -{ -#if 1 - float r = -1; - // clang-format off - // rm rn tm tn vn pd mv 2p - if(a.n <= 64) { - r = layernorm2d_fwd_>(s, a); - } - else if(a.n <= 128) { - if (a.n % 2 == 0) - r = layernorm2d_fwd_>(s, a); - else - r = layernorm2d_fwd_>(s, a); - } - else if(a.n <= 256) { - if (a.n % 4 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 2 == 0) - r = layernorm2d_fwd_>(s, a); - else - r = layernorm2d_fwd_>(s, a); - } - else if(a.n <= 512) { - if (a.n % 8 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 4 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 2 == 0) - r = layernorm2d_fwd_>(s, a); - else - r = layernorm2d_fwd_>(s, a); - } - else if(a.n <= 768) { - if (a.n % 4 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 2 == 0) - r = layernorm2d_fwd_>(s, a); - else - r = layernorm2d_fwd_>(s, a); - } - else if(a.n <= 1024) { - if (a.n % 8 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 4 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 2 == 0) - r = layernorm2d_fwd_>(s, a); - else - r = layernorm2d_fwd_>(s, a); - } - else if(a.n <= 1536) { - if (a.n % 8 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 4 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 2 == 0) - r = layernorm2d_fwd_>(s, a); - else - r = layernorm2d_fwd_>(s, a); - } - else if(a.n <= 2048) { - if (a.n % 8 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 4 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 2 == 0) - r = layernorm2d_fwd_>(s, a); - else - r = layernorm2d_fwd_>(s, a); - } - else if(a.n <= 3072) { - if (a.n % 8 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 4 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 2 == 0) - r = layernorm2d_fwd_>(s, a); - else - r = layernorm2d_fwd_>(s, a); - } - else if(a.n <= 4096) { - if (a.n % 8 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 4 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 2 == 0) - r = layernorm2d_fwd_>(s, a); - else - r = layernorm2d_fwd_>(s, a); - } - else if(a.n > 4096) { - if (a.n % 8 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 4 == 0) - r = layernorm2d_fwd_>(s, a); - else if (a.n % 2 == 0) - r = layernorm2d_fwd_>(s, a); - else - r = layernorm2d_fwd_>(s, a); - } - return r; -#else - return layernorm2d_fwd_>(s, a); -#endif - // clang-format on -} - -float layernorm2d_fwd(layernorm2d_fwd_traits t, - layernorm2d_fwd_args a, - const ck_tile::stream_config& s) -{ - - float r = -1; - if(t.data_type.compare("fp16") == 0) - { - return layernorm2d_fwd_b16_(t, a, s); - } - else if(t.data_type.compare("bf16") == 0) - { - return layernorm2d_fwd_b16_(t, a, s); - } - if(r < 0) - throw std::runtime_error("Without supported instances!"); - - return r; -} diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp deleted file mode 100644 index 2a20d1e05..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp +++ /dev/null @@ -1,22 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -#if 0 -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); - -template float layernorm2d_fwd_>(const S&, A); -#endif - -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp deleted file mode 100644 index d043efc86..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp +++ /dev/null @@ -1,13 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp deleted file mode 100644 index a6ffc8cd2..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp +++ /dev/null @@ -1,14 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); - -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp deleted file mode 100644 index 80beeca67..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp +++ /dev/null @@ -1,12 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp deleted file mode 100644 index b362a550a..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp +++ /dev/null @@ -1,14 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); - -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp deleted file mode 100644 index 9c2d78999..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp +++ /dev/null @@ -1,14 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); - -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp deleted file mode 100644 index c0c75f878..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp +++ /dev/null @@ -1,14 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); - -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp deleted file mode 100644 index 1bcd0f8a7..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp +++ /dev/null @@ -1,13 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp deleted file mode 100644 index 6b25fce8c..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp +++ /dev/null @@ -1,12 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp deleted file mode 100644 index c4400f0f2..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp +++ /dev/null @@ -1,12 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp deleted file mode 100644 index 7f0e4898c..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp +++ /dev/null @@ -1,22 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -#if 0 -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); - -template float layernorm2d_fwd_>(const S&, A); -#endif - -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp deleted file mode 100644 index 8c3a42cc4..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp +++ /dev/null @@ -1,13 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp deleted file mode 100644 index 04d8bc153..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp +++ /dev/null @@ -1,14 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); - -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp deleted file mode 100644 index c32574749..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp +++ /dev/null @@ -1,12 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp deleted file mode 100644 index c71db57a6..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp +++ /dev/null @@ -1,14 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); - -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp deleted file mode 100644 index f3ca0932e..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp +++ /dev/null @@ -1,14 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); - -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp deleted file mode 100644 index 242f1d2dd..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp +++ /dev/null @@ -1,14 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); - -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp deleted file mode 100644 index e3bfa8e3a..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp +++ /dev/null @@ -1,13 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp deleted file mode 100644 index 90d960cf0..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp +++ /dev/null @@ -1,12 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp deleted file mode 100644 index 0960a95c3..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp +++ /dev/null @@ -1,12 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "layernorm2d_fwd_instance_common.hpp" - -// clang-format off -// rm rn tm tn vn pd mv 2p -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -template float layernorm2d_fwd_>(const S&, A); -// clang-format on diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp deleted file mode 100644 index 22895e8ed..000000000 --- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp +++ /dev/null @@ -1,67 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include "layernorm2d_fwd.hpp" -#include - -#pragma once - -using S = ck_tile::stream_config; -using A = layernorm2d_fwd_args; - -template -using trait_ = layernorm2d_fwd_traits_; - -template -float layernorm2d_fwd_(const S& s, A a) -{ - using DataType = typename Traits_::DataType; - - using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem< - typename LayerNormTypeConfig::XDataType, - typename LayerNormTypeConfig::GammaDataType, - typename LayerNormTypeConfig::BetaDataType, - typename LayerNormTypeConfig::ComputeDataType, - typename LayerNormTypeConfig::YDataType, - typename LayerNormTypeConfig::MeanDataType, - typename LayerNormTypeConfig::InvStdDataType, - typename Traits_::Shape, - Traits_::kPadN, - Traits_::kSaveMeanInvStd, - Traits_::kTwoPass>; - - using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass; - using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass; - using Pipeline = std::conditional_t; - - using Kernel = ck_tile::Layernorm2dFwd; - - const dim3 grids = Kernel::GridSize(a); - constexpr dim3 blocks = Kernel::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = 1; - - auto kargs = Kernel::MakeKargs(a); - if(s.log_level_ > 0) - std::cout << ", " << Kernel::GetName() << std::flush; - - return ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); -} diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp index 4f12d9103..43f4e8c72 100644 --- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp +++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp @@ -1,5 +1,6 @@ #include "ck_tile/host.hpp" #include "layernorm2d_fwd.hpp" +#include #include // different threshold for different dtype @@ -29,7 +30,16 @@ auto create_args(int argc, char* argv[]) .insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case") .insert("v", "1", "cpu validation or not") .insert("kname", "1", "print kernel name or not") - .insert("prec", "fp16", "precision") + .insert("prec_i", "fp16", "input precision") + .insert("prec_o", "auto", "output precision, set auto will be the same as input") + .insert("prec_sx", + "auto", + "output quant scale type, set auto will use fp32. used when fquant=1") + .insert("prec_sy", + "auto", + "output quant scale type, set auto will use fp32. used when fquant=1 or 2") + .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only") + .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant") .insert("warmup", "5", "cold iter") .insert("repeat", "20", "hot iter"); @@ -37,7 +47,11 @@ auto create_args(int argc, char* argv[]) return std::make_tuple(result, arg_parser); } -template +template bool run(const ck_tile::ArgParser& arg_parser) { ck_tile::index_t m = arg_parser.get_int("m"); @@ -45,21 +59,46 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::index_t stride = arg_parser.get_int("stride"); if(stride < 0) stride = n; - float epsilon = arg_parser.get_float("e"); - std::string data_type = arg_parser.get_str("prec"); - int kname = arg_parser.get_int("kname"); - int do_validation = arg_parser.get_int("v"); - int warmup = arg_parser.get_int("warmup"); - int repeat = arg_parser.get_int("repeat"); + float epsilon = arg_parser.get_float("e"); + std::string prec_i = arg_parser.get_str("prec_i"); + std::string prec_o = arg_parser.get_str("prec_o"); + std::string prec_sx = arg_parser.get_str("prec_sx"); + std::string prec_sy = arg_parser.get_str("prec_sy"); + if(prec_o == "auto") + { + prec_o = prec_i; + } + if(prec_sx == "auto") + { + prec_sx = "fp32"; + } + if(prec_sy == "auto") + { + prec_sy = "fp32"; + } + + int kname = arg_parser.get_int("kname"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + int fused_add = arg_parser.get_int("fadd"); + int fused_quant = arg_parser.get_int("fquant"); + if(fused_quant == 1 && prec_o != "int8") + { + std::cout << "if fused_quant is 1, only support \"-prec_o=int8\" case" << std::endl; + return false; + } assert(stride >= n); - using TypeConfig = LayerNormTypeConfig; + using TypeConfig = LayerNormTypeConfig; - using XDataType = typename TypeConfig::XDataType; - using YDataType = typename TypeConfig::YDataType; - using GammaDataType = typename TypeConfig::GammaDataType; - using BetaDataType = typename TypeConfig::BetaDataType; + using XDataType = typename TypeConfig::XDataType; + using YDataType = typename TypeConfig::YDataType; + using GammaDataType = typename TypeConfig::GammaDataType; + using BetaDataType = typename TypeConfig::BetaDataType; + using XResidualDataType = XDataType; + using YResidualDataType = XDataType; using MeanDataType = std::conditional_t; @@ -73,36 +112,72 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::HostTensor gamma_host({n}); ck_tile::HostTensor beta_host({n}); + ck_tile::HostTensor x_residual_host({m, n}, {stride, 1}); + ck_tile::HostTensor y_residual_host({m, n}, {stride, 1}); + ck_tile::HostTensor y_host_ref({m, n}, {stride, 1}); ck_tile::HostTensor y_host_dev({m, n}, {stride, 1}); ck_tile::HostTensor mean_host_ref({m}); ck_tile::HostTensor invStd_host_ref({m}); + ck_tile::HostTensor y_scale_host_ref({m}); + ck_tile::HostTensor y_scale_host_dev({m}); + + ck_tile::HostTensor x_scale_host({n}); + ck_tile::HostTensor x_scale_host_dev({n}); ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); ck_tile::FillUniformDistribution{-.5f, .5f}(gamma_host); ck_tile::FillUniformDistribution{-.5f, .5f}(beta_host); + ck_tile::FillUniformDistribution{-1.f, 1.f}(x_scale_host); ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem y_scale_buf(y_scale_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem x_scale_buf(x_scale_host_dev.get_element_space_size_in_bytes()); + + ck_tile::DeviceMem x_residual_buf(x_residual_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes()); x_buf.ToDevice(x_host.data()); gamma_buf.ToDevice(gamma_host.data()); beta_buf.ToDevice(beta_host.data()); + x_residual_buf.ToDevice(x_residual_host.data()); + x_scale_buf.ToDevice(x_scale_host.data()); - std::cout << "[" << data_type << "]" + auto prec_str = [&]() { + auto base_str = prec_i; + if(prec_i != prec_o) + { + base_str += "|" + prec_o; + } + if(fused_quant == 1) + { + base_str += std::string("(") + prec_sy + ")"; + } + return base_str; + }(); + + std::cout << "[" << prec_str << "]" << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush; - layernorm2d_fwd_traits traits{data_type, SaveMeanVar}; + layernorm2d_fwd_traits traits{ + prec_i, prec_o, prec_sx, prec_sy, SaveMeanVar, fused_add, fused_quant}; layernorm2d_fwd_args args{x_buf.GetDeviceBuffer(), + fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr, + fused_quant == 1 ? x_scale_buf.GetDeviceBuffer() : nullptr, gamma_buf.GetDeviceBuffer(), beta_buf.GetDeviceBuffer(), + y_buf.GetDeviceBuffer(), - nullptr, - nullptr, + fused_add == 1 ? y_residual_buf.GetDeviceBuffer() : nullptr, + fused_quant != 0 ? y_scale_buf.GetDeviceBuffer() : nullptr, + nullptr, // p_mean, unsupported yet + nullptr, // p_invStd, unsupported yet + epsilon, m, n, @@ -111,6 +186,12 @@ bool run(const ck_tile::ArgParser& arg_parser) float ave_time = layernorm2d_fwd( traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + if(ave_time < 0) + { + std::cout << " not supported!" << std::endl << std::flush; + return false; + } + std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(GammaDataType) * n + sizeof(BetaDataType) * n + sizeof(YDataType) * m * n; @@ -122,6 +203,17 @@ bool run(const ck_tile::ArgParser& arg_parser) if(do_validation) { // reference + if(fused_add != 0) + { + // fused pre_add/pre_add_store + // TODO we accumulate directly to x_host for simplcity here... + + std::transform(x_host.mData.cbegin(), + x_host.mData.cend(), + x_residual_host.mData.cbegin(), + x_host.mData.begin(), + std::plus{}); + } ck_tile::reference_layernorm2d_fwd( x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon); + if(fused_quant != 0) + { + auto dquant_functor = [&](int m_, auto& o_, auto& acc_) { + int N_ = acc_.mDesc.get_lengths()[1]; + if(fused_quant == 1) + { + for(int n_ = 0; n_ < N_; n_++) + { + // input smooth outlier + acc_(m_, n_) = + acc_(m_, n_) * ck_tile::type_convert(x_scale_host(n_)); + } + } + ComputeDataType absmax = static_cast(0); + for(int n_ = 0; n_ < N_; n_++) + { + const auto a = ck_tile::abs(acc_(m_, n_)); + absmax = a > absmax ? a : absmax; + } + // printf("cpu:absmax:%f\n", absmax); + ComputeDataType y_scale = absmax / static_cast(127.0); + y_scale_host_ref(m_) = ck_tile::type_convert(y_scale); + for(int n_ = 0; n_ < N_; n_++) + { + o_(m_, n_) = ck_tile::type_convert(acc_(m_, n_) / y_scale); + } + }; + + ck_tile::reference_layernorm2d_fwd(x_host, + gamma_host, + beta_host, + y_host_ref, + mean_host_ref, + invStd_host_ref, + epsilon, + dquant_functor); + } + else + { + ck_tile::reference_layernorm2d_fwd( + x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon); + } + y_buf.FromDevice(y_host_dev.data()); - auto [rtol, atol] = get_elimit(); + ck_tile::HostTensor sy_host_dev({m, n}, {stride, 1}); + if(fused_add == 1) + { + y_residual_buf.FromDevice(sy_host_dev.data()); + } + + auto [rtol, atol] = get_elimit(); + if(stride == n) { pass = ck_tile::check_err( y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol); + if(fused_add == 1) + { + pass &= ck_tile::check_err( + sy_host_dev, x_host, std::string("ADD Error: Incorrect results!"), rtol, atol); + } } else { @@ -153,8 +312,30 @@ bool run(const ck_tile::ArgParser& arg_parser) std::string("] Error: Incorrect results!"), rtol, atol); + if(fused_add == 1) + { + std::vector sy_host_dev_row( + sy_host_dev.begin() + i_r * stride, sy_host_dev.begin() + i_r * stride + n); + std::vector sy_host_ref_row( + x_host.begin() + i_r * stride, x_host.begin() + i_r * stride + n); + pass &= ck_tile::check_err(sy_host_dev_row, + sy_host_ref_row, + std::string("ADD[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } } } + if(fused_quant == 1) + { + y_scale_buf.FromDevice(y_scale_host_dev.data()); + pass &= ck_tile::check_err(y_scale_host_dev, + y_scale_host_ref, + std::string("SCALE Error: Incorrect results!"), + rtol, + atol); + } std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; } @@ -168,23 +349,56 @@ int main(int argc, char* argv[]) if(!result) return -1; - const std::string data_type = arg_parser.get_str("prec"); - int save_mv = arg_parser.get_int("save_mv"); - if(data_type == "fp16" && save_mv) + std::string prec_i = arg_parser.get_str("prec_i"); + std::string prec_o = arg_parser.get_str("prec_o"); + std::string prec_sx = arg_parser.get_str("prec_sx"); + std::string prec_sy = arg_parser.get_str("prec_sy"); + + if(prec_o == "auto") + { + prec_o = prec_i; + } + if(prec_sx == "auto") { - return run(arg_parser) ? 0 : -2; + prec_sx = "fp32"; } - else if(data_type == "fp16" && !save_mv) + if(prec_sy == "auto") { - return run(arg_parser) ? 0 : -2; + prec_sy = "fp32"; } - else if(data_type == "bf16" && save_mv) + int save_mv = arg_parser.get_int("save_mv"); + + // no dynamic quant case + if(prec_i == "fp16" && prec_o == "fp16" && prec_sx == "fp32" && prec_sy == "fp32" && save_mv) + { + return run(arg_parser) ? 0 : -2; + } + else if(prec_i == "fp16" && prec_o == "fp16" && prec_sx == "fp32" && prec_sy == "fp32" && + !save_mv) + { + return run(arg_parser) ? 0 : -2; + } + else if(prec_i == "bf16" && prec_o == "bf16" && prec_sx == "fp32" && prec_sy == "fp32" && + save_mv) + { + return run(arg_parser) ? 0 : -2; + } + else if(prec_i == "bf16" && prec_o == "bf16" && prec_sx == "fp32" && prec_sy == "fp32" && + !save_mv) + { + return run(arg_parser) ? 0 : -2; + } + + // dynamic quant case, only in inference + else if(prec_i == "fp16" && prec_o == "int8" && prec_sx == "fp32" && prec_sy == "fp32" && + !save_mv) { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } - else if(data_type == "bf16" && !save_mv) + else if(prec_i == "bf16" && prec_o == "int8" && prec_sx == "fp32" && prec_sy == "fp32" && + !save_mv) { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } return -3; diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp index 861e4a023..a0f2db0e8 100644 --- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp +++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp @@ -8,31 +8,35 @@ #include "ck_tile/ops/layernorm2d.hpp" #include -template +template struct LayerNormTypeConfig; -template <> -struct LayerNormTypeConfig +template +struct LayerNormTypeConfig { using XDataType = ck_tile::half_t; - using YDataType = ck_tile::half_t; + using YDataType = OutType; using GammaDataType = ck_tile::half_t; using BetaDataType = ck_tile::half_t; using MeanDataType = ck_tile::half_t; using InvStdDataType = ck_tile::half_t; using ComputeDataType = float; + using XScaleDataType = XScaleDataType_; + using YScaleDataType = YScaleDataType_; }; -template <> -struct LayerNormTypeConfig +template +struct LayerNormTypeConfig { using XDataType = ck_tile::bf16_t; - using YDataType = ck_tile::bf16_t; + using YDataType = OutType; using GammaDataType = ck_tile::bf16_t; using BetaDataType = ck_tile::bf16_t; using MeanDataType = ck_tile::bf16_t; using InvStdDataType = ck_tile::bf16_t; using ComputeDataType = float; + using XScaleDataType = XScaleDataType_; + using YScaleDataType = YScaleDataType_; }; // runtime args @@ -40,82 +44,21 @@ struct layernorm2d_fwd_args : public ck_tile::Layernorm2dFwdHostArgs { }; -// this is used to pattern-match internl kernel implementation, not to instantiate kernel -template -struct layernorm2d_fwd_traits_ -{ - using DataType = ck_tile::remove_cvref_t; - - static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize; - static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0); - static constexpr ck_tile::index_t total_warps = - (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize; - - // num of warps along m - static constexpr ck_tile::index_t BlockWarps_M = []() { - if constexpr(is_warp_per_row) - { - static_assert(warpSize % ThreadPerBlock_N_ == 0); - return total_warps * (warpSize / ThreadPerBlock_N_); - } - else - { - // static_assert(warpSize % ThreadPerBlock_M_ == 0); - return total_warps / (ThreadPerBlock_N_ / warpSize); - } - }(); - - // num of warps along n - static constexpr ck_tile::index_t BlockWarps_N = []() { - if constexpr(is_warp_per_row) - { - static_assert(warpSize % ThreadPerBlock_N_ == 0); - return 1; - } - else - { - static_assert(ThreadPerBlock_N_ % warpSize == 0); - return ThreadPerBlock_N_ / warpSize; - } - }(); - - static constexpr ck_tile::index_t Repeat_M = Repeat_M_; - static constexpr ck_tile::index_t Repeat_N = Repeat_N_; - - static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_; - static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_; - - static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M; - static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_; - - using BlockTile = ck_tile::sequence; - using BlockWarps = ck_tile::sequence; - using WarpTile = ck_tile::sequence; - using Vector = ck_tile::sequence<1, Vector_N_>; - - using Shape = ck_tile::Layernorm2dShape; - - static constexpr bool kPadN = kPadN_; - static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; - static constexpr bool kTwoPass = kTwoPass_; -}; - -template -float layernorm2d_fwd_(const ck_tile::stream_config& s, layernorm2d_fwd_args a); - // This is the public API, will be generated by script struct layernorm2d_fwd_traits { - std::string data_type; - bool save_mean_var; + std::string prec_i; // input precision + std::string prec_o; // output precision + + // if fused_quant == 1, need set prec_sx/prec_sy to proper string, otherwise can set + // arbitrary(will skip check) if fused_quant == 2, need set prec_sy to proper string, otherwise + // can set arbitrary(will skip check) + std::string prec_sx; // x-scale, used for [1*N] input smooth quant + std::string prec_sy; // y-scale, used for [M*1] output for next layer + + bool save_mean_var; // + int fused_add; // 0:no-add, 1:pre-add-store, 2:pre-add + int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant }; float layernorm2d_fwd(layernorm2d_fwd_traits, layernorm2d_fwd_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/02_layernorm2d/misc/dquant.png b/example/ck_tile/02_layernorm2d/misc/dquant.png new file mode 100644 index 0000000000000000000000000000000000000000..28b1a61a14ea6774191fc2ac54f195cb86477f9b GIT binary patch literal 36863 zcmce-RahNS(v;O0vH&CAQ;#uSXgk-ou>o{J}|JTe(~?$ zlw6iiGoV~B1~zDgql+;P$B0}ZlBi*+B!QoWZODV>}fhZ zq>X(d?lJ`zPu+MC%xp<$T53z#Xsu~r4P!<0{{$Ky^oI#U0EztXDqHUpILyB*xh^nY zh<{f_zkE9XtDzE;;O4(uU^F;iZ2s#C6A=~g@5%=r8kXSSl^_(v{~XTm|L5UoNZEwZ z0lvD2r2`|}jD1XU@d{#F0ER7F*j$f2q`%e;qkOSRh-BR=u9@J6y{mj_^E|WLw_c2) z6!&!2M73)=2&ymMH&L5k`nzNwJ9t>l{G9ez#YJT`is2AZ+b!0- z13UIsQD;AUhDyq6{&{N;1GP~?%r0|#a;6uI=VqTD*)SuClj{-CzEW@_#DB1#967fI z|IbgYFrcOb5sl#|aBBNP>)Fp5d&jO^R3XKXDt;sO!Pj`BsP~p*F)hTY~EtU?SA;JtfuaWk>330??!KJ7biyEp^-I? z>m(xX(Ce%Y#;@!bg-%^@sZjlU!!6c)S&8zMxz;-|IiFyNDIHcE&e?tSm&^D(pT&iM z&FzXBDhJWM-8WI3I_^Zr2M&$~?gm+go(zAtRJH|}Z>dA8@Mc9I3_xw2XhppHV0e~$ z!_}ST24e-L{uKT6sr~yoh2esRU4T(?0sp@8v3uRc_7b+Z~EUO zlSG{a&2N;JOAzi0jj_*h7IU2|7htASJfF8p%qMEDF_V3_Bwgpu$_r}YPc`T0$#u3k zoH-4DF@#$z%)SurklhFLA~pS zL4gBbdwCf|mU2!36(fV9ttA>+`2cos+=O@28BE}LeTZl(UL#B{QSEWYH>W(4DzZNu< zObss#3H5&1PSlxyE>>KIBK2MQpnCUCn+SVMU)kYSu50))s@OQOG}aX|)1%ILSFL!0 zi#Twtj0df6K}NzL9)H&a`8R^9KS5SZ%hP~2aMGPmgvpjJr&&JNe|B?cV5u8xR{Q}x zk33Q8TVHj4U+KancGfwDSxA(uE@5}z*>JslQ{`^m_%o&V^%;q!@^`pj zJ-!U30hW~M2>fb!K-u#Mvc1jqXeEbq8=;cAXPp3YfBr#)30~v1_no{|6tDEF?f~Ye zs8+Vzwx|eGwJgcMAwz{PxJg4eBfFkv|Jq(!ZrWX&Sp}WW%cktBK3g|pv*4gESv%ZvMd zlRjLp^;8;)?1_yYCGRkBO^AZ6)QN^9|7=;2D$r;hS>fx0sHRc7gz@ANWdmI5MvhaC z`T>`dkM1-rMKHm^Q_6r(9 zf&9vRA+@;5s@q;qp^M&}3(n;B5y8&co#xJyiMI@0KBjCUoJbrp6m$04zQoDS%O5>n zbr3SbDBvBRsF6LT78a00j3W&@H<=6p#|Q?$;1KdXZ0e@)_;fcNBsGxU!)mM+L62KV-9G6d-t@lrXs zrFKuM6BCC{95W#4@2@hXa+lg{MYslcNVbcl=FiRZ5L*q&l}+Z97uct(oL% zeT40}{>%%SJ3#X%q1uR-9o9q_nciW9?2x>Nu~Db0oFL*v+QVvg?#16x|J#==(2&S3 zjHMNP8xE|o1JwK4ZlTC__3M+YwPp7Il6?X;Dqyqi@U}LQqt&()mxhrn2;qsbBJp3= zzr6eO{|p2+H?k*><+frjn~GiyXxR$D8L~#0Cu^n3cVhp9Y!%KE?fHgeo6)}+U^7f9nLzKRjI=1(8hK|jAN zYYVU!2kDMB3Oh51=I5fR%&l9DG(@0Z@5M!Jth{-$x`b2z9WWjew4$1)o!6ZdW1?Q^ zlVZ6Od?0`O-AX^b2jR0t%b_WG##;S+-s~#JiOdGU%$B&@du1DEMbyzRQcU)rP7XhQ zUqqvRoTp9PAzz>5EP0fp4hfnFx#}SuO-b>c{28-p)jve@6L1JyOM< z8Tn>3j=#W0s)274q-q3Z2n7aw23$t2IyQ`J; z=ZOkC>P_m-^k2=lV^&t4IV!eH0;{65^SLbnJJ!a{wdSyHY1FE7I(E0+zw50MDGWWS z{thZ9584R)QSspUH}RG&UKUF=1;#2l&F|HD2SE*v%-Fy-qIb*a!u+2;^`?!lCLhiG zpaf$?%hvYZF4s5jq4x0HzgDwb;wBlXO|Y~+C!{bR{LiPDnEzQmfzc}~(DN)SO+^@7 z)%a*DjC8q0L!y+Ccd{<^Bz&H|r^c{+A$|QxYo-L*sC*&Q?9i&A(%zDilTEnsqq$NKvy9`*kjc+(P2nkp65;rM#@)4vJ!1?S?zWl6?fYN~>U*(?PI zSLNQtpgES-E}y~wor^o=BO%l-39<$VZfm-_IR#IjU=^_4ums;$k z{7b1#f5$yott}H{;XvePQe&zdYo%EGPW@kCX_efPq%z)2U4J zkFEGrgVKis2y`j6GT0m#A^OP|foAMfh5ny+(ExQS?lwXj_jRxA+;1H+NOOV&`n?`q zV#c4pYp~#vi^@031)!&Y=oBgIvPIZF;e=ENF+&sPDev%p?ceL_4FC=IAr3O@FP`tC z_1*al5Z^=`i~mDp`=fiTe|vcKD>%_Zj|N_K>^0UMNw3j9x>KwW`i{`qOlkF5oN1lV zvTa$Th|mMbak3w0%ReveLii5{oNsd z{i)pnbAG&kVYcV z`jhwp_6U!)bd3|R(^%Uk76WN})Q?GCH2?wklL_hN(K-4VPK`bY7!_DGj ztdJNJaA&z|px;EqUhc*9dtIUFh>KOjj=(bzsY97ST?*SsvPIU7+$Nrv(^(W>y|6mf zE4J3wR|SKLE3jpsO13=40hjTNC3DL32J2yNV6DehNb^gMGD{H4?_)>yqYp|N3s=b( z(@ZTE;(8gJ*@7~AB(M*)|GajQ6xj6B3+iLq57;y><7H!HH%W8lphl(&9X@sXiInAd zdCYbOnm8n|3hHEF zWc3~ydq}T}crC{)b!0bxe${x4;6F~hZF$=6_te%gq#7K}n7c>v6~?f5QnU927A-v# zh0vd%Ln~RxUCpct$X6OsEwC=SeER8S5OO^#+OG5aX%OkKoS&V>*~_>;yCKstQvbwg zaAGiRCT8Be%qVmFCwHRb{in?iIFKp7FdBIybZzwqKKUlnK>Q~*7dQ`*l4^ck!NA})Fb~(@N)hiF zZC2c!MW|t)INtPuNUcfUX1PiuSW&xjqm2EZTs3bI0v&TA>&$07VoI$YpTGo7KY_L@ z4wQ`8lpeznDHEQ|u{R#UKy(hRgG0NJ@^$c%cWh@dg8hLLlL6UJnbn?~@`vY=2l&-3 z@)G?Km##CqpbCb+4so%=zMB?Xi}7NWUVRN-NWzvDE*jT)ZsR}yw7@#JeXs`jmc0k1 zL1|KPhROEPRkko%JP zLKcNh=xS32BJNaT+`~=lNo)JDCphJcalv!9j$bq3n3tyZyfH%2;-niPjlPmf?og*l zE_a3wJ?FAU*KUrdm3e-o zH0n`7Fmv{a@evfyo*Q0mIxAXboAH$lsN?x1>K=E1X@QsL>6M|sXTb3bZh`jkPRoF& zuyo>M47kweF*#j5&i~Kh5I!vXiU+wX{dKQ_4|SotLo z^n(kG_QouvS>xxz;|LINpkd?VSD)4#uXInd14P@F#61!04b6j&cZg%>D+)FkH(eIG z>u7>X z-+zVo-GV&u^54BT12rC<5Z_RQ{lYX1$0MFI^ujSy_G@wG|8;aK8~)ZNyL^VXexh8H(M8LB_PQY@zQB@3bom{VcHGstdaYdbzr~@M+*@^ z!(g6v2L8xl>Bx}=s!7NdaOsHNJo&2rO7hS4t(keR!7O6L|lPjwO= zFE*dil)$JO2Z{3}ILado9RgL~gOAPf379>KE}% z4kZliBU%!ls>c42(Ard=Sa;ahB}*E~&!$sBmXn`7xf?5eNl)V{D!Wl2y28IRHc&@C zo|bb^GlqJ+{vvP8B-*({R+eI6NW$Riw}fcF=inmRT?rAybA8ufv>Qd6&yv5&--({C?C^6kp4*oms!b6 z!KXTS@VkisKDG6OZE(F7S~PzS;W6D<(yKZ;beLOA2Ng7H7nMyD{95QYUtx_FTK1so z@z0k~TXw$>*$mr;lH%3Rj_jyyD2cv9HEFEAw#5jUcV>rv1a~pRW#M0s@{ zcebIE*~rFKy9WR)i4`Strji_;d>DWK>{RpOy+0_IEth%%@iIzXj7Ezlr;*>AUebKb zh>+(UNHNJ;I84W_5;`(a=Z|ki%i35A-sX|IN@`iOo02_n8E)q?4%#cJUu-U*;Z}Eg z%i1F}>%$Sdn6%W4VIFgB<})Jk+P{wWOM!vIc%T6xt!65--^adfzTcqQrUcIQYf?>s ztB%ZU_rDr}#8WUCf9it^&<1S7&g;B0J*;+A@I9O{vtPl$A3lRb8uTwVg0_elj*sIvaVR8tEOxBd@HGetYsacfY^?*O zNT4RTpe9eBAkRrbc#J}D(`P%~1$D}@%L$uy=9es`yFFY~Px0B&A|Ki19Q6NfbmUbp zj$eE0=pFYcSB{#VFWbJq@T|MxcG;Lc{xo!sl=7pq{96|o?hKb}}{ zlOG&b(-6evsgPj_5h*LZUVfgHDZq z9RdI6Izb)#Kmqz!z9$ChY%HJ||8@L3lNXrp3oK~0aG)9gcC-b}bPWN!Dh5LRQvd1r zcc%Y$6N9}OZQ%1ucsao!WqynJ#TG*RpZ2~=`nuopmRx-V$wP-0998oNI18c%!+QTl}jXJ{5MymPh&I=t;n3Vzms{L{h1ynfYl`5(SctL#D7R@IOH?xM#ZN^2&K_M z9@w+X6zj8IX=mK;5|>piL@R*)&&2kZ^EJMupoOW%M|Ntt5+xPNS-07Icv&3#U1-?D z$)tTDF0TC3f$Q6;%<`V!snz_HZ90=l4YxpeW-QcrCE1OGERZ`9 ze$`|cCGtxG*y~7bhC}V^U=K!}BST$p$bNhy5An&`PqA8EM#<+{T8(jj>*-%&No*!i1t(pP^Yzy z%N$bfY?oW(kfWVj?{%@rr&yum#wAAcMJ!ld0@8qbuPOF#fJ#Vij*AU3ey8RuYlJva zYb8{Y_iV=k&K@T3^P{D^h>4wQ8f^8MBowkc3$8hzW&&BtI#LSf72EtrJuLz@Z9YI} zFc3j{#(ci)XfnsY^u&GVhm(_2xq97r2G7BnR_;s0JE7rMn_VAowEFZhGmVYc-d_(5 zW<2c|@?KH&aDYf1+&$dmub!p#aRR5mKTb5AUY@ZApj+D7-0j-u4VUe2kC$C8pwDB0 z#B_)ld9(6>NO2dkZyv8DahbVu+^Jp6M}?Ljfz);Gn@&3Jv)+5!w|pEW+g{fTpYmgy z7)fo?qw}{oiLsHbEL$oHZ#1BSe!6k#uU4@62K;{QTj&$IoQFDL(iavgP5-;G}Ge9h%)SHUG9o>h zIE#RQ4iyF-XY`PP#ZNY7*Ns9l3W}hB06MkmKl%faCX72m@!-oNqobpbSa9jV!2@mX z4<4SLQ!A~lytg5%x$^E%P*4~mYU=8E4c~$C_mk?ks(DjCfBuYrWlIR@j95&oeb#e& z;5K^kvT=V2Ysn}&y;CL03L_@i+&6x4a*iOVr_?>mUnO!JeR$oxuJ6ppJX}2p+QIR1 zjW^a(y;8*nZ1>*yS%1i2*bUwEtd%Z@@h5)TUaWc5UWx71+}?&o@otH-`slmSG4sd4 zDZBrZfTE{3F;1{prcSNt)zhBbS*JSg=$Lzs1I0;`2}iGTc!Nh~lW z`lbqTGfE+W{XM)MBQ}cS3E<_qLnnHKuzI~WQd3t#u_mdbnJwDLfI6ISJ=QuK1 z?v7JKdLvU6b#Et(L$_n^3d_5RpPeFJ<5pt+I0hF>M`S#_Qq2tz=1tkvr++gc?6oGR z;<1uqn!NRWuaIt&9vK@0r4L3f`~A_`V#OHMv*kKkKEAh>i*{j5eEN%(2FB^)r5fYB zg{G#aZ{NO|Nd8fId%hDM50*MVU8)(p$D&tvxY-{&TWKI2Nae6xmKAury%+oPbh6rX zlP{Un+SZ28ZdPl%)4#jBE1xIY*VmWRI#;2EACggAY$_uYnUsW$2DruE(_N{zq?Ab) zOyux*ybMMmCNaIay81RkNI=lt-QDbb!Edq1REwk_9RDin@L?Sa)lyF4BRBbBtY+^Z9Z7$v;Pc2CY z$n^QGo@&1M3&TdX*v1jsKvpIXhW>+AJ3@*&LB>tgM6~O@1|NJXyabKgwNRH3wTs~eFfB;Xhow*3chrnzQS9c z-2rfi7nywSlU)`|)ll@`zu~lrN(8^>o>$2a?%HvM_MFPoCQv=*{fxkLT#jIjZp&Fr zi;3CV-Mw{#@0b1dlG}EAdb*y{W=Cs-^xC^1`jY)6f4+^*g>gs)q@>ocQ_rtbu|cOJ zCGP?$r}qg*JcKvppqS+lLx*q!n@bRK?ZV&)Xov#T#j37qTs+)u>%u$pJ&uv4}6B9t=iA-z;&BZo%XMboU zXe7LVSR&&6gfdsZ!STUk5m=h_K88+Z*ja2X0u&3gOwi zM4J<0-*ZmZ#DuyYuw?!*i&+l0vPg(7UGty^idp7$ex~|83dnFhtfJHS2DJ#+&sj%M z$=WBVo1_g(a8D)8ej&A=fRisJsS(CS@6lJIzQU!ET@b(M!fHI)H7n@e{A$Rm+A#jw(AP%LQq>tdulV;DT76Q4}6|mAC>fq)6l6&uy?u*xbqgL*jkDjWIw2_s_ zFeCZYdnq?EM1RV^>a!-JG|ruH_eF$$Moxu9210f!cN{sgz%OWo)QZw9G$7Q{@l?8L zg5v2CuzLi%b~Ap_+}L;ufkx zGy<eZI>|guG>EuM%`%#VOq>+y6h z>yJ)#5Y7E=6bE)xi8|6`%o=y?Xe2Zq{fyO7TPtEN&t=-cK-4r<-(Epjk}c7-Dy}3W zkm4mE;x8nZ^@|mQE-7Qe>QEm8fOsd3l|HfVB)}+abXkpsX#I=pMhQCf1_+6qFL3v> zzYoW-V%TR}L*II6Hj!g)af7yUW-D;5fub?xCeg`JkoiV86#7XnNpQwci{Nn;B>+It z)XDVr{W<@qp;6AaZc@Jtks0-sfJRBTkBrKLn9)T>QK%J}?zWe>q^4wdPI0asgMNPu zY2W@|G@FoIaD@%k#TN{3N58JCOSyPs1)+VL`+1SyEvkWSaE#IgX`b9SyY{a2(>`W~ z5VBW{W0qzuIKa*TRdOtfj~wyo?GWGKPVHfs zol!CZcV-`$kJl(-UzBWm!lTiVxpH}om|E{*h>hENI-Bp0BX~Xj!Kjo$luwlkj2xk# zT}ih0w1kwMVK4HhTA1WBOFnz@jTrZLF0^Qj^}niAAI}v+%8)k^)W{Cn-Tpyl6T4`k zTr+ni&b5rn7?OnfA)opzpIZ`?_SkLXZ2#H&u5R?Ry8^k8-^e|<_^zla3&b&m)yctf z!@ax4myAr)O5SJfvg*V_vPt<%IxTk|p?w3!@QnS2N4@b4=NoD{3ArxlFv`n1>W@Y*P;nH7P3>m^ckUvw?<0oB9Xa0;Yng>%~h0 z>FC0uk_;viNl!|ZJ*$xp{^61J@P;Rq@lL`al21!q(Qjw(o%$*Dot4+_3!jnbC12-$ zCF?_S(N{`YRdu0P1RwC4d(p+2;{-sa?b#jGDGZaEB&#F53A-{UY%DPr! z1N4yHPE2TciOJ2IC=!{O#^-reS>N-Nocg^3gNRmo-NaSzu@~pfcP4nLzqlCuZlU7Z zkjrK-szp4;!GI(9(5?peY%-&t>y|zKHQbU=^jkuO6KLdS)d({@)Jsb3oj$~tWJd*E zgBE8;7^hVEC?OFc6JwH^rnzd*&G`DU1m{ZlA6Uc^7`5&VHC`)ad91pPn4_A_y_=r ze)y#pHRk1|#c=S>5liF%3r`N=%yg(Dsl+X{sO*TLjeR^!ES6QaV2nG~%Bt&3bMAXZ z{)0fYuKM#K&IIzO0&=IuX>7nOzL-2=F(&+b{|2|*#q)3iEmcwOVYA26_0de>$Nh@+ zkM4S_wdWIH2Yg`n&A|i@4^JL66qNC-yV!kD^W=u-1L3h!9ywJk>4Eujt?5R)=M+Pp zXap!=aGy_hc5dDtHW)SP%Rq41dN(H>e)WjX96eHEBHOQUiZ#WKuYa>JxTWEm-95Ri zsDVzsun+~&;R4SjF}~2Ul9z*AEDYWZ6C=Bt-3=a7h-OwxiOiG10670hmn_Ib$A zTcZ<_fP`5#7&rMzi;#k|!P7fLZy6XRa3!~YLT9nbAeym4D~m8j*Cb2vWYwC|^5RR! zARG3)>uZ7e#o6nCWPybpTO{B3`1tX+d_Azg^t{>?6+}teo}9XxPq=I?GMT;ceVdYb z7dH|*L*`F*tiIEfG{c&6hoD_MsOIfD8MW5*td?^BkP+K$P(z>C`$brqeBJaIasaOu zX(WLi?0QBhafb|2%aRCqoKBN;yqdf}_)8jo)cE=NCDN%)gK!d=P0{w-`^&}8sWG#n z5>hT!cmjpmr<;SkJ+UYv5zH451k+^T*xSIgPiu^e9p1_mxIE@nq&G&ONOyYG+U*23T2M<0K67cL1lKz2l;r>7r?z<=26 zg3yQe;3p;|L~Hy^Dw+*Y0G(bz;FOr3Puu|a@Ejd6olX}9fzOr#Uo5M+2xAt!w7`S? zb%?gq7Nofythbv-xW?x_l$1Rz1Dw$?*BZ$3caPj4zxvus4$i2&@hKD+6x7ydEOccW zCC9udEND((+J#8w@t!pPpd^&mcA=@M>VPwA7=<(_k=n46)-(t;nt5P=I93C;#c)~3 z;vFYSn;YIpN~T!^<{Xtg9y;8i+|f1M+Inq#G8npRzsZB&4XUCq^RhMu&dw-C84uRY zrgPk%tGxQl!Z%w>2y(kP){03}-nXpZFj*gmgo9~x<@lZhWHh7VIMs1`zFxKnxWaJ% zz89c*+?a=Cj%wv!u4&Fg4eZ#MFn+aTqMsCTa4Tvl4^Yg`iAG~J1pL55KwR5gmrQii+?^ZA^jG`{xruOYRx?D{PaNd*H5YXiuLJ3@Rav!S^kJtDGFj~H5-nw9&(ijo5!k5 z5rcw?^J*L=_JaDFjFs^#zK8A2DlR;r+7?da$5Nk{2kXfn-Zda4s|}*g(waEU7gen> zq9&fc?f`t|z5(@akriTh2K;u){M5Y5OE6XBp%z?51T{SW>vTW_VZJS;xctL|NoMr# zu?zEcWiX_@!K@ttklz~^R3Jw=wekJrO>+QH_Y=fj;SL*)!8F>eRw z8dL_}EL%pkxuz1+EOo3sxy=$)U}WFDUxYfXH&Z_3naTAD3({3X9!5%JEWbHKee)QRWh&qr!c6w<6<0Z_Dl0Pvh4WGLy2MizPF6U z4}qAJubfg=R$6QFz4Im<6zKAbo?#5R`l{RU!cLStX||xVIqvfDAr?NZJXGc{uxOs!#QyQ6%`thKhZcrQ)EMt_bp*KgM`U)i6>h4fnOy^fzH?2*OU~;~> zS*h0tUv@|YAhY?=8V(+OoFjh#F7J5)_agWeHK#7g>)S+?BvUzXUmGYL~kddhkXzB@(O1MB!* z$czf|Z>c{m&(7M-0| zMssG`Q?W;_y-c_-wN#nqMia~2cV3V8LA>7o2|Pyb{{}beZ*aFnCGtIyBVSN8JGXn9 zO%tsHq`C#=?QZU@TBV_hcx;aYfHbF3u-}0l+fIB`f|5T77FZEs1~Ap(6=%-;q@z&o zn+X&b-LzU&gs+|qeCRtOON+#iX7>3hfL5opxa63P8x4e+?Y0d*6Xsf+lLJfId2<_X zM4qH{-79mtC$n2$=R5>ByB3~u1#+CqP2GNdd4Z5?);!_o5h`U`P2N@97;~5@m&)rD zc{?_z+;ba+i<+2@v@)wHOS6|J0z{~O*A8KN4>+#AGTM*gi6z%#(QWR?Pu5`j{cQ+Y zk7=i0=B_T*X3a=IHU>9f&=NzUW*Vw^1eQDnIdTJM{kX7E9+%!nwB@G~q z>6vcjA(br!dxhybs@e%|4C^S@RbMDqg7!&j?^G^V zzq>t}iHwYNzE~IWeC!EEAr)xxcw&LyKkS>%myD^Zv)*Wz$>2VjFBfK%!+r*797VuL zZG1%N+6$06834{7dQ5el*Xeb4JjZT6|HWFE8P~B@-re^P68$&xBu1S`4nf_I2;O)_ zU!Ihsoi8rjL&1{O57tj(QNN`WWz8&K+=CY&t+F)^O7=b9WgZV?v>~jA*qwSI@a-uG z#kE|xOLKsba0H=_Ci)q^{DaGxu2WHI%f|?LH_Ac!{%gxH$q%ckm6UYEm@5jZ_q@2;lUA0zQIcfimo;&@+rfA1iIq1_+KK{4fuA#pf7_7W7gV&T2gU@{ z#e=Y{#XkKpp~ZPWGX6jm-^fKH|F=O(G_SiFS65=Q?*;q4H{Hg#%d1|+ELSN zV=8_7Lx{ZmN9H5IJ-SmYljWSyp~5xEw|{32_2**&K99#qDw`sWpb`d_k`z|s^Tn#2 z?Y=Pea;&D=0_ijoQc`(mq~e5*j*hEBWu=2csT99^5l4PX$^j6acgb2^$?`+{MiV4u z+X?1`7f2zU+SuIOZi|x69C&$YlOYqFZUK_f^2DM*3gUgc=ZoB?ps&$(ZwOU;c9bA^ zu#l9xda4}N_pV&4wdSIW-_361T+)UvD4Gyi*moU*hOn=sEL@5WO-M$(MeF=v7_CTW#k z%}5ot>%`itq}=BaJhoz=Ke!Z~ksHes>22Tx@C2?Gl-8V*Dw}xpQEhnnhgu6B1HmA% z+0%V3BK_d-w^dxkzLw42-7$nK}zbdGvOjU~@OTALzu(5I^bxM}yJ+I4Nt2UuaE zAV_C=Z}yZQJn=lCk(Qa}ZU45Kx^DHsZLg!U={Jguc=>B3%_Cx=l`3P}#y(cI4J4kw zi71cSW}(LJhj0k!+tB=+D2Q#P(3euCQe=IDvdnqOMJMY`0 zqg$NeizJ*Rol?Q*)|&1_T|Yq((4OL*$-Zp4=s?B zL1SSTXsCY5n*mk97ZhiwN0wTMutbea<0asGuGFn^K?t|6HJ8?=<)?30#<6i1H+c;s zop=Ho~kQT9M$WJ*+3qIicSd z=@!g0pN30Xc=Tw|0oK6j!@&m*J{NW9>NwP$u5_R|ZH!s(Gx7;^-#{Edzil`Hf1RAQ zRw_9X;m0v!OTKq~v-&0zn}yIm;}CKBPN}i^KtRAbQ!LZ;B}X^ zX<9JrajHBa>uIjBJJDPZiB07;5wlU(U145(?9LhE7MX=LH85E$8q&&K=)O!hvORv# z;;wpx%a|1Ns)<`dET>{tR=Nz^k-9h=8wOOpER@L*s&G1*CRb{iHDgyOZL&Y8w^(8! zC50%)EGb%FUk4ScG^!2Ys;QK!-0zM96LG`}bIC!Plkf{36axUT2vE4oAg;CBlc=># ziKk^|{=7#|MP;&7ZD?d<1e|~f?yUl)6z{h?RZwTpdyMLam+0#6hYs}j z@d5i24w9^deIZ2PG4Ryu%-BsPzrc{>WqB%SYI&r zF2?n-68jtSylaBrtG*~w9x1VZ^|yitvDuZX^=vxoRg+nk>t{_gW5y{iiTj{Ej~a3~ zxYfKY&sK{??jdcx)>w@NMGy1zEN1>p*a`R4g7W=N>l14?TJ4nlX_#gB)w0JZ#kshs z{45Wehw2?M*}Io;#;rDwSNPQeduKf8#uvcPj;h!Bn#1QeN|eKSyO~{QL>}q!;bK;s zQ2)BVwTK%5&*m1*rfU@gg%@>3E2gvNUn^HHF^1$eF^Ws>DWwd*T@rJTK3miuz(nP) z71MNx@yp6bSw=#RKW*l~yF0%>O zEy&0LEV$J+G7(EM8$C2jYkJ+n&!$Zr&#Q{Ag?I(QFi#ksUv4Ja$}n&P|J39JV|dn% z3qo;)Smdwrh!4xZ%$)brF1;*?JtaQ!JkQeFEOet+BVcNssIa#iW+~qh`O?zS#s47d zSDT5b`JOZX5W)P1CIP5qe38xg%&hDR-Gn<$~R{()MtQZAu9ju^5rL2+tYTA17B z{r!DE#-v?5!0lyxT}Hg3(~{e}&{A3eCJftN)zi|Kwz-JIz zY$ds9%hO?k>aXe&RB#__lAoxy)}#9q;=ZEJUHlqkD>JHCSOutfpVay+U7NQIaem$rzcd6!JT?`MuDJCV_Y&O!?onmG%`LkX#_^6zy?m<<|^tg~(6 z43kicS8e4`vNLlx6UJ}$EuCF{FBa9ZQ@wooBCZ&X!B@pqN3I};e;t}mXT9FJ>C91o zHiPF>T8hTBT?~O(Jerp~Kx|`Q@BLMFR@5I$W_l{w*3S5hv14LCD?ekx1}vdfk1T1n zE#ZjAE4cv{q`K=9HF9S5%E`(3@DJi(Oh!_f{RDjj5hsExY}*C+O-PGZ97z&pE7;Rf zz>Bb&|2gw+m5jE+AaXrvcq?`%^(V)5^l76`<%900KBoR}h*VUN4B??ev&zm`GM1Kc ziYCd<{Ips|oA%-UIDcM<_U;x5qK4<=6YF!%a|h|h9!Za$dEZifD}?bE-&<&HOar;2 zmSWWT?u@Yq_`3>XmYRt)&}(WOgGJr4&yUUmy4@u8f1ru+RVJ>-ifV6Tm2Ovv(5fr5 zt4((C=gV>K7#RHw740Ce@z*WlA^HMafoYwM}k%M%#o1G;3uEDyaILlL*q~ zC`N8~Q1UU2%nBYHQs??jFV6#zuxpN=Qp%3DHh&FgxRC7t@tfFJzoU5H)34FmBpo1{ zVr;(OV~rV=dB?^xXRDR#mY*~&y?z)35K=OvY=Fw5c6QgGqA9`W&sCtp zK?pi!F1+L8@)`rFF`))ui|3C?%6$sSf=j&i%68z1Y=)4=!sVOMRHSf2&xph(Nx>s` z^7R0rgqj;i93$C5ZQV`X=d#vJPY$7wk}6wN;d-G1eoD>a0(@6-in>a##GDLIZbkj< zr5EePZ-{<|Nh|~B;#*;ZP#LG-7EsP;${j=!J+re*IvyI2$#1m2J)|si@2ovB?C)~} zO%YkQpwG5m_Feo_z4&`JHe_55#l05ht~{8Hop@G$iOJjR%ru9Qp7&DA`)W@2edSl> zDF;>2|6EZjb?vZ;+%?4jqLSD9>oZFmBRU*(etwI(yCV3+RhM81fiiXDFt+Gl{PRTO+%BW}Zxhf2ynwzEG8T@g73ltZJ|kF~Emq zb)gA7E^@j{gK^tv|7g_y@nH8x79{glqX$%5QTIM?S012KF}V}gn8TgTjZ6q>9PhJ2^M@z-X>A3wj`aNFPK>|^)Ad-`l%XOxUPaX28-sbfno zR?bw(YRZf_PT5gz+MH8ZlRTm2M6Q2&k5K`K%RjU*qu$A+R4&4dLV}?sG>U>%w|Bu4 zG$c0j(U>fKVv;V&M1`>>)dDu--EPl;X?KS)XTff6ksP$2YNKfUtP`$syOQ!pei{en zAR3_1jPTh`>HCHBC9+fEA!1Mg?T%G%+akYHNmNaf&3VdgAa?bh9iKZCa)JTFI!C9& z!2?5q(6w4~^G$#6A{~-DfNbS^YRHguTY3vrYNBX}%U2r=(FhX_fyq7@W0xuDJfTdW zLLA`zp)VM2$^8cXit)lAK!YtpF3{f=k9*kFEwCS?W(mtnR9-#_9NCj z$9tWRmx?s&%FlS5fvVysXB-~QVBSHHs^gnFSwgGK-RT1Bfw=t}e#pr9geb$R$q=TJ zbE?9@6%B(!;TCa5(hVDJNn?t}o?!4=QL_qT^`uCY{fBoAcsICfiP4Qb!<o8{hGY#BH7_Q1 zpij9e=%lYFyX;wO$QKt%caQJAMV_t1C$4!5=UAZ=GLdKux}7_Wy>5)CrD81**rS`t;uj!X0oPa<43qs%m|#kVI{^){nb>>Ntm zu%_G&U&^Rj6HAvh_<${;9DEO*WO;D#6M0SQXVpRcEg!DGtq$M8#ByqNTiXGziO17Y z1(n0#-NPBo>u^=U+R~gUYz$0-E(KE2p6DUcF7yfHWw!uqJIQR7NL_Y>wM*MJu_fN) z0}UdW9 zb00UmTM6Jv=!|(Ep3lpLK<;%G^J}DoYs4izzL;oo`Ma4~g1e-D&C5?QG?!8N2qJcd z_sRFi8M%bc2y=-)H=I=Yh%KJusQn=V(e4{9wyE_5^e2_vADe{wp8ZC8QXZ0*WaNuB z8{c$=$?5B1YyU~MrBf<$!Ci52d3ZJ1_Q0hX3THZ5H@o5o&4dmCaj9% zT5d>r<_9QhR~JU=@h8JLh%_<&@p5|FuHSw=>DinTZZXvR1c@Zgp7GeaYq#sz4cxmb zwU0@ryOhf_ee=`8iP05%&9rNfZ1_B}NUq`^`TFM(;mCpk>T!Vzh3?jT$Z`WL%M0p_ znfuu3@ii~otjZ(`cclj#EcKt#oI1XvzpnG+e2EJcH;eN^UU8lA>RGOMVNh1i_{*rv z=PXy2GrQbBtSS%a@F|DrZo`u224<0Aw>BQ^k;5(-T*971Pnyty@uYXb%N^UJtWyQ# zu@!;{R|+Bb4!K$unHCK}$u7a*W_(I3cwJ=)5d##mh(`ZlQi=*ax0?*GTkQAO zag=zs>8`{x24XI*_>ejHx~1qgwZh_bL)AXAsSc?G-4_j2Y?ArnDz8kGo9h1|$a_w- zbw|f1GN;7)rRGCiP&~#o5APcks8(Y#xf97eF&d9QA~t|=;UGbJm}kgJT$F&-7Apl! zZ==$uwTkn1&3k>d#I1WZu|vq7`*0&MvR8qPeAd*3U}V>c^x~oR^Phev3s$|CDzm?M zvmdS6fEIyNB!KPP+$rFGD-^BSJOZRjn(EA26?4jX%^dZe1a8U@P6|IwDgr-(&DBFg zw?ZVL79NL2_-VZ0A1wu~A=Q#af(Yz%(rUwyv1%%JZLNF`hPLFAnU7s~iR4M&BeS_~ z>*c#C#n^tc(YhrTX-Sj5k&eMF&D)H)VhHDB^hnHhJ<=$=o#wimR)^t<>|LJF6Yuar z(15b4G|AEliU%5G`rkj3eiR)LBa(Y|0v5BK1Ns?N}_JYsoza=!Q5LN zG=v;91gmKch*_Q`ip3j$s(D3O^&$!R;{6({Um>ha^ph{CRX6t)jxjIEUioAg$gxLjIUF)q45!{e%f$!8@s)aM>lHrkkS-Muf zbWl97h5C4=Uy8;xQhwPo=oURtbyseV3V&M3OnYY{5#(=6w2h@^U%eU73kqF<{5=*% z_@8dcdm3GTGSb4L#c%~LMX&j!>9^6-ij(@4F^kE;qG#t*vS-mMA5&2I22G%bs9LwF{g7JP6`P?Fb{cQ}DX07mOkr|MidR#9%;J3KpI zCmN~F4xU{mgG7nh;`)_>)%bN~!)`2#0iFgjnOi3gZsruWvOE8k339zkat}cY@LQ3Rm#b} zq*=VMGncoU(LCvrH`c*j&g_PIZeUYoA0yPJdEpsYlI^Kh><4jNl{EfRQtvA;asSOo zW#jdD)(7;)Pft(Qe;cp%Sr+NNMJlikk^cNiiTK-L`@7?%#(Bn8fy(pM3RV1ZG!qso zOeRD2G8go8W5?IEINLuW07pWubjKPvouszRtLC7V%Fmvz5yEcBlCX_48yHgc16 zkB2hMk9?J>3bUIA*|&ZQ(FxoRoaXDj8xF1-I0(KyzAd!uaV>5UvOg`#BIKk%Ay+93>f?fEhY@2kZ#y1(<$ zs%yOP&-yrj>p3#Pnt>(D+EW{vurEi_>2tOJ?aXn;GlmOcw06hBRiO-9WDcbDlb~R> zO;kjD6+r{{!lpC!FQdgxw^oT&KR+I1*v*GF^+~MU(?65-XZR6aY7@iVdW~%dBs(Kj zEW>&sG&UzY>7}WiKYqVC`27;rUh_ISRKuiOX@XPK zVIx&4bvJjRv?9Nj?)e3IX?R80oz|bu28bt2a0Tgl8lH0~%*_NJz^~)`Ter{O*6%nFHf;@XKl@4F;~m z@4r+-@qG&uZELj(<>+DSA_=I4SYhp`8ydwVaf^ojiGrKdn#Rnl{ajwa-ulLwo5i+g zKy&5B5r5s7Uc|Dt;uXyC-}X-q?HLaz-362Vh61|nTNh~lKb-PTY704jBxT`H@p zsti%P@+^nm2)Lq(s4&tOly>%>UO67OC5h#hjcqciiR51SJ$5S;b`E~wh?B`5YCXR9 z(!+}im*9ScsK?)D~m36*Cp=FTM9M$Y3 z9!z=i02!Jjyqzw4;<+V2*^7&dR|DyAJ?P3LdIx4AQUibBVF&iCv&^MV;BBupF?Sw| z>S*%x^h}(lHPu;rTLgOCiMA4_i&04Pg#~q($dXU^T{GGoMQ&&G*QM`_X1tsaF{`So zf*F+i`}?e{EI=0u+O9);uo(@;u6%z$VnSjy8JU`#92y+_(&_<*92yc5)FO!Z6%`eQ zR%gmIISqP$1_cEHolsY=N!BuLLFL zezcv0A_}?_5~l6$n&RW9F!v)F=I7>$Dk#{FWkSXXges6r(S-HL1q7BSCeD@6HR^3h zGe%c{r2K*GM}RPfJy5>`_KpKFA~Y0CgG~Kxaz5C}F4+O0gm%RZlWro}tTpd<>JbCd zCuuYOFgbTr!j+JO3E?k{!wfdKxVU47_6x?1VkW=_Ay`!>zkUSx2*%WF&EbU{E?YM>pr%Z_J<1eK%=p- zj$~l$9UUFi>deen-=_gf88D(DD?4(!)Yj4i68XX$yAJ_V%_7$!VvbpC1t8Kiu9*RHcK#GZq`ugELARNH74F2Znc#b%@mx1NE6c8FhHl9f3 z7`I3sZ*q?J=${_vLeX+j!U?`bQ|)ONv2#m39iPbj8Yn1>G5+%BdA$sObYW2c`h_;b zUAWag*E~k3^Fgkk%2!g*BcSdXqw0=0%O}1=*HpsS$H#ai^&Jwq!{02N7V8xO9GnCo zzwRt}ettF?N!4g^V|Q4{T5L8RP6qeyXr*msVBlS2$NBmBSb0D|fbc`J{f_M1**`b_ z=KC^{*F9|$+fvC@#=#3+R9 zksoedyGCoWvpc)HH^G3Eo^pD6I+ze#wHd=lc=ryM)%a&=DZ^tk4ud)>0Rf^d0U=@5 zJFrw#pe(Qn&oxl^&V){@W|7}W;%JcEY#lr(n5-T>mZv#S-%Wb#WUtCOQ zBOom;t*g6YzcbPyJ_h55IGVyKcdmaMNh%T~sy~{}PpV<6B%99nj?4wH&y9^dnwpv_ zDk}Z2Fo{`E8hrX~Ole5CvJF4ziR4PJ22p1Z5q=ut9rUA~T9PeG+RQf2yBZmAS-nL4 z)~dU2oo8C~-LKYa4rH`tlo&XLGE!ue)1lMF5a*$)BB_Z@^Qfq|5_>^!oTOKF@>$kLO@4 z?N9=9pfR!A2q5-+<6o+!z?u?RJsd7tbxWSy$zW8+%`D404JQs2qCtg&v!?37Ob;(| zOI9iX8BOok(0dm$cWA>z5oe=2oM2%h|#}5w+YzBD#OJxIZ=finQn@ZtRTFSdQ+fCOTjF zPE4$L>g!jHgnYcv9VyEiwv_fKoZqzGWlKgU zr$%<3LieuJgecskdH1(Z72(~_}y}_eC-}mD@Ed3ViiyDUA^_Y-*R(04=Q7g-(P!+w+^!X z6lA7GQ^KJWU!JEh8X|UT%PZ>R{Ga+55Y9OkQ4|WG`-RL4imh;-jRd=HlGk+|(52;Sq>p&}YBi-UbP0 z6_%8UM7(-tq@^AB^XF$o1Z>*i=x73i_u<|i_t#G7-O3JyOzs~JgF{240&MW$8{^~S zM>7Ni4N0jecx_hOeSDy}L=6oo`pGEZqMbn!>c>jd0D{kmJ_G~=@*v`1AFAiOos8&p ziMQ<{N>DKKb=Dav_>YP2^ui)rK<4SXj-i$dKqgF1PNo}B;h&(ek!IeBd6#SCS%aNM z59h8}*{^1h9kUSH%;V^K9;&7Ko!vPoO;PPlqdZBfJ%^lll-<42bcN?{*tx<^IetC0 z* zbDxh6q#rW8p=Pxr#g0c07Zq$>)V9p}?3t;QUydsX*NeaN6<2A(HiVu{*L+)p#!+E) z#ATA>y0&pb!MIGebCuw=jfp;3JXp}TB~F8Ws_yju-A*)ToS1S9cT zsLo-i?Cf9V5RR@k8}VrBtt1{x+QRGn6)jp8p6Z~eUpEeddT|j{ADr}9@Jmf3t^GzL zd`S;_@!Ux=d;}T-zeA-M#ch~s7i4^$7=yE@#P64tax#-SPaK{%4i`@N*sxtJe)1%4 zSTVHC6^Q7lgt?o3xztC1!J9WQFrPE^9pRa9#z2NSMiP8H6;V}<&z+@2t4G4;0HQls zhWD zo-Y^2sx$tQA2+j5DvNBZk$DnV}j!zui>?p6!0X76ZA zO|OHJx9n|*MGr<}k=WL3QGX;SGTPtx6gRR(nXqdSX6>RBf`67?COfDtU#pM>9ly42AEF&W{^3T|y=fgFa7RP)49vT{2$Z+7pIkjB61d)Z$ zk*mAAS4ewYT-@HlfiK$qL+JNxpx$u@K~7W@TC}{ZjQ{&17j!2Ygl3HyZYh>liNx?^ zxdr;{Vg?M)f;rp{r|;ie0|Q}ADR(o&!iITatZt6~f=Q7U#{<{`+r^u4Rtu@{Ul+DMyFeygF%rT14W;kB!Cm%Q zAur|)FFp43xooepza0BsVa|s=2{9AG27iHtp6PF+jAl2+DQ2|&n5=W*7K&usn*EXQ zj`7!M(w6z)$Vd5-K2xDlKym7OxfxphH5JFIkF{#hZKRTS&h>S(`)Dt(uDXPx%Z`2_ z_!cZOn!;P$mwnbD$Ts z1lqc+gPpsf)CToZ$|d%#gkhhg=!|uLmV`fyOG(D>qa`=1Pi@KjVhlKIx{X-d97n~3 z;4pLvFwpOrG;!kCDV`h^dGbm@htC-~^7%{Q=@(F7!p$zxw(v}MPphx# zxDBUH-YFpP*3Jw*Vnf{=LA>3mV`P7OHUkOvnbYFDc;sONxt=L3apxw7PHe=Qk@}zK-k!-6Tn3n_ygFW@e_Nqa#xT zG=ZaogLqsvt3O`9K}14AA>@I^2l2`;77UpRjl{;rLPJ5oaAZDE8hhUJ=5v4~JU%gD z=j%FBsO|Us7OvXS6BXU^4O?cw>Y(=Z`kmTj?hjN$!#bR%PmatwBi|lZ&nSe z3oK43C8(nX$Jz$61UM)VnbJ4vjPRy)ts~T;U{qAoV@@Fu`1Uv=f1VeZY9G?V&A$8g zIeE2|X9hA2ilrh+vG^}`{wjx=Ex2;qq1~Ya+m4zUZZ6+oKYt3D-M;*tH)CBjCb0_T zTmXVz>QNEW?6{Gd-3I(z0TsSUY*lkzo(a>z+(gaQYx&h=7=0;Q?TBHsQ|%!|+R-Xp z@*9%^Mwk^c?y^upwi)@y)Yv6waVa^3dz98}@l>tWOr6ybi@BMSXagJ#DXZo^CQnz} z;-Vs+X=6vJUPXF5|5SBl&TILR3m6-+vq5v$crU7a=1N({t)FulX^q|6!x6th;r2(Z z+Zw<4#H0-BUARDEIhFV_%M=Y+j``Bpxt`jZ=WP9{(!N{d2dH!YkV<*0POwr!Q1_NjS@?D zbq8vHTm5Xa11qxDSD5vjr*|Im89=Ccdw18-21K5j;sCh8 zIxIMn80Hoh&PNL{y-xd6w5Xz|j3zs?2;KnxxC5XRHa0d$v;emO<1`-#a2!w-rKO~x z>fCqO$Uak@!&Js#dXuE@S^)v0=l!KwG;9%Cy%ZHnc|sr6d#9N!PR#mB*h%-qi2~Q4 z@~G^Cq?+}A;{R6>Z(pw`=Uk*;3d!??OpZRs_)6RE7V^P zmrK5`@CUdOc2ENrLtWV3ncx??K}CG{)UISdBwq&#ZJ4gD%EiFjmL)IJ){d8D6Yo|s)x?lBdQa{`%;7?6hD@nj9fY>_-xku;N5)QQ}J?iOCKh1 zCE4+W;@sB@h+?r+A$j~0?0cP`XfZy6xNE~+c|JSjFSo5(2OwhTGF`vCTwAHSeF>L-`SieU$XHVk z%C-x%%|gc2ZgQ2P7q-M}T(nDtKH_b2r5q-|Tf(G;ifi;_QSpG{<|fCfT~@&)<^I zcO9uvH3VS~yz;KriDR3SZD^u>TCIkEW3=yGDON=3NR%=W`n-Edg|JE6#{yY(Lc-A| zzbKjWbT)QLrro7ea28-#zBPISEc;Sl3;dSs!)xGFJ*t_Y9 zW}|bxwNaix=3Bf`Z_OoJppH{^024BR%Y1CMLoT%rTPu>H%-GB zo+Y+2kR^?$e-th3ubH;)lbSEo><RB!e&C?-8sr4xyfP2bHerVyt!u|wf2 zNj=CrGY;ZCQ}n>n3VCQv+B>@9v*G)0Ee*qL|LH?>6zs$k0~S@&_lZF>Mre2~dkCH* z$>ivluhG>!bC1Eh#SJ>iW6({3R1DYSQVhly&_1?*r+!Z9jB@qkz|E{MrR`?t-x(9t zTLi1vw9*blf~OmkgU0!nIb)}YH2pb5esDECq9h=^J54#ilE*P56fa41uuLx_ybN8i z#>?|B&uQtBmQwHETX0opFn@B!RZs=GWe0Q|CVh4rNzJBTrKN9AaKN)u3CvsUA__z> zCx0xxf{;Z`sl*ZME|uimE&JmxX0^BU8;tNIIeXRO1`91+)jMc$s!)l$@Z{V1#NdTV z$_ZA&55QDa1`OtN~eBws;=_{A=Q6=y$`=GE8Q%r%x%E`D*k*?v6=|5WTU*r{Oi5?OGS&HA-`?=sRUZ zTb9w6rxlem(+Y;bn`S`W6lXG3r!KZtpW|f2nCPk|nNJ+rPzG=y!!J>jYD0;e{<<{m zLKQx^{aBZt>H$*Z_SO+b^pv^Iu-Owe1-#Y+di^#RwdRY)QxCYhFz$=n>DV&rJc-rF z3TPN)Gycr>8cuH!fmiSZ6N#*HI-lo#paQ3Sp#pys$*;<-V~PZdDBug<|9AM{H4HKgauF;zAL4(9uYN!U z{?Ccn-u`#kO~_lsTVx-c2oIe9-^1YQvf4?n{yYCahyQO~7M#}UjFR(nqPa!x!wB3J zl(UO$odQ3D{AwI}UN6ZZa%+I=t#ve(80(zJ!7i>Mtg{L(`t+){VC zSX0|UI284fe5Vy9jJ3NrRABS=W=_He2hoeI&FJ^PJZ3b5lSV}>%2m;+TAt9s?I3&z z0V_2Gc4W1&2@SDlk$wvOcfKu8Qf`if2j(T1^(6|lVtALho}an2NbghFL@4^@B^S%{ zO%kTAkM+Z$Ze)eP`)Z_{5(|GGFK2~^-Q?!s;>BHls613O%tB8KfeH+g=%-DtIx_T- z{RpgC?$J+S+{~pu;%^Vi(?-KlPS+5clx>~EXthR$UC$uD3d|W&q_Z%uU_v;aoUY2N zo(#%tyVebr%u=AK`9}2Inny$yC6th5#EO^fT=`yOo@AW7L-d^?=?RLX124V$cK%^qDqY6iY=gdc*{e;<F3 zR8cyxAW@+q3lv~|yMwg~L4`Jp0W0zd7n2Jbf|~moY?$9B@~hx)B01iH<(>Y&`47kV zXt=nNWKoTPESbP{MM@X~W$n%Bsl1Ac}lkXC50$M`|=`_l|& zG0DjE!O&iJue3rqKBugtq#0=3Kd--Bj0zI+IyuMW7Ye1`X5$*&*PT#>U1L7awvY&s>^;mlh%(TYrDQ zDWGF)p1WEZwFUmy%kcB}F9T4#fIzDx*A(lSc1b8N4-YaS&t*ZzcXM@Dcdo)VtA7ne>McbzIBv?QO#*#~2qJ0{V41BKYmb045vc78%gWL|d=&<$ z9qA+J16f#DUcY|*OIum_H|P&1CME!j2e>Bd_8+eSq#tT0>-FJ0u+{@|w9CW!3r`Iw1zv!&f%Aj5 zay8Q?2=t%7)EOeVQAG&}2_>|or4dH_5OA0hQ&L7#xmo%731u*~s3|DAr>1^O`T*X$ z(dCS+e+4uqLPDg_=e2;|1T&Yyim0ZRmW;u!SNkeljmgOiZ7D$VE(VP4h}w@1}2k*n*D14;^yb)XVz|n&O2Ud6I4cs#S!4K+mw)$8~{Mg zs-|y<+xg}@WrlhK=?qmkpJrb{_)}d-sh&-?FwFohK^23K>;ln^63){!)wIe`T2RUl)AdenV67eOox-%SG->M zyAQ{60#j1(3q=eKHv!xYp=v*BSMCibM4*DjgO0`|Am9V?DZ5|GXFb31^lqcTua7i4CV$n(gHNMMK-0}IW_uNW9H zKx44KZ{CksXyR#}sY|a~3bC~y7N(7SKT0f~ z7ScXyD;yU=u2{h&V8kJP9no1KU+(`1qvrk-4zCe}l@JDlDHI9i#x}{xyh-FJE4h}U zh@2iwI4>|zpzq`$-WJB;guHKOpeAxEDn|7_0MKJ3L-1sMoyG0a6qW0jH?Xz4O^J;a zr9cM=O?x)5ZGj4BH|RwigZqH)bE~Q~Tds{lz=it$J>32)shgb)FJTuKms`Jg80`i^ zi4KmAYq~&a0e3QyhY<7^uyD&GD57uKqkgLW@B$G9kB|@v{mlZbyLWQCH=r!wxf{Me z@jxDBbtEJt0Mb`m>ttp|8-%kf`&3p1*j)>Y1Aqqk3Ziver$3!`p@0@&UeP88m`_IF zvgWUi;{e~wgBiwCQh0x;k*OwU>qriXu)Ym%2I1Eoc%-Oy^}x?8C?ho$u>mTNm4hPy z%ICyq4su&%G$cd-mVv#!{n6oJrrVkg>M^A;0M3cA>p{RdyYu)U;iJAyKtPb0o+S*% zr{99M{^l6)fPkGY#2(WKdh>k05h3^tWG%mt_7UUXiCgcnB0wVZf$2Us(?V>bnsEuE zV5gM-eEV7uG+C0w9IjHl?l9i;Uyj3vwf|p)^6lgM{`kJu!rWX)90bPA<6|9W8{k6} zosh877Ok-$Of3&gn}~pMZ?K{*vvYJ;nJvr@&SClc*2iv-fuFFXUg#5N^;{#pr(2O>#yey zc!}tbl#HInq;M0laya!sgY+?Ny6bHUntJn#mk_i+xdVcGBj+rh4d_AbIG>@kaB9JB<2-2EP|gedN&m8o}rOZQAvsKcZf_OlCG|< zfq}u??XPz4I>_ra8?dSHKk7p#mKff`6k>&mDJd&oUSA*2SMOA`J>QQoZz(||(-E+# zK_IW!g)B?88WdT4C@3iWBq*}mNhRnvtD&u2#A`&nsT>);dvZkyWk$pOMW*VD1BHnt z3TD0Wb`DgX)6tg?Z*71j(I4g@(Vl7J~kb*gbQMZ+#7?}-|0J-XZcbr@@ zetR^dA}i}P*6zl}#{NE?>)0blBplodurbi%Wnvl`A6HN$rLlpV$OpC50xONp@J~ZL zv|rHm`@R(3e8A?W7%&1?zVD@^rGc3Nq&YP;wOtKFH592jy$&dwAfS2@COJgM$9D(H zp*h=s_${!NPIh)+i=9tbA#=|E5E@2eVq&Ltz)Z(RKC%Fq=0M0!{!D;E0n$lAd)fo? zCkThFE@#GAVEsK@9l&yMtN--&e%)^cT&_62%?Y8l z-MpRPF#hxYt?wCN`#@qVl!Ma+6Vw!NWC*Q}K;t&XBF||f)sNLpEbvJ9`2!dI6yp@B z3u!Fjx$Dibww&DPJp}RW>nyWGlFcLF zCt}EgdnwI=@&VjZq1xo`X1gtT0tpse_)X&VKR_X*qm$mZA-U}CTZPfhxs&-ro~A!5 zH-zFxp|=@#XoJUHm0=auE00Sc_L9$#$0w9xUV3^Y>g;nAvB^~FaQ`?k3J3^9$9DW$W9~NlpGf~-&A<1$d4GF2<;wn| zlT*52Qbbs|{mT~~iwpmhf=c@a1gFu=c1aYVZq(4wFg!mPpj%7H zlcxbfDkKxz7^`|MD+dR*h`tgWt~m~N_PA!+Ad;Y1&^Z1d?I-^4b}Z07kv&+yTs>HV z3rR%K5{f=U(J6Lq`$Y6l(kQbOfhLjzEW7{ur*>wXj2BPd-Xh=Lu#J_aTu;5ygX%tv|E!Fk>9*6P{fZK2dT?^y1JLagh-> z818rVwqrAw-g!X?W@d0+w#ka`ndi*kfi?+^ACX;r^ zT#{z|Nzd>yl2hIMhd13ZKTet0BdY2NqEG9)Wm$e;k+)+}bn>n69kKVkR`BR&#xk<# zAY*g*mNLYt-Kyx8eVF6D=GRrq=4d%q)}my-T?ZC_vjUTlMP1@m&OhY*stV6x5Wn~^ z9Yq00fuly`mYy%0khC^_=4Pkkpu_DidRlip_DT0Chy8`D?((ac0fbQVDdrm-JOceYwt0L|MHOVto-MwQzCB+c$kEB zyLEVa4G-`1DguHYiO*S-vcFZ~|G(SB$#&RVBpxg~^A?kZmW-SDkB<i_Fj%+`#IF-~EK}5#+4tMab`Q~^P=3QAf39U8gTvq0GtGpU zlGrF9_`-=!M0%BHgi6%bj3Hz!i>oC)?_BM!bn4n1)JJ}uOQVxc+c9e58GGl%hY`c= zvS?()?k_O&mYPiI-M1i-arqow;NxaqdgXn;)hdR>Chc*2Q{W8= zdN5zB-gQzCe?)XW0|ys9O+c#8VQ0G2P>(Z4h9MTp-Hx%2Mhy+?D_zsvA`kQIh@7nL zMJTlnLpd(Hlg=R-A0Z(U^sEs_Wvp7ni}(5_OA|ZV762am7gg|J^~QI;9mz5&q97X| zmGUZXax^!awpPnYlo_e|9^Q82D8p2~-*}CvOQTM_Ia8Mdu~Ip@#6VfEmYzkrNnDxa zU@7?Bw~erB5K=WIgx|$acQcX7rm#YueWYI zrC@A}@!pneIV4$F;WRL?4{5?P6aRxxB9bKmJ+iER(BqQ{vy*NK7g2SX`Q7mgF@|Fv zzNBkQoz8?}{0hgPE{HBUM;pUtch|c4c9LG3cdZzXt`W+deV8!W9^KIJ)$g@a7z?X5 zh6;Vz^WED~n$a5-e<6|&x_8!Mwp*fg;ceG@cra!}O}ahn0*U{7C0Gsm(G=QEd62RnBlxG50ZAze#eA2VS7pP z%4^sQ606v0URBrTu_-mr0YWB7Fbx|Ddu>{7#+kxiwR)N1*&fmvZe;+V`WL`C{|s|% zYe)U&s{amBr5xpswBY{q%^{#9Nw#~bo$+Sfn6Rm$AnQ^)Lbi%Tlwy6L@Q--W9Y4d> z$?!38oYH>l9aL!7iohR(M)AG~?7{s}uNVZn_#5Xh3kCF$ORlPZ`-@Mc5EnN)>t9T8 z4QbJ{?&9@7nCm%mteNROgWI_iO;HZ?&nJ6K?&;und^`z}MCBMo!UgB-1xLQ7FMkQ_9ea3v+nK8gdo;zImoC zHxqSE_JWksCpnGQU&#%7I!g_SGG3ZF^hysD*eZWE|rDdb*vU9plpEwt*%7t%C>qp1>TL+G2%=KB()q1|x0b=S9^L@i2!*n4M zRCwC#;NwERR-5Tp1*n_{@q9;Lkp9bhn~U&TRe}D;zT6csRx?FGjCHyT?@fE3xlHG` z`URQz>o~`uv~!{cghA%rn5}YMC0LL;XSS2#=g`%&zcul3hn;`#`CERQ{`oU~@okN6 z-O3~3>PpEwZck^eJca&XK@?U6oR*UeMXhPeu7>?DPIK=of|=r@d2Pa}7T$bA^lCa6 zPGfTCn5%0FW`dmW>liNRiC4$OxK(m_kOjYL(6@E5W1(PosB}{2_@{tEWzzZPFU}G3kJIu&fEA>EXMF5Do@v{cR`w>=*(f z$Ih7g$#}RiL2LVA#70A0O_s0rj?2Q7t;?!=90NxLwQ^KK*3j?iz2jt@|7C}pg#=_D zj@!1jlwR7=<1TAf02~?R!1%j1q!hNoF+s_nybFmJGzU!}~%7 z%oz2l=g9+o#BhE-+Fks^Y?vBa=}!x+lCE2AC^kAX1qZq0&NuuLzu4s4P!COt@)(Mf z2-V&(jQDFjvG1}Fr7#Uf4uB+B`dfuQv9z6^lPGr+b`ICK9g9_+qeM3TgF4s>pfYLq zhH~=P4IbGwGH~Fi$4L`|=|Y%uMB4wlV#+*_V+dHGf0I$O_^RL5Pabyvm$sc{E#LQ> zriOF1lM16H<-?3+Q85L}x8)g4mDquODEB+})4L2XpEUZA_YpamS(^MvP_2lk4I171 zcIGLqQkYJ;npsJ|zp)Gx{8E_7KRePUp(Y=FP?&daz}(c~Lnab+jq~j(5$!|dOBLEXVdoXnIk|g|Jpm2pCjEXRx7iR=0Gpg)j%vmJgEn4IfI$X z(#xsKxV?8haWFk9s)TIEBl`tmzVEw1LjlTMOLXuyo(ns5?nb$;Q=PV>>t7tnYbW_R zWNzDYmgD&e!ykh>y0kbmbJk`8ZEFau#2KeE+!s-PxjxBix-uS&;(x^1us<&`H#4lN zi9cs^KCqR|#)8SVlYVm6GCG41?{&QaMwdDMQgr31j{COXAR62;)wYob4iVuV-ZrBf zxh-j$4@Dn!d15VoBd4CLjD-CV+n8n+dJ$e7dnG5Ie#+$%6e)iGnuBYW^ zr_VkQH~fIA+)>kUnwSH7oCjW#QscE#sp&R1mAEFPR*t$W!{g{uMb!EvEo4BQh$v1+ zCx?!nb$K9lcq4Gx9m~+b!!DW`!Z*+#fs_7eCk{yL`O|Q6Da4zSV%x5dhjy1(Z1j;} zaOIT#X1LF5n$g!?y3Qglxg-fwpn4v)nhyd8)_?Oo8G1c8zL-ptzUdbhs3?hC z1qoyXj-D+{e2bFL*S3j!Cu=tk>4mB5Ycs>FLEb%E_IDoKc>coXjoUmaLmTjZ&{?Pg zlx=s)B&Z*_w6Y>45=fOKvfW+c90`bYUujs@SNtH!RT?<2NKizbsk-yX5{efNh^0Wg z^>`iAM-KnaP8t60WvHdi;G;WLVAob?e4tuXOw^LN?;Bxc5>hu`m=jkh0Rn$f3Ywrm z-5_VM$e;j+pA$`&^vx%7YL_E*_lPsAxS2tcpnv_G3yL&YBDSEjBW1bVz^9 z>h00Y8_K{X;Ugj1%wqF>E=;;8*(@n!3RL){Z{?9f%6=ZLlGX7@d8G2eq758P69fwj#T|J^dhVMC2k9kqc%C6&kx1kqWR`d#18KPn`s;*6@e z-r^0es$e-Tb;Q%@GT0a(FAJbYZGJ0yFti$*HR0X7^~K?)RJJ33**Jw(S}HYmI+>b? zqa&j$aM&(_Yc&-;dKpIi2AG_T^d)I&g=6}OXY9>bWuiu6N=&$5(e>GanZqBCE8)^D z7jz;tsQrWT4|AHb2S@A6<8Z|{)f4|O+eSd-HMA9|BID!<-VixJG11C9Dphh&?~$G# zy`D&Py_4Qk5T}e(qxhuno#ONenX9H?W*t9&OF-J&O!_p&iQV% zKKw?G#xR)TwyYH49sKW?6?}^tKB+iV>I*NpRA1@AAO{K5_O>yob0 z=kLE3Z`pROe(!?k*ZZAZCp>%lesg*LUZ1~}E4DuuUpX5%D+o;LE?pI`w*e>PFWu5u zo3`pw>d7bTtAO*ypw^iOa8Ruy1UPjGW(VmPnC9rs^-`SJkt4T%*3rPplPmA9ovSr1 zqe>{%GU~66x{Tibw!>2w2R%7Dxg_NJ#;H$IqINl)PzR1xfDO2$TX2*o?ZZZP_v4kD zEB+`Zn!gBn)o3{H6<7ZGFN!fmnxBH4%%-|rTW~r@@>SldbtU$D6CUnNXKHNF7g)zx z^l|Hz+11ZBPM>*B!TRwuJ+V)P;Q3U6Ot)REd57QLSi|M^B4PVq|H;?y)Rh`8*}id> zlWx2J@<6SbGCNMJ?s&h#)=Pg@*{6dyHTRyqGUb@;&cz;!bETi%d;0$JV%f9xAGY59 zsF8Sj?~WVt?f1NOf4p`(bM<8{hsCXJSAk0kU>>+@9vfkQVJdJ%m3@u=@;MhTJLTUt z*skgRE;@Vl5wU6FAy;`fnMOT(f8Z(ij+%Woy{8^?FPm49u|{kEnfr!c{Bk;Gi^jHz z#Yf!BzrIzxn{DN(i+7y#b0i-DfDfJv-g8vytUv?#{!HtE5(H9olyP+iAzLW5%{F-6e%To0?YLieJBP!t%gvd|xN1 zZ2BMhsVn>9^rpW$TSGY>rK%tJ5XN7S_41ISJfz1uF>Oi217Hj9+>h@UAMa#-l6mE_ zzVY=tRk{Vh#)ypc`^%24e7E;*i;qo<@JE3IIPO#CpuTu%-8aLZPZ@WwyZd`zd`{xeHNpMzXJ4*g?-XD1FEgm{>QRCH zyyfw4D`phr>?sdtE@kW1R(Dnu_Ei3M_sgaWfBkBX^JSieRL7TkM7U)d72a(8d1;q} zdb;x4&l@9W%N@BpLy%$NyxDrC*8)%Ktk+wU*js24|04OcFvqmGzCBYWXge#dxcty@Zo87JH!17b@gSBhC2`gMEb zO;8IqW@im!_wnE<3mq4@zxb*AxHjZzCdmBYp$!9YMjz*dl#)<8gbA%}o~sD$^) zDI2H7ZBcQcr6H>=d*pGveochWd4zwobN`e-b6CF3f4z9_^T+K^z&mA4O*|x^9dSbT zSTOLrgHk{O{M;czP!JG;1I3|;2natZ{(B1b|He~e;?Py+hW`HkhK7clo16If_}*UG z3sq)qLG$g~w^LJ7vvYF{k26RJ@{)y+(D(hDLn$yAEa>6pbpKDTR9G~HSoXJXg_r0m zhZhekfj%z^OOe&F=;^o9dF+NqM(CAO#dm)pAoxl|LA3Ss8Z8Fmbt;T&pLC?9EhdXp zNJvQDym=ED8JWna)!w%uE{6Eyf=>o)W79u6%Erd_3`ZDSVjv~W2)MucSc&@r!B@ru z5-L02;&Z;XzCOM{EgM-`U(d(Q9b$_wj@#JS$Y$K)?CgAVvMRMN^5e&kwzjr>*{D4C zaRdaT{Bn?;y?rK+-SqwS@%X|{R9_6m`N@fTlR5%BFjV%-EsYHgM6E2ewD<%BysjJa zii8*lV=wmN&D6zu5e*g0(VyUrHzOP-*8 zf#5DH1F^EVKiJv|T}H#g0>-?+-34Js(#R;QrpDda_#Ht+!1YnqmoG1XN)UE}^)+C? z@x=xgZ((B@na!C!IfRne^P=#nsWG-c@w7@UEiE66$PmO)r{ck?s;UNc_7YCQz85xg zzbq43Um=K$Pfx$X!m_Zi7#D-FcLn2@FAX1&u21{xX=VCqfG|E+L#b~f#N2*u!KhsB1|-Y8NcEEI{?z=8kNHb0gtMM^^Q9*zK;tupTef{|E1up33J4<*}!sV(8;cPig4kxri}W%@18E@FAAt09Z0@*W1X%#n`)knegPr{)UqONVQNh2S|TI`+BJQ&!$yqp|}Z1NKGA@0Iw``wQ5uTi35kd$J}G(<@G z%DYV|{u?5NErUQ`xw!ZCP-4U&^I`V4n7Vi!(evE!pszuHJYbZGj(1 zfzOJlSQ%9nL+-JY%B2d8kmwi!l8#Tgug#1{}0 zaT6?KDEuaUaA61)H4W2j{D85EH)taHry%cT>F#HPUeh2k*n|TlHvHO)0Irl~DukR) z@t(%e*!C!0U*qb~z$a?#h;LkOGxg381HtCs2nKN2KteI+^8S{bbh=^Vq=UL2hW6r< zSNT=5GIp1XS`H<#2`%V;{k^j;tGR4$z|+*9j8MPjXnwyu-u)L$l2T>ewjXbrSk8lh z@(!r3_5AAUPk;Z|j)I{s_t2w{*IaJoH zcTVq@_AyBqo7bqX4S&m00-8%$(J+5pWaJW2?0VVU5wfv|C|#$oqeuFm#wSwm$_+c_Jnaf6 zb`3NCo*yMrZIt3%41vtt7lX;diU1U1#w8J*ZVvafH|8$zoywX{~aR;sFx~ z$epDKLN-?8mhVpy%i20Ra$%0BIPV^kz`!D8H&ZU-6iLDZ9!h3GeMKgIUh#}6G0YER zHh;`!cZILF_$kdTh=F#?=`9W}vn=Zzv`c!t?ZZ;!DZb;?V5zloyCM8QM!5V8gf0k} zp<7#9wY9at+G%EHc6D{Nla-yF9U2;n{6fOOfHa_UXFMO(X5V4GN>;Vi&nH`!oIrpV z+3~HOcRm`E+eyH|si%@;Y&~3^?TAQ) zVo7AM3Tij|oYSFvC2mle5#}axx+qCkxHQvFEfl|J;FK(@&76RMEHdZL5A62H>ER(4 zi+iRT*bhp;fV@}p_U0F4?pk$jKN^R_W#sH5BasB1WQO?*nH?VA;m)2>O`59SV>aQQ zvyG(H2)!WhZrt=E@yG;alvx=dB!K_s0AY+U2P|h7mzM_ZLX3=z5)u+S-yMSM736?^oR@b6cd(GJl;_?+}7#Jvr57L)!J2{@Aj&*faIo6T?r~6DEI~q8Mr|Y z4-dtZZ!gB$Tie>Q6-6W3`-X>ydwZ|0kEKSYD@=m+r&%s~Q`k&^7XpfDM3SS@HYSBYIOd981U>eFrUq1N#tWaMrj0y_Bb_Go>L!<@^@{h6g^Paj|3HBwRrqvR(V zCTh2fSq^e?az#RmpCAy(ex=RGz`$>FFzIW}fa3GznUW)+f^KV+s6g%JYZ?s#pWfLW zZ~2c%xI%%NpOPnAt`fk@bdq;LJP;t1yf*rAXPUj*1MX${$Y0{&O-?~Hao!1X2R-=% z>mnAKJdYx@yga-7XE5y>oYXY*lXJ}`{>LK3u;%0K?Y;H$*-5wEi9%rMe=G_2;9POv z7~o@Auho}S$uXBQMTlK3=vkKJ^?bGg>jWj!kVPFT0iU!X7o>5ig+gx09V`*iAbT~9)Sx?%UkzJ?U% zU8J(A>S+H@cZnTLjjZb@2SZQvPI?}xkDdqD`}%nXUxO;y8savaqwBj0(&KA9{8#AV z?}WW8=eucoM%6&_-zS3Hp1H(4hxw?e*u;W@XKoNMf=i9|aMT&}w8cobUb0nZ2Dt~)b*S@dMD zCFVNXk_MT}kx^d{=ZocZ{ zYaXl(&H7=g0pxi5Ck6Imroa-v{E_7B*zJu@Y^q;w5d9makNeI1iRQWt4Zw9VVrD zP+F(%tgXK*-*R;jODl;GsaEg0!|pQ+6OX}O%aS#fkIPTqsneM}Jd>=$@dHE2?TOY( z6qfe_<$1*O-@`wkYedcC{xA_DpLH2#c``8HzeesF7Lc`wzoFqcVjInAVY4IQRFG23 zjLnKmOVR{go(n`8#^2pLjOzU%T@gd7ze^D+I(~1A)>b9}_1MC?KAl6KyxfkznGtF* z;(oiohx*rpV>^Xa;8CwbJy-CXY1V=YH6o`ufd=;22Vkyz0p(lWs3FSD*z-T1*}3VO z>UwVyF!SkG4a!@SPzbV9NgE6>=G+E5lZrg8wY`nPm$cRR{nhX2n&L)|@X;m^aUJa& zOx4jkCH$gKK@N;p;k5KAn||>*I3tr&Dcm;=I~C4Rk*P-V(+Hcw{gm1x9(bdPhvp#5Z($ZAl%u(TGwEUjj zNdKUKf7!)GxNy*n%Y-8o-kS+eb9y`@twQf&HFe!w)tVeQ$jY8@fY#dd`_v45TCcqU zhae6?DxYotx@bA-3t5WpLS*|fg&%2lg(fcZBOjp6k#FTLFShYGk+PiV;dOpU!>=i$ zjD{?m4VESQ0|Mun8Y5UZ&)5vDr4@~y{UEZ;N+&Jr@8WgUd$?s)?cjn)FSC=)CmO!Q zF4VU!7o>2-BxPAw_KL^F>^kwu*nUk1KdZOKTYb{ReioQL427vV>skp!Kj>7WSNNXo z#A;K#GJ2HG2dZ^8$Ti#3lmGN-YleSswarOehZ>fx3Dff!#@GDXDtgP0BKr49U!07d+ISLX3;uNI$EoJ&JuRos)m&s^Fh-D#Zq__Hn!CRU-il%%w8X=&fkBP8S~De|SN#{&$lBz&|QSKG6r zZQIy&@v7RI_?fu4o`nz{@Mc%%kDgMnp4VoHRfc1rJq!TJq8Jn!?Ji z?Gv{iG6FD3lzGgi@uPN=O|?{*^GI8%$O$8lu8!#$;Obgz*HD&*Sl!XzMa~y08uqp!2{(Jq^py^@hE#?PE(pgTCIUQTvX z5(Tk~(@oTF-GDW>z+-czps!9I!SZh?R z=5jOt5w8~uLH0qqZbd9^;^1hp0@pJ?(5x=kNz#@mbap~Zl?C$$6 zMlk(a;HQQyA=+C98m9|RG$4@=c@hE=DAnt0N;n#%?hIu{Bb_VU_D7@F6z@mZ;x}l- zI&Uw?;$JJ{P0Vwc`oN=Gn!sa+q^tRLwrR!`j&TpplQgdB%qj)*+oL(?$Y1`hWa_8K zy4x4ojuh5!C(YM>H2WifA0FU@UdBi`(dCK3GeatR-qDet-6MH-k3*#RaA!(Qxq7DP zHZv9f%V}lMF2u7vT$GXTeS*~GOLT!UjfqE|b8a+p*zEmU}Bhgj-PSX_S41I=(q3O}Y+Ja)uuVra7zH{v>3JM^f>vu^KDq5)%uWt$Zamd87m_ zoKrE{n&O-=OK)1D2k%R}UPtL`j1V_lT%N=(bR@xy@D`?2_&S-3Js+!96Ruc{P5A~H zM|)Ds6fo1}dALV`4YI47TVq4NAj_~*$ObjzLg*sK%E~Gf+e1wogSp-0&j&Aly1No( zwMT~BGjmUgmi+JzI5+D$VxuZW`3W3m-i*^owN*vV7J5&JA_^~QH%29zcR4vZ@xQ-1 zTyFI%)rNte$KDs~rxmAFUJmJFg)QHzkIG~QJ%@_qLqQ3#vGQta?cS$rf3k#{15u7H zRc(0|n5p(J@LZf;$zM%SBqV^by8ZpIZ6hl6nV*}A)A$J0bubC=@nx>-fuUw*W}aVB z>j0TA9ohEz-EgSJ~_QoG;UJjv9;YzOibjm z{&TQUUlHRZQ3fkBUESJq)S@*?_VT*l*f_RezQ7N`#t43i0y&ACVd%d&@Y`wri zCDzWGhhZyEK26Ij>7kHiZx@H9e#8nnU}9wOIXTW&eoji_p`(*kRNNb-|7)BcG1{CS zQ)wq?CA=`TA1I~=>HV#KRL%xIEBNx+tJW(z;+0lUdUf@AkxEABD{>N2(g>^PihG81 z@KeQzH(ohRvp%8}Eb@8);=O#?`$r*&jJtRr}R^C6u^RS<#0j|5HF)GF(tn-LBo=eM?AFm zs7V4F5&iZpRFtOr2clG%v?s^Kp<`f3hB?Bnalp{k{Q6N7a}#+B&0hmemfUD|$`D4l zx>7Rp>FFsh0l_PDblg4v5q8PWRmfxvJ?`vOh1E;P-B!Zd^H)Jy&e`*V(d02Yi`_8JtorHQaZpZQC6=IXHp3Cf}?|!Rj7Clnyp| zM;hNYPe|AhC#vCqu%DzK@NnY-OlFhzK$-h!LN?=?E9loW@XZa^MCp;m8$EbgPFN~d zFAjYP3w&^B2p=Ec&D9l1u*L!+UQzQFO!>6^-uj6`cT4SZ>>$tIqUHz%5Dl zE^BMCzNIDYypx5MRbF%F=g*%X+0E?Gbj3_hVQji-$@>Bq(-RpLlYAY_-yQH362zWe zq1YC!8dVPu{@B=2)5e##=i56C4zD$%Ui&HN3^s}<(qk}ljV;j8(f#@J2S5c(ZBFzBtmYmS zLvgvLQ%A_9B$=cXjh9-;^?qO_8MjG=CH`sNUC3DA>ujfzB3NR$G+Ju;=>){SO|p8E zXj3_{rsgDo&nc^5_h`a=$R77n_q1ao>yoI~RSp-ON}eBRS62#2(qEgj`RA$!Bp^z> zzQ~sl!@anIYLx;&ji~$1m}sl!{^yEot zzbE8k{Zj)E90+Hr5pp?cY3cLtu%qmwez+Pb%4+j(&c3$y-lPGWiBG*Cz_2B!r77fM95q)~vd7EX%a2bQ zp%^$7SMATfY)VDT!&e*@E~PTRB7TRVpaV>4cHiXtw|Q~ag1%G5}%qYF6OrckJr zsF;c7=H@P8gpZN00!wmeD9XjvT(z~7lqAX~8V%)EUisPOLGMvc2e&`;62Fgn^3JU> z$PUV)Tlur7$SU|*ob&Ig_XTxHX{nUc@k;wMfFg`oW0nS0M)vhrCnFAZ4-bE*+zd$n z9uP`^%Wh((tu_bZgehe^n0WgZn4iSi9^w=wh}@n# z70mygVZv8ZRGgTYU}R;@=66{GU^nEKFHvocu_>C};=&yeJ4GhX#*-mg(T3avJ*X*> z<1;hy>qpz`@gV++UIPe~S|n6TdO}j=4ftexx3}In(y~Z>>>iBeVT+URLeB zY7BpoSCRS6LZP!cSR;q-Bt-|hsT8Lytmnkh! zC~>Xqysw)h)P?B!Lo{ofUQG%ZNw-4VVmuOv0NBvQ&N$ZBT9jW;QBmD@ zs1(FEy!>L#<(%G+47G*&z@!GA|H84>Lttd8r{u>gb!OAqyoDV98LZ>Sk$%%mG9H8C(1b^)EG|Z3YHG@9tUP?qKDlrm`0xe^zwPc~Y@1=-tAW3a zpl@q&rLFS}-SleSQZp34{T#C*+tTf^rcHjErc+K|HQW4pdjscebKZZPMZ#l)7B$rJ zEFMeDamLg2R6?P4Ff92zDqD;FheP0Y zt@+^lY{y8b2DU`3&-vE3u(15*CY$52CY^EB*7~a8wgtZ78DmSctBX&(;KAX<#fb&5_2%>k%ITB<1{m!FE_u{PxvkVB1DS=+^WBhcyHH=@eF@ z`C26NvD;%dY&3{r$svyTeqylYcRP4^rdOAfD7 z1GuCbtN9ocRC@ULO&?qScMuMTYRuL`5{wjozx?^ek{`et-kCT1MR@$Hdm31$+^`CO z@pv@cYRUe zi>QhxJ?cA7Cf*ol1-hS_s;i}iub=7|wDZ-!wN}^vYY>msM!e8KHl4r7b(}?^{-&m; ztw_o!TE2n<>3jOBm`6%EtuP|+of+(FCrF{t0-TU#MctO zv_ou1xvYt!lk%zBYAJj}uwXOjA{`9Sz=ef{x3{+k2S@>*qr_^+j4Bn9Z3Hw?D5nhJ zGPFC(zl>SQ5g^m0qHrcHXh{hXrv)0R!FG&quqHNxzYU!Se_p86W_r#!N?V7qy_6P2 zTT}Dvk+HMu6}KS_dzl#)z(Qe~UGpdtxpeSO=nba^HlcOC@VU1kQmE5r7l<$5O>!O< z6nN+4KR>1KKNjLZLe==0TF*SK&I`-@)Oglr2uWvjm(7<>mhJi~N|kC2y{3@NH{v~Q zd`Furx+N;H!ZX2GmPz zHA`LeS~vM`(8)PY+GGA&#A|2VyAe!Mi1uJ^I&yZ zW%?7E!qs$QNst5NYw!NHe%VqRO80x)t6;a3a5dl&;EizmM7dE8*Ry^emF`Gl-3<$R zNu;r3#po-HQR+{#z4c?+vn%#Ty|uNIt3t=P1N5V|(eQ1dl4_7PJE%m}RCn8RQvT=6 zSFD0!{mcAIt{7>6odE@umeAKN_YI4>b_WfoJ`)&Wi{)SOg>94w!0$j6+=cBLFvinA zcL6#rn0s@@M?P-Wj~iQU$PAdXw6*(s#KrR-PJJE)naI%gabsw%-;04_xRriBakp?s zH}zVbas8#NK9q8^Be~k+K$DaxwEqAnz)vxJFR*x8 zcK^A{ug8ZtUpzz;9nI55c6w1!-lEE)TYg%rhVXzLqsuyd$Vfnwza-dEKh?L0Vo5HD2gUcDl7qSB3RyMi!s0bd#`dpm*Nktc$P`KC< zpA$sQ&Z(1`+o`!asp_ueKng{czFLl1B=C53LP8+zl){DsJ{HZn6u4SO-m!^x)eaG& zxXpnd+*!xd6p(1#UQKC`XQYfc@vH;tf({6GqU}ZjiD*yHP0h^x@S>U#^VJc$)vR!e zIx3!{f7#GGrHX>Y#R_+qVfH^@2tfn~Nn#HMZPr3SGwc?B#hF$6NzyCnjZL z!OxXXB&E2nO9W-YxQ26AdS;bnhMB`+XzSYKP2L`-$cii#xg#JuW8O3Op+)?e$slgi z&>tkTB285vw?-N2mNl?bEB$;vX8LBcHEUifO7^)Rl41^VTm=lx zv+yX01tTFc_?rckgOd&g6@Kt~0)X~jpHJ;P7F#KoYWcBPfE-b(0J60$%Zfvtq8}M8 zVdMlDntLK}v`=X7JS6!Qdx)j|s^)Rvl1tW$=;SIEEUy*cS9REYNF9N$LPUVLG!%Hv zrx53Sh07?#1(~Lg1-oNuM8)3)VAE8!wUm$urHA>U;b$5}hH7HVFk|q4`sBOoK%5(n z*u^dB*t1N1KRud4_T+}{s{;w0+6Zkl=J(yZ%3$-p(9&hSRhglfW5p{Pz(WJbdXkqA zS%t4YEC(HO3y^87lx?@};ZD{I%^_MayKWtaCii9XpBx?;@EOe_`C;Ss7>KKZ)Q&p- zaaGC{)IUj&_>scyJIko$mgcg5$8BO9@(AnvHxLpx2GHC0a2s)tUww_##TB=O+BGH^ z<7a+%Xd1^Q{twP8|z?G{~x5CMxJMWQav*{HdzOBaXB=@X1S6P8ns5+#b<@p{^291Jn-GScL04C>(CX7O))uXve>4w`vvR! zv{CuR#f3rw-N*lzg-S@6Y!mSlMMjQXU(a{fyUWQfw46?1J4HuNWJTM-G%;BzR7!3M zdhi~sU<#eYO=$I7DK4~paOrdrM+N8>NLyQA9dvJIG>1i58Ts~>!6-9uGPrB$`Wh$V z2Zr~R{n270zzyzTB1?>cFmW&!D(%yV^in<*IeH5(Gx6W3_&;cxn5d|z0E&(aF?k-0 zQYcg~@E|oTOb$r1m;4C;ObI}4!d_8mYik#|>xr-M@bVhB`QI)tHCwKiMQm@%&|e z6^%&y)@G8D$@YOEN7~m3T2v4U!_`Xj;@Zc;1IHAJko+dEyRo_k#pWzUR% zBz`+EX*o|wOIZNp(kVWZ2kXB+KO(Vnyo4sto$t(@&SkcWv{FuJSVbEVnEus(sU&SH za(ulnRC%ky!^woI{S&n(K?He*M%S^EpXLniYIo$DkbBKush3b5+k|I@L zOVHJ~;brXj*bbyurk~!!B;LPFndii^%e!;i=! zK}!Fn_enHhOh8RI1tc)aEh7VNhXtX{{asNzpJ%F=*b1*?d3%4cZO=t>^FZhbPXVWU z%3j{kZs+|njby!q%6mA`Pah2ic^(o?JR7>!6RS(JnFD&JW<77&vcZT5mBB$KXs<$i@|+!m_1M%|QYAqU1-%QMqFXC#sfSJBL(?A5pq_`s**hVwbG zUBs^8Lfr;E&Sk9^RgJlun3&yO7!O%4mRoP-gjVoFl$2jKKPHk!7V*=8mM`sjgKfJ0 zWUsr(Z`!Jc?jMPS1I&BXJ*g)q`Hwk0I;Rlk47TyQf<)c4cUMLFzHL-E$%v4VpmSmF zMyw$wF0aqOevb-D9tRSh30}I5yv|KK=P8tno5jCsm`heGrs5M2r4(q!;!>3KSyuj7 zE@o}eT(P^dC)yVE!1%Mgf2GI5!Kh;)wm-2GOefmdsQ)8%-)+8-1i-YfW}2qkkh=sW zZCT6QVPW*O&tx-{->%N-^jk=p zPGr+Z_1hf7?7H_dzZc_Bi25Fo$Dc{{wf^Pb>er93J60qyAbDlg?;Z76fBY_eXz~af z?OVt8^+8q85U2T*a9SKbI_0R8*$;uY+ImL()mlEFdwK%lxkj_?h`7)#*-6={BC+PJ zzb4=Nk9-JQtY1)b>?iN0mpXB3o-?6gN%_e1c!zPavnx1Vx0o}Ibs)d{(#g@`%jA^IM1yLKTZxn^zc$9lF{;LYET{NR!`0k1*F zlbmUzKvw9N*^o`6$CKKZZj~|Z?a$Sf-5MQ6ap>BStbTRxT%%a*AT-153S8Hp!g}(R zjOUKH#dg!?TRdO4W~8aDFLLFO{}NG&x~kEuuuth3X{LsU*Izlczh3^$#_MTgsko7` zND{5!%ga}x_u(;{`w&u;LBXoz^VI*?K|LB=|8miajkEmE?pP_crBlVxqAzESZF+L( z7jaF;hM+ZPm2k2?RE8)A$aE~}){*b!fk4hdC>>qZ67ImZ_0KFnkIwZ4(cC~U$986W zd$ai2Yr)mw3zGoR@TjD25OpcS22VUIQ*q1)9@Yza9{n$g%i~!Me=3YEeB3mzfjm!Xtjgf#+UUVDhEF8`AkY{{{{QeEpc!ZvS8XiEF5hfTalE`o2{L+$nQZ zI2!BP`=l-LyQ$_>*0soC*n}&KUin}FyVL0J8pWEgo9RBB!WQlJc5?FlP38rSb6)*> zeWt1r^PRuwIvg9Xj(%%spL$$>$x5jsKc%snhcF$mtX$FEZPZN9XouNrj`_4R((T1s z?Ca)s*lng-UNcq2-zx}re@*Ou*`|t7wd8TDg5N1@epp25V!OtqvZY3Bbb9vVGVQnI zi~N)1--)GeL7p)*_=F7TjsD|eloRWLAcnZl9&IDxxvL-7i`ZKlcC)mF+}pCA>o{nM zvr(@_2Pr@G*%C87+;AFpeR@*%G3_m|TNue&x2xet-9PvE}XOsf#Ibyn(PkE?jIb0ldt*2kf7P`h0tboJccL~Moi zkF=zUPMfFjdRF#h=cVnQSI9?ayFQTApcITdWnOD<5xdp=!|Yk0{2bH%w}?NoGwz0S zCcD497xh;JdrfjDroma;nD(#0pX_tM1$^4B(cgGZ_MN5U}2)qgY+wD7O&c3L|A9;@Jl@m2JO}!&8NV}F|2}FeaQVfsMsA2 zxyX)PsQB-OeCOFk&B8VohXjyi+hndfaCnvKE?GRnw+TmZ)U86|0BgX=b0`r|larbTPRdhl`IcE%$m zNc8SZ*WmVk)*;bohQyj6q|24zpyfC~)=zmKH$T4Fcg-$tqGA_Cnd>Eg|gN zf5CjJPXYyXi#Uxf&fIl^m%I5NwBcI%c~V5qY$H9}Mk8{%G`v1x^P}4iLLoZW_N}RZ zU{qAeN}JOCT%`;SsI###I$Pn#XilH(MvAhpj{N4CITj_2_dg#jfpEG^1cGDc>4o+U z1@BB@SIS=Fn>4VZ@w&6IPm4d~v}U8p`84*bRZY|EzGnNK1$23Cmf>(`)SUQIRs}$L zkjwBU8jT_ki5Pdn8h!+&`Msm*f zZcuwe#4#V`>9Bq6R~PD;#UQr;1%v0wzqZ}=i}~r69V2GXwT4+H<5v#Wi2@;N!qbz< z=;2|-jAUw5mHUk;-sLv+mXZ@6WxnORc!yHDv09^K#8jN~;G687U5a&~<_ITyWR`Ww z=qrv?4bicO>xLA~Ln?XM=CU~-r5P8>@K#4*8-Z6>Jvgd6O~IrW-4NbNq}VYSN#yr~rPDkMcHkt%e%COQprl(0yLb2hb)$ z!w;kF4-&zhg7xiBxQxra4|k5_Qr|9m6|JCg`Un%G3fD5yNr`Dt4auCuz- zV>HtH{*vdCne4#9gw5Y2a(i55KXaWl70$Zt&Vd_-IVb#aU*8n(SbuHCK*S*})`m-E zRhR0sd79iQ_17yAC4&iGGdU7526s5d%KKkCR=C;|dn#)X+pMeq zc%zKJ*r!BR-nZ>oQFivzwywn4IlaY`!y|?Uqs2Q>}$;REZVqdtRL5Y{+%jhuQRDJw zF^g9Y7^~rT6du9y2dZD-m76uO(ZbD?#^D6mGR*@no_tk)(cik)WL=fC*FRddfALl5 z_|$ktSJ}O41Cr2w+FJkTZ<9h(FXx|4jXX)aV-P)OOoF5YTZBR_f31!uO~9ThVO@8( zQf3+v3-zEA>-HxU-1UC(NmQ(RB`rc6;+W_D2Hte=Y4 zoC~##4jdp`yQ+5^Q#^_CujQa|v$~XUgf!scqY2i$fj3=2Y~53;Em*!6I%VOIS4NOo zd9ST47beUa;Ov1s__X~Rjm)wGj3-Z$x?StD>lUS;xNEOe(_A>ye%E!^Zp00L+trNj z<>QRz@ym8oX?2q`IBxtBbo-SNW%`z{4W-O3>o2AMSv|*+J7o*nO4FVF8FbQ%)hYe# z{6;fn37{HP8309qWEh{5sM@Cy^R7CUxG7U$V#R71f|Y8g^$Oy1#Wd@!Tc_MpfN0+e z&OF-%b&mCTDFs9d+2>JcMAh_SKv4Pt1)@~oA4eC~pp(tjGqLs+kMLK$N?y0-SjTdU z6HbpRDr^vf2Gh>Y3>mD>t&l|~5vRs`klo?UYt!>sZlQ6~J;^OoZ>)X8`v*b;p=bS{ z9eoy=%dJjWEp`FtPAoS|an$%#UNJ`k2M>E(uN)$0SF1-uULMBT?)oPY5n@41Z+z@Bi}A;s#hzC{JRIQ1w+cudZ+G#`;TUaL0wn% z5Ls7q00*!D6mhgk(~Wq&?PBa&o63p_XSb0l&EFQ<3Pg(av%*AI z2I*^us~ucc2}B$k!a#Q3O}-BrGp?*sF)v~^%bdKFN{xqUKFPuS6ZeZ|c3t_Bya-M! z&Yg(r{>qjgH;wBSY&cjz%AlZy1+C&`v0nodPYw*YT!TBNbi(`nW-J>7PaV9~E(^)-coLZZQsAGX$uWraXQA2XWhPOw<@-V=J z5=Y@{V8;To$>i5uqugvGHt99j9&2-|cyAq=;oZ3rjzOnJ=jRO?kJMi|LSvw5@B>## zX2D|pp)bd+PU(O*V6XvL21A?GcpmNpesPUjp2DtGnmKaFA1mOAde)kx9R9u8w&hBK zWli03y7I%V@nZgsem~^SNn9kqM72fibqyUBg+RON)n*#f!_!=ub1&)bY=XG8S72mJ zVB&07Te<;yhxfHzRK*QbDd6@%=g+IDK6@VhfxEBz)JZiecAOYz&)4D^4PtnrVg#mq zby7-Y-r%Td^2g2g|Asi`ocNcH9gojf41&BK-ZnH|q7F4|lhQFu_C0+uFt(`4*ez{p zPF8YC29sDc5^&hKDqd3KldzOP5 zU*C5A+Pv5b36a7;S^3i|1bNP`v`u4dnykEWEKhMcxB4$5JtEt1aReybU3dUtds(*M zTOBrQrxFh%VQ`d_RPn3*ZC{mSpXi(~`a))$KO)*h4@2?}a%W|=4@pgOVB1GED%?dD zR#~Gr)NtrA^P1j%jCryU8EL@94CmdJOJ3q$4|9lhlv+oYj7X9Gbp|aBxlXz%-g!QH z7ggoW#!~4P^noSETqSf=;G8^LQpeb6(snLrC%1^HH8O}7RTI%DMFZ>Ew|$XRjF1}F z?T}}aMY5fb&Ci~9cl%MsF?4d;JCK%BP2g2S z#N-_xuP~nWm%cNnlhMX$PAbP$mEN^`&9BW z!C=)i-~QG^^va2A%w~eN$W!PiBDsE2M?>LUN+wPkyU4v%K2_0|{p@-Pk3(|Fzeo+) zbFSP{;$M5A9kNc)>$Qf92*9=L@HVW?d zu?FNVH(Ztq`>2`twe0;{UoqBa)`shD*p=6IX1UZxiiVvP(n@3hQ`o39U6m0V-ZTH2&h4kh z{=q(c6)E7Ap6x*N*v!mA?m+FCm@8M*cV?0cdbH^Y!N(cQ2n7pnig4}k3d2Pl>sa?v zo36&av_=vlQz`SMX{2RM+X=TU0?2uv_aefq-jo$gb0cva=cRXN%ab83w3HG{7z9Qo zolqPQj@Du^QgX6KW!DWZ+E+c4gtnm78<yF5IKf zA%-}D<{%sDkn-956B)cooI>9BU;pY7g~p%X#Bzuz^1ZSXUQM@!)2}aof#Xo8O77mUmHP1${}c z_kMUBRrxI+jJ7vW4cgZCgx_=YG)!~3amJ*Z)m%?~JYQkD_Ji1(nZ~wuB~%9YA=*e} zJO{_QMx>0+=-F9v7JBqScsH}*p8E^9Is0+Rmk9?{f~c7|zutWmO=K#pvS?gY%H$T_VIM60+-gmh-khN{`-@OMi$KKtXDseIHo)1Qk)cjfF{Wchm6xp*h8w@;iY`U?7k~gV+>id!abAK3An@NYnZP0MEs#f!(LJ zLtg8tV33ax&c0sPfUes&P@70=r!e?WQgd^XLZDH-H6LJN-g^O&9YV`_h|%T89X`sM zpyK!m?TRUH9}UTD0QYZI5{MHq@k5@ zTz1CS99xz<`Iy>uY)au+skv(?yr@z5Jv1A|Xs)rRtOy9~{O{r7F|iXwIlpnT{xp8y zj2$z~`;8>Ui+}(JY;6B~f&TxKq{9DCTVGK$4$rqo)pA6mfWMd+HU+&k8K(-}s(-5Hva+&7*0%`3XPlgzfRyL_>Cx|k7w{0<*pvVwtU}C}CAc1i^MIN+ zGb@W&pX#4z`k#NC6&E2`|EGDah~L%h{Cr-$TqIFkVq*8`s2bqOhDymm2yk(8q0k@8 zv5AS15fNWMk|6w5v#B&{a7Jv+EiAlzxOD^gEp>86MIOMlCnhGwOh8IbE@C$$vHV}R8(Ky_ji879JkL0eA@_9`@l#6nhU_A z38-;^Ap_X8pR9ntx~{HH$m>Yj2^j^YQw{jCfyhEzKyz!Wi0}G8Q#}_ULQP&Wh>Mf+ z?BwK=sVM|dsXS%@44FGU6O$4kxoWMl93ntPMg7-LT(AO9&F8!k_q{0y2L}h>8$$?2 z`TqU;#?}^~(P9x4OrNks1cm5pL`6iDHZ{$UjlCwX0vJUE1d>X#o*#fSwer`m>8UAp zE-t_;7_1MNB&({b9&b*&y1O@ymYV-tA>hd-6Y|7rJ-yu5F*Y{-*Dxknp%$?H^Kx_l z`x=70`_?cq@V$TwS>S;O2O|Jpz>=d(0o;33`IpwFfe_rx0|VHSGhJZPi?u!Qb_fg5mvYuw)5 z{5o{ug1y>_ikjtl7SDY6_WdV$yfS%qNRS=uXP{u($Lk|{Hgi@*d?0c9Zz(*DwOZ9B z(_ek(M)Ay8H}XmY0-i5Bg-nVJL5@KLLjE;LEoW&sDi5R&Zsm5^xgb+YxOmDxF4DpI z_-bT!z;in(3X8<7pHPk^&Y`1W+|D242oOmd$$vg1t1_w}c{W%}Kd$|eVO9v_p8vVc z8t5rcQhf@}D^9}hyg4m*ho=HSyJo+qGl^6m+4U5t{~vJW_?Q`*!rA4C_7msz8`cVL zFBKMs`zu(Btj%@CPQI&h0Ncv}U@84iS6G}W4c?OB8+(_pN+kS94bn53;3VsVxN$8l zB-R_1tbQ4mCNO-COsWec^fFUa^2cQqvm)#kN+&W@S?9bO_A=tMXCK6bpfn)^wpyES z70{awj?8KwEozSS17^Kauc)q+W@`~{vgIR%kOnDDt4w*>UPr8cmuZVfkD#oW@XBn1 ztRlDaXOwr$a|=gF5fMt#(jn#L<S{M-<>8L6sPt;t%DZ>SiT^&G zu@iZ-+KtuN#nC<9n>|;q#6K_c#lI$lsqUOM+ikb?fu1R-BB;B4nW4}sgDr@2_Ym>) zv+f}qJ$*`Q>i1=KcJ>C(LlQzlonnww*KmLT2YPzc*6oMe^JcH(BKQBNx3_GotJ~H! zAq2M&G-wFHg1bXP(BSUw6L*IMcXxN2xVvj`nYa^NgZn(=U29eC(tYZ~x#|ZD@4b)K zp4RTCGralzPvzK2ci}tuM}u= zG&MC9`}l6kZ3S3}A8(FRQxJ@7Fy+aCA^TPQzu~GkEdPtECOnXAXl~|q`kIuONJ&p` zWp3_vG)H?aD=t1{z0ydIe#A09w5MEoO@lbimrtq2YbgpVGwU8)`E z>G?I|j7WnCOqD5EmYsy!`WsIX{n2pgV4m_*vXL%!vzap_r+52F=N5AYiq^Gh);--a zWzLs>cYt#KF2H{4-4PHFtaka5g_CTGz}Lh4>qO)uVg}1KZxP(noA1O6md9nu3tgS4 zE3d!?3U#9>OY1(06+8%t4vMF8WP)Duii(e~FMJ+aGBOOz%>P<=_Fz> zh`DSsA|lWVM}=Wk0A~mYMEO$VSpazFf`Wp}%l5CpQmmozVa@BUw6rt;G626zZ+ClO zvH)V~a%h`wp|<=Z39gC(UqVAw!B(m^7ti{jSe|mAc4sJ|Hj{AbejUuDYNuXRGdu zgF(;9s*hF4dJb(8y+_v9Bc%+%A7g8W*#^aizK0QpEdEV@Q z^tKo1#Rl5!@EJ7O-L&A9F@ZJ|0C(8k{Uc`YK)Tc@6O)}iGjPa-&YjVgy|LTos&l@k z)d0dR44;i(Ue>grE!{df*>^d|!^SIevVW8@GRl{e7=8`MuK@UyC}Q%b8;%;;z4R#l z2P%_xRjHCgM=Nu*Z$_Eb8%5#MA`>QO@X=^Iz5=*p7Ci1zYnw8F!v!1A8^r7Vc)cr? zTfn3)59X#&fy7Y6^FuE5yz$MFH+CxV281Xzl!v9hpO(9n50 zJ+S_Ri2oQJbx%lG@A9nz=rNVq&XEQrPmv?|!85P*F#&T%VV9gE?}PJm1BrzZx3Ew7SleJo24U}@)lO}~9`UbWBuPbil-~JwJjbW2^`fHD= z=GfAh4L>)KrBPz=5@b~KJ!c5vuPh=}IA9Ob)RAVBj9=@$sHAkN-}g>@{8eu)TDj1J zZ<%M~E?k0c;AYOqASGB%Im@=$XNa?pHh)v%pbHQffyOBs48;on$N!eFCc500Ix?JK z(s6jK^UD=>M(N{iN*8!eXAaZ1`@H>0IYOz(-KvH1f7&>QJ%% zIS+qh4!Al2i?PCLP61(VEi(hq-tk8K^UccOf1%V(CwZ8q^Ld@@bPNnt#C*aM+L}%{ zfk{4d{koi*wbYwzY32l+t7A; zXBU+7+m}=7c&{q-d?+{#7Ad7G`P)IfEmurutuhUG=seLbAFSH(^dgz{d2nxq(D`M5 zq!$|X*S?+PJ0lN;SCKaQQ?FXVCckOHqZ~np)Z)EzEaSm*rM~2jV2OtFVE)?b^dE3L z=vw(Z2*v2V()Nyj)tYCP_SF?dpPhtd6>B;*HHU(5Zj;VY+GJ@B*)~}V_gX~N$NJ8+ zSl0&)%sWvdg0>aI%iurv%=@05_K{!R(@H$qut1j#*!XbsrhF|DWOU*J(-4ZE7JfM{pi>+Sz*ZS3lbE0f(1B zuU#o0Os-22Vs?JToQ4&L8Z@ot$Z^wTl)ZKL{YJm+#w*_`#Zxk{+Dwe^+c@=2&=>bh zZ|_mxMKj_@J5PvPEeIufSHE+9TjfK|uCIDjDYcI6Su(-$3$QkkIgY%L@20REXJoBs z$PC^PKu?t^Oj|T6L;uWZbu0;&zUXRp+-31Rb|xalU(Ll_xGBkkPd`*kYR*rg{W;a@5?)|$t%##&w$D(aN4R`A0 zLiq{*xFLUdSOQ=j0P^+!g{F?4f6rP;bHMYNJcX*@rf%r6jKtj;k^=y5;?}dQ9fGT$ z9=3`k|2WkHxCM}%nJYo-9LfG~gtK?f^m!&Cro0vDbuX zz3psLIkwnHnp6HhG<1#$F3Aje`n8pIp75tBwmqK+2L~U2VR5mfq-61~>EmaTbPG+# z!&CP{7hI7sp5jk6=M2A8c7;6_WpnqKIQ}o7b_eUjfG|*%XFA!%Q_W5&BEn<-+yN{p%5t(RnAG)9yv6}B(11wWNLbLa8OcK2E=}c zPY-XkDg^rtw?fUVB@j-*lH{PB&Ma0*jei1VKw7A-wi}6~?6dM4`*$V zBs;!i4ZM}lhxd3zS00t?@-|{(R}>uJP)Sm4cVwA%Wz=UUOo67L14E`3?;gs`HL_2o`!=0>=N0^O z$2E5D)a>Y$iMQrmR-Z(;u2NsT)CFX%zNtlh>eCrQ9{CSODLjLdWRj)Dj^}!k5wof>>F%{vwPgl=GI^;*;gUk$_OroSH-tvgNp5d# z)c|gOv`R(Y&h_V`ozxJ}G!y-vaZ4HDo&FC{419ULDxznKnmuL=_?GB`9uM?>?R z-?1&U9i-W5IosZ{lpcIV9^#{M}c({2*_v41z#4#rcDuT?fA)3a7>k*p9iOP0LY0`tI z3=9mJnVG4nsrmW&Y89F{OfE~;U&CkF+Y?pKOfB+u+oyXYm|AIQ-T331fzcZ0a@DQ+ z^xo?0v$E3I4oQMRgM9>D=|3ql1x0*z_68SNwiB`m_4V}}b9zejBkMrtbP04UZK{8g z;V#Aj0}2vc=ax614_+V6Zj}QpQAmnB7;qmkuXLdsm=4hn^|mvA6K5i( zaD3bv@Q3rQ<2nKN`OPeZwe4y4C$`*rVv=6(y1OA47g=>ZRoo(BGg|Ah$NCFFhg69c zns4{Jps)V6E>w>&!H7|uUTw=nj=)kSOP}OAks;ZndFSo$`$MV#tY|$zvX`_Ay@^w}^I?LLfZ_52XD$5YCS|65? z@pGN);*E<}$Kd#)8`Nmgv`Emm$0RFZQD!G`zA81s+n_^wBG+%RCmX5qHGaX%sY%u8o0f2z6;GmoMT~+@$w>L>z(G$)he#8D`AAg?_pMD7kzRgYVUKzP^-_*0z?pxCOFY7zUP!!n3u%#pA@5REQR{N zllTJC1Z{tz3g~m$ZYwHx1^cw;Kh=nOdOrRB{d**8=%721Q&6z8ql2VV#?{FwK#qoG zqqmcvp2o3)AUG$@3`=c!jUmX#*(othYU!>~cEoaE1kv{nGB>Dc-ON2p*&|zSD|eQH z==&;E&iNOZx|?(wfVp918lbCBPfr7i|7X_9+w{h<_@WiYKlz?VZz0;d;Y^j5^aptv z_6^t9PJprmc;CXp!rnVo*VM?2H52lVC;4jWQ~FuAX`SdChH8byyd8p|prFiGXsPMw zj1LUJn;i{=<72;0VHtJGuhk(mxarM!iW?TuM7e9DCtK4|`A`uJD^u9+yRv~ zEbKR+5Z*sLY=rR7`v7%2CBmBH*(P>8cie}!wzCkiivTqJ|Xs# zHxW;1#%9%B-89^GPd|q|miuIyoq~v;dm1>{>s>J%#LbpNAzGc8za5px*g>HMd1i(t z1l#p-*^oh&O5;I*b_aYgwp39wRQ&z@i)D^G^{9EkJ-!uaGn}=NWx=gr__%MNu7&uD zkSomu{f4{qjVkSy+H)OvL%+SRoC5qgh6F1T19itLi$1Z)+C;%~<+p(01yF(LLw;P) z8949SLYcr8HZy9}^emb75`c=~Y{br;jK2TFlHOWUf+Xk)SOEhD zzCesSHYW0R5C0w$7H?TxZ*rrAX*`HgI&3t(YvZYrmaJ@S12p?fKI24H2kWt_DlblJ zesR5{q|PlTw7L{AX@n1h^!Zn3bD7<_&x|0a<3|g%tkEAr6G9h`mvzf-1Bqki78Y3V z-oxCcGfVePT%J154{L>I>`v{tdz>8i(OH`a5&VDEeFq?6 z0CmpK%kwz@@($aO!wcf?e6*jHcZe%1D!9b&`ssWeh?CyE?dkEoJxK+qVnf62oCk~p zyB&Dm&W)HWS3$;>eC9`-ff~)o>T9^HZ?kQzs{s~sr_^Nhc&MZFKwU;v6|WOBzZmMj z{-vaWo`JTb0Jev!R`#3Qyj=25`VZYglLSurI)?ZJ+%1AH9>A0b!^^c3fi4iEE1`-D zK|}>n6Ha{Y;RxwmvR}T}Lk>)9C$j|AwT#Uh?8E6a41gy%JOo^m-#7c+R-|DYad`Cw zzZtWFb?H5iGUXtUuH46Hp*xHO7I)MZ1zfUbxwr-hV9D(BcYA=xaj)CZ$i(F3=HXro zJ=s~wvQUa8OGSWtuWY`V>17c!u6B7moeLNrP(X$@F*5^9Z_pn-&&|JU^Won0B(z(C z#XHuA%sI4QKZN4Cq;jWmsG-JVGJX#B`~%3Q*#e#}Kz+`x@MWj)BQ)V`wq|M@S-{`| zv3Ey7IUoKMYNUO7ejnc)M+&TI7uK|dk)-^N7N5!yARbyr@KVA32iS@aa$wPpYJ%69 zAkcSfG%iuESB`Oi7l=h|<1$c(1OrAD-@iB1*FX9Qo~c?&EmgM8?$Ch_U7)VP0aW+% z5e4N4T+=}PZ-0Bc7O+?)=lc026X)ShKvaognJD}CGnb|zaXO^>1NDHa-Y^h+WRH8e z`(@K#B2iPD0RpEvzaA1ip=1mn9Uvf_zptdgez_%30R=Eb#FIZ1I1?}vJHvE5`oeAv1e@?zPT$Q z2Oyge0&HU5n5hsFrgdgV*Fkd1g7MxIXE(7*w28G?o24Jb8pPG6H8#2c!Go;qM8t=_ zdiC5iPeQMie61KZE|1-yl!*70EE*Mu(|F&k&nMqyj@_&lz!}$8hX9f=5Eif2lU!6r zhvpPpSf-C|)F~n`jwlXlJ&wSwDUim&sS+g$h0#g+^=$z;B9bM|^%iD%(0D7o`5%qd zmoLZD6+u;@8wM_$Jqf2(eYa4LUviV4;&3rR>+r>@QC`76t59Y+_^}ig1RMiaD*y{- zZAd9X!ws|$QtfP;Rp_l%P*!_jhS9s1Imgm0zuGW7%=T2@-#@uUNed?TU<}an|NVU? zsai^Y>4rk2&k`3pui(;uHyPOjN|Dx*EUil=k>Jb^e(2;iO0{Q6y%wyL<)9@|mTw4u zf482H7)AEJ;(*LrL6%vx>uVm7vI*t%C!wVb$JcDYt{gZF4X*kZ4;9&MdOQ6_Vg-_*ByL2GH- zV%+PePySm)&H;|Z=zMMrD^c-sTu|=${+Fq;{KHm5u80ZV}oXW<+k^>4M{oHiY|x1w;bOfoW5V5G%uPoeuzeR z$WC*-q-^OrjQignkwvSbWB9Op{)HC3qr;Cz%tuG&0oRJ{z(+7(Ik2y>iPNX@i{~BV zjX zM}09Dz0SfXWQ3>VA~F)4pHX@Y+LzL(pAd8_;UEwHfa&*Kj-b}O8t$M-t&ccO&x-7J zRw{6^24k=4Pxu~VxQuVt^n_mWl1ZPeV$^e$NqTk1&q8+CE-f$!AHQR$im; zP*146gqq1&zLY%$llzL@fDzW&?sGt`5?W0-cYW!X)OuQXrc+TX_k*7nC4)2Tx?Qj0 z%#fpc8Oq{21;@klOcUuoOG(`)P}KV9Yr>Jzn8%zh<$BijdQBoc#cWrpFvDkD1H4A# z37b!HU}rfZ_j=FMqP9Sx)|lDm8xsFjS0r>&L!xm|O59gEZ`A0M^LKGUmhozF(OCEg zp2T`NWmDK{XaH%Q6<~HUYRC9Wnn+5lEKQq?FPjo zS1%W^#h&+OxeYIZnD38Lb{@5C+RKNW$z$f|N4ZF9_*!L5Xs;&4m4j*t=QLlhUpbyo zvn=$gFPh{d=v>y(9j0_71}(W^22rY)jo*}W9>RcQIa4sPT=1TV;{|Wp^AJMwjPNz8 zDhg}GMD#x}4NAG6;x2+71wS-~OKo&b%E)=)5XS2+g`F=Y1<5!pa5SL?J>ziL5;F7t z=l0;r#SrTMAZXXL9S|mNEV`B{PhMqz-(~s?iJ&}gB5e>bpqdbVv)27H6hYmZ6;?If z=_h0uzNCfwLBU+aSlVhPJ({68*O=4$%)xyZ9hAmPw7_7JTORxq#s~&DKtZ$12bqq} zL<`v2zyXMb<8_h@9(=2pfxNuGzn`j&9vT%@1+M{5RtC20obxJ$u8U6o9-IJ@gPsv; z`fgYcB3R%?!?G$F8%t?>2TW>FSsD8a3rfxwr>7Q&`$bx-UKTq%U0+w4CSRVZ+CJ_G z+bG1w1Oa^zuw-N##(-UY%EzA&s(2tfh4X$f!z7gm3z2rwcx5>t6T%rABT(1K?jMqK z(GdGYODn1HjeKNCTvphycJwm(*T-SNQt3G?%&)h11nBEvwub>miQPI23nvB!#(GJ~ zu;g|4iexqh8WlfF4YpraVS$~aqnwMqm_c}|2H;J+6@=c7iZ2UX?8D;3gt(+6{O)dz zy*uu2W>|IC4 z@zD{@9*(?{Qg}$1CK*3aCud$VfrX*M(flAtK&u4M)x`q?2KLB+ z`RM?o`p-rhFuO42Koc1Z%x_rre;{v|1i0#dcGQ5OLHu8M0v!0WQKjNdGO?txI{C#% zzq#gNajfa=!ZQS|E5J#%aNexRW^P!^R&$VfX#i20W@_Pp!sE#=EdbM9sQP^_k%^^~ zvzXyrzBpF&-=OFY8TG*m<&@>;xqG1hWr!)IJgd!ImNIw|Mgm2jBMepz*R1IZ6uE9q zP4Oo`f2i_)aBt4ph1^_q(X_J>JBM^DeG!u!v2pdJzIoqxSiqzf6OP${-P1;Y1#fm* zw8ITLdxv3Z9=uk4Gay6v#D_KcBh6k1Y1!GeEbAnL17r6WVH}B7MlW`}u8LPMu&6jc zTl%9@Nz#Z6A$;|K0V^PooUcVX+psK(w;eMaSSM{sF-kJjNuGc11u@HeZ?m8u%i$Sx z7KN9F-V4CM|AcNDwfo4ahfi(gEY{Vsi7mBK-29AbH$t{~E*Hz;=?@X#=&H0-tl(_? z_XDE9lwqt<*(zAY#W)#0oMRBxci|oH*|BpdsD5{{oYD}T$eiSR2>QeZs>MWRHu#G4 zR-u@vteJes@q))?wIXPgm$LOz8^Z=TqoUPC;%o9wV3d^}${ADwTc4vAQSGGm+wWG> zp)0Qafu~!tyq5+uYYa>3Wza>|Vg>WO+Ku9HnmK4yah0Cne zola@!ml3+1g(a30!@S%c7f_z)gf`yvV1*Trs$!LiB1)aY%aq zT!!3+1y8^(F6AD2JpGY3d$Lt%YRj?x?P!Up%p`R{8OdoE9FUMPd0=s+ zN>hn#M#k@qzLA2TEd{0-{T^B2j!p7ZnJS~H7T7h}E@frpe%4mLI^*$~uB*Tghl>Jo z0DmAgl*}cELXd!aZ;mA|W$$41!vK4nzvkwL;YpwLuO6>)_pCG>-IcJ%=u9nXQx39G z`S>?tj9i_i7Z#e~Ibr=)IB!5tDR{{VTG43~^{OfkJ(KUc$+~`c#X?)0za%I8_V}jy z$#JVI1blW>u3~yR`H;7?f+O z)u}4^#$Bzm3SuXdSjchOu>{8x7-%_tat1fZshqHohs*=d<4m>Cs{vQ`rt{&ushI3Y z@DbkU**Ccy4%=6UPqtsnA+IGcM&QpcMsOVM?N&L?UrO=h8jQvD4+I6jyp6ND{UnRW zoE%UOqe@|xqiM^ado@hP8dVsodWFDV>7#Y3@-t_{1^e&+ zo-A#s^X!a%Nc({Q3^WwcP`!ZL)jIfwL}faF^y!{)j(E0O(Kgb?z0r7s&(xWr#oyoT ztiGsyuKrRLLDyOfbHN)dPPT1X<%acv5DS;&hPwi%VGhh*Ci(QoFaMyiI|+bdOQ+v(PoI?EIz%Wa1tyFR`aJ|Gbh zK(INPJ>i}3FV)~wUJt&@Xw6pGz=r$$qKMiRutHb|QX(#Am<;~oDo1(}#lO-@SA+Ew zD4tgNTD2VrMrv3$CJ~zLo{^6ztYDtMadE=^>~*iwyCu5DL#URGkN*ds0O$Qxt>4I} z&S8yCTg~7jJzlMIp6ofwqIy}cZ))xiPMlXBudK=4?GxFm$27i2t{V@Ri6JvljG(OV zla5uDes!Kws?}y9iV4a4b!8Kbpx1n+TE>d!#B07EezIpZf5Es*ol=g6*S5AVwyQp2 zxJ{jR7wrz&J_v>toJnsYD&o*S5P6MsFqlqQ3_Oq#*@sv-HoNahfB#qlUm@JyNDuQu z_!+*M@B3oSPX&&LaA5>iR(+|rAV%^{EgrNLlPM)4MT zH0tI{K0)CGH8m-AQK?M3y8tg8N4HQ1Ms!M^p{I?e!y`YINV~Y(w-xrpHnP0NageV` zRXJAoDY2)xt%0lQDUAyxejQsSSYOr74a;nR%V7nfY~1Vl6ZQ1g=6Jnnh$&<~Q;7}r zK0;bq#FRs@&3Suf=L<2UcP;uVW8w8jbn|C4@t{SXel?`@aW69o+jml3PcNj14vlmxEmVC0{} zOV4hY6*XINULXDqT$}$^$Q^XkV)!q9O8YeF0Ra}Kg@u>@9iQk=_reEt|%UU|39Bto6jeNXN`gw+w8MC~1m;NNO|PcFe*mnZ7=Rc2I3 z9F&H}J8aC9(#Gqu6ZgljUcm=ggsw|M&XXkiMg z%Rf3Kh|u8}lJlpBJ(Mgw0yk#{x#%=&4Pv;?`*nUf`^Y1QB0%e^pj%s0;Xs z-b}2B^EH=<(Q*-U{iiDRjsZGx@tHc2CxEg2cF~&u=&tN9jo6`MXbtD501VEO%cM6O zuYWLFb-5y)JfU}gdz1ANxA-1yUOJ9S7^J(RE-H)G&7RE6lAvRQ)|fZ@)+ivco6%Ky zoq(3+vlk5OE^S|b!2uDW#?3XmCX|gy5DCkk_EDCe!b{;BYUns|t50M}r>Z6D)D|RJ`5fjwm0z=ETXmT26(2!#Le-Jn0Ho%eYO-G7~k z{iC1@AUYp z+cbkzm)a%tie=KNzVAaKGEgJX`dm445FM2NQm-D_H$At#@2E?<|HLC4Fj|-Vu4c4f zX3g&0axRIKr%}viYxc78!4}EFND&ihm@jF&1pi#>w`)Qct7401cV~?Tx%;6&zcIn{ zfZXxIDXg-$l)+Y~F})15_an0$Jsx=dCE|}@N}Q|pjf4@4FBy~eJZ)^^{^j9(hE!CY z#_KFt4~{A5f^3aWVy#Ogq%)N$*;7n-j4^!abNiT9hwWOoZ#(0VBTa*24g{~va~>`3 zM>9)sl&x>+W-Zgm@653bi@j_YSBjn_67nD`4_Jo!GE%+#Cf3)ouSRu*HSixhBxiV)@Iq#Er1_v+CQUw$ zQS)4N5#)&e5)!6hH%l8V<+;lpG)3dp9iF)MXkw#17`h8)`a`j6lY~xCM&f|+N4?<9 z`J1({B9jzZ8lKzZm3L2>dp761=8rGSZ?&vbS~9=)Bfyr6kpe@MiKJ!)uWSwc4I>^P zDhRJf>`&w1CaY-%WxjE|F28t($gFiKznXsV33~0@`?|K}>+J^AeMH>GA@8Hs{th(liVt7i|0Wn}W9t0&A}BJ$*rsnQ!mYmjetwcV)FNrA zzy!UP@ET$wu)i*X=H~-6J{gG3LgzrIcO2@{!PT0RJ$S-(P5fmStIRV2v0YJWcRkAR z)`?u5dO@ecA4>_~ucLI@n)U3zLC?`9R^=P&wm*T3)p@PET74KUd<`42g|ss9Ha)ob z5w~O0GYvc`T5#3TFPMPPS8r1lw<+$pxKiUqjM9gt7cidPlp%inH7=n#dm#tc73yv} zP_=oT8@@!-+@ofBuSeDcY95RK1fJO6KHyxhLSNzc4xnG-`=ZDfsUK_5^RHqzP0!8K zJ02ZG5IPkxmOV0RvRRX2|Emr)L_55hRc+vx?$=y% zl_ha~bM=Novwhan^JDri2n_T+Fxq$bhv(5cx|uL$e%2S8jfa36p@ggL=sA!Wn|GO|Lw8l-Rc6dbkn4H<`hcCAh(CgL9sj zAd>yU*GI1S4w+e>qnjJ1Tx^hiZKJuHJ}*&hz4Vg-(+acW$KYDc_+rALMss>IsD@rg zJ>o(2hrq_^VcrC)^K_x-=xBWo^|Y&oGcIPrx5{r1oP{{~t&(0J5JBtyleLwS7{9OO zRfY#>Ci>b$#XTny7Xt#l-6iDrJXkOWKsd^0V1dmwJ{esQ@!mAab=lc&>|ULGff+~M zj;PKPYlE!1*iSoM&C8cl7g9S9jbQOwLP7eQ*~3g67G=zW_})D45rVfnV&}EhKIiEV zQeXo;GW^8Ogau0XhJ&Ttz;q-QDhll-y>li}Wm|s~X zblhUyq^q+H`7Uh_Ps+K8eZQN|nZ$?4sxK~ZezIiTcm1>!@g3#N)>!14ZcOmCp-GVS z_+c0Rddvc-ePNL>X(v@gHI4P9dn0UU##N|RIx>odWsDrT8>6#aTg9Ht8gB2`@?o*z zWY%uq+P&CRyr2~2jTR5JAz{)RZ8tJ0`fJ^0wC#jHMqG0|ncG=~iYWbkDN79y`{5>)Oue8fvo1VnJS`0f4nflqOFE`{=qv*n^cLRc5j6>XZ$s`I|E$kgf z_-K%bW9Z6Tu#A#3l-PRW`3X}7@Ezvpm^#*;$*0G|S`SmQlNG*K7Td=$xl3ELmVR-G zcXYY=veE=dkM*eR6HnY)zwNG{L6!>rxT7!R+D(eu5VOcmr|&D?=&E(YQLxCz`&QDL z@X2y=NTCE}$efAH*G~c2jQbai1{>QO7Htd$O)<4v@z)J=_i!DfHPP~CHu^Hi6)l?V z!ypD-P+gOlk$F<~gqsnR@6wJ>;p?6UGW!aBUayq32>8Xh-A<#p|E5dF$5s6A-J<$r zPKH`=kequ83)_~ltwzqFPVk>aJe4!q6x#qMSIsU%y_q}4FPsf4a{df%$cZp|r<@^w<5H?uU`vh`x*v9q zl9uwUr8n)aaj2|3xjo`_QM?m8~etreo3{Pwu7rb!{~UVpyum3=g- zYmAra#|7$Kf`_Wou@z_s&0=M^IzCPyTzU&k!H~%Lppfbz&SRY+E-KPu;EQ;~z=>jW zl2&x0iq+;lf8Kljd07?*{Ozm#&vho+hPT$5^|aNwmdlTPWQGiT5iv+mL0th`NBf+{ zsL-yD-_=FHJEtQMMXrk{vSOX2GZm88K4+fSzEXm>nSPGb@oPdWKNjy~2`w@CO9v=Q z%tEaj>!;Lvmj73oF)?GcV@Wl;nTrs0q*s5;&C= zffr%2a*(z)U2}TTN-_*V=uKj$o7^uV*(WjKkk3Q?Dp2%;RoW`YLr+I{wtBGguy;>Q zmUg=RCOp&SPmcw%$J?JyfD+-joUIl9h12+mB-8rO{E3;4F=XmuYwi-)}Egkr<@)e0=5#_#3FGDGgYJOgZWDEdJ( zvz=GGN^iNYwdq$N6vy!}Vze(L;5q0F{(Wry$#zFeB|M~d1Lx?QK(N^>at?w?7L%R4)lvXp|@wkNYYwh3ci8=hPh z&{ls8wqax~GYJL1Ys{-29XdVh$8mND`8`6Pshj>#W|*`b8(u7v#Fi_@#MSv_t5Hfn zCHwrU>@4+a&awZR(|T^jT^*OmYo3ys{o|2$lT!>kaa@oXZ~E;CRzv=oj7Q^4S@p?^+g9NQ^ut8e8y6$85dNdeXBkdp^4U@q~;} z?BbPNIU_%#O+Od1?y+4$k1t1B*3u##XOYX7q>JjA7hbO)&~K!(Gdk5@npZ|>UE+#= z!_QmKu`#pG#cY05Zi@19bzJlRHBceIXNbqn?YLm^go79gVO-=m(j+_r5#fmJP3OP6 zkM4Q8c4;x?+-!@XX>DQMGb5QGV4C3lTy~-!@J+s^oPX0%?_(GTmUX~y2@%*Nxe+_c#yH}IGc;0i&I7JwWr%FiT#xuSK5% z1rTtaeT}SXQ?rRkxH#{EQO-u6YA_ivQPCr0cB*T>Pre!_UB|HwZ3|R>(_Go%VKuw~j5@!^Cz!EP-z>=AGFH_$W+ny3 z`wl2Jmil%LAshd3GtknqLt{I5w^$Sn?>>;da)^(_xWAtwn6&V5Q%kVjQ%%`Fi#J$K z#Aypnd8pV=4vl|R`uhpohNe!8j=Yfbv_o#7w1D+(+2wh8I6?=o^WilMAGpNeNLaz1 zRG8xzVYuS#XskB)^FzweeO!G^e@w2Jb zTy@mKGHg8#f{XXGued(dh$`(d)z05f#@hP|Y}Vaf0K6kCeW^2R#k$M<{i1 zQT1-i_@fUsa2zv`QNRwODBp2z|9yfDXU!wHU$Di=Mo+VXfzfw>{r|DP0o=_0iVDjs ZqAn#(9&-{s@O5BdBt>OKDuwm^{s$pBT$2C* literal 0 HcmV?d00001 diff --git a/example/ck_tile/02_layernorm2d/script/perf_test.sh b/example/ck_tile/02_layernorm2d/script/perf_test.sh index bfb7f9ffe..a34624536 100755 --- a/example/ck_tile/02_layernorm2d/script/perf_test.sh +++ b/example/ck_tile/02_layernorm2d/script/perf_test.sh @@ -2,37 +2,37 @@ # run from top of ck folder EXE=build/bin/tile_example_layernorm2d_fwd -$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 -$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 -$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 \ No newline at end of file +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 \ No newline at end of file diff --git a/example/ck_tile/02_layernorm2d/script/smoke_test.sh b/example/ck_tile/02_layernorm2d/script/smoke_test.sh index dcd40fda4..d56406b6f 100755 --- a/example/ck_tile/02_layernorm2d/script/smoke_test.sh +++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh @@ -2,30 +2,34 @@ # call from top of CK folder EXE=./build/bin/tile_example_layernorm2d_fwd +for fquant in "" "-fquant=1 -prec_o=int8"; do for pr_i in "fp16" "bf16" ; do -$EXE -prec=$pr_i -m=99 -n=13 -$EXE -prec=$pr_i -m=17 -n=16 -$EXE -prec=$pr_i -m=1 -n=100 -$EXE -prec=$pr_i -m=4 -n=128 -$EXE -prec=$pr_i -m=80 -n=127 -$EXE -prec=$pr_i -m=22 -n=255 -stride=256 -$EXE -prec=$pr_i -m=7 -n=599 -$EXE -prec=$pr_i -m=19 -n=512 -$EXE -prec=$pr_i -m=33 -n=313 -stride=1000 -$EXE -prec=$pr_i -m=11 -n=510 -$EXE -prec=$pr_i -m=171 -n=676 -stride=818 -$EXE -prec=$pr_i -m=91 -n=636 -$EXE -prec=$pr_i -m=12 -n=768 -stride=800 -$EXE -prec=$pr_i -m=100 -n=766 -stride=812 -$EXE -prec=$pr_i -m=31 -n=1024 -$EXE -prec=$pr_i -m=64 -n=1000 -stride=1004 -$EXE -prec=$pr_i -m=8 -n=1501 -$EXE -prec=$pr_i -m=3 -n=1826 -$EXE -prec=$pr_i -m=5 -n=2040 -$EXE -prec=$pr_i -m=7 -n=2734 -$EXE -prec=$pr_i -m=1 -n=3182 -$EXE -prec=$pr_i -m=9 -n=4096 -$EXE -prec=$pr_i -m=3 -n=8192 -$EXE -prec=$pr_i -m=1 -n=10547 -$EXE -prec=$pr_i -m=3 -n=17134 +for fadd in "0" "1"; do +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=99 -n=13 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=17 -n=16 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1 -n=100 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=4 -n=128 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=80 -n=127 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=22 -n=255 -stride=256 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7 -n=599 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=19 -n=512 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=33 -n=313 -stride=1000 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=11 -n=510 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=171 -n=676 -stride=818 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=91 -n=636 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=12 -n=768 -stride=800 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=100 -n=766 -stride=812 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=31 -n=1024 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=64 -n=1000 -stride=1004 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=8 -n=1501 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3 -n=1826 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=5 -n=2040 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7 -n=2734 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1 -n=3182 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=9 -n=4096 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3 -n=8192 +#$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1 -n=10547 +#$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3 -n=17134 +done +done done diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 2c423831e..3b198502d 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -25,6 +25,7 @@ #include "ck_tile/core/numeric/bfloat16.hpp" #include "ck_tile/core/numeric/float8.hpp" #include "ck_tile/core/numeric/half.hpp" +#include "ck_tile/core/numeric/int8.hpp" #include "ck_tile/core/numeric/integer.hpp" #include "ck_tile/core/numeric/integral_constant.hpp" #include "ck_tile/core/numeric/math.hpp" diff --git a/include/ck_tile/core/numeric/int8.hpp b/include/ck_tile/core/numeric/int8.hpp new file mode 100644 index 000000000..9ca3333c3 --- /dev/null +++ b/include/ck_tile/core/numeric/int8.hpp @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile/core/config.hpp" +#include "ck_tile/core/numeric/half.hpp" +#include "ck_tile/core/numeric/integral_constant.hpp" +#include "ck_tile/core/numeric/math.hpp" +#include "ck_tile/core/numeric/numeric.hpp" +#include "ck_tile/core/utility/bit_cast.hpp" +#include "ck_tile/core/utility/random.hpp" +#include +#include + +#pragma once + +namespace ck_tile { + +// use int8_t directly for int8 arithemetic +// here one can use ck_tile::int8_t to access original int8_t +using int8_t = int8_t; + +// limits +template +struct numeric; + +template <> +struct numeric +{ + // minimum finite value, or minimum positive normalized value for float + CK_TILE_HOST_DEVICE static constexpr int8_t min() { return int8_t(-128); } + + // minumum finite value + CK_TILE_HOST_DEVICE static constexpr int8_t lowest() { return int8_t(-128); } + + // maximum finite value + CK_TILE_HOST_DEVICE static constexpr int8_t max() { return int8_t(127); } + + // difference between 1.0 and next value representable by float + CK_TILE_HOST_DEVICE static constexpr int8_t epsilon() + { + return 1; // not used + } + + CK_TILE_HOST_DEVICE static constexpr int8_t round_error() + { + return 1; // not used + } + + // positive infinity value + CK_TILE_HOST_DEVICE static constexpr int8_t infinity() + { + return 1; // not used + } + + // quiet NaN + CK_TILE_HOST_DEVICE static constexpr int8_t quiet_NaN() + { + return 1; // not used + } + + // signaling NaN + CK_TILE_HOST_DEVICE static constexpr int8_t signaling_NaN() + { + return 1; // not used + } + + // smallest positive subnormal value + CK_TILE_HOST_DEVICE static constexpr int8_t denorm_min() + { + return 1; // not used + } + + CK_TILE_HOST_DEVICE static constexpr int8_t zero() { return 0; } +}; + +#if 0 +template +struct numeric_traits; + +template <> +struct numeric_traits +{ + static constexpr int exp = 5; + static constexpr int mant = 10; + static constexpr int bias = 15; + static constexpr uint16_t nan_mask = 0x7C00; + static constexpr uint16_t head_mask = 0xFC00; + static constexpr uint16_t mant_mask = 0x3FF; + static constexpr uint16_t exp_mask = 0x1F; + static constexpr uint32_t Inf = 0x7C00; + static constexpr uint32_t NegInf = 0xFC00; + static constexpr uint32_t NaN = 0x7C01; + static constexpr uint32_t Neg0 = 0x8000; + using bitwise_type = uint16_t; +}; +#endif + +CK_TILE_HOST_DEVICE +constexpr float int8_to_float(const int8_t& x) { return static_cast(x); } + +CK_TILE_HOST_DEVICE +constexpr int8_t float_to_int8(const float& x) { return static_cast(x); } + +} // namespace ck_tile diff --git a/include/ck_tile/core/numeric/type_convert.hpp b/include/ck_tile/core/numeric/type_convert.hpp index cb18cde70..4011e08ce 100644 --- a/include/ck_tile/core/numeric/type_convert.hpp +++ b/include/ck_tile/core/numeric/type_convert.hpp @@ -10,6 +10,7 @@ #include "ck_tile/core/numeric/half.hpp" #include "ck_tile/core/numeric/bfloat16.hpp" #include "ck_tile/core/numeric/float8.hpp" +#include "ck_tile/core/numeric/int8.hpp" namespace ck_tile { @@ -60,6 +61,9 @@ CK_TILE_TYPE_CONVERT(bf16_t, bf16, float, float) CK_TILE_TYPE_CONVERT(fp8_t, fp8, float, float) CK_TILE_TYPE_CONVERT(bf8_t, bf8, float, float) +CK_TILE_TYPE_CONVERT(float, float, int8_t, int8) +CK_TILE_TYPE_CONVERT(int8_t, int8, float, float) + #undef CK_TILE_TYPE_CONVERT #endif diff --git a/include/ck_tile/core/tensor/null_tile_window.hpp b/include/ck_tile/core/tensor/null_tile_window.hpp index 9707f2990..de99be196 100644 --- a/include/ck_tile/core/tensor/null_tile_window.hpp +++ b/include/ck_tile/core/tensor/null_tile_window.hpp @@ -80,6 +80,13 @@ CK_TILE_DEVICE constexpr auto make_tile_window(null_tensor_view, return null_tile_window>{window_lengths}; } +template +CK_TILE_DEVICE constexpr auto make_tile_window(const null_tile_window& t, + const StaticTileDistribution&) +{ + return t; +} + template CK_TILE_DEVICE void move_tile_window(null_tile_window&, diff --git a/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp index 837f52c39..62cd26b6a 100644 --- a/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp +++ b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp @@ -8,20 +8,44 @@ namespace ck_tile { +// Note: for simplicity, each functor only care about single M +struct reference_layernorm2d_default_epilogue +{ + template + void operator()(int m, HostTensor& o, const HostTensor& acc) + { + const int N = acc.mDesc.get_lengths()[1]; + for(int n = 0; n < N; ++n) + { + o(m, n) = ck_tile::type_convert(acc(m, n)); + } + } + + template + auto operator()(int m, const HostTensor& acc) + { + HostTensor o(acc.get_lengths(), acc.get_strides()); + operator()(m, o, acc); + return o; + } +}; + template + typename InvStdDataType, + typename Epilogue = reference_layernorm2d_default_epilogue> void reference_layernorm2d_fwd(const HostTensor& x_m_n, const HostTensor& gamma_n, const HostTensor& beta_n, HostTensor& y_m_n, HostTensor& mean_m, HostTensor& invStd_m, - ComputeDataType epsilon) + ComputeDataType epsilon, + Epilogue epilogue_functor = {}) { auto layernorm2d_fwd_func = [&](auto m) { const int N = x_m_n.mDesc.get_lengths()[1]; @@ -51,16 +75,19 @@ void reference_layernorm2d_fwd(const HostTensor& x_m_n, if constexpr(!std::is_same_v) invStd_m(m) = ck_tile::type_convert(divisor); + HostTensor acc(x_m_n.get_lengths(), x_m_n.get_strides()); for(int n = 0; n < N; ++n) { ComputeDataType x = ck_tile::type_convert(x_m_n(m, n)); ComputeDataType gamma = ck_tile::type_convert(gamma_n(n)); ComputeDataType beta = ck_tile::type_convert(beta_n(n)); - auto y = (x - mean) * divisor; - y = y * gamma + beta; + auto a_ = (x - mean) * divisor; + a_ = a_ * gamma + beta; - y_m_n(m, n) = ck_tile::type_convert(y); + acc(m, n) = a_; } + + epilogue_functor(m, y_m_n, acc); }; make_ParallelTensorFunctor(layernorm2d_fwd_func, diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp index eb06fea2d..fb8d7221b 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp @@ -9,4 +9,5 @@ #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp" #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp" #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/common.hpp b/include/ck_tile/ops/common.hpp index 4363ea1f5..1510f18a3 100644 --- a/include/ck_tile/ops/common.hpp +++ b/include/ck_tile/ops/common.hpp @@ -3,4 +3,5 @@ #pragma once +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp b/include/ck_tile/ops/common/generic_2d_block_shape.hpp similarity index 96% rename from include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp rename to include/ck_tile/ops/common/generic_2d_block_shape.hpp index e4b60331e..64ad20c3b 100644 --- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp +++ b/include/ck_tile/ops/common/generic_2d_block_shape.hpp @@ -1,11 +1,10 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include "ck_tile/core.hpp" - namespace ck_tile { + /* // clang-format off @@ -42,7 +41,7 @@ template typename Vector_, // contiguous pixels(vector size) along seq index_t BlockSize_ = warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})> -struct Layernorm2dShape +struct Generic2dBlockShape { // block size static constexpr index_t Block_M = BlockTile_::at(number<0>{}); diff --git a/include/ck_tile/ops/elementwise.hpp b/include/ck_tile/ops/elementwise.hpp index 62ba9dc0b..cd1e43fb8 100644 --- a/include/ck_tile/ops/elementwise.hpp +++ b/include/ck_tile/ops/elementwise.hpp @@ -4,4 +4,5 @@ #pragma once #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/epilogue.hpp b/include/ck_tile/ops/epilogue.hpp index a98f60b36..c24744bdb 100644 --- a/include/ck_tile/ops/epilogue.hpp +++ b/include/ck_tile/ops/epilogue.hpp @@ -5,4 +5,6 @@ #include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp" #include "ck_tile/ops/epilogue/default_2d_epilogue.hpp" +#include "ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp index 5dc49c3b0..7c5d5a6f3 100644 --- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp @@ -9,23 +9,29 @@ namespace ck_tile { // this epilogue just store out a M*N matrix, row major -template +template struct Default2DEpilogueProblem { - using AccDataType = remove_cvref_t; - using ODataType = remove_cvref_t; - static constexpr bool kPadM = kPadM_; - static constexpr bool kPadN = kPadN_; + using AccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; + static constexpr bool kPadM = kPadM_; + static constexpr bool kPadN = kPadN_; + static constexpr bool UseRawStore = UseRawStore_; }; template struct Default2DEpilogue { - using Problem = remove_cvref_t; - using AccDataType = remove_cvref_t; - using ODataType = remove_cvref_t; - static constexpr bool kPadM = Problem::kPadM; - static constexpr bool kPadN = Problem::kPadN; + using Problem = remove_cvref_t; + using AccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; + static constexpr bool kPadM = Problem::kPadM; + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool UseRawStore = Problem::UseRawStore; CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; } @@ -36,7 +42,7 @@ struct Default2DEpilogue { // TODO: this is ugly - if constexpr(kPadM || kPadN) + if constexpr(UseRawStore && (kPadM || kPadN)) { store_tile_raw(o_dram_window_tmp, cast_tile(o_acc_tile)); buffer_store_fence(); diff --git a/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp b/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp new file mode 100644 index 000000000..2e2960411 --- /dev/null +++ b/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/reduce.hpp" + +namespace ck_tile { + +template +struct DynamicQuantEpilogueTraits +{ + static constexpr bool kPadM = kPadM_; + static constexpr bool kPadN = kPadN_; + static constexpr bool UseRawStore = UseRawStore_; + static constexpr bool UseMax3 = UseMax3_; +}; + +// this epilogue just store out a M*N matrix, row major +template +struct DynamicQuantEpilogueProblem +{ + using AccDataType = remove_cvref_t; + using YScaleDataType = remove_cvref_t; + using ODataType = remove_cvref_t; + using BlockShape = remove_cvref_t; // can consum generic 2d shape + using Traits = remove_cvref_t; +}; + +template +struct DynamicQuantEpilogue +{ + using Problem = remove_cvref_t; + using AccDataType = remove_cvref_t; + using YScaleDataType = remove_cvref_t; + using ODataType = remove_cvref_t; + using BlockShape = remove_cvref_t; + static constexpr bool kPadM = Problem::Traits::kPadM; + static constexpr bool kPadN = Problem::Traits::kPadN; + static constexpr bool UseRawStore = Problem::Traits::UseRawStore; + static constexpr bool UseMax3 = Problem::Traits::UseMax3; + + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2d{}; + } + + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2dSync{}; + } + + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2dCrossWarpSync{}; + } + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync(); + return reduce_crosswarp_sync.GetSmemSize(); + } + + // TODO: this function assume store out vector size is the same as OAccTile last dimension size + // how do we fix this ? + template + CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, + YScaleWindow& y_scale_window, + const OAccTile& o_acc_tile, + void* smem) + { + auto reduce = GetBlockReduce2d(); + auto reduce_sync = GetBlockReduce2dSync(); + auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync(); + + const auto f_absmax = [](auto acc_, auto v_0_) { return max(acc_, abs(v_0_)); }; + + auto row_absmax = [&]() { + constexpr auto y_size_per_row = + OAccTile{}.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at( + number<1>{}); + // constexpr auto y_size_per_row = OAccTile::get_lengths()[number<1>{}]; + if constexpr(UseMax3 && std::is_same_v && y_size_per_row % 2 == 0) + { + // fast max3 implementation + const auto f_max3 = [](auto acc_, auto v_0_, auto v_1_) { + float rtn; + asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)" + : "=v"(rtn) + : "v"(acc_), "v"(v_0_), "v"(v_1_)); + return rtn; + }; + return reduce(o_acc_tile, type_convert(0), f_max3, sequence<1, 2>{}); + } + else + { + return reduce(o_acc_tile, type_convert(0), f_absmax); + } + }(); + reduce_sync(row_absmax, f_absmax); + reduce_crosswarp_sync(row_absmax, smem, f_absmax); + + // here y_scale is Acc TYpe, need convert to YScale type later + auto y_scale = tile_elementwise_in( + [&](const auto& v_) { + return v_ / type_convert(numeric::max()); + }, + row_absmax); + + store_tile(y_scale_window, cast_tile(y_scale)); + + auto o_acc_scaled_tile = + make_static_distributed_tensor(o_acc_tile.get_tile_distribution()); + + sweep_tile(o_acc_tile, [&](auto idx) { + constexpr auto row_id = make_tuple(idx[number<0>{}]); + o_acc_scaled_tile(idx) = o_acc_tile[idx] / y_scale(row_id); + }); + + // TODO: this is ugly + if constexpr(UseRawStore && (kPadM || kPadN)) + { + store_tile_raw(o_dram_window_tmp, cast_tile(o_acc_scaled_tile)); + buffer_store_fence(); + } + else + { + store_tile(o_dram_window_tmp, cast_tile(o_acc_scaled_tile)); + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp index 9389a5397..e106264ce 100644 --- a/include/ck_tile/ops/fmha.hpp +++ b/include/ck_tile/ops/fmha.hpp @@ -43,4 +43,5 @@ #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp" #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp" #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index c3e028528..ac74782a3 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -39,4 +39,5 @@ #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp" #include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp" #include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/image_to_column.hpp b/include/ck_tile/ops/image_to_column.hpp index 57e83a7a5..2b02bcc5d 100644 --- a/include/ck_tile/ops/image_to_column.hpp +++ b/include/ck_tile/ops/image_to_column.hpp @@ -6,4 +6,5 @@ #include "ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp" #include "ck_tile/ops/image_to_column/pipeline/block_image_to_column_problem.hpp" #include "ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/layernorm2d.hpp b/include/ck_tile/ops/layernorm2d.hpp index 2a403b0f4..711c5d859 100644 --- a/include/ck_tile/ops/layernorm2d.hpp +++ b/include/ck_tile/ops/layernorm2d.hpp @@ -4,9 +4,10 @@ #pragma once #include "ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp" -#include "ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp" #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp" #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp" #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp" #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp index cebe5131a..9a2e06d05 100644 --- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp +++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp @@ -5,19 +5,24 @@ #include "ck_tile/core.hpp" #include "ck_tile/ops/common.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp" namespace ck_tile { // host side args struct Layernorm2dFwdHostArgs { - const void* p_x; - const void* p_gamma; - const void* p_beta; - - void* p_y; - void* p_mean; - void* p_invStd; + const void* p_x; // [m ,n], input, fp16/bf16 + const void* p_x_residual; // [m ,n], shortcut input, prec same as input, nullptr if not used + const void* p_x_scale; // [1 ,n], smooth scale input, fp32, nullptr if not used + const void* p_gamma; // [1, n], gamma, prec same as input + const void* p_beta; // [1, n], beta, prec same as input + + void* p_y; // [m, n], output, fp16/bf16 + void* p_y_residual; // [m, n], shortcut output, prec same as input, nullptr if not used + void* p_y_scale; // [m, 1], output a dynamic quant per row, nullptr if not used + void* p_mean; // [m, 1], output mean, prec same as input, nullptr if not used + void* p_invStd; // [m, 1], output inv-stdvariance, prec same as input, nullptr if not used float epsilon; @@ -27,10 +32,11 @@ struct Layernorm2dFwdHostArgs }; // TODO: Extract some type to wrapper class -template +template struct Layernorm2dFwd { using Pipeline = remove_cvref_t; + using Epilogue = remove_cvref_t; using Problem = typename Pipeline::Problem; using XDataType = remove_cvref_t; @@ -40,18 +46,26 @@ struct Layernorm2dFwd using YDataType = remove_cvref_t; using MeanDataType = remove_cvref_t; using InvStdDataType = remove_cvref_t; + using XScaleDataType = remove_cvref_t; + using YScaleDataType = remove_cvref_t; + + // for simplicity, shortcut input/output type is same as X + using XResidualDataType = XDataType; + using YResidualDataType = XDataType; static constexpr bool kHasGamma = !std::is_same_v; static constexpr bool kHasBeta = !std::is_same_v; - static constexpr bool kSaveMeanInvStd = Problem::kSaveMeanInvStd; - static constexpr bool kSaveMean = Problem::kSaveMeanInvStd; - static constexpr bool kSaveInvStd = Problem::kSaveMeanInvStd; - - static constexpr index_t Block_M = Problem::BlockShape::Block_M; - static constexpr index_t Block_N = Problem::BlockShape::Block_N; - static constexpr bool kPadM = false; // always no need to pad along M - static constexpr bool kPadN = Problem::kPadN; - static constexpr bool kTwoPass = Problem::kTwoPass; + static constexpr bool kSaveMeanInvStd = Problem::Traits::kSaveMeanInvStd; + static constexpr bool kSaveMean = Problem::Traits::kSaveMeanInvStd; + static constexpr bool kSaveInvStd = Problem::Traits::kSaveMeanInvStd; + + static constexpr index_t Block_M = Problem::BlockShape::Block_M; + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + static constexpr bool kPadM = false; // always no need to pad along M + static constexpr bool kPadN = Problem::Traits::kPadN; + static constexpr bool kTwoPass = Problem::Traits::kTwoPass; + static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd; + static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant; static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N; static constexpr index_t Vector_N = Problem::BlockShape::Vector_N; @@ -62,13 +76,18 @@ struct Layernorm2dFwd struct Kargs { - const void* p_x; - const void* p_gamma; - const void* p_beta; + const void* p_x; // [m ,n], input, fp16/bf16 + const void* p_x_residual; // [m ,n], shortcut input, prec same as input, nullptr if not used + const void* p_x_scale; // [1 ,n], smooth scale input, fp32, nullptr if not used + const void* p_gamma; // [1, n], gamma, prec same as input + const void* p_beta; // [1, n], beta, prec same as input - void* p_y; - void* p_mean; - void* p_invStd; + void* p_y; // [m, n], output, fp16/bf16 + void* p_y_residual; // [m, n], shortcut output, prec same as input, nullptr if not used + void* p_y_scale; // [m, 1], output a dynamic quant per row, nullptr if not used + + void* p_mean; // [m, 1], output mean, prec same as input, nullptr if not used + void* p_invStd; // [m, 1], output inv-stdvariance, prec same as input, nullptr if not used float epsilon; @@ -81,9 +100,13 @@ struct Layernorm2dFwd CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) { return Kargs{hargs.p_x, + hargs.p_x_residual, + hargs.p_x_scale, hargs.p_gamma, hargs.p_beta, hargs.p_y, + hargs.p_y_residual, + hargs.p_y_scale, hargs.p_mean, hargs.p_invStd, hargs.epsilon, @@ -106,6 +129,7 @@ struct Layernorm2dFwd template <> struct t2s { static constexpr const char * name = "bf16"; }; template <> struct t2s { static constexpr const char * name = "fp8"; }; template <> struct t2s { static constexpr const char * name = "bf8"; }; + template <> struct t2s { static constexpr const char * name = "int8"; }; // clang-format on // in byte @@ -113,24 +137,41 @@ struct Layernorm2dFwd CK_TILE_HOST static std::string GetName() { +#define _SS_ std::string +#define _TS_ std::to_string // clang-format off using S_ = typename Problem::BlockShape; auto surfix = [&] () { std::string n; + if (kFusedAdd != Layernorm2dFusedAddEnum::NO_ADD) n += _SS_("_") + Layernorm2dFusedAddEnumName::name; + if (kFusedQuant != Layernorm2dFusedQuantEnum::NO_SWEEP) n += _SS_("_") + Layernorm2dFusedQuantEnumName::name; if (kPadN) n += "_pn"; if (kSaveMeanInvStd) n += "_mv"; - if (kTwoPass) n += "_2p"; + // if (kTwoPass) n += "_2p"; return n; }(); - #define _SS_ std::string - #define _TS_ std::to_string - return _SS_("layernorm2d_fwd_") + _SS_(t2s::name) + "_" + + auto prec_str = [&] () { + std::string base_str = _SS_(t2s::name); + if (!std::is_same_v) { + base_str += _SS_("_") + _SS_(t2s::name); + } + if (kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT) { + base_str += _SS_("_sx") + _SS_(t2s::name); + base_str += _SS_("_sy") + _SS_(t2s::name); + } + if (kFusedQuant == Layernorm2dFusedQuantEnum::DYNAMIC_QUANT) { + base_str += _SS_("_sy") + _SS_(t2s::name); + } + return base_str; + }(); + + return _SS_("layernorm2d_fwd_") + _SS_(prec_str) + "_" + _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" + _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" + _SS_(Pipeline::name) + surfix; - #undef _SS_ - #undef _TS_ // clang-format on +#undef _SS_ +#undef _TS_ } CK_TILE_DEVICE void operator()(Kargs kargs) const @@ -153,6 +194,31 @@ struct Layernorm2dFwd tmp2_, make_tuple(number{}, number{}), {iM, 0}); }(); + const auto x_residual_window = [&]() { + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE || + kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD) + { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_x_residual), + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, + number<1>{}); + + // NOTE: we don't do any pad in this kernel for loading, assume that inside kernel + // will check the max count dynamically + const auto tmp2_ = pad_tensor_view(tmp_, + make_tuple(number{}, number{}), + sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); + } + else + { + return make_null_tile_window(make_tuple(number{}, number{})); + } + }(); + const auto gamma_window = [&]() { const auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_gamma), @@ -194,6 +260,28 @@ struct Layernorm2dFwd tmp2_, make_tuple(number{}, number{}), {iM, 0}); }(); + auto y_residual_window = [&]() { + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE) + { + auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_y_residual), + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, + number<1>{}); + + auto tmp2_ = pad_tensor_view(tmp_, + make_tuple(number{}, number{}), + sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); + } + else + { + return make_null_tile_window(make_tuple(number{}, number{})); + } + }(); + auto mean_window = [&]() { if constexpr(kSaveMean) { @@ -232,17 +320,60 @@ struct Layernorm2dFwd return make_null_tile_window(make_tuple(number{})); }(); + auto x_scale_window = [&]() { + if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT) + { + const auto win_ = [&]() { + const auto tmp_0_ = make_naive_tensor_view_packed( + static_cast(kargs.p_x_scale), + make_tuple(kargs.n), + number{}); + + return pad_tensor_view(tmp_0_, + make_tuple(number{}), + sequence{}); // x_scale no need pad + }(); + return make_tile_window(win_, make_tuple(number{}), {0}); + } + else + return make_null_tile_window(make_tuple(number{})); + }(); + + auto y_scale_window = [&]() { + if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT || + kFusedQuant == Layernorm2dFusedQuantEnum::DYNAMIC_QUANT) + { + const auto win_ = [&]() { + const auto tmp_0_ = make_naive_tensor_view_packed( + static_cast(kargs.p_y_scale), + make_tuple(kargs.m), + number<1>{}); + + return pad_tensor_view( + tmp_0_, make_tuple(number{}), sequence{}); + }(); + return make_tile_window(win_, make_tuple(number{}), {iM}); + } + else + return make_null_tile_window(make_tuple(number{})); + }(); + __shared__ char smem[GetSmemSize()]; Pipeline{}(x_window, + x_residual_window, gamma_window, beta_window, y_window, + y_residual_window, mean_window, inv_std_window, + x_scale_window, + y_scale_window, static_cast(kargs.epsilon), kargs.n, - smem); + smem, + Epilogue{}); } }; diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index c767a472a..16a7c3b86 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -5,6 +5,7 @@ #include "ck_tile/core.hpp" #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp" #include #include @@ -24,20 +25,25 @@ struct Layernorm2dFwdPipelineOnePass using MeanDataType = ck_tile::remove_cvref_t; using InvStdDataType = ck_tile::remove_cvref_t; + using XResidualDataType = XDataType; + using YResidualDataType = XDataType; + static constexpr bool kHasGamma = !std::is_same_v; static constexpr bool kHasBeta = !std::is_same_v; - static constexpr bool kSaveMean = Problem::kSaveMeanInvStd; - static constexpr bool kSaveInvStd = Problem::kSaveMeanInvStd; + static constexpr bool kSaveMean = Problem::Traits::kSaveMeanInvStd; + static constexpr bool kSaveInvStd = Problem::Traits::kSaveMeanInvStd; static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; static constexpr bool kPadM = false; // TODO - BlockLayernorm2dFwdProblem::kPadM - static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kPadN = Problem::Traits::kPadN; + static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd; + static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant; static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) - return "bpr_op"; // block per row + return "bpr"; // block per row else - return "wpr_op"; // warp per row + return "wpr"; // warp per row }(); CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() @@ -46,20 +52,30 @@ struct Layernorm2dFwdPipelineOnePass } template + typename InvStdWindow, + typename XScaleWindow, + typename YScaleWindow, + typename Epilogue> CK_TILE_DEVICE auto operator()(const XWindow& x_window_, + const XResidualWindow& x_residual_window_, const GammaWindow& gamma_window_, const BetaWindow& beta_window_, - YWindow& y_window, + YWindow& y_window_, + const YResidualWindow& y_residual_window_, MeanWindow& mean_window, InvStdWindow& inv_std_window, + const XScaleWindow& x_scale_window_, + YScaleWindow& y_scale_window, ComputeDataType epsilon, ck_tile::index_t row_size, - void* smem) const + void* smem, + Epilogue) const { const auto x_window = make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); @@ -67,8 +83,17 @@ struct Layernorm2dFwdPipelineOnePass gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution()); const auto beta_window = make_tile_window( beta_window_, Policy::template MakeGammaBetaBlockTileDistribution()); + const auto x_residual_window = make_tile_window( + x_residual_window_, Policy::template MakeXBlockTileDistribution()); + auto y_residual_window = make_tile_window( + y_residual_window_, Policy::template MakeXBlockTileDistribution()); + const auto x_scale_window = make_tile_window( + x_scale_window_, Policy::template MakeGammaBetaBlockTileDistribution()); + + auto x = load_tile(x_window); + auto x_resi = load_tile(x_residual_window); + auto x_scale = load_tile(x_scale_window); - const auto x = load_tile(x_window); int cur_count = 0; int max_count = block_tile_welford_calculate_max_count(row_size); @@ -81,6 +106,18 @@ struct Layernorm2dFwdPipelineOnePass const auto gamma = load_tile(gamma_window); const auto beta = load_tile(beta_window); + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE || + kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD) + { + sweep_tile(x_resi, [&](auto idx) { + // compute x = x_resi + x + x(idx) = type_convert(x_resi(idx)) + + type_convert(x(idx)); + }); + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE) + store_tile(y_residual_window, x); + } + // compute welford each-thread->cross-lane->cross-warp auto [mean, var] = block_welford(x, cur_count, max_count); block_welford_sync(mean, var, cur_count); @@ -100,8 +137,8 @@ struct Layernorm2dFwdPipelineOnePass store_tile(inv_std_window, cast_tile(inv_std)); // layernorm computation - auto y = make_static_distributed_tensor(x.get_tile_distribution()); - sweep_tile(y, [&, mean_ = mean](auto idx) { + auto ln = make_static_distributed_tensor(x.get_tile_distribution()); + sweep_tile(ln, [&, mean_ = mean](auto idx) { constexpr auto i_idx = make_tuple(idx[number<0>{}]); constexpr auto j_idx = make_tuple(idx[number<1>{}]); @@ -109,11 +146,28 @@ struct Layernorm2dFwdPipelineOnePass const auto beta_ = type_convert(beta[j_idx]); const auto x_ = type_convert(x[idx]); - auto y_ = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; + auto ln_ = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; - y(idx) = type_convert(y_); + ln(idx) = ln_; }); - store_tile(y_window, y); + + if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT) + { + // smooth-quant pre-scale, then run rowwise-quant + sweep_tile(ln, [&](auto idx) { + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + const auto xs_ = type_convert(x_scale[j_idx]); + ln(idx) = ln(idx) * xs_; + }); + } + + if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::DYNAMIC_QUANT || + kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT) + { + Epilogue{}(y_window_, y_scale_window, ln, smem); + } + else + Epilogue{}(y_window_, ln); } }; } // namespace ck_tile diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp index 8e9f8e81e..7ec830add 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp @@ -14,10 +14,10 @@ template + typename Traits_> struct Layernorm2dFwdPipelineProblem { using XDataType = remove_cvref_t; @@ -27,14 +27,14 @@ struct Layernorm2dFwdPipelineProblem using YDataType = remove_cvref_t; using MeanDataType = remove_cvref_t; using InvStdDataType = remove_cvref_t; + using XScaleDataType = remove_cvref_t; + using YScaleDataType = remove_cvref_t; using BlockShape = remove_cvref_t; static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; - static constexpr bool kPadN = kPadN_; - static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; - static constexpr bool kTwoPass = kTwoPass_; + using Traits = remove_cvref_t; }; } // namespace ck_tile diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp index e35d02e70..ec10efbc6 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -24,20 +24,25 @@ struct Layernorm2dFwdPipelineTwoPass using MeanDataType = ck_tile::remove_cvref_t; using InvStdDataType = ck_tile::remove_cvref_t; + using XResidualDataType = XDataType; + using YResidualDataType = XDataType; + static constexpr bool kHasGamma = !std::is_same_v; static constexpr bool kHasBeta = !std::is_same_v; - static constexpr bool kSaveMean = Problem::kSaveMeanInvStd; - static constexpr bool kSaveInvStd = Problem::kSaveMeanInvStd; + static constexpr bool kSaveMean = Problem::Traits::kSaveMeanInvStd; + static constexpr bool kSaveInvStd = Problem::Traits::kSaveMeanInvStd; static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; static constexpr bool kPadM = false; // TODO - BlockLayernorm2dFwdProblem::kPadM - static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kPadN = Problem::Traits::kPadN; + static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd; + static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant; static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) - return "bpr_tp"; // block per row + return "bpr_2p"; // block per row else - return "wpr_tp"; // warp per row + return "wpr_2p"; // warp per row }(); CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() @@ -46,20 +51,30 @@ struct Layernorm2dFwdPipelineTwoPass } template + typename InvStdWindow, + typename XScaleWindow, + typename YScaleWindow, + typename Epilogue> CK_TILE_DEVICE auto operator()(const XWindow& x_window_, + const XResidualWindow& x_residual_window_, const GammaWindow& gamma_window_, const BetaWindow& beta_window_, YWindow& y_window, + const YResidualWindow& y_residual_window_, MeanWindow& mean_window, InvStdWindow& inv_std_window, + const XScaleWindow& /*x_scale_window*/, + YScaleWindow& /*y_scale_window*/, ComputeDataType epsilon, ck_tile::index_t row_size, - void* smem) const + void* smem, + Epilogue) const { auto x_window = make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); @@ -67,6 +82,10 @@ struct Layernorm2dFwdPipelineTwoPass gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution()); auto beta_window = make_tile_window( beta_window_, Policy::template MakeGammaBetaBlockTileDistribution()); + auto x_residual_window = make_tile_window( + x_residual_window_, Policy::template MakeXBlockTileDistribution()); + auto y_residual_window = make_tile_window( + y_residual_window_, Policy::template MakeXBlockTileDistribution()); // Problem::BlockShape static constexpr index_t Block_N = Problem::BlockShape::Block_N; @@ -93,9 +112,26 @@ struct Layernorm2dFwdPipelineTwoPass for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { - const auto x = load_tile(x_window); - block_welford(x, mean, var, cur_count, max_count); + auto x = load_tile(x_window); + auto x_resi = load_tile(x_residual_window); + move_tile_window(x_window, {0, Block_N}); + move_tile_window(x_residual_window, {0, Block_N}); + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE || + kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD) + { + sweep_tile(x_resi, [&](auto idx) { + // compute x = x_resi + x + x(idx) = type_convert(x_resi(idx)) + + type_convert(x(idx)); + }); + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE) + { + store_tile(y_residual_window, x); + move_tile_window(y_residual_window, {0, Block_N}); + } + } + block_welford(x, mean, var, cur_count, max_count); } block_welford_sync(mean, var, cur_count); @@ -119,6 +155,7 @@ struct Layernorm2dFwdPipelineTwoPass row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N; move_tile_window(x_window, {0, -Block_N}); + move_tile_window(x_residual_window, {0, -Block_N}); move_tile_window(gamma_window, {stride_to_right_most_window}); move_tile_window(beta_window, {stride_to_right_most_window}); move_tile_window(y_window, {0, stride_to_right_most_window}); @@ -126,14 +163,24 @@ struct Layernorm2dFwdPipelineTwoPass // layernorm computation for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { - const auto x = load_tile(x_window); + auto x = load_tile(x_window); + auto x_resi = load_tile(x_residual_window); + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE || + kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD) + { + sweep_tile(x_resi, [&](auto idx) { + // compute x = x_resi + x + x(idx) = type_convert(x_resi(idx)) + + type_convert(x(idx)); + }); + } // load gamma/beta (TODO: support no gamma/beta?) const auto gamma = load_tile(gamma_window); const auto beta = load_tile(beta_window); - auto y = make_static_distributed_tensor(x.get_tile_distribution()); + auto ln = make_static_distributed_tensor(x.get_tile_distribution()); - sweep_tile(y, [&, mean_ = mean](auto idx) { + sweep_tile(ln, [&, mean_ = mean](auto idx) { constexpr auto i_idx = make_tuple(idx[number<0>{}]); constexpr auto j_idx = make_tuple(idx[number<1>{}]); @@ -141,14 +188,16 @@ struct Layernorm2dFwdPipelineTwoPass const auto beta_ = type_convert(beta[j_idx]); const auto x_ = type_convert(x[idx]); - auto y_ = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; + auto ln_ = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; - y(idx) = type_convert(y_); + ln(idx) = ln_; }); - store_tile(y_window, y); + static_assert(kFusedQuant != Layernorm2dFusedQuantEnum::DYNAMIC_QUANT); + Epilogue{}(y_window, ln); move_tile_window(x_window, {0, -Block_N}); + move_tile_window(x_residual_window, {0, -Block_N}); move_tile_window(gamma_window, {-Block_N}); move_tile_window(beta_window, {-Block_N}); move_tile_window(y_window, {0, -Block_N}); diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp new file mode 100644 index 000000000..fb327f74a --- /dev/null +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/utility/type_traits.hpp" + +namespace ck_tile { + +enum class Layernorm2dFusedAddEnum +{ + NO_ADD = 0, + // fused add before layernorm and store result to global + PRE_ADD_STORE = 1, + // fused add before layernorm, but not store result + PRE_ADD = 2, +}; + +// clang-format off +template struct Layernorm2dFusedAddEnumName; +template<> struct Layernorm2dFusedAddEnumName { static constexpr const char * name = "no"; }; +template<> struct Layernorm2dFusedAddEnumName { static constexpr const char * name = "pras"; }; +template<> struct Layernorm2dFusedAddEnumName { static constexpr const char * name = "pra"; }; +// clang-format on + +enum class Layernorm2dFusedQuantEnum +{ + NO_SWEEP = 0, + SMOOTH_DYNAMIC_QUANT = 1, // smooth oulier + rowwise quant, need input x-scale and store y_scale + DYNAMIC_QUANT = 2, // rowwise quant, store out a y-scale +}; + +// clang-format off +template struct Layernorm2dFusedQuantEnumName; +template<> struct Layernorm2dFusedQuantEnumName { static constexpr const char * name = "no"; }; +template<> struct Layernorm2dFusedQuantEnumName { static constexpr const char * name = "dqt"; }; +template<> struct Layernorm2dFusedQuantEnumName { static constexpr const char * name = "smdqt"; }; +// clang-format on + +template +struct Layernorm2dFwdTraits +{ + static constexpr bool kPadN = kPadN_; + static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; + static constexpr bool kTwoPass = kTwoPass_; + static constexpr Layernorm2dFusedAddEnum kFusedAdd = kFusedAdd_; + static constexpr Layernorm2dFusedQuantEnum kFusedQuant = kFusedQuant_; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/permute.hpp b/include/ck_tile/ops/permute.hpp index ee8c69372..990e9ecc0 100644 --- a/include/ck_tile/ops/permute.hpp +++ b/include/ck_tile/ops/permute.hpp @@ -5,4 +5,5 @@ #include "ck_tile/ops/permute/kernel/generic_permute_kernel.hpp" #include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp index fe2d24044..aa617ee2b 100644 --- a/include/ck_tile/ops/reduce.hpp +++ b/include/ck_tile/ops/reduce.hpp @@ -7,4 +7,5 @@ #include "ck_tile/ops/reduce/block/block_reduce2d.hpp" #include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp" #include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp index fa3007d1e..c93329bfb 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp @@ -301,7 +301,10 @@ struct BlockReduce2D .get_static_tile_distribution_encoding(), ReduceDim{})); - return make_static_distributed_tensor(acc_dstr); + auto dst_ = make_static_distributed_tensor(acc_dstr); + // init acc_tensor + tile_elementwise_inout([&](auto& x_) { x_ = type_convert(reduce_init); }, dst_); + return dst_; } // return number of pixels each lane need to reduce diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp index beb8c718e..3c6814711 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp @@ -17,14 +17,24 @@ struct BlockReduce2d CK_TILE_DEVICE constexpr BlockReduce2d() {} - template + template > CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor, YDistributedTensor_& y_tensor, - const ReduceFunc& reduce_func) + const ReduceFunc& reduce_func, + ReducePacksPerXDim = {}) { + sweep_tile( + [&](auto... idx_) { + constexpr auto idx_0 = make_tuple(make_tuple(idx_[number<0>{}]...)[number<0>{}]); + y_tensor(idx_0) = reduce_func(y_tensor(idx_0), x_tensor[idx_]...); + }, + ReducePacksPerXDim{}); +#if 0 constexpr auto I0 = number<0>{}; constexpr auto I1 = number<1>{}; - constexpr auto spans = XDistributedTensor_::get_distributed_spans(); // FIXME: hard coded to reduce 2nd axis @@ -42,6 +52,7 @@ struct BlockReduce2d y_tensor(y_dstr_idx) = y; }); +#endif } template @@ -63,14 +74,17 @@ struct BlockReduce2d return tensor; } - template + template > CK_TILE_DEVICE auto operator()(const XDistributedTensor_& x_tensor, const ComputeDataType& reduce_init, - const ReduceFunc& reduce_func) + const ReduceFunc& reduce_func, + ReducePacksPerXDim = {}) { auto y_tensor = MakeYBlockTile(); set_tile(y_tensor, reduce_init); - (*this)(x_tensor, y_tensor, reduce_func); + (*this)(x_tensor, y_tensor, reduce_func, ReducePacksPerXDim{}); return y_tensor; } diff --git a/include/ck_tile/ops/rmsnorm2d.hpp b/include/ck_tile/ops/rmsnorm2d.hpp index 98c60f1b5..f0a6cf960 100644 --- a/include/ck_tile/ops/rmsnorm2d.hpp +++ b/include/ck_tile/ops/rmsnorm2d.hpp @@ -9,4 +9,5 @@ #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp" #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp" #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/softmax.hpp b/include/ck_tile/ops/softmax.hpp index 584ca7068..4df34e1e0 100644 --- a/include/ck_tile/ops/softmax.hpp +++ b/include/ck_tile/ops/softmax.hpp @@ -5,4 +5,5 @@ #include "ck_tile/ops/softmax/block/block_softmax_2d.hpp" #include "ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/topk.hpp b/include/ck_tile/ops/topk.hpp index b1143e4a0..fcae3e02d 100644 --- a/include/ck_tile/ops/topk.hpp +++ b/include/ck_tile/ops/topk.hpp @@ -5,4 +5,5 @@ #include "ck_tile/ops/topk/block/block_topk_stream_2d.hpp" #include "ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/topk_softmax.hpp b/include/ck_tile/ops/topk_softmax.hpp index 809473d53..cc7dbffee 100644 --- a/include/ck_tile/ops/topk_softmax.hpp +++ b/include/ck_tile/ops/topk_softmax.hpp @@ -7,4 +7,5 @@ #include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp" #include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp" #include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/welford.hpp b/include/ck_tile/ops/welford.hpp index ebf940683..a4c479dd9 100644 --- a/include/ck_tile/ops/welford.hpp +++ b/include/ck_tile/ops/welford.hpp @@ -6,4 +6,5 @@ #include "ck_tile/ops/welford/block/block_welford.hpp" #include "ck_tile/ops/welford/block/block_welford_problem.hpp" #include "ck_tile/ops/welford/thread/thread_welford.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" -- GitLab From 550248deecf974959df8175010221de88b79246f Mon Sep 17 00:00:00 2001 From: carlushuang Date: Fri, 1 Nov 2024 11:52:50 +0800 Subject: [PATCH 030/153] [layernorm] hot fix (#1620) * hot fix ln * some rename --- .../02_layernorm2d/layernorm2d_fwd.cpp | 31 ++++++++++++------- .../layernorm2d_fwd_pipeline_one_pass.hpp | 5 +-- .../layernorm2d_fwd_pipeline_two_pass.hpp | 10 +++--- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp index 43f4e8c72..8f029c212 100644 --- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp +++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp @@ -127,9 +127,10 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::HostTensor x_scale_host_dev({n}); ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(x_residual_host); + ck_tile::FillUniformDistribution{-1.f, 1.f}(x_scale_host); ck_tile::FillUniformDistribution{-.5f, .5f}(gamma_host); ck_tile::FillUniformDistribution{-.5f, .5f}(beta_host); - ck_tile::FillUniformDistribution{-1.f, 1.f}(x_scale_host); ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes()); @@ -212,7 +213,11 @@ bool run(const ck_tile::ArgParser& arg_parser) x_host.mData.cend(), x_residual_host.mData.cbegin(), x_host.mData.begin(), - std::plus{}); + [](auto x_, auto r_) { + auto o_ = ck_tile::type_convert(x_) + + ck_tile::type_convert(r_); + return ck_tile::type_convert(o_); + }); } ck_tile::reference_layernorm2d_fwd sy_host_dev({m, n}, {stride, 1}); + ck_tile::HostTensor y_residual_host_dev({m, n}, {stride, 1}); if(fused_add == 1) { - y_residual_buf.FromDevice(sy_host_dev.data()); + y_residual_buf.FromDevice(y_residual_host_dev.data()); } auto [rtol, atol] = get_elimit(); @@ -294,8 +299,11 @@ bool run(const ck_tile::ArgParser& arg_parser) y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol); if(fused_add == 1) { - pass &= ck_tile::check_err( - sy_host_dev, x_host, std::string("ADD Error: Incorrect results!"), rtol, atol); + pass &= ck_tile::check_err(y_residual_host_dev, + x_host, + std::string("ADD Error: Incorrect results!"), + rtol, + atol); } } else @@ -314,12 +322,13 @@ bool run(const ck_tile::ArgParser& arg_parser) atol); if(fused_add == 1) { - std::vector sy_host_dev_row( - sy_host_dev.begin() + i_r * stride, sy_host_dev.begin() + i_r * stride + n); - std::vector sy_host_ref_row( + std::vector y_residual_host_dev_row( + y_residual_host_dev.begin() + i_r * stride, + y_residual_host_dev.begin() + i_r * stride + n); + std::vector y_residual_host_ref_row( x_host.begin() + i_r * stride, x_host.begin() + i_r * stride + n); - pass &= ck_tile::check_err(sy_host_dev_row, - sy_host_ref_row, + pass &= ck_tile::check_err(y_residual_host_dev_row, + y_residual_host_ref_row, std::string("ADD[") + std::to_string(i_r) + std::string("] Error: Incorrect results!"), rtol, diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index 16a7c3b86..5601f3a68 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -111,8 +111,9 @@ struct Layernorm2dFwdPipelineOnePass { sweep_tile(x_resi, [&](auto idx) { // compute x = x_resi + x - x(idx) = type_convert(x_resi(idx)) + - type_convert(x(idx)); + auto re_ = type_convert(x_resi(idx)) + + type_convert(x(idx)); + x(idx) = type_convert(re_); }); if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE) store_tile(y_residual_window, x); diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp index ec10efbc6..48f66739d 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -122,8 +122,9 @@ struct Layernorm2dFwdPipelineTwoPass { sweep_tile(x_resi, [&](auto idx) { // compute x = x_resi + x - x(idx) = type_convert(x_resi(idx)) + - type_convert(x(idx)); + auto re_ = type_convert(x_resi(idx)) + + type_convert(x(idx)); + x(idx) = type_convert(re_); }); if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE) { @@ -170,8 +171,9 @@ struct Layernorm2dFwdPipelineTwoPass { sweep_tile(x_resi, [&](auto idx) { // compute x = x_resi + x - x(idx) = type_convert(x_resi(idx)) + - type_convert(x(idx)); + auto re_ = type_convert(x_resi(idx)) + + type_convert(x(idx)); + x(idx) = type_convert(re_); }); } // load gamma/beta (TODO: support no gamma/beta?) -- GitLab From fbd654545a2644f99c3e7a493ebcc2169938583b Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 1 Nov 2024 13:51:56 +0800 Subject: [PATCH 031/153] [Ck_tile] smoothquant (#1617) * fix compile error * fix typo of padding * Add smoothquant op * Add smoothquant instance library * refine type * add test script * Re-generate smoothquant.hpp * Always use 'current year' in copyright * use Generic2dBlockShape instead * Add vector = 8 instance back * Find exe path automatically * Simplify the api condition * Remove debugging code * update year * Add blank line between function declaration * explicitly cast return value to dim3 * refine return value * Fix default warmup and repeat value * Add comment * refactor sommthquant cmake * Add README * Fix typo --------- Co-authored-by: Po Yen, Chen --- .../02_layernorm2d/script/perf_test.sh | 5 +- .../02_layernorm2d/script/smoke_test.sh | 3 +- .../10_rmsnorm2d/example_rmsnorm2d_fwd.cpp | 2 +- .../instances/rmsnorm2d_fwd_api.cpp | 9 +- .../ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp | 2 +- .../ck_tile/10_rmsnorm2d/script/perf_test.sh | 5 +- .../ck_tile/10_rmsnorm2d/script/smoke_test.sh | 3 +- .../add_rmsnorm2d_rdquant_fwd.hpp | 6 +- .../example_add_rmsnorm2d_rdquant_fwd.cpp | 8 +- .../add_rmsnorm2d_rdquant_fwd_api.cpp | 9 +- .../script/perf_test.sh | 5 +- .../script/smoke_test.sh | 3 +- example/ck_tile/12_smoothquant/CMakeLists.txt | 24 ++ example/ck_tile/12_smoothquant/README.md | 21 ++ .../12_smoothquant/example_smoothquant.cpp | 237 ++++++++++++++++++ .../smoothquant_bf16_n1024_instance.cpp | 22 ++ .../smoothquant_bf16_n1536_instance.cpp | 13 + .../smoothquant_bf16_n2048_instance.cpp | 14 ++ .../smoothquant_bf16_n256_instance.cpp | 12 + .../smoothquant_bf16_n3072_instance.cpp | 14 ++ .../smoothquant_bf16_n4096_instance.cpp | 14 ++ .../smoothquant_bf16_n4096_tp_instance.cpp | 14 ++ .../smoothquant_bf16_n512_instance.cpp | 13 + .../smoothquant_bf16_n64_n128_instance.cpp | 12 + .../smoothquant_bf16_n768_instance.cpp | 12 + .../smoothquant_fp16_n1024_instance.cpp | 22 ++ .../smoothquant_fp16_n1536_instance.cpp | 13 + .../smoothquant_fp16_n2048_instance.cpp | 14 ++ .../smoothquant_fp16_n256_instance.cpp | 12 + .../smoothquant_fp16_n3072_instance.cpp | 14 ++ .../smoothquant_fp16_n4096_instance.cpp | 14 ++ .../smoothquant_fp16_n4096_tp_instance.cpp | 14 ++ .../smoothquant_fp16_n512_instance.cpp | 13 + .../smoothquant_fp16_n64_n128_instance.cpp | 12 + .../smoothquant_fp16_n768_instance.cpp | 12 + .../instances/smoothquant_fwd_api.cpp | 143 +++++++++++ .../instances/smoothquant_instance_common.hpp | 62 +++++ .../12_smoothquant/script/perf_test.sh | 37 +++ .../12_smoothquant/script/smoke_test.sh | 30 +++ .../ck_tile/12_smoothquant/smoothquant.cpp | 218 ++++++++++++++++ .../ck_tile/12_smoothquant/smoothquant.hpp | 114 +++++++++ example/ck_tile/CMakeLists.txt | 1 + include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp | 1 - .../add_rmsnorm2d_rdquant_fwd_kernel.hpp | 17 +- .../add_rmsnorm2d_rdquant_fwd_shape.hpp | 78 ------ ...2d_rdquant_fwd_pipeline_default_policy.hpp | 1 + .../kernel/layernorm2d_fwd_kernel.hpp | 4 +- ...ayernorm2d_fwd_pipeline_default_policy.hpp | 1 + .../layernorm2d_fwd_pipeline_problem.hpp | 2 +- .../pipeline/layernorm2d_fwd_traits.hpp | 2 +- .../ops/reduce/block/block_reduce2d.hpp | 3 +- include/ck_tile/ops/rmsnorm2d.hpp | 1 - .../rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp | 12 +- .../rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp | 78 ------ .../rmsnorm2d_fwd_pipeline_default_policy.hpp | 1 + include/ck_tile/ops/smoothquant.hpp | 12 + .../smoothquant/kernel/smoothquant_kernel.hpp | 176 +++++++++++++ .../smoothquant_pipeline_default_policy.hpp | 95 +++++++ .../smoothquant_pipeline_one_pass.hpp | 94 +++++++ .../pipeline/smoothquant_pipeline_problem.hpp | 35 +++ .../smoothquant_pipeline_two_pass.hpp | 132 ++++++++++ include/ck_tile/remod.py | 5 +- 62 files changed, 1758 insertions(+), 219 deletions(-) create mode 100644 example/ck_tile/12_smoothquant/CMakeLists.txt create mode 100644 example/ck_tile/12_smoothquant/README.md create mode 100644 example/ck_tile/12_smoothquant/example_smoothquant.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1536_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n2048_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n256_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n3072_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n512_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n768_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1536_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n2048_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n256_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n3072_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n512_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n768_instance.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fwd_api.cpp create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp create mode 100755 example/ck_tile/12_smoothquant/script/perf_test.sh create mode 100755 example/ck_tile/12_smoothquant/script/smoke_test.sh create mode 100644 example/ck_tile/12_smoothquant/smoothquant.cpp create mode 100644 example/ck_tile/12_smoothquant/smoothquant.hpp delete mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp delete mode 100644 include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp create mode 100644 include/ck_tile/ops/smoothquant.hpp create mode 100644 include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp create mode 100644 include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp create mode 100644 include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp create mode 100644 include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp create mode 100644 include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp diff --git a/example/ck_tile/02_layernorm2d/script/perf_test.sh b/example/ck_tile/02_layernorm2d/script/perf_test.sh index a34624536..5a34e1928 100755 --- a/example/ck_tile/02_layernorm2d/script/perf_test.sh +++ b/example/ck_tile/02_layernorm2d/script/perf_test.sh @@ -1,6 +1,5 @@ - -# run from top of ck folder -EXE=build/bin/tile_example_layernorm2d_fwd +#!/bin/sh +EXE="$(find . -name tile_example_layernorm2d_fwd -type f | head -n 1)" $EXE -m=1 -n=1 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 $EXE -m=700 -n=80 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 diff --git a/example/ck_tile/02_layernorm2d/script/smoke_test.sh b/example/ck_tile/02_layernorm2d/script/smoke_test.sh index d56406b6f..b7fd354bb 100755 --- a/example/ck_tile/02_layernorm2d/script/smoke_test.sh +++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh @@ -1,6 +1,5 @@ #!/bin/sh -# call from top of CK folder -EXE=./build/bin/tile_example_layernorm2d_fwd +EXE="$(find . -name tile_example_layernorm2d_fwd -type f | head -n 1)" for fquant in "" "-fquant=1 -prec_o=int8"; do for pr_i in "fp16" "bf16" ; do diff --git a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp index bb2c94901..34df7b74f 100644 --- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp +++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp @@ -69,7 +69,7 @@ bool run(const ck_tile::ArgParser& arg_parser) using WarpTile = ck_tile::sequence<1, 64>; using Vector = ck_tile::sequence<1, 1>; - using Shape = ck_tile::Rmsnorm2dShape; + using Shape = ck_tile::Generic2dBlockShape; using Problem = ck_tile::Rmsnorm2dFwdPipelineProblem>(s, a); } return r; -#else - return rmsnorm2d_fwd_>(s, a); -#endif // clang-format on } float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t, rmsnorm2d_fwd_args a, const ck_tile::stream_config& s) { - float r = -1; if(t.data_type.compare("fp16") == 0) { return rmsnorm2d_fwd_b16_(t, a, s); @@ -146,8 +141,6 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t, rmsnorm2d_fwd_args a, const ck_tile: { return rmsnorm2d_fwd_b16_(t, a, s); } - if(r < 0) + else throw std::runtime_error("Without supported instances!"); - - return r; } diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp index 756ecb2c4..b4d429d46 100644 --- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp +++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp @@ -97,7 +97,7 @@ struct rmsnorm2d_fwd_traits_ using WarpTile = ck_tile::sequence; using Vector = ck_tile::sequence<1, Vector_N_>; - using Shape = ck_tile::Rmsnorm2dShape; + using Shape = ck_tile::Generic2dBlockShape; static constexpr bool kPadN = kPadN_; static constexpr bool kSaveInvRms = kSaveInvRms_; diff --git a/example/ck_tile/10_rmsnorm2d/script/perf_test.sh b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh index f3cfcc4b8..7b9d0820f 100755 --- a/example/ck_tile/10_rmsnorm2d/script/perf_test.sh +++ b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh @@ -1,6 +1,5 @@ - -# run from top of ck folder -EXE=build/bin/tile_rmsnorm2d_fwd +#!/bin/sh +EXE="$(find . -name tile_rmsnorm2d_fwd -type f | head -n 1)" $EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 $EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 diff --git a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh index 6ec5e846c..758d6de54 100755 --- a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh +++ b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh @@ -1,6 +1,5 @@ #!/bin/sh -# call from top of CK folder -EXE=./build/bin/tile_rmsnorm2d_fwd +EXE="$(find . -name tile_rmsnorm2d_fwd -type f | head -n 1)" for pr_i in "fp16" "bf16" ; do $EXE -prec=$pr_i -m=99 -n=13 diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp index bf70d9d23..443b9b102 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp @@ -18,7 +18,7 @@ struct AddRmsnormRdquantTypeConfig using BDataType = ck_tile::half_t; using GammaDataType = ck_tile::half_t; using XDataType = ck_tile::half_t; - using YScaleDataType = ck_tile::half_t; + using YScaleDataType = float; using QYDataType = ck_tile::int8_t; using ComputeDataType = float; }; @@ -30,7 +30,7 @@ struct AddRmsnormRdquantTypeConfig using BDataType = ck_tile::bf16_t; using GammaDataType = ck_tile::bf16_t; using XDataType = ck_tile::bf16_t; - using YScaleDataType = ck_tile::bf16_t; + using YScaleDataType = float; using QYDataType = ck_tile::int8_t; using ComputeDataType = float; }; @@ -101,7 +101,7 @@ struct add_rmsnorm2d_rdquant_fwd_traits_ using WarpTile = ck_tile::sequence; using Vector = ck_tile::sequence<1, Vector_N_>; - using Shape = ck_tile::AddRmsnorm2dRdquantShape; + using Shape = ck_tile::Generic2dBlockShape; static constexpr bool kPadN = kPadN_; static constexpr bool kSaveX = kSaveX_; diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp index 40fabf7f5..ada4c6f2d 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp @@ -66,7 +66,7 @@ bool run(const ck_tile::ArgParser& arg_parser) using BDataType = DataType; using GammaDataType = DataType; using XDataType = DataType; - using YScaleDataType = DataType; + using YScaleDataType = float; using QYDataType = ck_tile::int8_t; using ComputeDataType = float; @@ -99,12 +99,12 @@ bool run(const ck_tile::ArgParser& arg_parser) constexpr bool kThreePass = true; - using BlockWarps = ck_tile::sequence<2, 2>; - using BlockTile = ck_tile::sequence<2, 128>; + using BlockWarps = ck_tile::sequence<4, 1>; + using BlockTile = ck_tile::sequence<4, 128>; using WarpTile = ck_tile::sequence<1, 64>; using Vector = ck_tile::sequence<1, 1>; - using Shape = ck_tile::AddRmsnorm2dRdquantShape; + using Shape = ck_tile::Generic2dBlockShape; using Problem = ck_tile::AddRmsnorm2dRdquantFwdPipelineProblem>(s, a); } return r; -#else - return add_rmsnorm2d_rdquant_fwd_>(s, a); -#endif // clang-format on } @@ -139,7 +135,6 @@ float add_rmsnorm2d_rdquant_fwd(add_rmsnorm2d_rdquant_fwd_traits t, const ck_tile::stream_config& s) { - float r = -1; // Only support instance of save_x == true for now assert(t.save_x); if(t.data_type.compare("fp16") == 0) @@ -150,8 +145,6 @@ float add_rmsnorm2d_rdquant_fwd(add_rmsnorm2d_rdquant_fwd_traits t, { return add_rmsnorm2d_rdquant_fwd_b16_(t, a, s); } - if(r < 0) + else throw std::runtime_error("Without supported instances!"); - - return r; } diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh index 11fd36488..d02b0bab3 100755 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh @@ -1,6 +1,5 @@ - -# run from top of ck folder -EXE=build/bin/tile_add_rmsnorm2d_rdquant_fwd +#!/bin/sh +EXE="$(find . -name tile_add_rmsnorm2d_rdquant_fwd -type f | head -n 1)" $EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 $EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh index 4a02cdcb6..b60f5fcf2 100755 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh @@ -1,6 +1,5 @@ #!/bin/sh -# call from top of CK folder -EXE=./build/bin/tile_add_rmsnorm2d_rdquant_fwd +EXE="$(find . -name tile_add_rmsnorm2d_rdquant_fwd -type f | head -n 1)" for pr_i in "fp16" "bf16" ; do $EXE -prec=$pr_i -m=99 -n=13 diff --git a/example/ck_tile/12_smoothquant/CMakeLists.txt b/example/ck_tile/12_smoothquant/CMakeLists.txt new file mode 100644 index 000000000..09a56c6da --- /dev/null +++ b/example/ck_tile/12_smoothquant/CMakeLists.txt @@ -0,0 +1,24 @@ +function (add_smoothquant_example TARGET_NAME MAIN_SRC) + message("adding ${TARGET_NAME}") + # not using add_example_executable() to add target, since we don't want this to have + # to be included in "make all/install/check" + add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC}) + target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) + + foreach(source IN LISTS ARGN) + list(APPEND INSTANCE_SRCS ${source}) + endforeach() + + target_sources(${TARGET_NAME} PRIVATE ${INSTANCE_SRCS}) + + set(COMPILE_OPTIONS) + # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations + list(APPEND COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + + target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS}) +endfunction(add_smoothquant_example TARGET_NAME MAIN_SRC) + +file(GLOB INSTANCE_SRCS instances/*.cpp) + +add_smoothquant_example(tile_smoothquant smoothquant.cpp ${INSTANCE_SRCS}) +add_smoothquant_example(tile_example_smoothquant example_smoothquant.cpp) diff --git a/example/ck_tile/12_smoothquant/README.md b/example/ck_tile/12_smoothquant/README.md new file mode 100644 index 000000000..d6b815f8c --- /dev/null +++ b/example/ck_tile/12_smoothquant/README.md @@ -0,0 +1,21 @@ +# smoothquant + +This folder contains example for smoothquant using ck_tile tile-programming implementation. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +make tile_smoothquant -j +``` +This will result in an executable `build/bin/tile_smoothquant` + +## cmdline +``` +args: + -m m dimension (default:3328) + -n m dimension (default:4096) + -v cpu validation or not (default:1) + -prec precision (default:fp16) +``` diff --git a/example/ck_tile/12_smoothquant/example_smoothquant.cpp b/example/ck_tile/12_smoothquant/example_smoothquant.cpp new file mode 100644 index 000000000..3a26eb6a7 --- /dev/null +++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp @@ -0,0 +1,237 @@ +#include "ck_tile/host.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/smoothquant.hpp" +#include + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-5; + double atol = 1e-5; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-5; + double atol = 1e-5; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + // due to rounding, int8 quantization might have 1 abs error + double rtol = 1; + double atol = 1; + return ck_tile::make_tuple(rtol, atol); +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3328", "m dimension") + .insert("n", "4096", "n dimension") + .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("e", "1e-5", "epsilon") + .insert("v", "1", "cpu validation or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "0", "cold iter") + .insert("repeat", "1", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = n; + std::string data_type = arg_parser.get_str("prec"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(stride >= n); + + using XDataType = DataType; + using XScaleDataType = float; + using YScaleDataType = float; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; + + // host verify + ck_tile::HostTensor x_host({m, n}, {stride, 1}); + ck_tile::HostTensor xscale_host({n}); + + ck_tile::HostTensor yscale_host_ref({m}, {1}); + ck_tile::HostTensor yscale_host_dev({m}, {1}); + + ck_tile::HostTensor qy_host_ref({m, n}, {stride, 1}); + ck_tile::HostTensor qy_host_dev({m, n}, {stride, 1}); + + ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillUniformDistribution{1e-3, .5f}(xscale_host); + + ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem xscale_buf(xscale_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes()); + + x_buf.ToDevice(x_host.data()); + xscale_buf.ToDevice(xscale_host.data()); + + constexpr bool kTwoPass = true; + + using BlockWarps = ck_tile::sequence<2, 2>; + using BlockTile = ck_tile::sequence<2, 128>; + using WarpTile = ck_tile::sequence<1, 64>; + using Vector = ck_tile::sequence<1, 1>; + + using Shape = ck_tile::Generic2dBlockShape; + using Problem = ck_tile::SmoothquantPipelineProblem; + + using OnePassPipeline = ck_tile::SmoothquantPipelineOnePass; + using TwoPassPipeline = ck_tile::SmoothquantPipelineTwoPass; + using Pipeline = std::conditional_t; + using Kernel = ck_tile::Smoothquant; + + ck_tile::SmoothquantHostArgs args{x_buf.GetDeviceBuffer(), + xscale_buf.GetDeviceBuffer(), + yscale_buf.GetDeviceBuffer(), + qy_buf.GetDeviceBuffer(), + m, + n, + stride}; + + auto kargs = Kernel::MakeKargs(args); + + const dim3 grids = Kernel::GridSize(args); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + auto s = ck_tile::stream_config{nullptr, true, 1, warmup, repeat}; + + ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + bool pass = true; + + if(do_validation) + { + using YDataType = ComputeDataType; + ck_tile::HostTensor y_host({m, n}, {stride, 1}); + // smooth outlier + { + auto f = [&](auto n_) { + auto v_xscale = ck_tile::type_convert(xscale_host(n_)); + + for(int m_ = 0; m_ < m; ++m_) + { + auto v_x = ck_tile::type_convert(x_host(m_, n_)); + y_host(m_, n_) = v_x * v_xscale; + } + }; + + ck_tile::make_ParallelTensorFunctor(f, xscale_host.get_element_space_size())( + std::thread::hardware_concurrency()); + } + + // yscale + { + ck_tile::HostTensor y_rowwise_amax_host({m}); + + using ReduceAmax = ck_tile::ReduceOp::AbsMax; + ck_tile::reference_reduce( + y_host, y_rowwise_amax_host, ReduceAmax{}); + + auto op = [](const auto& v0) { + return v0 / + ck_tile::type_convert(ck_tile::numeric::max()); + }; + ck_tile::reference_unary_elementwise( + y_rowwise_amax_host, yscale_host_ref, op); + + yscale_buf.FromDevice(yscale_host_dev.mData.data()); + + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err(yscale_host_dev, + yscale_host_ref, + std::string("yscale Error: Incorrect results!"), + rtol, + atol); + } + + // rowwise quantization + { + ck_tile::reference_rowwise_quantization2d( + y_host, yscale_host_ref, qy_host_ref); + + qy_buf.FromDevice(qy_host_dev.data()); + auto [rtol, atol] = get_elimit(); + + if(stride == n) + { + pass = ck_tile::check_err(qy_host_dev, + qy_host_ref, + std::string("qy Error: Incorrect results!"), + rtol, + atol); + } + else + { + for(int i_r = 0; i_r < m; i_r++) + { + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, + qy_host_dev.begin() + i_r * stride + n); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, + qy_host_ref.begin() + i_r * stride + n); + pass &= ck_tile::check_err(qy_host_dev_row, + qy_host_ref_row, + std::string("qy[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + } + + std::cout << "[" << data_type << "]" + << " m:" << m << ", n:" << n << ", stride:" << stride + << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + if(data_type == "fp16") + { + return run(arg_parser) ? 0 : -2; + } + /*else if(data_type == "bf16") + { + return run(arg_parser) ? 0 : -2; + }*/ + + return -3; +} diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp new file mode 100644 index 000000000..b25361da2 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +#if 0 +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); + +template float smoothquant_>(const S&, A); +#endif + +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1536_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1536_instance.cpp new file mode 100644 index 000000000..0a332fe41 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n2048_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n2048_instance.cpp new file mode 100644 index 000000000..bdf5804e4 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n256_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n256_instance.cpp new file mode 100644 index 000000000..774c977f2 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n3072_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n3072_instance.cpp new file mode 100644 index 000000000..c571ef443 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_instance.cpp new file mode 100644 index 000000000..80e4b3a29 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp new file mode 100644 index 000000000..7f776a6e4 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n512_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n512_instance.cpp new file mode 100644 index 000000000..12bc90b66 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp new file mode 100644 index 000000000..1cee18606 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n768_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n768_instance.cpp new file mode 100644 index 000000000..aca7f7eb4 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp new file mode 100644 index 000000000..be5fecaca --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +#if 0 +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); + +template float smoothquant_>(const S&, A); +#endif + +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1536_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1536_instance.cpp new file mode 100644 index 000000000..59fe14875 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n2048_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n2048_instance.cpp new file mode 100644 index 000000000..a3710a6ab --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n256_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n256_instance.cpp new file mode 100644 index 000000000..2b1bca7aa --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n3072_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n3072_instance.cpp new file mode 100644 index 000000000..205ba130e --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_instance.cpp new file mode 100644 index 000000000..96503ac91 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp new file mode 100644 index 000000000..36e5e0bb1 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n512_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n512_instance.cpp new file mode 100644 index 000000000..f09932e29 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp new file mode 100644 index 000000000..023cd0be6 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n768_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n768_instance.cpp new file mode 100644 index 000000000..5dcf560c7 --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +template float smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fwd_api.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fwd_api.cpp new file mode 100644 index 000000000..962755f6e --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fwd_api.cpp @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "smoothquant.hpp" + +template +using trait_ = smoothquant_traits_; + +template +float smoothquant_dispatch(smoothquant_traits /*t*/, + smoothquant_args a, + const ck_tile::stream_config& s) +{ + float r = -1; + // clang-format off + // rm rn tm tn vn pd 2p + if(a.n <= 64) { + r = smoothquant_>(s, a); + } + else if(a.n <= 128) { + if (a.n % 2 == 0) + r = smoothquant_>(s, a); + else + r = smoothquant_>(s, a); + } + else if(a.n <= 256) { + if (a.n % 4 == 0) + r = smoothquant_>(s, a); + else if (a.n % 2 == 0) + r = smoothquant_>(s, a); + else + r = smoothquant_>(s, a); + } + else if(a.n <= 512) { + if (a.n % 8 == 0) + r = smoothquant_>(s, a); + else if (a.n % 4 == 0) + r = smoothquant_>(s, a); + else if (a.n % 2 == 0) + r = smoothquant_>(s, a); + else + r = smoothquant_>(s, a); + } + else if(a.n <= 768) { + if (a.n % 4 == 0) + r = smoothquant_>(s, a); + else if (a.n % 2 == 0) + r = smoothquant_>(s, a); + else + r = smoothquant_>(s, a); + } + else if(a.n <= 1024) { + if (a.n % 8 == 0) + r = smoothquant_>(s, a); + else if (a.n % 4 == 0) + r = smoothquant_>(s, a); + else if (a.n % 2 == 0) + r = smoothquant_>(s, a); + else + r = smoothquant_>(s, a); + } + else if(a.n <= 1536) { + if (a.n % 8 == 0) + r = smoothquant_>(s, a); + else if (a.n % 4 == 0) + r = smoothquant_>(s, a); + else if (a.n % 2 == 0) + r = smoothquant_>(s, a); + else + r = smoothquant_>(s, a); + } + else if(a.n <= 2048) { + if (a.n % 8 == 0) + r = smoothquant_>(s, a); + else if (a.n % 4 == 0) + r = smoothquant_>(s, a); + else if (a.n % 2 == 0) + r = smoothquant_>(s, a); + else + r = smoothquant_>(s, a); + } + else if(a.n <= 3072) { + if (a.n % 8 == 0) + r = smoothquant_>(s, a); + else if (a.n % 4 == 0) + r = smoothquant_>(s, a); + else if (a.n % 2 == 0) + r = smoothquant_>(s, a); + else + r = smoothquant_>(s, a); + } + else if(a.n <= 4096) { + if (a.n % 8 == 0) + r = smoothquant_>(s, a); + else if (a.n % 4 == 0) + r = smoothquant_>(s, a); + else if (a.n % 2 == 0) + r = smoothquant_>(s, a); + else + r = smoothquant_>(s, a); + } + else if(a.n > 4096) { + if (a.n % 8 == 0) + r = smoothquant_>(s, a); + else if (a.n % 4 == 0) + r = smoothquant_>(s, a); + else if (a.n % 2 == 0) + r = smoothquant_>(s, a); + else + r = smoothquant_>(s, a); + } + return r; + // clang-format on +} + +float smoothquant(smoothquant_traits t, smoothquant_args a, const ck_tile::stream_config& s) +{ + if(t.data_type.compare("fp16") == 0) + { + return smoothquant_dispatch(t, a, s); + } + else if(t.data_type.compare("bf16") == 0) + { + return smoothquant_dispatch(t, a, s); + } + else + throw std::runtime_error("Without supported instances!"); +} diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp b/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp new file mode 100644 index 000000000..cdf93f6fc --- /dev/null +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "smoothquant.hpp" +#include + +#pragma once + +using S = ck_tile::stream_config; +using A = smoothquant_args; + +template +using trait_ = smoothquant_traits_; + +template +float smoothquant_(const S& s, A a) +{ + using DataType = typename Traits_::DataType; + + using PipelineProblem = ck_tile::SmoothquantPipelineProblem< + typename SmoothquantTypeConfig::XDataType, + typename SmoothquantTypeConfig::XScaleDataType, + typename SmoothquantTypeConfig::ComputeDataType, + typename SmoothquantTypeConfig::YScaleDataType, + typename SmoothquantTypeConfig::QYDataType, + typename Traits_::Shape, + Traits_::kPadN, + Traits_::kTwoPass>; + + using OnePassPipeline = ck_tile::SmoothquantPipelineOnePass; + using TwoPassPipeline = ck_tile::SmoothquantPipelineTwoPass; + using Pipeline = std::conditional_t; + + using Kernel = ck_tile::Smoothquant; + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + auto kargs = Kernel::MakeKargs(a); + if(s.log_level_ > 0) + std::cout << ", " << Kernel::GetName() << std::flush; + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); +} diff --git a/example/ck_tile/12_smoothquant/script/perf_test.sh b/example/ck_tile/12_smoothquant/script/perf_test.sh new file mode 100755 index 000000000..741eb32ec --- /dev/null +++ b/example/ck_tile/12_smoothquant/script/perf_test.sh @@ -0,0 +1,37 @@ + +EXE="$(find . -name tile_smoothquant -type f | head -n 1)" + +$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 + +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 \ No newline at end of file diff --git a/example/ck_tile/12_smoothquant/script/smoke_test.sh b/example/ck_tile/12_smoothquant/script/smoke_test.sh new file mode 100755 index 000000000..d08e06396 --- /dev/null +++ b/example/ck_tile/12_smoothquant/script/smoke_test.sh @@ -0,0 +1,30 @@ +#!/bin/sh +EXE="$(find . -name tile_smoothquant -type f | head -n 1)" + +for pr_i in "fp16" "bf16" ; do +$EXE -prec=$pr_i -m=99 -n=13 +$EXE -prec=$pr_i -m=17 -n=16 +$EXE -prec=$pr_i -m=1 -n=100 +$EXE -prec=$pr_i -m=4 -n=128 +$EXE -prec=$pr_i -m=80 -n=127 +$EXE -prec=$pr_i -m=22 -n=255 -stride=256 +$EXE -prec=$pr_i -m=7 -n=599 +$EXE -prec=$pr_i -m=19 -n=512 +$EXE -prec=$pr_i -m=33 -n=313 -stride=1000 +$EXE -prec=$pr_i -m=11 -n=510 +$EXE -prec=$pr_i -m=171 -n=676 -stride=818 +$EXE -prec=$pr_i -m=91 -n=636 +$EXE -prec=$pr_i -m=12 -n=768 -stride=800 +$EXE -prec=$pr_i -m=100 -n=766 -stride=812 +$EXE -prec=$pr_i -m=31 -n=1024 +$EXE -prec=$pr_i -m=64 -n=1000 -stride=1004 +$EXE -prec=$pr_i -m=8 -n=1501 +$EXE -prec=$pr_i -m=3 -n=1826 +$EXE -prec=$pr_i -m=5 -n=2040 +$EXE -prec=$pr_i -m=7 -n=2734 +$EXE -prec=$pr_i -m=1 -n=3182 +$EXE -prec=$pr_i -m=9 -n=4096 +$EXE -prec=$pr_i -m=3 -n=8192 +$EXE -prec=$pr_i -m=1 -n=10547 +$EXE -prec=$pr_i -m=3 -n=17134 +done diff --git a/example/ck_tile/12_smoothquant/smoothquant.cpp b/example/ck_tile/12_smoothquant/smoothquant.cpp new file mode 100644 index 000000000..ed01d654f --- /dev/null +++ b/example/ck_tile/12_smoothquant/smoothquant.cpp @@ -0,0 +1,218 @@ +#include "ck_tile/host.hpp" +#include "smoothquant.hpp" +#include + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-5; + double atol = 1e-5; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-5; + double atol = 1e-5; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + // due to rounding, int8 quantization might have 1 abs error + double rtol = 1; + double atol = 1; + return ck_tile::make_tuple(rtol, atol); +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3328", "m dimension") + .insert("n", "4096", "n dimension") + .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("v", "1", "cpu validation or not") + .insert("kname", "1", "print kernel name or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = n; + std::string data_type = arg_parser.get_str("prec"); + int kname = arg_parser.get_int("kname"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(stride >= n); + + using TypeConfig = SmoothquantTypeConfig; + + using XDataType = typename TypeConfig::XDataType; + using XScaleDataType = typename TypeConfig::XScaleDataType; + using YScaleDataType = typename TypeConfig::YScaleDataType; + using QYDataType = typename TypeConfig::QYDataType; + using ComputeDataType = typename TypeConfig::ComputeDataType; + + // host verify + ck_tile::HostTensor x_host({m, n}, {stride, 1}); + ck_tile::HostTensor xscale_host({n}); + + ck_tile::HostTensor yscale_host_ref({m}, {1}); + ck_tile::HostTensor yscale_host_dev({m}, {1}); + + ck_tile::HostTensor qy_host_ref({m, n}, {stride, 1}); + ck_tile::HostTensor qy_host_dev({m, n}, {stride, 1}); + + ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillUniformDistribution{1e-3, .5f}(xscale_host); + + ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem xscale_buf(xscale_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes()); + + x_buf.ToDevice(x_host.data()); + xscale_buf.ToDevice(xscale_host.data()); + + std::cout << "[" << data_type << "]" + << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush; + + smoothquant_traits traits{data_type}; + + smoothquant_args args{x_buf.GetDeviceBuffer(), + xscale_buf.GetDeviceBuffer(), + yscale_buf.GetDeviceBuffer(), + qy_buf.GetDeviceBuffer(), + m, + n, + stride}; + + float ave_time = smoothquant( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(XScaleDataType) * n + + sizeof(YScaleDataType) * m + sizeof(QYDataType) * m * n; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush; + + bool pass = true; + + if(do_validation) + { + using YDataType = ComputeDataType; + ck_tile::HostTensor y_host({m, n}, {stride, 1}); + // smooth outlier + { + auto f = [&](auto n_) { + auto v_xscale = ck_tile::type_convert(xscale_host(n_)); + + for(int m_ = 0; m_ < m; ++m_) + { + auto v_x = ck_tile::type_convert(x_host(m_, n_)); + y_host(m_, n_) = v_x * v_xscale; + } + }; + + ck_tile::make_ParallelTensorFunctor(f, xscale_host.get_element_space_size())( + std::thread::hardware_concurrency()); + } + + // yscale + { + ck_tile::HostTensor y_rowwise_amax_host({m}); + + using ReduceAmax = ck_tile::ReduceOp::AbsMax; + ck_tile::reference_reduce( + y_host, y_rowwise_amax_host, ReduceAmax{}); + + auto op = [](const auto& v0) { + return v0 / + ck_tile::type_convert(ck_tile::numeric::max()); + }; + ck_tile::reference_unary_elementwise( + y_rowwise_amax_host, yscale_host_ref, op); + + yscale_buf.FromDevice(yscale_host_dev.mData.data()); + + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err(yscale_host_dev, + yscale_host_ref, + std::string("yscale Error: Incorrect results!"), + rtol, + atol); + } + + // rowwise quantization + { + ck_tile::reference_rowwise_quantization2d( + y_host, yscale_host_ref, qy_host_ref); + + qy_buf.FromDevice(qy_host_dev.data()); + auto [rtol, atol] = get_elimit(); + + if(stride == n) + { + pass = ck_tile::check_err(qy_host_dev, + qy_host_ref, + std::string("qy Error: Incorrect results!"), + rtol, + atol); + } + else + { + for(int i_r = 0; i_r < m; i_r++) + { + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, + qy_host_dev.begin() + i_r * stride + n); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, + qy_host_ref.begin() + i_r * stride + n); + pass &= ck_tile::check_err(qy_host_dev_row, + qy_host_ref_row, + std::string("qy[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + } + + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + if(data_type == "fp16") + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "bf16") + { + return run(arg_parser) ? 0 : -2; + } + + return -3; +} diff --git a/example/ck_tile/12_smoothquant/smoothquant.hpp b/example/ck_tile/12_smoothquant/smoothquant.hpp new file mode 100644 index 000000000..26a598db5 --- /dev/null +++ b/example/ck_tile/12_smoothquant/smoothquant.hpp @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/smoothquant.hpp" +#include + +template +struct SmoothquantTypeConfig; + +template <> +struct SmoothquantTypeConfig +{ + using XDataType = ck_tile::half_t; + using XScaleDataType = float; + using YScaleDataType = float; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; +}; + +template <> +struct SmoothquantTypeConfig +{ + using XDataType = ck_tile::bf16_t; + using XScaleDataType = float; + using YScaleDataType = float; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; +}; + +// runtime args +struct smoothquant_args : public ck_tile::SmoothquantHostArgs +{ +}; + +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template +struct smoothquant_traits_ +{ + using DataType = ck_tile::remove_cvref_t; + + static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize; + static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0); + static constexpr ck_tile::index_t total_warps = + (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize; + + // num of warps along m + static constexpr ck_tile::index_t BlockWarps_M = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return total_warps * (warpSize / ThreadPerBlock_N_); + } + else + { + // static_assert(warpSize % ThreadPerBlock_M_ == 0); + return total_warps / (ThreadPerBlock_N_ / warpSize); + } + }(); + + // num of warps along n + static constexpr ck_tile::index_t BlockWarps_N = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return 1; + } + else + { + static_assert(ThreadPerBlock_N_ % warpSize == 0); + return ThreadPerBlock_N_ / warpSize; + } + }(); + + static constexpr ck_tile::index_t Repeat_M = Repeat_M_; + static constexpr ck_tile::index_t Repeat_N = Repeat_N_; + + static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_; + static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_; + + static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M; + static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_; + + using BlockTile = ck_tile::sequence; + using BlockWarps = ck_tile::sequence; + using WarpTile = ck_tile::sequence; + using Vector = ck_tile::sequence<1, Vector_N_>; + + using Shape = ck_tile::Generic2dBlockShape; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kTwoPass = kTwoPass_; +}; + +template +float smoothquant_(const ck_tile::stream_config& s, smoothquant_args a); + +// This is the public API, will be generated by script +struct smoothquant_traits +{ + std::string data_type; +}; + +float smoothquant(smoothquant_traits, smoothquant_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index e404e5019..9dd9a6ca3 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -11,3 +11,4 @@ add_subdirectory(06_permute) add_subdirectory(09_topk_softmax) add_subdirectory(10_rmsnorm2d) add_subdirectory(11_add_rmsnorm2d_rdquant) +add_subdirectory(12_smoothquant) diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp index fb8d7221b..d06d8529a 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp @@ -4,7 +4,6 @@ #pragma once #include "ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp" -#include "ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp" #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp" #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp" #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp" diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp index 4a0e29035..f06910db3 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp @@ -9,15 +9,16 @@ namespace ck_tile { // host side args +// X = A + B, Y = Rmsnorm2d(X), QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale) struct AddRmsnorm2dRdquantFwdHostArgs { - const void* p_a; - const void* p_b; - const void* p_gamma; + const void* p_a; // [m ,n], input, fp16/bf16 + const void* p_b; // [m ,n], input, fp16/bf16 + const void* p_gamma; // [1, n], gamma, prec same as input - void* p_x; - void* p_yscale; - void* p_qy; + void* p_x; // [m, n], output, p_a + p_b, fp16/bf16 + void* p_yscale; // [m, 1], output, rowwise quant scale (amax / 127) of reuslt of rmsnorm2d(x) + void* p_qy; // [m, n], output, result of quant tensor of rmsnorm2d(x) int8 float epsilon; @@ -90,7 +91,7 @@ struct AddRmsnorm2dRdquantFwd CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) { - return integer_divide_ceil(hargs.m, Block_M); + return dim3(integer_divide_ceil(hargs.m, Block_M)); } CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; } @@ -170,7 +171,7 @@ struct AddRmsnorm2dRdquantFwd number<1>{}); const auto tmp2_ = - pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); return make_tile_window(tmp2_, make_tuple(number{}), {0}); }(); diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp deleted file mode 100644 index 4bc7db434..000000000 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core.hpp" - -namespace ck_tile { -/* -// clang-format off - -4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector - - Block_N (Warp_N * WarpPerBlock_N * Repeat_N ) - +<----------------------< Repeat_N(2)>--------------------->+ - | | - +<-- -->+ - Warp_N - +--------------+--------------+--------------+--------------+----+----------------+ - Warp_M | wrap_0 | wrap_1 | | ^ ^ - +--------------+--------------+ | | - | wrap_2 | wrap_3 | | v - +--------------+--------------+--------------+--------------+----+ Block_M - | | | - + + | - | | | v - +--------------+--------------+--------------+--------------+ + - - each Warp-tile (e.g 16 thrd per row) - - Vector_N (contiguous pixels each thrd holds along N, or vector size) - +-----------+-----------+-----------+-----------+-----------+ - | thrd_0 | thrd_1 | thrd_2 | thrd_3 | ... Vector_M - +-----------+-----------+-----------+-----------+-----------+ - | thrd_16 | thrd_17 | thrd_18 | thrd_19 | ... - +-----------+-----------+-----------+-----------+-----------+ -// clang-format on -*/ -template - typename WarpPerBlock_, // num warps along seq - typename WarpTile_, // warp size, seq - typename Vector_, // contiguous pixels(vector size) along seq - index_t BlockSize_ = - warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})> -struct AddRmsnorm2dRdquantShape -{ - // block size - static constexpr index_t Block_M = BlockTile_::at(number<0>{}); - static constexpr index_t Block_N = BlockTile_::at(number<1>{}); - - // num warps along seq, within each block - static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{}); - static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{}); - - // warp size - static constexpr index_t Warp_M = WarpTile_::at(number<0>{}); - static constexpr index_t Warp_N = WarpTile_::at(number<1>{}); - - static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0); - static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0); - // repeat of each thread along seq - static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); - static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); - - // vector size along seq - static constexpr index_t Vector_M = Vector_::at(number<0>{}); - static constexpr index_t Vector_N = Vector_::at(number<1>{}); - - static_assert(Warp_M % Vector_M == 0); - static_assert(Warp_N % Vector_N == 0); - // num of threads along seq, within each warp - static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; - static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; - - static constexpr index_t BlockSize = BlockSize_; -}; - -} // namespace ck_tile diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp index 73ba633b1..0b9bae4e9 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp @@ -26,6 +26,7 @@ struct AddRmsnorm2dRdquantFwdPipelineDefaultPolicy sequence<1, 1, 2, 2>, sequence<0, 3, 0, 3>>{}); } + template CK_TILE_DEVICE static constexpr auto MakeGammaBlockTileDistribution() { diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp index 9a2e06d05..f5a214ba5 100644 --- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp +++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp @@ -117,7 +117,7 @@ struct Layernorm2dFwd CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) { - return (hargs.m + Block_M - 1) / Block_M; + return dim3(integer_divide_ceil(hargs.m, Block_M)); } CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; } @@ -165,7 +165,7 @@ struct Layernorm2dFwd return base_str; }(); - return _SS_("layernorm2d_fwd_") + _SS_(prec_str) + "_" + + return _SS_("layernorm2d_fwd_") + _SS_(prec_str) + "_" + _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" + _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" + _SS_(Pipeline::name) + surfix; diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp index 6661cddf4..02fd5f7b9 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp @@ -26,6 +26,7 @@ struct Layernorm2dFwdPipelineDefaultPolicy sequence<1, 1, 2, 2>, sequence<0, 3, 0, 3>>{}); } + template CK_TILE_DEVICE static constexpr auto MakeGammaBetaBlockTileDistribution() { diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp index 7ec830add..17ff80f47 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp index fb327f74a..ed9e18be3 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp index 3c6814711..d6ca98e7b 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp @@ -29,7 +29,8 @@ struct BlockReduce2d sweep_tile( [&](auto... idx_) { constexpr auto idx_0 = make_tuple(make_tuple(idx_[number<0>{}]...)[number<0>{}]); - y_tensor(idx_0) = reduce_func(y_tensor(idx_0), x_tensor[idx_]...); + y_tensor(idx_0) = reduce_func( + y_tensor(idx_0), ck_tile::type_convert(x_tensor[idx_])...); }, ReducePacksPerXDim{}); #if 0 diff --git a/include/ck_tile/ops/rmsnorm2d.hpp b/include/ck_tile/ops/rmsnorm2d.hpp index f0a6cf960..8d075dc5f 100644 --- a/include/ck_tile/ops/rmsnorm2d.hpp +++ b/include/ck_tile/ops/rmsnorm2d.hpp @@ -4,7 +4,6 @@ #pragma once #include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp" -#include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp" #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp" #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp" #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp" diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp index 99084a25e..fd89cc36c 100644 --- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp +++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp @@ -11,11 +11,11 @@ namespace ck_tile { // host side args struct Rmsnorm2dFwdHostArgs { - const void* p_x; - const void* p_gamma; + const void* p_x; // [m ,n], input, fp16/bf16 + const void* p_gamma; // [1, n], gamma, prec same as input - void* p_y; - void* p_invRms; + void* p_y; // [m, n], output, fp16/bf16 + void* p_invRms; // [m, 1], output inv-rms, prec same as input, nullptr if not used float epsilon; @@ -83,7 +83,7 @@ struct Rmsnorm2dFwd CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) { - return (hargs.m + Block_M - 1) / Block_M; + return dim3(integer_divide_ceil(hargs.m, Block_M)); } CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; } @@ -149,7 +149,7 @@ struct Rmsnorm2dFwd number<1>{}); const auto tmp2_ = - pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); return make_tile_window(tmp2_, make_tuple(number{}), {0}); }(); diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp deleted file mode 100644 index fc4b9f470..000000000 --- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core.hpp" - -namespace ck_tile { -/* -// clang-format off - -4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector - - Block_N (Warp_N * WarpPerBlock_N * Repeat_N ) - +<----------------------< Repeat_N(2)>--------------------->+ - | | - +<-- -->+ - Warp_N - +--------------+--------------+--------------+--------------+----+----------------+ - Warp_M | wrap_0 | wrap_1 | | ^ ^ - +--------------+--------------+ | | - | wrap_2 | wrap_3 | | v - +--------------+--------------+--------------+--------------+----+ Block_M - | | | - + + | - | | | v - +--------------+--------------+--------------+--------------+ + - - each Warp-tile (e.g 16 thrd per row) - - Vector_N (contiguous pixels each thrd holds along N, or vector size) - +-----------+-----------+-----------+-----------+-----------+ - | thrd_0 | thrd_1 | thrd_2 | thrd_3 | ... Vector_M - +-----------+-----------+-----------+-----------+-----------+ - | thrd_16 | thrd_17 | thrd_18 | thrd_19 | ... - +-----------+-----------+-----------+-----------+-----------+ -// clang-format on -*/ -template - typename WarpPerBlock_, // num warps along seq - typename WarpTile_, // warp size, seq - typename Vector_, // contiguous pixels(vector size) along seq - index_t BlockSize_ = - warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})> -struct Rmsnorm2dShape -{ - // block size - static constexpr index_t Block_M = BlockTile_::at(number<0>{}); - static constexpr index_t Block_N = BlockTile_::at(number<1>{}); - - // num warps along seq, within each block - static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{}); - static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{}); - - // warp size - static constexpr index_t Warp_M = WarpTile_::at(number<0>{}); - static constexpr index_t Warp_N = WarpTile_::at(number<1>{}); - - static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0); - static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0); - // repeat of each thread along seq - static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); - static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); - - // vector size along seq - static constexpr index_t Vector_M = Vector_::at(number<0>{}); - static constexpr index_t Vector_N = Vector_::at(number<1>{}); - - static_assert(Warp_M % Vector_M == 0); - static_assert(Warp_N % Vector_N == 0); - // num of threads along seq, within each warp - static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; - static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; - - static constexpr index_t BlockSize = BlockSize_; -}; - -} // namespace ck_tile diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp index e4814cf45..b258dcbae 100644 --- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp @@ -26,6 +26,7 @@ struct Rmsnorm2dFwdPipelineDefaultPolicy sequence<1, 1, 2, 2>, sequence<0, 3, 0, 3>>{}); } + template CK_TILE_DEVICE static constexpr auto MakeGammaBlockTileDistribution() { diff --git a/include/ck_tile/ops/smoothquant.hpp b/include/ck_tile/ops/smoothquant.hpp new file mode 100644 index 000000000..c9e459765 --- /dev/null +++ b/include/ck_tile/ops/smoothquant.hpp @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp" +#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp" +#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp" +#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp" +#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp new file mode 100644 index 000000000..6ec333516 --- /dev/null +++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +namespace ck_tile { + +// host side args +struct SmoothquantHostArgs +{ + const void* p_x; // [m ,n], input, fp16/bf16 + const void* p_xscale; // [1, n], input, columnwise scale, fp32 + + void* p_yscale; // [m, 1], output, rowwise quant scale (amax / 127) of (p_x * p_xscale) + void* p_qy; // [m, n], output, p_x * p_xscale / p_yscale + + index_t m; + index_t n; + index_t stride; // row_stride +}; + +// TODO: Extract some type to wrapper class +template +struct Smoothquant +{ + using Pipeline = remove_cvref_t; + using Problem = typename Pipeline::Problem; + + using XDataType = remove_cvref_t; + using XScaleDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YScaleDataType = remove_cvref_t; + using QYDataType = remove_cvref_t; + + static constexpr index_t Block_M = Problem::BlockShape::Block_M; + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + static constexpr bool kPadM = false; // always no need to pad along M + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kTwoPass = Problem::kTwoPass; + + static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N; + static constexpr index_t Vector_N = Problem::BlockShape::Vector_N; + static constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N; + + static constexpr auto I0 = number<0>{}; + static constexpr auto I1 = number<1>{}; + + struct Kargs + { + const void* p_x; + const void* p_xscale; + + void* p_yscale; + void* p_qy; + + index_t m; + index_t n; + index_t stride; // row_stride + }; + using Hargs = SmoothquantHostArgs; + + CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) + { + return Kargs{ + hargs.p_x, hargs.p_xscale, hargs.p_yscale, hargs.p_qy, hargs.m, hargs.n, hargs.stride}; + } + + CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) + { + return dim3(integer_divide_ceil(hargs.m, Block_M)); + } + + CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; } + + // clang-format off + template struct t2s; + template <> struct t2s { static constexpr const char * name = "fp32"; }; + template <> struct t2s { static constexpr const char * name = "fp16"; }; + template <> struct t2s { static constexpr const char * name = "bf16"; }; + template <> struct t2s { static constexpr const char * name = "fp8"; }; + template <> struct t2s { static constexpr const char * name = "bf8"; }; + // clang-format on + + // in byte + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); } + + CK_TILE_HOST static std::string GetName() + { + // clang-format off + using S_ = typename Problem::BlockShape; + auto surfix = [&] () { + std::string n; + if (kPadN) n += "_pn"; + if (kTwoPass) n += "_2p"; + return n; }(); + + #define _SS_ std::string + #define _TS_ std::to_string + return _SS_("smoothquant_fwd_") + _SS_(t2s::name) + "_" + + _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" + + _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" + + _SS_(Pipeline::name) + surfix; + #undef _SS_ + #undef _TS_ + // clang-format on + } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + const auto iM = get_block_id() * Block_M; + + const auto x_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_x), + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, + number<1>{}); + + const auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); + }(); + + const auto xscale_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_xscale), + make_tuple(kargs.n), + make_tuple(1), + number{}, + number<1>{}); + + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + + return make_tile_window(tmp2_, make_tuple(number{}), {0}); + }(); + + auto yscale_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_yscale), + make_tuple(kargs.m), + make_tuple(1), + number<1>{}); + + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + + return make_tile_window(tmp2_, make_tuple(number{}), {iM}); + }(); + + auto qy_window = [&]() { + auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_qy), + make_tuple(kargs.m, kargs.n), + make_tuple(kargs.stride, 1), + number{}, + number<1>{}); + + auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {iM, 0}); + }(); + + __shared__ char smem[GetSmemSize()]; + + Pipeline{}(x_window, xscale_window, yscale_window, qy_window, kargs.n, smem); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp new file mode 100644 index 000000000..ff81e69f0 --- /dev/null +++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp" +#include "ck_tile/ops/reduce/block/block_reduce2d.hpp" + +namespace ck_tile { + +struct SmoothquantPipelineDefaultPolicy +{ + template + CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution() + { + using S = typename Problem::BlockShape; + + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 2>>, + sequence<1, 1, 2, 2>, + sequence<0, 3, 0, 3>>{}); + } + + template + CK_TILE_DEVICE static constexpr auto MakeXScaleBlockTileDistribution() + { + using S = typename Problem::BlockShape; + + return make_static_tile_distribution( + tile_distribution_encoding< + sequence, + tuple>, + tuple, sequence<0, 1>>, + tuple, sequence<1, 2>>, + sequence<1, 1>, + sequence<0, 3>>{}); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2d{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2dSync{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync() + { + using P_ = BlockReduce2dProblem; + return BlockReduce2dCrossWarpSync{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + if constexpr(Problem::kNeedCrossWarpSync) + { + using P_ = BlockReduce2dProblem; + + using block_reduce2d = BlockReduce2d; + using x_block_tile = + decltype(make_static_distributed_tensor( + MakeXBlockTileDistribution())); + using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile()); + + return GetBlockReduce2dCrossWarpSync().template GetSmemSize(); + } + else + { + return 1; // zero size arrays are an extension + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp new file mode 100644 index 000000000..d5b3780de --- /dev/null +++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp" +#include +#include + +namespace ck_tile { + +template +struct SmoothquantPipelineOnePass +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using XDataType = ck_tile::remove_cvref_t; + using XScaleDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using QYDataType = ck_tile::remove_cvref_t; + using YScaleDataType = ck_tile::remove_cvref_t; + + static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; + static constexpr bool kPadM = false; // TODO - BlockSmoothquantProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + + static constexpr const char* name = []() { + if constexpr(kNeedCrossWarpSync) + return "bpr_op"; // block per row + else + return "wpr_op"; // warp per row + }(); + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + CK_TILE_DEVICE auto operator()(const XWindow& x_window_, + const XScaleWindow& xscale_window_, + YScaleWindow& yscale_window, + QYWindow& qy_window, + ck_tile::index_t, + void* smem) const + { + auto x_window = + make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); + auto xscale_window = make_tile_window( + xscale_window_, Policy::template MakeXScaleBlockTileDistribution()); + + auto reduce_absmax_func = ReduceOp::AbsMax{}; + auto reduce_max_func = ReduceOp::Max{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto block_reduce2d_cross_warp_sync = + Policy::template GetBlockReduce2dCrossWarpSync(); + + const auto x = load_tile(x_window); + const auto xscale = load_tile(xscale_window); + auto y = tile_elementwise_in( + [&](const auto& a, const auto& b) { + return type_convert(a) * type_convert(b); + }, + x, + xscale); + + // compute absmax, cross-lane->cross-warp + auto absmax = block_reduce2d( + y, reduce_absmax_func.GetIdentityValue(), reduce_absmax_func); + block_reduce2d_sync(absmax, reduce_max_func); + block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func); + + // ex: yscale = absmax / 127 if int8 + auto yscale = tile_elementwise_in( + [&](const auto& v_) { + return v_ / type_convert(numeric::max()); + }, + absmax); + store_tile(yscale_window, cast_tile(yscale)); + + // quantize y to qy + auto qy = make_static_distributed_tensor(y.get_tile_distribution()); + sweep_tile(qy, [&](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + auto qy_ = y[idx] / yscale[i_idx]; + qy(idx) = saturates{}(qy_); + }); + store_tile(qy_window, qy); + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp new file mode 100644 index 000000000..37e09b58c --- /dev/null +++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/utility/type_traits.hpp" + +namespace ck_tile { + +// Y = X * XScale, QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale) +template +struct SmoothquantPipelineProblem +{ + using XDataType = remove_cvref_t; + using XScaleDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YScaleDataType = remove_cvref_t; + using QYDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; + + static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; + static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kTwoPass = kTwoPass_; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp new file mode 100644 index 000000000..7878ef1d3 --- /dev/null +++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp" +#include +#include + +namespace ck_tile { + +template +struct SmoothquantPipelineTwoPass +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using XDataType = ck_tile::remove_cvref_t; + using XScaleDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using QYDataType = ck_tile::remove_cvref_t; + using YScaleDataType = ck_tile::remove_cvref_t; + + static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; + static constexpr bool kPadM = false; // TODO - BlockSmoothquantProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + + static constexpr const char* name = []() { + if constexpr(kNeedCrossWarpSync) + return "bpr_tp"; // block per row + else + return "wpr_tp"; // warp per row + }(); + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + CK_TILE_DEVICE auto operator()(const XWindow& x_window_, + const XScaleWindow& xscale_window_, + YScaleWindow& yscale_window, + QYWindow& qy_window, + ck_tile::index_t row_size, + void* smem) const + { + auto x_window = + make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); + auto xscale_window = make_tile_window( + xscale_window_, Policy::template MakeXScaleBlockTileDistribution()); + + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + index_t num_n_tile_iteration = + __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); + + auto reduce_absmax_func = ReduceOp::AbsMax{}; + auto reduce_max_func = ReduceOp::Max{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto block_reduce2d_cross_warp_sync = + Policy::template GetBlockReduce2dCrossWarpSync(); + + using XTensorType = decltype(cast_tile(load_tile(x_window))); + auto absmax = block_reduce2d.template MakeYBlockTile(); + set_tile(absmax, reduce_absmax_func.GetIdentityValue()); + + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto x = load_tile(x_window); + const auto xscale = load_tile(xscale_window); + const auto y = tile_elementwise_in( + [&](const auto& a, const auto& b) { + return type_convert(a) * type_convert(b); + }, + x, + xscale); + + block_reduce2d(y, absmax, reduce_absmax_func); + + move_tile_window(x_window, {0, Block_N}); + move_tile_window(xscale_window, {Block_N}); + } + + // compute absmax, cross-lane->cross-warp + block_reduce2d_sync(absmax, reduce_max_func); + block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func); + + // ex: yscale = absmax / 127 if int8 + auto yscale = tile_elementwise_in( + [&](const auto& v_) { + return v_ / type_convert(numeric::max()); + }, + absmax); + store_tile(yscale_window, cast_tile(yscale)); + + // reverse read x to reuse cache + ck_tile::index_t stride_to_right_most_window = + row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N; + + move_tile_window(x_window, {0, -Block_N}); + move_tile_window(xscale_window, {-Block_N}); + move_tile_window(qy_window, {0, stride_to_right_most_window}); + + // recompute y and quantize y to qy + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto x = load_tile(x_window); + const auto xscale = load_tile(xscale_window); + const auto y = tile_elementwise_in( + [&](const auto& a, const auto& b) { + return type_convert(a) * type_convert(b); + }, + x, + xscale); + + auto qy = make_static_distributed_tensor(y.get_tile_distribution()); + sweep_tile(qy, [&](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + auto qy_ = y[idx] / yscale[i_idx]; + qy(idx) = saturates{}(qy_); + }); + store_tile(qy_window, qy); + + move_tile_window(x_window, {0, -Block_N}); + move_tile_window(xscale_window, {0, -Block_N}); + move_tile_window(qy_window, {0, -Block_N}); + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py index 0612d4238..b0d2c36ef 100644 --- a/include/ck_tile/remod.py +++ b/include/ck_tile/remod.py @@ -1,3 +1,4 @@ +from datetime import datetime import pathlib from pathlib import Path import subprocess @@ -8,8 +9,8 @@ NS = 'ck_tile' OPS = 'ops' OPS_COMMON = 'common' # common header will be duplicated into ops/* other module -HEADER_COMMON = """// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n +HEADER_COMMON = f"""// SPDX-License-Identifier: MIT +// Copyright (c) 2018-{datetime.now().year}, Advanced Micro Devices, Inc. All rights reserved.\n """ # aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp) -- GitLab From 03c6448ba3c854195c61c817036b66af1fa0e844 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 31 Oct 2024 22:52:23 -0700 Subject: [PATCH 032/153] Reduce build time. (#1621) * disable fp8 gemm_universal on gfx90a and gfx908 by default * fix cmake syntax * fix clang format * add ifdefs in amd_xdlops * disable fp8 gemm instances on gfx90a by default * update readme --- CMakeLists.txt | 12 ++++++-- README.md | 14 +++++---- .../gpu/CMakeLists.txt | 30 +++++++++++++++++-- ...tiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp | 10 ++++--- ...gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp | 5 ++-- ...gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp | 10 ++++--- profiler/src/profile_gemm_universal.cpp | 8 ++++- .../test_gemm_universal_xdl.cpp | 4 +-- 8 files changed, 69 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a5180363..74628597a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,7 +137,7 @@ if(GPU_TARGETS) else() set(USER_GPU_TARGETS 0) endif() -find_package(hip) +find_package(hip REQUIRED) # No assumption that HIP kernels are launched with uniform block size for backward compatibility # SWDEV-413293 and https://reviews.llvm.org/D155213 math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}") @@ -170,7 +170,10 @@ else() set(CK_GPU_TARGETS ${GPU_TARGETS}) endif() endif() - +#if the user did not set GPU_TARGETS, delete whatever was set by HIP package +if(NOT USER_GPU_TARGETS) + set(GPU_TARGETS "") +endif() #make sure all the targets on the list are actually supported by the current compiler rocm_check_target_ids(SUPPORTED_GPU_TARGETS TARGETS ${CK_GPU_TARGETS}) @@ -187,6 +190,10 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1 add_definitions(-DCK_USE_WMMA) set(CK_USE_WMMA "ON") endif() +option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF) +if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908")) + add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH) +endif() # CK config file to record supported datatypes, etc. configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/config.h) @@ -314,7 +321,6 @@ link_libraries(${OpenMP_gomp_LIBRARY}) link_libraries(${OpenMP_pthread_LIBRARY}) ## HIP -find_package(HIP REQUIRED) # Override HIP version in config.h, if necessary. # The variables set by find_package() can't be overwritten, # therefore let's use intermediate variables. diff --git a/README.md b/README.md index 053406515..302173dc1 100644 --- a/README.md +++ b/README.md @@ -137,12 +137,11 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa You can find instructions for running ckProfiler in [profiler](/profiler). -Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly. +Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly. +However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and +crash. On average, you should expect each thread to use ~2Gb of RAM. Depending on the number of CPU cores and the amount of RAM on your system, you may want to -limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM. - -By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and -crash. In such cases, you can reduce the number of threads to 32 by using `-j32`. +limit the number of threads. For example, if you have a 128-core CPU and 128 Gb of RAM it's advisable to use `-j32`. Additional cmake flags can be used to significantly speed-up the build: @@ -154,6 +153,11 @@ Additional cmake flags can be used to significantly speed-up the build: `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most other platforms have faster instances, such as `xdl` or `wmma`, available. +* `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances, + such as `gemm_universal` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not + have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on + architectures like the MI100/MI200 for the functional support only. + ## Using sccache for building The default CK Docker images come with a pre-installed version of sccache, which supports clang diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index f82176ffc..6756c3351 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -67,6 +67,21 @@ function(add_instance_library INSTANCE_NAME) list(REMOVE_ITEM ARGN "${source}") endif() endforeach() + # Do not build gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94 + if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH) + foreach(source IN LISTS ARGN) + if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_multiply_multiply_xdl_f8") + message("removing gemm_multiply_multiply_f8 instance ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() + foreach(source IN LISTS ARGN) + if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_xdl_universal" AND source MATCHES "_f8_") + message("removing gemm_universal_f8 instance ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() + endif() #only continue if there are some source files left on the list if(ARGN) set(INST_OBJ) @@ -74,11 +89,20 @@ function(add_instance_library INSTANCE_NAME) set(INST_TARGETS ${SUPPORTED_GPU_TARGETS}) if(source MATCHES "_xdl") list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) - elseif(ARGN MATCHES "_wmma") + elseif(source MATCHES "_wmma") list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) - elseif(ARGN MATCHES "mha") + elseif(source MATCHES "mha") list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) endif() + #only build the fp8 gemm instances for gfx908/90a if the build argument is set + if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH) + if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8") + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + endif() + if(source MATCHES "gemm_multiply_multiply_f8") + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + endif() + endif() set(offload_targets) foreach(target IN LISTS INST_TARGETS) string(APPEND offload_targets "--offload-arch=${target} ") @@ -108,7 +132,7 @@ function(add_instance_library INSTANCE_NAME) # flags to compress the library if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132) - message("Adding --offload-compress flag for ${INSTANCE_NAME}") + #message("Adding --offload-compress flag for ${INSTANCE_NAME}") target_compile_options(${INSTANCE_NAME} PRIVATE --offload-compress) endif() diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp index 8a24af1b8..b1b64ca85 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp @@ -36,12 +36,12 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; template using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple< - // clang-format off +// clang-format off //################################| ALayout| BLayout| DsLayout| ELayout|AData| BData| DsData| EData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| //################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - +#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Compute friendly DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 256, 64, 16, 16, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, @@ -58,17 +58,18 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances = std DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 64, 128, 16, 16, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 64, 128, 128, 16, 16, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 64, 64, 128, 16, 16, 32, 32, 1, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8> +#endif // clang-format on >; template using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple< - // clang-format off +// clang-format off //################################| ALayout| BLayout| DsLayout| ELayout|AData| BData| DsData| EData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| //################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - +#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Latency friendly DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2, 2, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, @@ -90,6 +91,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_instances = std: DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 32, 128, 128, 16, 16, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<8, 8, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 16, 256, 128, 16, 16, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 32, 256, 128, 16, 16, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, S<8, 8, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8> +#endif // clang-format on >; } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp index 3b930e989..658714d35 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp @@ -62,12 +62,12 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple< template using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_instances = std::tuple< - // clang-format off +// clang-format off //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - +#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Latency friendly DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, @@ -90,6 +90,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_instances = std::tuple< DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 128, 16, 4, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 128, 16, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 128, 8, 8, 16, 16, 1, 4, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8> +#endif // clang-format on >; } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp index b621cad94..382ed5b5a 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp @@ -35,12 +35,12 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; template using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple< - // clang-format off +// clang-format off //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - +#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Compute friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 64, 16, 16, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, @@ -57,17 +57,18 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple< DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 64, 128, 16, 16, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, // DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 64, 128, 128, 16, 16, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 64, 64, 128, 16, 16, 32, 32, 1, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8> +#endif // clang-format on >; template using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple< - // clang-format off +// clang-format off //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - +#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Latency friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, @@ -97,6 +98,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple< DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 128, 16, 16, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 128, 16, 16, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 128, 16, 16, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8> +#endif // clang-format on >; } // namespace instance diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp index f86dddc72..576bd009b 100644 --- a/profiler/src/profile_gemm_universal.cpp +++ b/profiler/src/profile_gemm_universal.cpp @@ -101,7 +101,9 @@ int profile_gemm_universal(int argc, char* argv[]) using F32 = float; using F16 = ck::half_t; using BF16 = ck::bhalf_t; - using F8 = ck::f8_t; +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + using F8 = ck::f8_t; +#endif using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -162,6 +164,7 @@ int profile_gemm_universal(int argc, char* argv[]) { return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); } +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN) { return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}); @@ -178,6 +181,7 @@ int profile_gemm_universal(int argc, char* argv[]) { return profile(F8{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); } +#endif else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN) { return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{}); @@ -194,6 +198,7 @@ int profile_gemm_universal(int argc, char* argv[]) { return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{}); } +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN) { return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{}); @@ -202,6 +207,7 @@ int profile_gemm_universal(int argc, char* argv[]) { return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{}); } +#endif else { std::cout << "this data_type & layout is not implemented" << std::endl; diff --git a/test/gemm_universal/test_gemm_universal_xdl.cpp b/test/gemm_universal/test_gemm_universal_xdl.cpp index 0d29c5fb7..23b5c74dd 100644 --- a/test/gemm_universal/test_gemm_universal_xdl.cpp +++ b/test/gemm_universal/test_gemm_universal_xdl.cpp @@ -56,7 +56,7 @@ class TestGemmUniversal_KM_NK using KernelTypes_MK_KN = ::testing::Types< // ADataType, BDataType, ComputeDataType, CDataType std::tuple< F16, F16, F16, F16>, -#if (defined CK_ENABLE_FP8) +#if defined(CK_ENABLE_FP8) && defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) std::tuple< F16, F8, F16, F16>, std::tuple< F8, F16, F16, F16>, std::tuple< F8, F8, F8, BF16>, @@ -66,7 +66,7 @@ using KernelTypes_MK_KN = ::testing::Types< using KernelTypes_MK_NK = ::testing::Types< // ADataType, BDataType, ComputeDataType, CDataType std::tuple< F16, F16, F16, F16>, -#if (defined CK_ENABLE_FP8) +#if defined(CK_ENABLE_FP8) && defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) std::tuple< F16, F8, F16, F16>, std::tuple< F8, F16, F16, F16>, std::tuple< F8, F8, F8, BF16>, -- GitLab From cb6c5d39dcc76f370d06d0c4467a3650c8713c2b Mon Sep 17 00:00:00 2001 From: carlushuang Date: Sat, 2 Nov 2024 05:30:16 +0000 Subject: [PATCH 033/153] [CK_TILE] layernorm have more accurate residual (#1623) * more accurate residual * modify comment * Fix literal case in README.md --------- Co-authored-by: Po Yen Chen --- example/ck_tile/02_layernorm2d/README.md | 4 +- example/ck_tile/02_layernorm2d/generate.py | 5 +- .../ops/epilogue/dynamic_quant_epilogue.hpp | 84 +++++++++++++++---- ...ayernorm2d_fwd_pipeline_default_policy.hpp | 10 +-- .../layernorm2d_fwd_pipeline_one_pass.hpp | 34 +++----- .../layernorm2d_fwd_pipeline_two_pass.hpp | 23 +++-- 6 files changed, 97 insertions(+), 63 deletions(-) diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md index 14c6fc0d6..3573d70cd 100644 --- a/example/ck_tile/02_layernorm2d/README.md +++ b/example/ck_tile/02_layernorm2d/README.md @@ -69,7 +69,7 @@ args: ``` ## limitations -Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by default generated. though our kernel template suppor this. (TBD: add some flag in generate.py) to generate those instance on demand. Beside, N>8192 case will by default using two-pass pipeline, and `-fquant=1/2` are not supported yet. +Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by default generated. Though our kernel template suppor this. (TBD: add some flag in generate.py) to generate those instance on demand. Beside, `N>8192` case will by default using two-pass pipeline, and `-fquant=1/2` are not supported yet. If need suport `N>8192` and `fused+residual+store`, you can use this example together with `12_smoothquant`, to construct layernorm+residual, and smoothquant, 2 kernels for this purpose. ``` # some case @@ -82,4 +82,4 @@ Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by d # standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant+fused-add-store, output in int8 ./build/bin/tile_example_layernorm2d_fwd -m=10 -n=1024 -prec_o=int8 -fquant=1 -fadd=1 -``` \ No newline at end of file +``` diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py index 300f6c05e..bf576db97 100644 --- a/example/ck_tile/02_layernorm2d/generate.py +++ b/example/ck_tile/02_layernorm2d/generate.py @@ -202,8 +202,9 @@ float layernorm2d_fwd_(const S& s, A a) using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem; using Default2DEpilogue = ck_tile::Default2DEpilogue; - using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem>; + static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1; + using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem>; using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue; diff --git a/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp b/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp index 2e2960411..3dec404b4 100644 --- a/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp @@ -8,17 +8,23 @@ namespace ck_tile { -template +template struct DynamicQuantEpilogueTraits { - static constexpr bool kPadM = kPadM_; - static constexpr bool kPadN = kPadN_; - static constexpr bool UseRawStore = UseRawStore_; - static constexpr bool UseMax3 = UseMax3_; + static constexpr bool kPadM = kPadM_; + static constexpr bool kPadN = kPadN_; + static constexpr bool UseSmoothInputScale = UseSmoothInputScale_; + static constexpr bool UseRawStore = UseRawStore_; + static constexpr bool UseMax3 = UseMax3_; }; // this epilogue just store out a M*N matrix, row major template ; + using XScaleDataType = remove_cvref_t; using YScaleDataType = remove_cvref_t; using ODataType = remove_cvref_t; using BlockShape = remove_cvref_t; // can consum generic 2d shape using Traits = remove_cvref_t; }; +// TODO: we should put descriptor creation function into policy template struct DynamicQuantEpilogue { using Problem = remove_cvref_t; using AccDataType = remove_cvref_t; + using XScaleDataType = remove_cvref_t; using YScaleDataType = remove_cvref_t; using ODataType = remove_cvref_t; using BlockShape = remove_cvref_t; @@ -63,6 +72,33 @@ struct DynamicQuantEpilogue return BlockReduce2dCrossWarpSync{}; } + CK_TILE_DEVICE static constexpr auto MakeSmoothInputScaleTileDistribution() + { + using S = BlockShape; +#if 0 + // don't remove this + // Note that if we set encoding purposely like this, you will result in compile fail + // TODO: x_scale create local-scratch to accept arbitrary acc input (with same length) + return make_static_tile_distribution( + tile_distribution_encoding< + sequence, + tuple>, + tuple, sequence<0, 1>>, + tuple, sequence<2, 2>>, + sequence<0, 1, 1>, + sequence<0, 0, 3>>{}); +#else + return make_static_tile_distribution( + tile_distribution_encoding< + sequence, + tuple>, + tuple, sequence<0, 1>>, + tuple, sequence<1, 2>>, + sequence<1, 1>, + sequence<0, 3>>{}); +#endif + } + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync(); @@ -71,8 +107,12 @@ struct DynamicQuantEpilogue // TODO: this function assume store out vector size is the same as OAccTile last dimension size // how do we fix this ? - template + template CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, + const XScaleWindow& x_scale_window_, YScaleWindow& y_scale_window, const OAccTile& o_acc_tile, void* smem) @@ -80,6 +120,18 @@ struct DynamicQuantEpilogue auto reduce = GetBlockReduce2d(); auto reduce_sync = GetBlockReduce2dSync(); auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync(); + const auto x_scale_window = + make_tile_window(x_scale_window_, MakeSmoothInputScaleTileDistribution()); + + auto x_scale = load_tile(x_scale_window); + + auto o_acc_tmp = o_acc_tile; + + sweep_tile(o_acc_tmp, [&](auto idx) { + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + const auto xs_ = type_convert(x_scale[j_idx]); + o_acc_tmp(idx) = o_acc_tmp(idx) * xs_; + }); const auto f_absmax = [](auto acc_, auto v_0_) { return max(acc_, abs(v_0_)); }; @@ -87,10 +139,9 @@ struct DynamicQuantEpilogue constexpr auto y_size_per_row = OAccTile{}.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at( number<1>{}); - // constexpr auto y_size_per_row = OAccTile::get_lengths()[number<1>{}]; if constexpr(UseMax3 && std::is_same_v && y_size_per_row % 2 == 0) { - // fast max3 implementation + // fast max3+abs implementation const auto f_max3 = [](auto acc_, auto v_0_, auto v_1_) { float rtn; asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)" @@ -98,11 +149,11 @@ struct DynamicQuantEpilogue : "v"(acc_), "v"(v_0_), "v"(v_1_)); return rtn; }; - return reduce(o_acc_tile, type_convert(0), f_max3, sequence<1, 2>{}); + return reduce(o_acc_tmp, type_convert(0), f_max3, sequence<1, 2>{}); } else { - return reduce(o_acc_tile, type_convert(0), f_absmax); + return reduce(o_acc_tmp, type_convert(0), f_absmax); } }(); reduce_sync(row_absmax, f_absmax); @@ -117,23 +168,20 @@ struct DynamicQuantEpilogue store_tile(y_scale_window, cast_tile(y_scale)); - auto o_acc_scaled_tile = - make_static_distributed_tensor(o_acc_tile.get_tile_distribution()); - - sweep_tile(o_acc_tile, [&](auto idx) { - constexpr auto row_id = make_tuple(idx[number<0>{}]); - o_acc_scaled_tile(idx) = o_acc_tile[idx] / y_scale(row_id); + sweep_tile(o_acc_tmp, [&](auto idx) { + constexpr auto row_id = make_tuple(idx[number<0>{}]); + o_acc_tmp(idx) = o_acc_tmp[idx] / y_scale(row_id); }); // TODO: this is ugly if constexpr(UseRawStore && (kPadM || kPadN)) { - store_tile_raw(o_dram_window_tmp, cast_tile(o_acc_scaled_tile)); + store_tile_raw(o_dram_window_tmp, cast_tile(o_acc_tmp)); buffer_store_fence(); } else { - store_tile(o_dram_window_tmp, cast_tile(o_acc_scaled_tile)); + store_tile(o_dram_window_tmp, cast_tile(o_acc_tmp)); } } }; diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp index 02fd5f7b9..1de230c14 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp @@ -45,7 +45,7 @@ struct Layernorm2dFwdPipelineDefaultPolicy template CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelford() { - using P_ = BlockWelfordProblem; @@ -55,7 +55,7 @@ struct Layernorm2dFwdPipelineDefaultPolicy template CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordSync() { - using P_ = BlockWelfordProblem; @@ -65,7 +65,7 @@ struct Layernorm2dFwdPipelineDefaultPolicy template CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordCrossWarpSync() { - using P_ = BlockWelfordProblem; @@ -77,13 +77,13 @@ struct Layernorm2dFwdPipelineDefaultPolicy { if constexpr(Problem::kNeedCrossWarpSync) { - using P_ = BlockWelfordProblem; using block_welford = BlockWelford; using x_block_tile = - decltype(make_static_distributed_tensor( + decltype(make_static_distributed_tensor( MakeXBlockTileDistribution())); using mean_var_block_tile = decltype(block_welford::template MakeMeanVarBlockTile()); diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index 5601f3a68..83cdab428 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -87,12 +87,9 @@ struct Layernorm2dFwdPipelineOnePass x_residual_window_, Policy::template MakeXBlockTileDistribution()); auto y_residual_window = make_tile_window( y_residual_window_, Policy::template MakeXBlockTileDistribution()); - const auto x_scale_window = make_tile_window( - x_scale_window_, Policy::template MakeGammaBetaBlockTileDistribution()); - auto x = load_tile(x_window); - auto x_resi = load_tile(x_residual_window); - auto x_scale = load_tile(x_scale_window); + auto x = load_tile(x_window); + auto x_resi = load_tile(x_residual_window); int cur_count = 0; int max_count = @@ -106,21 +103,21 @@ struct Layernorm2dFwdPipelineOnePass const auto gamma = load_tile(gamma_window); const auto beta = load_tile(beta_window); + auto acc = cast_tile(x); + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE || kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD) { sweep_tile(x_resi, [&](auto idx) { // compute x = x_resi + x - auto re_ = type_convert(x_resi(idx)) + - type_convert(x(idx)); - x(idx) = type_convert(re_); + acc(idx) = type_convert(x_resi(idx)) + acc(idx); }); if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE) - store_tile(y_residual_window, x); + store_tile(y_residual_window, cast_tile(acc)); } // compute welford each-thread->cross-lane->cross-warp - auto [mean, var] = block_welford(x, cur_count, max_count); + auto [mean, var] = block_welford(acc, cur_count, max_count); block_welford_sync(mean, var, cur_count); block_welford_cross_warp_sync(mean, var, cur_count, smem); block_tile_welford_post_scale_var(var, cur_count); @@ -138,7 +135,7 @@ struct Layernorm2dFwdPipelineOnePass store_tile(inv_std_window, cast_tile(inv_std)); // layernorm computation - auto ln = make_static_distributed_tensor(x.get_tile_distribution()); + auto ln = make_static_distributed_tensor(acc.get_tile_distribution()); sweep_tile(ln, [&, mean_ = mean](auto idx) { constexpr auto i_idx = make_tuple(idx[number<0>{}]); constexpr auto j_idx = make_tuple(idx[number<1>{}]); @@ -146,26 +143,15 @@ struct Layernorm2dFwdPipelineOnePass const auto gamma_ = type_convert(gamma[j_idx]); const auto beta_ = type_convert(beta[j_idx]); - const auto x_ = type_convert(x[idx]); - auto ln_ = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; + auto ln_ = (acc[idx] - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; ln(idx) = ln_; }); - if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT) - { - // smooth-quant pre-scale, then run rowwise-quant - sweep_tile(ln, [&](auto idx) { - constexpr auto j_idx = make_tuple(idx[number<1>{}]); - const auto xs_ = type_convert(x_scale[j_idx]); - ln(idx) = ln(idx) * xs_; - }); - } - if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::DYNAMIC_QUANT || kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT) { - Epilogue{}(y_window_, y_scale_window, ln, smem); + Epilogue{}(y_window_, x_scale_window_, y_scale_window, ln, smem); } else Epilogue{}(y_window_, ln); diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp index 48f66739d..fadf56dfd 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -106,7 +106,7 @@ struct Layernorm2dFwdPipelineTwoPass auto block_welford_cross_warp_sync = Policy::template GetBlockWelfordCrossWarpSync(); - using XTensorType = decltype(load_tile(x_window)); + using XTensorType = decltype(cast_tile(load_tile(x_window))); auto mean = block_welford.template MakeMeanVarBlockTile(); auto var = block_welford.template MakeMeanVarBlockTile(); @@ -117,22 +117,22 @@ struct Layernorm2dFwdPipelineTwoPass move_tile_window(x_window, {0, Block_N}); move_tile_window(x_residual_window, {0, Block_N}); + auto acc = cast_tile(x); + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE || kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD) { sweep_tile(x_resi, [&](auto idx) { // compute x = x_resi + x - auto re_ = type_convert(x_resi(idx)) + - type_convert(x(idx)); - x(idx) = type_convert(re_); + acc(idx) = type_convert(x_resi(idx)) + acc(idx); }); if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE) { - store_tile(y_residual_window, x); + store_tile(y_residual_window, cast_tile(acc)); move_tile_window(y_residual_window, {0, Block_N}); } } - block_welford(x, mean, var, cur_count, max_count); + block_welford(acc, mean, var, cur_count, max_count); } block_welford_sync(mean, var, cur_count); @@ -166,21 +166,21 @@ struct Layernorm2dFwdPipelineTwoPass { auto x = load_tile(x_window); auto x_resi = load_tile(x_residual_window); + auto acc = cast_tile(x); + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE || kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD) { sweep_tile(x_resi, [&](auto idx) { // compute x = x_resi + x - auto re_ = type_convert(x_resi(idx)) + - type_convert(x(idx)); - x(idx) = type_convert(re_); + acc(idx) = type_convert(x_resi(idx)) + acc(idx); }); } // load gamma/beta (TODO: support no gamma/beta?) const auto gamma = load_tile(gamma_window); const auto beta = load_tile(beta_window); - auto ln = make_static_distributed_tensor(x.get_tile_distribution()); + auto ln = make_static_distributed_tensor(acc.get_tile_distribution()); sweep_tile(ln, [&, mean_ = mean](auto idx) { constexpr auto i_idx = make_tuple(idx[number<0>{}]); @@ -189,8 +189,7 @@ struct Layernorm2dFwdPipelineTwoPass const auto gamma_ = type_convert(gamma[j_idx]); const auto beta_ = type_convert(beta[j_idx]); - const auto x_ = type_convert(x[idx]); - auto ln_ = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; + auto ln_ = (acc(idx) - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; ln(idx) = ln_; }); -- GitLab From 4f1fdbb6e3cae103eab134bb9c1b3001ee48f17f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Mon, 4 Nov 2024 22:34:17 +0100 Subject: [PATCH 034/153] Temporary disable part of dynamic op conv instances (#1630) * Temporary disable part of dynamic op conv instances * fix --- ...ouped_conv_fwd_xdl_dynamic_op_instance.hpp | 20 +++++++++++-------- ...mic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 3 +++ ...amic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 3 +++ ...amic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp | 3 +++ ...mic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp | 3 +++ ..._op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp | 3 +++ ...c_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 3 +++ ...c_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp | 3 +++ ..._op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp | 3 +++ 9 files changed, 36 insertions(+), 8 deletions(-) diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp index 9db675a51..82c01a634 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp @@ -53,8 +53,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances = std::tuple< DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, // instances for small conv.K and conv.C DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8> + #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time) DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, @@ -68,6 +68,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances = std::tuple< DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> + #endif // clang-format on >; @@ -87,8 +88,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f16_instances = std::tuple< DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, // instances for small conv.K and conv.C DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8> + #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time) DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, @@ -102,6 +103,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f16_instances = std::tuple< DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> + #endif // clang-format on >; @@ -121,8 +123,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f32_instances = std::tuple< DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1>, // instances for small conv.K and conv.C DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> + #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time) DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, @@ -136,6 +138,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f32_instances = std::tuple< DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> + #endif // clang-format on >; @@ -155,8 +158,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_int8_instances = std::tuple< DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, // instances for small conv.K and conv.C DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8> + #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time) DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, @@ -170,6 +173,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_int8_instances = std::tuple< DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> + #endif // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp index 853470e1c..4ee20a0ca 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp @@ -31,6 +31,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instanc Tuple<>, NHWGK, ConvFwdDefault>{}); +#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause + // long compilation time) add_device_operation_instances( instances, device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2, @@ -47,6 +49,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instanc Tuple<>, NHWGK, ConvFwd1x1S1P0>{}); +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp index 725b9ca0d..18a616ef1 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp @@ -31,6 +31,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance Tuple<>, NHWGK, ConvFwdDefault>{}); +#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause + // long compilation time) add_device_operation_instances( instances, device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2, @@ -47,6 +49,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance Tuple<>, NHWGK, ConvFwd1x1S1P0>{}); +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp index fbd5fe370..850458f53 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp @@ -31,6 +31,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance Tuple<>, NHWGK, ConvFwdDefault>{}); +#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause + // long compilation time) add_device_operation_instances( instances, device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2, @@ -47,6 +49,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance Tuple<>, NHWGK, ConvFwd1x1S1P0>{}); +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp index 6bfc29537..f69bcf1a7 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp @@ -30,6 +30,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instanc Tuple<>, NHWGK, ConvFwdDefault>{}); +#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause + // long compilation time) add_device_operation_instances( instances, device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2, @@ -46,6 +48,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instanc Tuple<>, NHWGK, ConvFwd1x1S1P0>{}); +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp index 249dfaa4d..00c0ba3ea 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp @@ -31,6 +31,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_inst Tuple<>, NDHWGK, ConvFwdDefault>{}); +#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause + // long compilation time) add_device_operation_instances( instances, device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3, @@ -47,6 +49,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_inst Tuple<>, NDHWGK, ConvFwd1x1S1P0>{}); +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp index 75c4ddc35..aa47bbdbe 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp @@ -31,6 +31,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_insta Tuple<>, NDHWGK, ConvFwdDefault>{}); +#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause + // long compilation time) add_device_operation_instances( instances, device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3, @@ -47,6 +49,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_insta Tuple<>, NDHWGK, ConvFwd1x1S1P0>{}); +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp index 2e237e07b..8df05d9da 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp @@ -31,6 +31,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_insta Tuple<>, NDHWGK, ConvFwdDefault>{}); +#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause + // long compilation time) add_device_operation_instances( instances, device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3, @@ -47,6 +49,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_insta Tuple<>, NDHWGK, ConvFwd1x1S1P0>{}); +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp index e38f1acbd..c50b64917 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp @@ -30,6 +30,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_inst Tuple<>, NDHWGK, ConvFwdDefault>{}); +#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause + // long compilation time) add_device_operation_instances( instances, device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3, @@ -46,6 +48,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_inst Tuple<>, NDHWGK, ConvFwd1x1S1P0>{}); +#endif } } // namespace instance -- GitLab From 0c9012fb70bcd2750ff0d5b8c23e4bc6f5937709 Mon Sep 17 00:00:00 2001 From: Lin Sun Date: Mon, 4 Nov 2024 16:33:20 -0800 Subject: [PATCH 035/153] Linsun/convint8 fwd instances (#1626) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add instances for int8 grouped conv2d fwd --------- Co-authored-by: root Co-authored-by: Bartłomiej Kocot --- ...ice_grouped_conv_fwd_xdl_comp_instance.hpp | 25 +++++++ ...ped_conv_fwd_xdl_large_tensor_instance.hpp | 19 ++++++ ...vice_grouped_conv_fwd_xdl_mem_instance.hpp | 37 +++++++++++ ...ed_conv_fwd_xdl_merged_groups_instance.hpp | 19 ++++++ .../gpu/grouped_convolution_forward.hpp | 45 ++++++++++++- .../grouped_convolution_forward_comp_xdl.inc | 32 +++++++++ ...uped_convolution_forward_mem_inter_xdl.inc | 32 +++++++++ ...uped_convolution_forward_mem_intra_xdl.inc | 32 +++++++++ .../gpu/grouped_convolution_forward_xdl.inc | 32 +++++++++ ...d_convolution_forward_xdl_large_tensor.inc | 16 +++++ ..._convolution_forward_xdl_merged_groups.inc | 30 +++++++++ .../gpu/grouped_conv2d_fwd/CMakeLists.txt | 11 ++++ ...l_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp | 39 +++++++++++ ...l_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp | 64 ++++++++++++++++++ ...wd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp | 38 +++++++++++ ...wd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp | 62 +++++++++++++++++ ...tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp | 39 +++++++++++ ...hw_gkyxc_ngkhw_int8_mem_inter_instance.cpp | 39 +++++++++++ ...hw_gkyxc_ngkhw_int8_mem_intra_instance.cpp | 39 +++++++++++ ...gc_gkyxc_nhwgk_int8_mem_inter_instance.cpp | 66 +++++++++++++++++++ ...gc_gkyxc_nhwgk_int8_mem_intra_instance.cpp | 66 +++++++++++++++++++ ...groups_ngchw_gkyxc_ngkhw_int8_instance.cpp | 48 ++++++++++++++ ...groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp | 48 ++++++++++++++ .../test_grouped_convnd_fwd.cpp | 4 +- ...est_grouped_convnd_fwd_large_cases_xdl.cpp | 3 +- 25 files changed, 880 insertions(+), 5 deletions(-) create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp index 7490ef223..dc4ee534b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp @@ -131,6 +131,31 @@ using device_grouped_conv_fwd_xdl_f32_comp_instances = std::tuple< // clang-format on >; +template +using device_grouped_conv_fwd_xdl_int8_comp_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + // AGPR Spill when use permuted lds layout. so, use padding for these two. + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + // clang-format on + >; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp index 05cb8d5d0..d317d270c 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp @@ -87,6 +87,25 @@ using device_grouped_conv_fwd_xdl_large_tensor_f32_instances = std::tuple< DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> // clang-format on >; + +template +using device_grouped_conv_fwd_xdl_large_tensor_int8_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // generic instance + DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + + DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8> + // clang-format on + >; } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp index 2388c4db0..1f381af08 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp @@ -154,6 +154,43 @@ using device_grouped_conv_fwd_xdl_f32_mem_instances = std::tuple< // clang-format on >; +template +using device_grouped_conv_fwd_xdl_int8_mem_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + // clang-format on + >; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp index 96baf6bb0..242ad2f73 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp @@ -90,6 +90,25 @@ using device_grouped_conv_fwd_xdl_merged_groups_f32_instances = std::tuple< // clang-format on >; +template +using device_grouped_conv_fwd_xdl_merged_groups_int8_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // Instances with NumGroupsPerBatch > 1 + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1, int8_t, int8_t, LoopScheduler::Default, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1, int8_t, int8_t, LoopScheduler::Default, 16>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1, int8_t, int8_t, LoopScheduler::Default, 32> + // clang-format on + >; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp index d884678de..8090b2449 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp @@ -122,6 +122,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { @@ -160,7 +162,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { @@ -191,7 +194,7 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { @@ -247,8 +250,27 @@ struct DeviceOperationInstanceFactory && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances(op_ptrs); + add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(op_ptrs); + add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances( + op_ptrs); + } #endif } + + // layout NGCHW/GKYXC/NGKHW if constexpr(NumDimSpatial == 2 && is_same_v && is_same_v && is_same_v) { @@ -282,8 +304,26 @@ struct DeviceOperationInstanceFactory && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instances(op_ptrs); + add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instances(op_ptrs); + add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instances( + op_ptrs); + } +#endif } + // 3D + // layout GNDHWC/GKZYXC/GNDHWK if constexpr(NumDimSpatial == 3 && is_same_v && is_same_v && is_same_v) { @@ -323,6 +363,7 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc index 9e4a0bbb6..e47a876e1 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc @@ -57,6 +57,22 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances( PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances( + std::vector>>& instances); +#endif + // grouped conv2d forward, NGCHW/GKYXC/NGKHW #ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instances( @@ -90,6 +106,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instances( PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instances( + std::vector>>& instances); +#endif + #ifdef CK_ENABLE_BF16 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances( diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc index d9470fb3f..f74622ad4 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc @@ -57,6 +57,22 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances( + std::vector>>& instances); +#endif + // grouped conv2d forward, NGCHW/GKYXC/NGKHW #ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instances( @@ -90,6 +106,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instances PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instances( + std::vector>>& instances); +#endif + #ifdef CK_ENABLE_BF16 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances( diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc index 0b1914255..81737b614 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc @@ -57,6 +57,22 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances( + std::vector>>& instances); +#endif + // grouped conv2d forward, NGCHW/GKYXC/NGKHW #ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instances( @@ -90,6 +106,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instances PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instances( + std::vector>>& instances); +#endif + #ifdef CK_ENABLE_BF16 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances( diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc index b1c13696c..4cb2aae09 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc @@ -171,6 +171,22 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances( PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances( + std::vector>>& instances); +#endif + // grouped conv2d forward, NGCHW/GKYXC/NGKHW #ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instances( @@ -204,6 +220,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instances( PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instances( + std::vector>>& instances); +#endif + #ifdef CK_ENABLE_BF16 // grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances( diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc index 6a2c61d05..5f35ab5a4 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc @@ -57,6 +57,22 @@ void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instan PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instances( + std::vector>>& instances); +#endif + #ifdef CK_ENABLE_BF16 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK void add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances( diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc index 474a61e56..1bd2697b9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc @@ -85,6 +85,36 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_insta PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instances( + std::vector>>& instances); + +void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instances( + std::vector>>& instances); +#endif + #ifdef CK_ENABLE_BF16 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances( diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt index 083d92d09..98bee66a9 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt @@ -9,45 +9,56 @@ add_instance_library(device_grouped_conv2d_fwd_instance xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp + xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp # NGCHW, GKYXC, NGKHW xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp + xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp # large tensor # NHWGC, GKYXC, NHWGK xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.cpp xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.cpp + xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp # merged groups # NHWGC, GKYXC, NHWGK xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp + xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp # NGCHW, GKYXC, NGKHW xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instance.cpp xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp + xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp #mem # NHWGC, GKYXC, NHWGK xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp + xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp # NHWGC, GKYXC, NHWGK xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp + xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp # NGCHW, GKYXC, NGKHW xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instance.cpp + xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp # NGCHW, GKYXC, NGKHW xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instance.cpp + xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp #comp # NHWGC, GKYXC, NHWGK xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp + xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp # NGCHW, GKYXC, NGKHW xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instance.cpp xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instance.cpp + xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp #dl # GNHWC, GKYXC, GNHWK dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp new file mode 100644 index 000000000..d98b89c55 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_int8_comp_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwdDefault>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp new file mode 100644 index 000000000..78c2257b9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_int8_comp_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdDefault>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_comp_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd1x1P0>{}); + + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_int8_comp_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd1x1S1P0>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_comp_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdOddC>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp new file mode 100644 index 000000000..65c75fa04 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwdDefault>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp new file mode 100644 index 000000000..5c425effd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdDefault>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd1x1P0>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd1x1S1P0>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdOddC>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp new file mode 100644 index 000000000..4064c6634 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_large_tensor_int8_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdDefault>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp new file mode 100644 index 000000000..9f0f9371b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_mem_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwdDefault, + Interwave>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp new file mode 100644 index 000000000..217f57d87 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_mem_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwdDefault, + Intrawave>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp new file mode 100644 index 000000000..f667481fa --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_mem_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdDefault, + Interwave>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_mem_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd1x1P0, + Interwave>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_mem_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd1x1S1P0, + Interwave>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_mem_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdOddC, + Interwave>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp new file mode 100644 index 000000000..2ff2c7f51 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_mem_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdDefault, + Intrawave>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_mem_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd1x1P0, + Intrawave>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_mem_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd1x1S1P0, + Intrawave>{}); + + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_int8_mem_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdOddC, + Intrawave>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp new file mode 100644 index 000000000..c66d48ed7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwdDefault>{}); + + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwd3x3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp new file mode 100644 index 000000000..8bdf5f527 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdDefault>{}); + + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd3x3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp index 29034afd9..1abd4fd9f 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp @@ -58,13 +58,13 @@ using KernelTypes1d = ::testing::Types, using KernelTypes2d = ::testing::Types, std::tuple, std::tuple, - std::tuple, std::tuple, std::tuple, std::tuple, std::tuple, std::tuple, - std::tuple>; + std::tuple, + std::tuple>; using KernelTypes3d = ::testing::Types, std::tuple, diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp index 3d734fa5e..088fed89f 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp @@ -52,7 +52,8 @@ using namespace ck::tensor_layout::convolution; using KernelTypes2d = ::testing::Types, std::tuple, - std::tuple>; + std::tuple, + std::tuple>; using KernelTypes3d = ::testing::Types, std::tuple, -- GitLab From 464abd235e27c33422aa52ed2044af8fbcc3a88d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Tue, 5 Nov 2024 10:09:52 +0100 Subject: [PATCH 036/153] [generate.py] Override blob list if it already exists (#1635) Before, generate.py appended the list at the end of the output file. When running the cmake configuration steps multiple times on the examples, the blob list (such as fwd_blob_list.txt) would grow at every configuration. `library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt` worked around this issue by removing the output file if it exists. Now, generate.py overrides the content of the output file. There is no need for the workaround in the CMakeLists.txt; and the issue is solved for the example projects too. --- example/ck_tile/01_fmha/generate.py | 3 +++ example/ck_tile/02_layernorm2d/generate.py | 2 +- library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt | 5 ----- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py index 9b91d36fb..5b1b6664c 100644 --- a/example/ck_tile/01_fmha/generate.py +++ b/example/ck_tile/01_fmha/generate.py @@ -47,6 +47,9 @@ def list_blobs(output_file : Optional[str], api_list : List[str], kernel_filter assert output_file is not None file_path = Path(output_file) + # create an empty file / drop its contents if it exists + open(file_path, "w").close() + for api in api_list: handler = handlers[api][HandlerId.LIST_BLOBS] handler(file_path, kernel_filter, receipt, mask_impl) diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py index bf576db97..09aa6b65f 100644 --- a/example/ck_tile/02_layernorm2d/generate.py +++ b/example/ck_tile/02_layernorm2d/generate.py @@ -559,7 +559,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, w_p = Path(self.working_path) list_p = w_p / 'layernorm2d_fwd_blobs.txt' blobs = self.get_blobs() - with list_p.open('a') as list_f: + with list_p.open('w') as list_f: # api related file list_f.write(str(w_p / (self.name_api + ".cpp")) + "\n") list_f.write(str(w_p / (self.name_common_header + ".hpp")) + "\n") diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt index 6d638b174..a53fde166 100644 --- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt @@ -27,11 +27,6 @@ rocm_install(FILES ${MHA_HEADERS} DESTINATION include/ck_tile/ops) # headers for building lib file(COPY ${MHA_HEADERS} DESTINATION ${FMHA_CPP_FOLDER}) -# Delete the blob file if it exists to avoid append of old content. -if(EXISTS ${FMHA_CPP_FOLDER}/blob_list.txt) - file(REMOVE ${FMHA_CPP_FOLDER}/blob_list.txt) -endif() - set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd") # generate a list of kernels, but not actually emit files at config stage -- GitLab From b6e74be1aa38396609bca91cba5f9e5f8665e4b0 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 5 Nov 2024 08:53:10 -0800 Subject: [PATCH 037/153] Make sure cmake can handle the xnack+/xnack- targets. (#1633) * make sure cmake can handle xnack targets * dont build xdl instances for gfx906:xnack- * dont build xdl tests for gfx906:xnack- --- example/CMakeLists.txt | 8 ++++---- .../src/tensor_operation_instance/gpu/CMakeLists.txt | 10 +++++----- test/CMakeLists.txt | 12 ++++++------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index ad3f7c787..22af7b2d5 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -85,9 +85,9 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME) #only continue if there are some source files left on the list if(FILE_NAME) if(FILE_NAME MATCHES "_xdl") - list(REMOVE_ITEM EX_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) elseif(FILE_NAME MATCHES "_wmma") - list(REMOVE_ITEM EX_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) + list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) endif() set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP) add_executable(${EXAMPLE_NAME} ${FILE_NAME}) @@ -169,9 +169,9 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME) #only continue if there are some source files left on the list if(FILE_NAME) if(FILE_NAME MATCHES "_xdl") - list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) elseif(FILE_NAME MATCHES "_wmma") - list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) + list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) endif() set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP) add_executable(${EXAMPLE_NAME} ${FILE_NAME}) diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 6756c3351..c8bbd6eb0 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -88,19 +88,19 @@ function(add_instance_library INSTANCE_NAME) foreach(source IN LISTS ARGN) set(INST_TARGETS ${SUPPORTED_GPU_TARGETS}) if(source MATCHES "_xdl") - list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) elseif(source MATCHES "_wmma") - list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) elseif(source MATCHES "mha") - list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) endif() #only build the fp8 gemm instances for gfx908/90a if the build argument is set if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH) if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8") - list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) endif() if(source MATCHES "gemm_multiply_multiply_f8") - list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) endif() endif() set(offload_targets) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b12ced524..a81c5a96b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME) #only continue if there are some source files left on the list if(ARGN) if(ARGN MATCHES "_xdl") - list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) elseif(ARGN MATCHES "_wmma") - list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) + list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) elseif(ARGN MATCHES "_smfmac") - list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201) + list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201) endif() set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP) add_executable(${TEST_NAME} ${ARGN}) @@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME) #only continue if there are some source files left on the list if(ARGN) if(ARGN MATCHES "_xdl") - list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) elseif(ARGN MATCHES "_wmma") - list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) + list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) elseif(ARGN MATCHES "_smfmac") - list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201) + list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201) endif() set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP) add_executable(${TEST_NAME} ${ARGN}) -- GitLab From d0e3a70a2e3ebb8f979c82309e3e58b5c23fe865 Mon Sep 17 00:00:00 2001 From: darren-amd Date: Tue, 5 Nov 2024 12:59:08 -0500 Subject: [PATCH 038/153] Statically Cast Pointer Offset (#1631) * explicit cast ptr offset * formating change --- ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 12 +++++----- ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 24 +++++++++---------- ..._conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp | 12 +++++----- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 24 +++++++++---------- .../gpu/grid/gridwise_tensor_rearrange.hpp | 8 +++---- 5 files changed, 40 insertions(+), 40 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp index 5e9da459c..b544c925e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp @@ -93,12 +93,12 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); - const long_index_t b_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); - const long_index_t e_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); + const long_index_t a_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const long_index_t b_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); + const long_index_t e_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp index d3c0f84b9..c1f58ccda 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp @@ -60,12 +60,12 @@ __global__ void const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge); const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block); - const long_index_t a_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); - const long_index_t b_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); - const long_index_t e_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); + const long_index_t a_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const long_index_t b_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); + const long_index_t e_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; @@ -117,12 +117,12 @@ __global__ void const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge); const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block); - const long_index_t a_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); - const long_index_t b_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); - const long_index_t e_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); + const long_index_t a_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const long_index_t b_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); + const long_index_t e_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); // Pass two lds pointer is the key to tell compiler that ds_read/write // operate on different lds chunk at same time without order dependecy diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp index 65b7b6cb7..3e14f66a0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp @@ -98,12 +98,12 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); - const long_index_t b_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); - const long_index_t c_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); + const long_index_t a_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const long_index_t b_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); + const long_index_t c_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index b3b057c80..de6c9c160 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -60,12 +60,12 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); - const long_index_t b_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); - const long_index_t e_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); + const long_index_t a_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const long_index_t b_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); + const long_index_t e_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); @@ -155,12 +155,12 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); - const long_index_t b_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); - const long_index_t e_batch_offset = - amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); + const long_index_t a_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const long_index_t b_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); + const long_index_t e_batch_offset = amd_wave_read_first_lane( + static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp index 174074990..ddf0b4a58 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp @@ -121,10 +121,10 @@ struct GridwiseTensorRearrange __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); // Global Memory - const index_t a_batch_offset = - __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); - const index_t c_batch_offset = - __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)); + const index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const index_t c_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx))); const auto in_global_buf = make_dynamic_buffer( p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize()); -- GitLab From 54440cf562b31eea6a158057fd8c41e9db1b4cc8 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:56:20 -0800 Subject: [PATCH 039/153] remove gfx940;gfx941 from default target lists (#1640) --- CMakeLists.txt | 8 ++++---- Jenkinsfile | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 74628597a..bd2f60683 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,20 +145,20 @@ message("hip_version_flat=${hip_VERSION_FLAT}") message("checking which targets are supported") #In order to build just the CK library (without tests and examples) for all supported GPU targets -#use -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" +#use -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" #the GPU_TARGETS flag will be reset in this case in order to avoid conflicts. # #In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures. if(NOT ENABLE_ASAN_PACKAGING) if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000) # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above - set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102") + set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102") else() - set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201") + set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201") endif() else() #build CK only for xnack-supported targets when using ASAN - set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+") + set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+") endif() #if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list diff --git a/Jenkinsfile b/Jenkinsfile index 48b4c805c..b79b2045b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1101,11 +1101,11 @@ pipeline { agent{ label rocmnode("gfx90a") } environment{ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \ - -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \ + -DGPU_TARGETS="gfx908;gfx90a;gfx942" \ -DCMAKE_CXX_FLAGS=" -O3 " """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ - -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \ + -DGPU_TARGETS="gfx908;gfx90a;gfx942" \ -DCMAKE_CXX_COMPILER="${build_compiler()}" \ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } @@ -1165,7 +1165,7 @@ pipeline { execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_CXX_COMPILER="${build_compiler()}" \ -D CMAKE_BUILD_TYPE=Release \ - -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \ + -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \ -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """ } steps{ -- GitLab From 365f39aed0d5335b6e39d5049231558128cfedd9 Mon Sep 17 00:00:00 2001 From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com> Date: Tue, 5 Nov 2024 14:58:29 -0700 Subject: [PATCH 040/153] Prevent instantiation of undefined FP8 operators. (#1639) --- .../elementwise_scale_permute_amax_2D_fp16_fp8.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp index 7ac3c4e23..9431a8cde 100644 --- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp +++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp @@ -68,7 +68,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle using DeviceReduceInstance = ck::tensor_operation::device::DeviceReduceMultiBlock& input, host_output_scaled_casted_transposed(m, k) = y1; const OutputDataType y_fabs = ck::type_convert(ck::math::abs(ck::type_convert(y0))); - host_output_amax(0) = ck::math::max(y_fabs, host_output_amax(0)); + host_output_amax(0) = ck::type_convert(ck::math::max( + ck::type_convert(y_fabs), ck::type_convert(host_output_amax(0)))); } } } -- GitLab From dcafb1de15a8fd1de3496f19fd806ac9cb185012 Mon Sep 17 00:00:00 2001 From: aledudek Date: Wed, 6 Nov 2024 10:44:58 +0100 Subject: [PATCH 041/153] Generic threshold calculation after merge fixes (#1618) * Generic threshold calculation add passing num of accums * Generic threshold - after merge fixes * Fix cmakelists --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> --- .../include/ck/library/utility/check_err.hpp | 8 ++++---- .../profiler/profile_pool3d_fwd_impl.hpp | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp index 73ac2a189..88741c3b9 100644 --- a/library/include/ck/library/utility/check_err.hpp +++ b/library/include/ck/library/utility/check_err.hpp @@ -24,7 +24,7 @@ namespace ck { namespace utils { template -double get_relative_threshold(const int numberOfAccumulations = 1) +double get_relative_threshold(const int number_of_accumulations = 1) { using F8 = ck::f8_t; using F16 = ck::half_t; @@ -79,13 +79,13 @@ double get_relative_threshold(const int numberOfAccumulations = 1) } else { - acc_error = std::pow(2, -NumericUtils::mant) * 0.5 * numberOfAccumulations; + acc_error = std::pow(2, -NumericUtils::mant) * 0.5 * number_of_accumulations; } return std::max(acc_error, midway_error); } template -double get_absolute_threshold(const double max_possible_num, const int numberOfAccumulations = 1) +double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1) { using F8 = ck::f8_t; using F16 = ck::half_t; @@ -142,7 +142,7 @@ double get_absolute_threshold(const double max_possible_num, const int numberOfA else { acc_error = - std::pow(2, expo - NumericUtils::mant) * 0.5 * numberOfAccumulations; + std::pow(2, expo - NumericUtils::mant) * 0.5 * number_of_accumulations; } return std::max(acc_error, midway_error); } diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp index a0890028a..cbdacad53 100644 --- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp +++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp @@ -240,6 +240,19 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& { out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data()); + auto number_of_accumulations = 1; + static_assert( + ReduceOpId == ck::ReduceTensorOp::AVG || ReduceOpId == ck::ReduceTensorOp::MAX, + "Warning: Unhandled ReduceOpId for setting up the number of accumulations!"); + + if constexpr(ReduceOpId == ck::ReduceTensorOp::AVG) + { + for(size_t i = 0; i < kernel_params.window_spatial_lengths.size(); ++i) + { + number_of_accumulations *= kernel_params.window_spatial_lengths.at(i); + } + } + auto absolute_error_threshold = 1.0; switch(in_params.init_method) { @@ -250,9 +263,10 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& absolute_error_threshold = ck::utils::get_absolute_threshold( - absolute_error_threshold); + absolute_error_threshold, number_of_accumulations); auto relative_error_threshold = - ck::utils::get_relative_threshold(); + ck::utils::get_relative_threshold( + number_of_accumulations); bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData, out_n_c_do_ho_wo_host.mData, -- GitLab From 3599418aa8f6b19e94c09160a086030ed50c7184 Mon Sep 17 00:00:00 2001 From: rocking Date: Thu, 7 Nov 2024 03:32:44 +0800 Subject: [PATCH 042/153] Fix F16 type (#1583) --- profiler/src/profile_layernorm_fwd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/profiler/src/profile_layernorm_fwd.cpp b/profiler/src/profile_layernorm_fwd.cpp index a261bd741..7031b3653 100644 --- a/profiler/src/profile_layernorm_fwd.cpp +++ b/profiler/src/profile_layernorm_fwd.cpp @@ -85,7 +85,7 @@ int profile_layernorm(int argc, char* argv[]) if(data_type == ck::DataTypeEnum::Half) { - ck::profiler::profile_layernorm_impl( + ck::profiler::profile_layernorm_impl( do_verification, init_method, do_log, time_kernel, length); } else if(data_type == ck::DataTypeEnum::Float) -- GitLab From 75c5bfa3642cb368acae5c7824aa7d6c506f5dae Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:14:42 -0800 Subject: [PATCH 043/153] enable compilation for generic navi targets (#1645) --- example/CMakeLists.txt | 4 ++-- include/ck/ck.hpp | 8 +++++--- include/ck/utility/amd_wmma.hpp | 5 +++-- include/ck_tile/core/config.hpp | 8 +++++--- library/src/tensor_operation_instance/gpu/CMakeLists.txt | 8 ++++---- test/CMakeLists.txt | 8 ++++---- 6 files changed, 23 insertions(+), 18 deletions(-) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 22af7b2d5..ea739c707 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -85,7 +85,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME) #only continue if there are some source files left on the list if(FILE_NAME) if(FILE_NAME MATCHES "_xdl") - list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) elseif(FILE_NAME MATCHES "_wmma") list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) endif() @@ -169,7 +169,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME) #only continue if there are some source files left on the list if(FILE_NAME) if(FILE_NAME MATCHES "_xdl") - list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) elseif(FILE_NAME MATCHES "_wmma") list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) endif() diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 5f74d51a6..999eb0229 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -63,13 +63,15 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING) #define __gfx101__ #endif #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \ - defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) + defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \ + defined(__gfx10_3_generic__) #define __gfx103__ #endif -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \ + defined(__gfx1103__) || defined(__gfx11_generic__) #define __gfx11__ #endif -#if defined(__gfx1200__) || defined(__gfx1201__) +#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__) #define __gfx12__ #endif diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index 322a0f94b..d04513f3e 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -9,7 +9,8 @@ // TODO: Add arch limitation namespace ck { -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \ + defined(__gfx1103__) || defined(__gfx11_generic__) #define __gfx11__ #endif /********************************WAVE32 MODE***********************************************/ @@ -260,7 +261,7 @@ struct intrin_wmma_i32_16x16x16_iu8_w64<16, 16, neg_a, neg_b, clamp> // gfx12 /********************************WAVE32 MODE***********************************************/ -#if defined(__gfx1200__) || defined(__gfx1201__) +#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__) #define __gfx12__ #endif diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp index 4be50b865..604c9551f 100644 --- a/include/ck_tile/core/config.hpp +++ b/include/ck_tile/core/config.hpp @@ -11,13 +11,15 @@ #define __gfx94__ #endif #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \ - defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) + defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \ + defined(__gfx10_3_generic__) #define __gfx103__ #endif -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \ + defined(__gfx1103__) || defined(__gfx11_generic__) #define __gfx11__ #endif -#if defined(__gfx1200__) || defined(__gfx1201__) +#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__) #define __gfx12__ #endif diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index c8bbd6eb0..80f0fc306 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -88,19 +88,19 @@ function(add_instance_library INSTANCE_NAME) foreach(source IN LISTS ARGN) set(INST_TARGETS ${SUPPORTED_GPU_TARGETS}) if(source MATCHES "_xdl") - list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) elseif(source MATCHES "_wmma") list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) elseif(source MATCHES "mha") - list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) endif() #only build the fp8 gemm instances for gfx908/90a if the build argument is set if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH) if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8") - list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) endif() if(source MATCHES "gemm_multiply_multiply_f8") - list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) endif() endif() set(offload_targets) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a81c5a96b..498a20dc5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME) #only continue if there are some source files left on the list if(ARGN) if(ARGN MATCHES "_xdl") - list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) elseif(ARGN MATCHES "_wmma") list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) elseif(ARGN MATCHES "_smfmac") - list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201) + list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) endif() set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP) add_executable(${TEST_NAME} ${ARGN}) @@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME) #only continue if there are some source files left on the list if(ARGN) if(ARGN MATCHES "_xdl") - list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) + list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) elseif(ARGN MATCHES "_wmma") list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) elseif(ARGN MATCHES "_smfmac") - list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201) + list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) endif() set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP) add_executable(${TEST_NAME} ${ARGN}) -- GitLab From 686a58a912f6884a9b66841cf04b4b81ba35aa7f Mon Sep 17 00:00:00 2001 From: dummycoderfe Date: Fri, 8 Nov 2024 12:28:23 +0800 Subject: [PATCH 044/153] [Ck tile] layernorm2d fwd optimize (#1637) * optimze small N case using vec io and using rcp div * [Ck_tile] layernorm, add param to control fastdiv; change generate codes and test pass * [Ck_tile] fix blockSize compute in Generic2dBlockShape * [Ck_tile]fix kfastfdiv template style * [Ck_tile] layernorm, fix stype in review --------- Co-authored-by: dummycoderfe --- example/ck_tile/02_layernorm2d/generate.py | 105 ++++++++++-------- .../ops/common/generic_2d_block_shape.hpp | 12 +- ...ayernorm2d_fwd_pipeline_default_policy.hpp | 12 +- .../layernorm2d_fwd_pipeline_one_pass.hpp | 11 +- .../pipeline/layernorm2d_fwd_traits.hpp | 2 + .../ops/welford/block/block_welford.hpp | 34 ++++-- .../welford/block/block_welford_problem.hpp | 9 +- .../ops/welford/thread/thread_welford.hpp | 43 +++++-- 8 files changed, 144 insertions(+), 84 deletions(-) diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py index 09aa6b65f..ca9e432a4 100644 --- a/example/ck_tile/02_layernorm2d/generate.py +++ b/example/ck_tile/02_layernorm2d/generate.py @@ -57,6 +57,7 @@ template @@ -118,6 +119,7 @@ struct layernorm2d_fwd_traits_ static constexpr bool kPadN = kPadN_; static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; + static constexpr bool kFastFDiv = kFastFDiv_; static constexpr bool kTwoPass = kTwoPass_; static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_; static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_; @@ -134,6 +136,7 @@ template @@ -148,6 +151,7 @@ using traits_ = layernorm2d_fwd_traits_; @@ -179,6 +183,7 @@ float layernorm2d_fwd_(const S& s, A a) using PipelineTraits = ck_tile::Layernorm2dFwdTraits(Traits_::kFusedAdd), static_cast(Traits_::kFusedQuant)>; @@ -269,7 +274,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, #include "layernorm2d_fwd_api_common.hpp" // clang-format off -// prec_i prec_o prec_sy rm rn tm tn vn pd mv 2p add sweep +// prec_i prec_o prec_sy rm rn tm tn vn pd mv rpcf 2p add sweep {F_instance_def} // clang-format on @@ -356,6 +361,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, F_Vector_N : int F_kPadN : bool F_kSaveMeanInvStd_ : bool + F_kFastFDiv_ : bool F_kTwoPass_ : bool F_kFusedAdd : int F_kFusedQuant : int @@ -363,7 +369,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, @property def trait_name(self) ->str: t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_XScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}' - t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}' + t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}' t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}' return t_ @@ -483,52 +489,55 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, fused_add_list = [0, 1] fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant - # rm rn tm tn vn pd mv 2p add sweep - h_trait_dict = {'64' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 1, True, False, False, 0, 0)], - '128' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 1, True, False, False, 0, 0)], - '256' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 4, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 1, True, False, False, 0, 0)], - '512' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 8, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 4, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 8, 4, 64, 1, True, False, False, 0, 0)], - '768' : [ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 4, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 4, 64, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 12, 4, 64, 1, True, False, False, 0, 0)], - '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 2, 128, 8, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 2, 128, 4, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 2, 128, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 1, True, False, False, 0, 0)], - '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 8, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 2, 128, 4, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 1, True, False, False, 0, 0)], - '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 1, 256, 8, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 4, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 8, 1, 256, 1, True, False, False, 0, 0)], - '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 128, 8, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 4, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 1, True, False, False, 0, 0)], - '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, False, 0, 0)], - '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 8, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 512, 4, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 1,1024, 1, True, False, False, 0, 0)], - '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 8, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 512, 4, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 2, True, False, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 8, 1,1024, 1, True, False, False, 0, 0)], - 'big' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, 0, 0)]} + # rm rn tm tn vn pd mv fdiv 2p add sweep + h_trait_dict = {'64' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 8, 8, 8, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 1, True, False, True, False, 0, 0)], + '128' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 8, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 1, True, False, True, False, 0, 0)], + '256' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 1, True, False, True, False, 0, 0)], + '512' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 8, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 4, 64, 1, True, False, True, False, 0, 0)], + '768' : [ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 4, 64, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 12, 4, 64, 1, True, False, True, False, 0, 0)], + '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 2, 128, 8, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 2, 128, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 2, 128, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 1, True, False, True, False, 0, 0)], + '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 8, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 2, 128, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 1, True, False, True, False, 0, 0)], + '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 1, 256, 8, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 1, 256, 1, True, False, True, False, 0, 0)], + '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 128, 8, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 1, True, False, True, False, 0, 0)], + '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, False, 0, 0)], + '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 8, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 512, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1,1024, 1, True, False, True, False, 0, 0)], + '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 8, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 512, 4, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 2, True, False, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 1,1024, 1, True, False, True, False, 0, 0)], + 'big' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, True, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, True, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, 0, 0)]} total_blob = list() for hs_key in h_trait_dict: hs = h_trait_dict[hs_key] diff --git a/include/ck_tile/ops/common/generic_2d_block_shape.hpp b/include/ck_tile/ops/common/generic_2d_block_shape.hpp index 64ad20c3b..c0bfd9319 100644 --- a/include/ck_tile/ops/common/generic_2d_block_shape.hpp +++ b/include/ck_tile/ops/common/generic_2d_block_shape.hpp @@ -38,9 +38,7 @@ namespace ck_tile { template typename WarpPerBlock_, // num warps along seq typename WarpTile_, // warp size, seq - typename Vector_, // contiguous pixels(vector size) along seq - index_t BlockSize_ = - warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})> + typename Vector_> // contiguous pixels(vector size) along seq)> struct Generic2dBlockShape { // block size @@ -68,10 +66,12 @@ struct Generic2dBlockShape static_assert(Warp_M % Vector_M == 0); static_assert(Warp_N % Vector_N == 0); // num of threads along seq, within each warp - static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; - static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; + static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; + static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; + static constexpr index_t ThreadPerBlock_M = Block_M / Repeat_M / Vector_M; + static constexpr index_t ThreadPerBlock_N = Block_N / Repeat_N / Vector_N; - static constexpr index_t BlockSize = BlockSize_; + static constexpr index_t BlockSize = ThreadPerBlock_M * ThreadPerBlock_N; }; } // namespace ck_tile diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp index 1de230c14..724f6261d 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp @@ -47,7 +47,8 @@ struct Layernorm2dFwdPipelineDefaultPolicy { using P_ = BlockWelfordProblem; + typename Problem::BlockShape, + Problem::Traits::kFastFDiv>; return BlockWelford{}; } @@ -57,7 +58,8 @@ struct Layernorm2dFwdPipelineDefaultPolicy { using P_ = BlockWelfordProblem; + typename Problem::BlockShape, + Problem::Traits::kFastFDiv>; return BlockWelfordSync{}; } @@ -67,7 +69,8 @@ struct Layernorm2dFwdPipelineDefaultPolicy { using P_ = BlockWelfordProblem; + typename Problem::BlockShape, + Problem::Traits::kFastFDiv>; return BlockWelfordCrossWarpSync{}; } @@ -79,7 +82,8 @@ struct Layernorm2dFwdPipelineDefaultPolicy { using P_ = BlockWelfordProblem; + typename Problem::BlockShape, + Problem::Traits::kFastFDiv>; using block_welford = BlockWelford; using x_block_tile = diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index 83cdab428..4b83ed4fb 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -36,6 +36,7 @@ struct Layernorm2dFwdPipelineOnePass static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; static constexpr bool kPadM = false; // TODO - BlockLayernorm2dFwdProblem::kPadM static constexpr bool kPadN = Problem::Traits::kPadN; + static constexpr bool kFastFDiv = Problem::Traits::kFastFDiv; static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd; static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant; @@ -125,7 +126,15 @@ struct Layernorm2dFwdPipelineOnePass // compute inv-std auto inv_std = tile_elementwise_in( [&](const auto& v_) { - return type_convert(1.0f) / (sqrt(v_ + epsilon)); + if(kFastFDiv && std::is_same_v) + { + return type_convert(1.0f) * + __builtin_amdgcn_rcpf(sqrt(v_ + epsilon)); + } + else + { + return type_convert(1.0f) / sqrt(v_ + epsilon); + } }, var); diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp index ed9e18be3..e8c22f8ab 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp @@ -39,6 +39,7 @@ template<> struct Layernorm2dFusedQuantEnumName @@ -46,6 +47,7 @@ struct Layernorm2dFwdTraits { static constexpr bool kPadN = kPadN_; static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; + static constexpr bool kFastFDiv = kFastFDiv_; static constexpr bool kTwoPass = kTwoPass_; static constexpr Layernorm2dFusedAddEnum kFusedAdd = kFusedAdd_; static constexpr Layernorm2dFusedQuantEnum kFusedQuant = kFusedQuant_; diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp index ce73c183e..968895e38 100644 --- a/include/ck_tile/ops/welford/block/block_welford.hpp +++ b/include/ck_tile/ops/welford/block/block_welford.hpp @@ -11,9 +11,10 @@ namespace ck_tile { template struct BlockWelford { - using Problem = remove_cvref_t; - using XDataType = typename Problem::XDataType; - using ComputeDataType = typename Problem::ComputeDataType; + using Problem = remove_cvref_t; + using XDataType = typename Problem::XDataType; + using ComputeDataType = typename Problem::ComputeDataType; + static constexpr bool kFastFDiv = Problem::kFastFDiv; CK_TILE_DEVICE constexpr BlockWelford() {} @@ -89,7 +90,8 @@ struct BlockWelford template struct BlockWelfordSync { - using Problem = remove_cvref_t; + using Problem = remove_cvref_t; + static constexpr bool kFastFDiv = Problem::kFastFDiv; template CK_TILE_DEVICE void @@ -173,8 +175,9 @@ struct BlockWelfordSync template struct BlockWelfordCrossWarpSync { - using Problem = remove_cvref_t; - using BlockShape = typename Problem::BlockShape; + using Problem = remove_cvref_t; + using BlockShape = typename Problem::BlockShape; + static constexpr bool kFastFDiv = Problem::kFastFDiv; template CK_TILE_DEVICE static constexpr index_t GetReduceWarps() @@ -351,12 +354,23 @@ CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count(int row_ } // Note: this function must be called after all the computation -template +template CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var(VarDistributedTensor_& var_tensor, - int count) + int count, + bool_constant = {}) { using DataType = typename VarDistributedTensor_::DataType; - tile_elementwise_inout([&count](auto& x) { x = x / type_convert(count); }, - var_tensor); + tile_elementwise_inout( + [&count](auto& x) { + if(FastFdiv_ && std::is_same_v) + { + x = x * __builtin_amdgcn_rcpf(type_convert(count)); + } + else + { + x = x / type_convert(count); + } + }, + var_tensor); } } // namespace ck_tile diff --git a/include/ck_tile/ops/welford/block/block_welford_problem.hpp b/include/ck_tile/ops/welford/block/block_welford_problem.hpp index dcae1ef2e..bcbfb7d76 100644 --- a/include/ck_tile/ops/welford/block/block_welford_problem.hpp +++ b/include/ck_tile/ops/welford/block/block_welford_problem.hpp @@ -7,12 +7,13 @@ namespace ck_tile { -template +template struct BlockWelfordProblem { - using XDataType = remove_cvref_t; - using ComputeDataType = remove_cvref_t; - using BlockShape = remove_cvref_t; + using XDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; + static constexpr bool kFastFDiv = kFastFDiv_; }; } // namespace ck_tile diff --git a/include/ck_tile/ops/welford/thread/thread_welford.hpp b/include/ck_tile/ops/welford/thread/thread_welford.hpp index 4c61cdcf4..52b253e5f 100644 --- a/include/ck_tile/ops/welford/thread/thread_welford.hpp +++ b/include/ck_tile/ops/welford/thread/thread_welford.hpp @@ -7,25 +7,46 @@ namespace ck_tile { -template -CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count) +template +CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count, bool_constant = {}) { // TODO: check nan? maybe no T delta = x - mean; - mean += delta / count; + if(kFastFDiv && std::is_same_v) + { + mean += delta * __builtin_amdgcn_rcpf(count); + } + else + { + mean += delta / count; + } T delta2 = x - mean; var += delta * delta2; } -template -CK_TILE_DEVICE static void -welford_merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b) +template +CK_TILE_DEVICE static void welford_merge(T& mean_a, + T& var_a, + int& count_a, + T mean_b, + T var_b, + int count_b, + bool_constant = {}) { - int count = count_a + count_b; - T count_ = type_convert(count); - T count_a_ = type_convert(count_a); - T count_b_ = type_convert(count_b); - T count_b_over_count = count == 0 ? type_convert(0) : count_b_ / count_; + int count = count_a + count_b; + T count_ = type_convert(count); + T count_a_ = type_convert(count_a); + T count_b_ = type_convert(count_b); + T count_b_over_count; + if(kFastFDiv && std::is_same_v) + { + count_b_over_count = + count == 0 ? type_convert(0) : count_b_ * __builtin_amdgcn_rcpf(count_); + } + else + { + count_b_over_count = count == 0 ? type_convert(0) : count_b_ / count_; + } T delta = mean_b - mean_a; mean_a += delta * count_b_over_count; -- GitLab From ea3640fdea4b11178c1657feff4849ad011e5d26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Fri, 8 Nov 2024 10:04:33 +0100 Subject: [PATCH 045/153] Add generic instances for two stage conv bwd wei (#1643) * Add generic instances for two stage conv bwd wei * Update layout prefix --- ...conv_bwd_weight_two_stage_xdl_instance.hpp | 76 ++++++++++++- .../grouped_convolution_backward_weight.hpp | 16 +++ ...rouped_convolution_backward_weight_xdl.inc | 100 ++++++++++++++++++ .../grouped_conv2d_bwd_weight/CMakeLists.txt | 4 + ...ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp | 41 +++++++ ..._ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp | 41 +++++++ ...nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp | 41 +++++++ ...nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp | 2 +- ...nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp | 2 +- ..._nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp | 41 +++++++ ..._nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp | 2 +- ..._nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp | 2 +- .../grouped_conv3d_bwd_weight/CMakeLists.txt | 4 + ...wgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp | 41 +++++++ ...wgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp | 2 +- ...wgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp | 2 +- ...hwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp | 41 +++++++ ...hwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp | 2 +- ...hwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp | 2 +- ...dhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp | 41 +++++++ ...cdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp | 41 +++++++ 21 files changed, 534 insertions(+), 10 deletions(-) create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp index 5f6c340e4..d82f82cce 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp @@ -39,7 +39,25 @@ template -using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances = std::tuple< +using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances = + std::tuple< + // clang-format off + //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups| + //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge| + //#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| Scheduler| Version| | + //#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | | | + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 16, 16, 32, 8, 16, 16, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 1> + // clang-format on + >; + +template +using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances = std::tuple< // clang-format off //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups| //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge| @@ -64,7 +82,25 @@ template -using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances = std::tuple< +using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances = + std::tuple< + // clang-format off + //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups| + //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge| + //#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| Scheduler| Version| | + //#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | | | + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 16, 16, 32, 8, 16, 16, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 1> + // clang-format on + >; + +template +using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances = std::tuple< // clang-format off //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups| //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge| @@ -82,6 +118,24 @@ using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances = st // clang-format on >; +template +using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances = + std::tuple< + // clang-format off + //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups| + //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge| + //#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| Scheduler| Version| | + //#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | | | + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 16, 16, 32, 8, 16, 16, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1> + // clang-format on + >; + // NGCHW requires transpose, we use vector loads and stores params for them template ; +template +using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances = + std::tuple< + // clang-format off + //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups| + //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge| + //#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| Scheduler| Version| | + //#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | | | + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 16, 16, 32, 8, 16, 16, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1> + // clang-format on + >; + template && is_same_v && is_same_v) { + add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances( + op_ptrs); add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances( op_ptrs); add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances( @@ -403,6 +409,8 @@ struct DeviceOperationInstanceFactory && is_same_v) { + add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instances( + op_ptrs); add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances( op_ptrs); add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances( @@ -464,6 +472,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { + add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instances( + op_ptrs); add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances( op_ptrs); add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances( @@ -524,6 +538,8 @@ struct DeviceOperationInstanceFactory && is_same_v) { + add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instances( + op_ptrs); add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances( op_ptrs); add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances( diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc index 132dde81a..630eb8135 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc @@ -113,6 +113,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_in PassThrough, PassThrough>>>& instances); +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instances( + std::vector>>& instances); + void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances( std::vector>>& instances); + +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instances( + std::vector>>& instances); + void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances( std::vector>>& instances); +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances( + std::vector>>& instances); + void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instances( std::vector>>& instances); + +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances( + std::vector>>& instances); + void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances( std::vector>>& instances); +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instances( + std::vector>>& instances); + void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances( std::vector>>& instances); + +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instances( + std::vector>>& instances); + void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances( std::vector>>& instances); +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instances( + std::vector>>& instances); + void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instances( std::vector>>& instances); + +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instances( + std::vector>>& instances); + void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances( std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances< + 2, + NGCHW, + GKYXC, + NGKHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v1>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp new file mode 100644 index 000000000..d70c95bf6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances< + 2, + NGCHW, + GKYXC, + NGKHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v1>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp new file mode 100644 index 000000000..74ccc4c89 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances< + 2, + NHWGC, + GKYXC, + NHWGK, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v1>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp index 0e4d085de..fab289855 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp @@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_p // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances< + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances< 2, NHWGC, GKYXC, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp index 680494cfd..407645e89 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp @@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_p // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances< + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances< 2, NHWGC, GKYXC, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp new file mode 100644 index 000000000..807de66ca --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances< + 2, + NHWGC, + GKYXC, + NHWGK, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v1>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp index 15401f0e1..084c83cd6 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp @@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pi // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances< + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances< 2, NHWGC, GKYXC, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp index 398c14b11..d174e5b6c 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp @@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pi // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances< + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances< 2, NHWGC, GKYXC, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt index c8c30897c..cf4e323bf 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt @@ -15,6 +15,10 @@ set(GROUPED_CONV3D_BWD_WEIGHT xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp ) if(DL_KERNELS) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp new file mode 100644 index 000000000..63249a1c1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances< + 3, + NDHWGC, + GKZYXC, + NDHWGK, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v1>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp index 549716586..7841ddad9 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp @@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf1 // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances< + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances< 3, NDHWGC, GKZYXC, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp index 18a00c6ea..ba6285a38 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp @@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf1 // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances< + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances< 3, NDHWGC, GKZYXC, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp new file mode 100644 index 000000000..a8fbefb5b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances< + 3, + NDHWGC, + GKZYXC, + NDHWGK, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v1>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp index 4d0f1e68c..e4baafc0b 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp @@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16 // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances< + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances< 3, NDHWGC, GKZYXC, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp index c5cc062f2..f9bc5b134 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp @@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16 // 1. Default add_device_operation_instances( instances, - device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances< + device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances< 3, NDHWGC, GKZYXC, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp new file mode 100644 index 000000000..16221eb3e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances< + 3, + NGCDHW, + GKZYXC, + NGKDHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v1>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp new file mode 100644 index 000000000..126e90f2c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances< + 3, + NGCDHW, + GKZYXC, + NGKDHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v1>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck -- GitLab From af9546d9f4dba6945e23e1c346f92678f0f208f9 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Sat, 9 Nov 2024 09:55:14 +0800 Subject: [PATCH 046/153] Fix 'sh' command compatibility of smoke_test_fwd.sh (#1553) --- .../ck_tile/01_fmha/script/smoke_test_fwd.sh | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh index 5dcc6ed42..b867cd6c0 100755 --- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh +++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh @@ -29,14 +29,14 @@ while getopts ":sa" opt; do done run_fp16_bf16_tests() { - local NUM_SPLITS=(1) - local PAGE_BLOCK_SIZE=(0) - local CACHE_BATCH_IDX=(0) + local NUM_SPLITS="1" + local PAGE_BLOCK_SIZE="0" + local CACHE_BATCH_IDX="0" if [ $TEST_SPLITKV -eq 1 ] ; then - NUM_SPLITS+=(2 3) - PAGE_BLOCK_SIZE+=(128) - CACHE_BATCH_IDX+=(1) + NUM_SPLITS="$NUM_SPLITS 2 3" + PAGE_BLOCK_SIZE="$PAGE_BLOCK_SIZE 128" + CACHE_BATCH_IDX="$CACHE_BATCH_IDX 1" fi for prec in "fp16" "bf16" ; do @@ -47,9 +47,9 @@ run_fp16_bf16_tests() { for lse in 0 1 ; do for bias in "n" "e" "a" ; do for p_drop in 0.0 0.2 ; do - for num_splits in "${NUM_SPLITS[@]}" ; do - for page_block_size in "${PAGE_BLOCK_SIZE[@]}" ; do - for cache_batch_idx in "${CACHE_BATCH_IDX[@]}" ; do + for num_splits in $NUM_SPLITS ; do + for page_block_size in $PAGE_BLOCK_SIZE ; do + for cache_batch_idx in $CACHE_BATCH_IDX ; do # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS @@ -103,4 +103,4 @@ if [ $TEST_APPENDKV -eq 1 ] ; then run_fp16_appendkv_tests fi -set +x \ No newline at end of file +set +x -- GitLab From bec6fbc65fe766ab23fe563675703defdb0dd2be Mon Sep 17 00:00:00 2001 From: dummycoderfe Date: Sat, 9 Nov 2024 17:57:27 +0800 Subject: [PATCH 047/153] Ck tile/moe sorting (#1624) * add moe_sorting & check ok * fix comments & typo * Run remod.py under include/ck_tile & example/ck_tile directories * format codes * fix output ci check bug * fix moe sorting readme and error commit file * use magiv div to accelerate compute * add an loop unroll for moe lds ops * add extblocksnel to set zeros for moebufs * [Ck_tile] moe set zero run ok, add size check and fix ref check * [Ck_tile]fix moe_sorting fuse set_zero remod * [Ck_tile] change name style, fix zero buffer size err, change folder * [Ck_tile] moe_sorting: fix name style * [Ck_tile] moe_sorting, remove useless params in traits * [Ck_tile] change outputtile cnt * unit_size; change output buf alloc --------- Co-authored-by: dummycoderfe Co-authored-by: Po Yen, Chen Co-authored-by: carlushuang --- example/ck_tile/13_moe_sorting/CMakeLists.txt | 8 + example/ck_tile/13_moe_sorting/README.md | 27 ++ .../ck_tile/13_moe_sorting/moe_sorting.cpp | 223 +++++++++++++++++ .../13_moe_sorting/moe_sorting_api.cpp | 73 ++++++ .../13_moe_sorting/moe_sorting_api.hpp | 20 ++ .../13_moe_sorting/script/smoke_test.sh | 19 ++ example/ck_tile/CMakeLists.txt | 1 + include/ck_tile/host.hpp | 1 + .../host/reference/reference_moe_sorting.hpp | 78 ++++++ .../fused_moe/kernel/moe_sorting_kernel.hpp | 232 ++++++++++++++++++ .../pipeline/moe_sorting_pipeline.hpp | 39 +++ .../fused_moe/pipeline/moe_sorting_policy.hpp | 15 ++ .../pipeline/moe_sorting_problem.hpp | 23 ++ include/ck_tile/ops/moe_sorting.hpp | 11 + 14 files changed, 770 insertions(+) create mode 100644 example/ck_tile/13_moe_sorting/CMakeLists.txt create mode 100644 example/ck_tile/13_moe_sorting/README.md create mode 100644 example/ck_tile/13_moe_sorting/moe_sorting.cpp create mode 100644 example/ck_tile/13_moe_sorting/moe_sorting_api.cpp create mode 100644 example/ck_tile/13_moe_sorting/moe_sorting_api.hpp create mode 100644 example/ck_tile/13_moe_sorting/script/smoke_test.sh create mode 100644 include/ck_tile/host/reference/reference_moe_sorting.hpp create mode 100644 include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp create mode 100644 include/ck_tile/ops/moe_sorting.hpp diff --git a/example/ck_tile/13_moe_sorting/CMakeLists.txt b/example/ck_tile/13_moe_sorting/CMakeLists.txt new file mode 100644 index 000000000..09f3e4ac4 --- /dev/null +++ b/example/ck_tile/13_moe_sorting/CMakeLists.txt @@ -0,0 +1,8 @@ +add_executable(tile_example_moe_sorting EXCLUDE_FROM_ALL moe_sorting.cpp moe_sorting_api.cpp) +target_include_directories(tile_example_moe_sorting PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/) + +set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS) +# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations +list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) +# list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) +target_compile_options(tile_example_moe_sorting PRIVATE ${EXAMPLE_MOE_SORTING_COMPILE_OPTIONS}) diff --git a/example/ck_tile/13_moe_sorting/README.md b/example/ck_tile/13_moe_sorting/README.md new file mode 100644 index 000000000..7b6792dd9 --- /dev/null +++ b/example/ck_tile/13_moe_sorting/README.md @@ -0,0 +1,27 @@ +# moe-sorting + +This folder contains example for moe-sorting kernel using ck_tile tile-programming implementation. This kernel is often used in Moe model, before launching the fused-moe-gemm block. The input&weight is a `token*topk` 2d matrix. The op rearange the input weight ids into different experts and feed into fuse moe gemm kernel. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +make tile_example_moe_sorting -j +``` +This will result in an executable `build/bin/tile_example_moe_sorting` + +## example +``` +args: + -v weather do CPU validation or not (default:1) + -pr_i index data type. (currently only fp32 supported now) (default:int32) + -pr_w output weight data type(currently only fp32 supported now) (default:fp32) + -t number of input tokens (default:32) + -e number of experts (default:8) + -k topk (default:2) + -st_i row stride of input, -1 means same as experts (default:-1) + -seed seed to be used, -1 means random every time (default:-1) + -kname when set to 1 it will print kernel name (default:0) + +``` diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp new file mode 100644 index 000000000..d2c4df105 --- /dev/null +++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/reduce.hpp" +#include "moe_sorting_api.hpp" + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("v", "1", "weather do CPU validation or not") + .insert("pr_i", "int32", "index data type. (currently only int32 supported now)") + .insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)") + .insert("t", "128", "number of input tokens") + .insert("e", "8", "number of num_experts") + .insert("k", "4", "topk") + .insert("unit", "32", "unit_size") + .insert("moe_buf_size", "0", "moe_buf_size") + .insert("seed", "-1", "seed to be used, -1 means random every time") + .insert("kname", "0", "when set to 1 it will print kernel name") + .insert("warmup", "5", "number of iterations before benchmark the kernel") + .insert("repeat", "20", "number of iterations to benchmark the kernel"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +void topid_unique_gen( + std::vector& host_tensor, int tokens, int topk, int num_expert, int seed) +{ + size_t total_size = topk * tokens; + std::srand(seed); + std::set unique_set; + IndexType current_v; + for(size_t i = 0; i < total_size; i++) + { + if(i % topk == 0) + { + unique_set.clear(); + } + current_v = std::rand() % num_expert; + while(unique_set.find(current_v) != unique_set.end()) + { + current_v = std::rand() % num_expert; + } + unique_set.insert(current_v); + host_tensor[i] = current_v; + } +} + +template +bool test_moe_sorting(ck_tile::ArgParser args) +{ + int validate = args.get_int("v"); + std::string index_prec = args.get_str("pr_i"); + std::string weight_prec = args.get_str("pr_w"); + int tokens = args.get_int("t"); + int num_experts = args.get_int("e"); + int topk = args.get_int("k"); + int seed = args.get_int("seed"); + int unit_size = args.get_int("unit"); + int moe_buf_size = args.get_int("moe_buf_size"); + int kname = args.get_int("kname"); + int warmup = args.get_int("warmup"); + int repeat = args.get_int("repeat"); + int max_output_ids = + ck_tile::integer_least_multiple(topk * tokens + num_experts * unit_size - topk, unit_size); + + if(seed < 0) + { + seed = std::time(nullptr); + } + + if(topk > num_experts) + { + printf("topk:%d value should be smaller than, or equal to number of num_experts:%d\n", + topk, + num_experts); + return false; + } + + // tokens already considered batch size + ck_tile::HostTensor topk_ids_host({tokens, topk}, {topk, 1}); + ck_tile::HostTensor weights_host({tokens, topk}, {topk, 1}); + ck_tile::HostTensor sorted_ids_host({max_output_ids}, {1}); + ck_tile::HostTensor sorted_weights_host({max_output_ids}, {1}); + ck_tile::HostTensor sorted_expert_ids_host({max_output_ids / unit_size}, {1}); + ck_tile::HostTensor sorted_id_cnt_host({1}, {1}); + ck_tile::HostTensor moe_buf_host({moe_buf_size}); + + ck_tile::FillUniformDistribution{-.5f, .5f}(weights_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(moe_buf_host); + topid_unique_gen(topk_ids_host.mData, tokens, topk, num_experts, seed); + + ck_tile::DeviceMem topk_ids_dev(topk_ids_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem weights_dev(weights_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem sorted_ids_dev(sorted_ids_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem sorted_weights_dev(sorted_weights_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem sorted_expert_ids_dev( + sorted_expert_ids_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem sorted_id_cnt_dev(sorted_id_cnt_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem moe_buf_dev(moe_buf_host.get_element_space_size_in_bytes()); + + topk_ids_dev.ToDevice(topk_ids_host.data()); + weights_dev.ToDevice(weights_host.data()); + if(moe_buf_size > 0) + { + moe_buf_dev.ToDevice(moe_buf_host.data()); + } + + moe_sorting_trait trait{index_prec, weight_prec}; + + moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(), + weights_dev.GetDeviceBuffer(), + sorted_ids_dev.GetDeviceBuffer(), + sorted_weights_dev.GetDeviceBuffer(), + sorted_expert_ids_dev.GetDeviceBuffer(), + sorted_id_cnt_dev.GetDeviceBuffer(), + moe_buf_size > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr, + tokens, + unit_size, + num_experts, + topk, + static_cast(moe_buf_size * sizeof(float))}; + + ck_tile::stream_config sc{nullptr, + true, + /* log_level = */ (kname ? 1 : 0), + warmup, + repeat}; + auto ms = moe_sorting(trait, karg, sc); + printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d, ms:%f , ", + index_prec.c_str(), + weight_prec.c_str(), + tokens, + num_experts, + topk, + ms); + if(ms < 0) + printf("not supported\n"); + fflush(stdout); + if(ms < 0) + { + return false; + } + + sorted_ids_dev.FromDevice(sorted_ids_host.data()); + sorted_weights_dev.FromDevice(sorted_weights_host.data()); + sorted_expert_ids_dev.FromDevice(sorted_expert_ids_host.data()); + sorted_id_cnt_dev.FromDevice(sorted_id_cnt_host.data()); + if(moe_buf_size > 0) + { + moe_buf_dev.FromDevice(moe_buf_host.data()); + } + + bool rtn = true; + if(validate) + { + ck_tile::HostTensor sorted_ids_ref({max_output_ids}, {1}); + ck_tile::HostTensor sorted_weights_ref({max_output_ids}, {1}); + ck_tile::HostTensor sorted_expert_ids_ref({max_output_ids / unit_size}, {1}); + + int32_t ref_total_tokens_post_pad = 0; + ck_tile::reference_moe_sorting(topk_ids_host, + weights_host, + sorted_ids_ref, + sorted_weights_ref, + sorted_expert_ids_ref, + ref_total_tokens_post_pad, + num_experts, + unit_size); + rtn &= ck_tile::check_err( + sorted_ids_host, sorted_ids_ref, std::string("OUT Error: Incorrect ids!"), 1e-6, 1e-6); + rtn &= ck_tile::check_err(sorted_weights_host, + sorted_weights_ref, + std::string("OUT Error: Incorrect w!"), + 1e-6, + 1e-6); + rtn &= ck_tile::check_err(sorted_expert_ids_host, + sorted_expert_ids_ref, + std::string("OUT Error: Incorrect eid!"), + 1e-6, + 1e-6); + if(moe_buf_size) + { + ck_tile::HostTensor moe_buf_ref({moe_buf_size}); + rtn &= ck_tile::check_err( + moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0); + } + rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0]; + } + + printf("valid:%s\n", rtn ? "y" : "n"); + fflush(stdout); + return rtn; +} + +int main(int argc, char** argv) +{ + auto [result, args] = create_args(argc, argv); + if(!result) + return -1; + std::string index_prec = args.get_str("pr_i"); + std::string weight_prec = args.get_str("pr_w"); + + bool r = true; + if(weight_prec.compare("fp32") == 0 && index_prec.compare("int32") == 0) + { + r &= test_moe_sorting(args); + } + return r ? 0 : -1; +} diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp new file mode 100644 index 000000000..25e99c530 --- /dev/null +++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_sorting_api.hpp" + +#define MOE_SORTING_DISPATCH(unroll_num_) \ + constexpr ck_tile::index_t unroll_num = unroll_num_; \ + using ms_problem = ck_tile::MoeSortingProblem; \ + using kernel = ck_tile::MoeSortingKernel; \ + auto kargs = kernel::MakeKargs(a); \ + const dim3 grids = kernel::GridSize(a); \ + const dim3 blocks = kernel::BlockSize(a); \ + const auto lds_bytes = kernel::GetSmemSize(a); \ + float ave_time = ck_tile::launch_kernel( \ + s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \ + return ave_time; + +float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s) +{ + if(t.weight_type == "fp32" && t.index_type == "int32") + { + if(a.num_experts > 127) + { + printf("lds size exceed, only support experts <127 \n"); + return -1; + } + if(a.moe_buf_bytes % 16) + { + printf("buf set size %d unaligned, must be multiple of 16\n", a.moe_buf_bytes); + return -1; + } + using index_t = ck_tile::index_t; + using ms_weight_type = float; + index_t smem_io_unroll_num = ck_tile::integer_divide_ceil(a.tokens * a.topk, 64); + switch(smem_io_unroll_num) + { + case(1): { + MOE_SORTING_DISPATCH(1); + } + case(2): { + MOE_SORTING_DISPATCH(2); + } + case(3): { + MOE_SORTING_DISPATCH(3); + } + case(5): { + MOE_SORTING_DISPATCH(5); + } + case(6): { + MOE_SORTING_DISPATCH(6); + } + case(7): { + MOE_SORTING_DISPATCH(7); + } + case(8): { + MOE_SORTING_DISPATCH(8); + } + case(9): { + MOE_SORTING_DISPATCH(9); + } + case(10): { + MOE_SORTING_DISPATCH(10); + } + case(11): { + MOE_SORTING_DISPATCH(11); + } + default: { + MOE_SORTING_DISPATCH(4); + } + } + } + return -1; +} diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp new file mode 100644 index 000000000..91b54932c --- /dev/null +++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once +#include +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/moe_sorting.hpp" + +struct moe_sorting_trait +{ + std::string index_type; + std::string weight_type; // currently always float +}; + +struct moe_sorting_args : public ck_tile::MoeSortingHostArgs +{ +}; + +float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s); diff --git a/example/ck_tile/13_moe_sorting/script/smoke_test.sh b/example/ck_tile/13_moe_sorting/script/smoke_test.sh new file mode 100644 index 000000000..1fc5eafcb --- /dev/null +++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh @@ -0,0 +1,19 @@ +# #!/bin/sh + +EXE=./build/bin/tile_example_moe_sorting + +$EXE -t=80 -e=17 -moe_buf_size=16 +$EXE -t=111 -e=117 -moe_buf_size=4 +$EXE -t=1000 -e=55 -moe_buf_size=1024 +$EXE -t=99 -e=120 -moe_buf_size=10244 +$EXE -t=175 -e=64 -k=8 +$EXE -t=65 -e=8 -k=2 +$EXE -t=1 -e=25 +$EXE -t=31 -e=19 -k=15 +$EXE -t=81 -e=37 -k=7 +$EXE -t=23 -e=1 -k=1 +$EXE -t=127 -e=99 -k=19 +$EXE -t=71 -e=11 -k=11 +$EXE -t=1 -e=1 -k=1 +$EXE -t=99 -e=2 -k=1 +$EXE -t=333 -e=99 -k=13 \ No newline at end of file diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 9dd9a6ca3..15db0f46c 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -12,3 +12,4 @@ add_subdirectory(09_topk_softmax) add_subdirectory(10_rmsnorm2d) add_subdirectory(11_add_rmsnorm2d_rdquant) add_subdirectory(12_smoothquant) +add_subdirectory(13_moe_sorting) diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp index c0ab13ce3..2e96009ac 100644 --- a/include/ck_tile/host.hpp +++ b/include/ck_tile/host.hpp @@ -23,6 +23,7 @@ #include "ck_tile/host/reference/reference_gemm.hpp" #include "ck_tile/host/reference/reference_im2col.hpp" #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp" +#include "ck_tile/host/reference/reference_moe_sorting.hpp" #include "ck_tile/host/reference/reference_permute.hpp" #include "ck_tile/host/reference/reference_reduce.hpp" #include "ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp" diff --git a/include/ck_tile/host/reference/reference_moe_sorting.hpp b/include/ck_tile/host/reference/reference_moe_sorting.hpp new file mode 100644 index 000000000..c8eb7edb5 --- /dev/null +++ b/include/ck_tile/host/reference/reference_moe_sorting.hpp @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" + +namespace ck_tile { + +template +CK_TILE_HOST void reference_moe_sorting(const HostTensor& topk_ids, + const HostTensor& weights, + HostTensor& p_sorted_token_ids, + HostTensor& sorted_weight, + HostTensor& sorted_expert_ids, + index_t& unit_cnt, + const index_t experts, + const index_t unit_size) +{ + const index_t num_token = topk_ids.mDesc.get_lengths()[0]; + const index_t topk = topk_ids.mDesc.get_lengths()[1]; + std::vector> expert_tokens(experts, + std::vector(unit_size, num_token)); + std::vector> expert_token_weights( + experts, std::vector(unit_size, 0)); + std::vector expert_slices(experts, 1); + std::vector expert_slice_idxs(experts, 0); + + for(index_t t = 0; t < num_token; t++) + { + for(index_t k = 0; k < topk; k++) + { + IndexType e = topk_ids(t, k); + WeightType w = weights(t, k); + index_t idx = expert_slice_idxs[e]; + if(idx > expert_slices[e] * unit_size - 1) + { + expert_slices[e]++; + index_t new_size = expert_slices[e] * unit_size; + expert_tokens[e].resize(new_size); + expert_token_weights[e].resize(new_size); + for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++) + { + expert_tokens[e][i] = num_token; + expert_token_weights[e][i] = 0; + } + } + + expert_tokens[e][idx] = t; + expert_token_weights[e][idx] = w; + expert_slice_idxs[e]++; + } + } + + IndexType* out_tokens = p_sorted_token_ids.data(); + WeightType* out_weights = sorted_weight.data(); + IndexType* out_expert_id = sorted_expert_ids.data(); + for(index_t e = 0; e < experts; e++) + { + memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size); + out_tokens += expert_slices[e] * unit_size; + memcpy(out_weights, + expert_token_weights[e].data(), + sizeof(WeightType) * expert_slices[e] * unit_size); + out_weights += expert_slices[e] * unit_size; + + for(index_t s = 0; s < expert_slices[e]; s++) + { + out_expert_id[s] = e; + unit_cnt++; + } + out_expert_id += expert_slices[e]; + } + unit_cnt *= unit_size; + return; +} +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp new file mode 100644 index 000000000..1c6acec70 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" +#include "ck_tile/ops/elementwise.hpp" +#include "ck_tile/host/hip_check_error.hpp" +#include +#include + +namespace ck_tile { + +struct MoeSortingHostArgs +{ + const void* p_topk_ids; + const void* p_weights; + void* p_sorted_token_ids; + void* p_sorted_weights; + void* p_sorted_expert_ids; + void* p_total_tokens_post_pad; + void* p_moe_buf; + index_t tokens; + index_t unit_size; + index_t num_experts; + index_t topk; + index_t moe_buf_bytes; +}; + +template +struct MoeSortingKernel +{ + using Problem = remove_cvref_t; + + using IndexType = typename Problem::IndexType; + using WeightType = typename Problem::WeightType; + + typedef MoeSortingHostArgs MoeSortingKargs; + + using Hargs = MoeSortingHostArgs; + + struct Kargs + { + const void* p_topk_ids; + const void* p_weights; + void* p_sorted_token_ids; + void* p_sorted_weights; + void* p_sorted_expert_ids; + void* p_total_tokens_post_pad; + void* p_moe_buf; + index_t tokens; + index_t num_experts; + index_t moe_buf_bytes; + + index_t tokens_per_thread; + mdiv unit_size_mdiv; + mdiv topk_mdiv; + }; + + CK_TILE_HOST static constexpr auto GridSize(const Hargs& h) + { + // TODO: assume num-experts not too much + return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BlockSize(h).x * 16)); + } + + CK_TILE_HOST static constexpr auto BlockSize(const Hargs& h) + { + return dim3(ck_tile::integer_least_multiple(h.num_experts, ck_tile::get_warp_size())); + } + + // in byte + CK_TILE_HOST static constexpr auto GetSmemSize(const Hargs& h) + { + const auto blocks = BlockSize(h); + return ((blocks.x + 1) * h.num_experts + (h.num_experts + 1)) * sizeof(index_t); + } + + CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h) + { + Kargs k; + k.p_topk_ids = h.p_topk_ids; + k.p_weights = h.p_weights; + k.p_sorted_token_ids = h.p_sorted_token_ids; + k.p_sorted_weights = h.p_sorted_weights; + k.p_sorted_expert_ids = h.p_sorted_expert_ids; + k.p_moe_buf = h.p_moe_buf; + k.p_total_tokens_post_pad = h.p_total_tokens_post_pad; + k.tokens = h.tokens; + k.num_experts = h.num_experts; + k.moe_buf_bytes = h.moe_buf_bytes; + + const auto blocks = BlockSize(h); + k.tokens_per_thread = integer_divide_ceil(h.tokens * h.topk, blocks.x); + k.unit_size_mdiv = mdiv{static_cast(h.unit_size)}; + k.topk_mdiv = mdiv{static_cast(h.topk)}; + return k; + } + + CK_TILE_DEVICE index_t calc_index(index_t total_col, index_t row, index_t col) const + { + return row * total_col + col; + } + + CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, index_t buf_bytes) const + { + const index_t offset = (blockIdx.x - 1) * blockDim.x + threadIdx.x; + if(offset < buf_bytes / 16) + { + buf[offset] = uint8x16_t{0}; + } + } + + CK_TILE_DEVICE void moe_align_block_size_kernel(const IndexType* __restrict__ topk_id, + const WeightType* __restrict__ weights, + index_t* p_sorted_token_ids, + WeightType* p_sorted_weights, + index_t* p_sorted_expert_ids, + index_t* p_total_tokens_post_pad, + const index_t num_experts, + const index_t tokens_per_thread, + const index_t numel, + const mdiv unit_size_mdiv, + const mdiv topk_mdiv, + void* smem) const + { + const index_t tid = static_cast(threadIdx.x); + const index_t start_idx = tid * tokens_per_thread; + + index_t* shared_mem = reinterpret_cast(smem); + + index_t* tokens_cnts = shared_mem; // 2d: (blockDim.x + 1, num_experts) + index_t* cumsum = shared_mem + (blockDim.x + 1) * num_experts; // 1: (num_experts + 1) + for(int i = 0; i < num_experts; ++i) + { + tokens_cnts[calc_index(num_experts, tid + 1, i)] = 0; + } +#pragma unroll Problem_::InternalLoadUnroll + for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) + { + ++tokens_cnts[calc_index(num_experts, tid + 1, topk_id[i])]; + } + __syncthreads(); + + if(tid < num_experts) + { + tokens_cnts[calc_index(num_experts, 0, tid)] = 0; + for(int i = 1; i <= static_cast(blockDim.x); ++i) + { + tokens_cnts[calc_index(num_experts, i, tid)] += + tokens_cnts[calc_index(num_experts, i - 1, tid)]; + } + } + + // __syncthreads(); + if(tid == 0) + { + cumsum[0] = 0; + for(int i = 1; i <= num_experts; ++i) + { + auto current_units = [&]() { + index_t x_ = tokens_cnts[calc_index(num_experts, blockDim.x, i - 1)] + + unit_size_mdiv.divisor - 1; + index_t y_ = unit_size_mdiv.div(x_); + return max(y_, 1) * unit_size_mdiv.divisor; + }(); + cumsum[i] = cumsum[i - 1] + current_units; + } + *p_total_tokens_post_pad = cumsum[num_experts]; + } + __syncthreads(); + if(tid < num_experts) + { + for(int i = cumsum[tid]; i < cumsum[tid + 1]; i += unit_size_mdiv.divisor) + { + p_sorted_expert_ids[unit_size_mdiv.div(i)] = tid; + } + } + +#pragma unroll Problem_::InternalLoadUnroll + for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) + { + index_t expert_id = topk_id[i]; + index_t rank_post_pad = + tokens_cnts[calc_index(num_experts, tid, expert_id)] + cumsum[expert_id]; + p_sorted_token_ids[rank_post_pad] = topk_mdiv.div(i); + p_sorted_weights[rank_post_pad] = weights[i]; + ++tokens_cnts[calc_index(num_experts, tid, expert_id)]; + } + + const index_t prefill_token = topk_mdiv.div(numel); + if(tid < num_experts) + { + index_t expert_offset = + cumsum[tid] + tokens_cnts[calc_index(num_experts, blockDim.x, tid)]; + while(expert_offset < cumsum[tid + 1]) + { + p_sorted_token_ids[expert_offset] = prefill_token; + p_sorted_weights[expert_offset] = static_cast(0.0); + expert_offset++; + } + } + } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + if(blockIdx.x > 0) + { + if(kargs.p_moe_buf) + { + moe_buf_set_zero_kernel(reinterpret_cast(kargs.p_moe_buf), + kargs.moe_buf_bytes); + } + return; + } + const size_t numel = kargs.tokens * kargs.topk_mdiv.divisor; + extern __shared__ char smem[]; + return moe_align_block_size_kernel(static_cast(kargs.p_topk_ids), + static_cast(kargs.p_weights), + static_cast(kargs.p_sorted_token_ids), + static_cast(kargs.p_sorted_weights), + static_cast(kargs.p_sorted_expert_ids), + static_cast(kargs.p_total_tokens_post_pad), + kargs.num_experts, + kargs.tokens_per_thread, + numel, + kargs.unit_size_mdiv, + kargs.topk_mdiv, + smem); + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp new file mode 100644 index 000000000..bbd47352d --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp" +#include +#include + +#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW +#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 0 +#endif + +namespace ck_tile { + +// template +// struct MoeSortingPipeline +// { +// // TODO: this kernel only support warp per row +// using Problem = remove_cvref_t; +// using Policy = remove_cvref_t; +// using WeightType = typename Problem::WeightType; + +// template +// CK_TILE_DEVICE auto operator()(const TopkIdWindow& topk_id_window, +// const WeightWindow& weight_window, +// index_t* p_sorted_token_ids, +// WeightType* p_sorted_weights, +// index_t* p_sorted_expert_ids, +// index_t* p_total_tokens_post_pad, +// const index_t num_experts, +// const index_t unit_size, +// const size_t numel, +// const index_t topk) +// { +// } +// }; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp new file mode 100644 index 000000000..f5218a93e --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/softmax.hpp" +#include "ck_tile/ops/topk.hpp" + +namespace ck_tile { + +struct MoeSortingPolicy +{ +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp new file mode 100644 index 000000000..adde59e35 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include +#include + +namespace ck_tile { + +template +struct MoeSortingProblem +{ + // TODO: this kernel only support warp per row + using WeightType = remove_cvref_t; + using IndexType = remove_cvref_t; + + static constexpr index_t WarpSize = get_warp_size(); + static constexpr index_t WarpsPerBlock = 1; + static constexpr index_t InternalLoadUnroll = InternalLoadUnroll_; +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/moe_sorting.hpp b/include/ck_tile/ops/moe_sorting.hpp new file mode 100644 index 000000000..b74607f06 --- /dev/null +++ b/include/ck_tile/ops/moe_sorting.hpp @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp" +#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp" +#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp" +#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" -- GitLab From 13332998a4ca6dcc8cc5fcd401ca900529e5e65c Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Mon, 11 Nov 2024 09:28:32 +0800 Subject: [PATCH 048/153] Return nullptr when block index is invalid (#1649) --- .../ck_tile/ops/fmha/block/page_block_navigator.hpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/ck_tile/ops/fmha/block/page_block_navigator.hpp b/include/ck_tile/ops/fmha/block/page_block_navigator.hpp index e8abdc579..5d158f9fb 100644 --- a/include/ck_tile/ops/fmha/block/page_block_navigator.hpp +++ b/include/ck_tile/ops/fmha/block/page_block_navigator.hpp @@ -230,7 +230,15 @@ struct PageBlockNavigator CK_TILE_HOST_DEVICE DataType* get_block_ptr(index_t block_index) const { - return physical_blocks + physical_block_indices[block_index] * block_stride + fixed_offset; + if(block_index < num_blocks) + { + return physical_blocks + physical_block_indices[block_index] * block_stride + + fixed_offset; + } + else + { + return nullptr; + } } CK_TILE_HOST_DEVICE int32_t get_block_index(const WindowOrigin& global_window_origin) const -- GitLab From 8ef8a994e73370d69980a4df7377ed4ce8ed05c8 Mon Sep 17 00:00:00 2001 From: valarLip <103567126+valarLip@users.noreply.github.com> Date: Mon, 11 Nov 2024 16:02:28 +0800 Subject: [PATCH 049/153] [CK_TILE] add more stride for layernorm to support un-continuous Tensor (#1650) * [CK_TILE] add more stride for layernorm to support un-continuous Tensor * align CK coding style * extend strides to layernrom expample * clang-format... --- .../02_layernorm2d/layernorm2d_fwd.cpp | 63 ++++++++++++------- .../kernel/layernorm2d_fwd_kernel.hpp | 23 ++++--- 2 files changed, 56 insertions(+), 30 deletions(-) diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp index 8f029c212..b49c04619 100644 --- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp +++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp @@ -25,7 +25,10 @@ auto create_args(int argc, char* argv[]) ck_tile::ArgParser arg_parser; arg_parser.insert("m", "3328", "m dimension") .insert("n", "4096", "n dimension") - .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("x_stride", "-1", "x row_stride, if -1 then equal to n") + .insert("xr_stride", "-1", "x residule row_stride, if -1 then equal to n") + .insert("y_stride", "-1", "y row_stride, if -1 then equal to n") + .insert("yr_stride", "-1", "y residule row_stride, if -1 then equal to n") .insert("e", "1e-5", "epsilon") .insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case") .insert("v", "1", "cpu validation or not") @@ -54,11 +57,20 @@ template bool run(const ck_tile::ArgParser& arg_parser) { - ck_tile::index_t m = arg_parser.get_int("m"); - ck_tile::index_t n = arg_parser.get_int("n"); - ck_tile::index_t stride = arg_parser.get_int("stride"); - if(stride < 0) - stride = n; + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t x_stride = arg_parser.get_int("x_stride"); + if(x_stride < 0) + x_stride = n; + ck_tile::index_t xr_stride = arg_parser.get_int("xr_stride"); + if(xr_stride < 0) + xr_stride = n; + ck_tile::index_t y_stride = arg_parser.get_int("y_stride"); + if(y_stride < 0) + y_stride = n; + ck_tile::index_t yr_stride = arg_parser.get_int("yr_stride"); + if(yr_stride < 0) + yr_stride = n; float epsilon = arg_parser.get_float("e"); std::string prec_i = arg_parser.get_str("prec_i"); std::string prec_o = arg_parser.get_str("prec_o"); @@ -89,7 +101,7 @@ bool run(const ck_tile::ArgParser& arg_parser) return false; } - assert(stride >= n); + assert(x_stride >= n); using TypeConfig = LayerNormTypeConfig; @@ -108,15 +120,15 @@ bool run(const ck_tile::ArgParser& arg_parser) using ComputeDataType = typename TypeConfig::ComputeDataType; // host verify - ck_tile::HostTensor x_host({m, n}, {stride, 1}); + ck_tile::HostTensor x_host({m, n}, {x_stride, 1}); ck_tile::HostTensor gamma_host({n}); ck_tile::HostTensor beta_host({n}); - ck_tile::HostTensor x_residual_host({m, n}, {stride, 1}); - ck_tile::HostTensor y_residual_host({m, n}, {stride, 1}); + ck_tile::HostTensor x_residual_host({m, n}, {xr_stride, 1}); + ck_tile::HostTensor y_residual_host({m, n}, {yr_stride, 1}); - ck_tile::HostTensor y_host_ref({m, n}, {stride, 1}); - ck_tile::HostTensor y_host_dev({m, n}, {stride, 1}); + ck_tile::HostTensor y_host_ref({m, n}, {y_stride, 1}); + ck_tile::HostTensor y_host_dev({m, n}, {y_stride, 1}); ck_tile::HostTensor mean_host_ref({m}); ck_tile::HostTensor invStd_host_ref({m}); @@ -162,7 +174,9 @@ bool run(const ck_tile::ArgParser& arg_parser) }(); std::cout << "[" << prec_str << "]" - << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush; + << " m:" << m << ", n:" << n << ", x_stride:" << x_stride + << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride + << ", yr_stride:" << yr_stride << std::flush; layernorm2d_fwd_traits traits{ prec_i, prec_o, prec_sx, prec_sy, SaveMeanVar, fused_add, fused_quant}; @@ -182,7 +196,10 @@ bool run(const ck_tile::ArgParser& arg_parser) epsilon, m, n, - stride}; + x_stride, // x row_stride + xr_stride, // x residule row stride + y_stride, // y row stride + yr_stride}; // y residule row stride float ave_time = layernorm2d_fwd( traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); @@ -285,7 +302,7 @@ bool run(const ck_tile::ArgParser& arg_parser) y_buf.FromDevice(y_host_dev.data()); - ck_tile::HostTensor y_residual_host_dev({m, n}, {stride, 1}); + ck_tile::HostTensor y_residual_host_dev({m, n}, {yr_stride, 1}); if(fused_add == 1) { y_residual_buf.FromDevice(y_residual_host_dev.data()); @@ -293,7 +310,7 @@ bool run(const ck_tile::ArgParser& arg_parser) auto [rtol, atol] = get_elimit(); - if(stride == n) + if(x_stride == n) { pass = ck_tile::check_err( y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol); @@ -310,10 +327,10 @@ bool run(const ck_tile::ArgParser& arg_parser) { for(int i_r = 0; i_r < m; i_r++) { - std::vector y_host_dev_row(y_host_dev.begin() + i_r * stride, - y_host_dev.begin() + i_r * stride + n); - std::vector y_host_ref_row(y_host_ref.begin() + i_r * stride, - y_host_ref.begin() + i_r * stride + n); + std::vector y_host_dev_row(y_host_dev.begin() + i_r * y_stride, + y_host_dev.begin() + i_r * y_stride + n); + std::vector y_host_ref_row(y_host_ref.begin() + i_r * y_stride, + y_host_ref.begin() + i_r * y_stride + n); pass &= ck_tile::check_err(y_host_dev_row, y_host_ref_row, std::string("OUT[") + std::to_string(i_r) + @@ -323,10 +340,10 @@ bool run(const ck_tile::ArgParser& arg_parser) if(fused_add == 1) { std::vector y_residual_host_dev_row( - y_residual_host_dev.begin() + i_r * stride, - y_residual_host_dev.begin() + i_r * stride + n); + y_residual_host_dev.begin() + i_r * yr_stride, + y_residual_host_dev.begin() + i_r * yr_stride + n); std::vector y_residual_host_ref_row( - x_host.begin() + i_r * stride, x_host.begin() + i_r * stride + n); + x_host.begin() + i_r * yr_stride, x_host.begin() + i_r * yr_stride + n); pass &= ck_tile::check_err(y_residual_host_dev_row, y_residual_host_ref_row, std::string("ADD[") + std::to_string(i_r) + diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp index f5a214ba5..10218e808 100644 --- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp +++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp @@ -28,7 +28,10 @@ struct Layernorm2dFwdHostArgs index_t m; index_t n; - index_t stride; // row_stride + index_t x_stride; // x row_stride + index_t xr_stride; // x residule row stride + index_t y_stride; // y row stride + index_t yr_stride; // y residule row stride }; // TODO: Extract some type to wrapper class @@ -93,7 +96,10 @@ struct Layernorm2dFwd index_t m; index_t n; - index_t stride; // row_stride + index_t x_stride; // x row_stride + index_t xr_stride; // x residule row stride + index_t y_stride; // y row stride + index_t yr_stride; // y residule row stride }; using Hargs = Layernorm2dFwdHostArgs; @@ -112,7 +118,10 @@ struct Layernorm2dFwd hargs.epsilon, hargs.m, hargs.n, - hargs.stride}; + hargs.x_stride, + hargs.xr_stride, + hargs.y_stride, + hargs.yr_stride}; } CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) @@ -182,7 +191,7 @@ struct Layernorm2dFwd const auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_x), make_tuple(kargs.m, kargs.n), - make_tuple(kargs.stride, 1), + make_tuple(kargs.x_stride, 1), number{}, number<1>{}); @@ -201,7 +210,7 @@ struct Layernorm2dFwd const auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_x_residual), make_tuple(kargs.m, kargs.n), - make_tuple(kargs.stride, 1), + make_tuple(kargs.xr_stride, 1), number{}, number<1>{}); @@ -250,7 +259,7 @@ struct Layernorm2dFwd auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_y), make_tuple(kargs.m, kargs.n), - make_tuple(kargs.stride, 1), + make_tuple(kargs.y_stride, 1), number{}, number<1>{}); @@ -266,7 +275,7 @@ struct Layernorm2dFwd auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_y_residual), make_tuple(kargs.m, kargs.n), - make_tuple(kargs.stride, 1), + make_tuple(kargs.yr_stride, 1), number{}, number<1>{}); -- GitLab From 5fb150dbe700eba180feb5b27973a8ba95fae2ce Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 11 Nov 2024 09:25:08 -0800 Subject: [PATCH 050/153] restore collecting performance of mixed prec gemms (#1648) --- script/process_perf_data.py | 4 ++-- script/process_qa_data.sh | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/script/process_perf_data.py b/script/process_perf_data.py index b82a7c289..3892206e4 100644 --- a/script/process_perf_data.py +++ b/script/process_perf_data.py @@ -133,12 +133,12 @@ def parse_logfile(logfile): if 'Best Perf' in line: lst=line.split() res.append(lst[4]) - elif 'onnx_gemm' in logfile or 'mixed_gemm' in logfile: + elif 'onnx_gemm' in logfile: for line in open(logfile): if 'Best Perf' in line: lst=line.split() res.append(lst[33]) - elif 'splitK_gemm' in logfile: + elif 'splitK_gemm' in logfile or 'mixed_gemm' in logfile: for line in open(logfile): if 'Best Perf' in line: lst=line.split() diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh index d6083d2fc..c9a1645f6 100755 --- a/script/process_qa_data.sh +++ b/script/process_qa_data.sh @@ -22,6 +22,7 @@ python3 process_perf_data.py perf_gemm_bilinear.log python3 process_perf_data.py perf_reduction.log python3 process_perf_data.py perf_splitK_gemm.log python3 process_perf_data.py perf_onnx_gemm.log +python3 process_perf_data.py perf_mixed_gemm.log file=./perf_fmha_fwd_gfx942.log if [ -e "$file" ]; then -- GitLab From 2b6458ddf243904cecf4c54b48c9dafa60ff80df Mon Sep 17 00:00:00 2001 From: Thomas Ning Date: Tue, 12 Nov 2024 10:08:25 +0800 Subject: [PATCH 051/153] [CK Tile] Improve the Layout, Padding, and Alignment features of CK Tile GEMM (#1651) * Finished the feature * Modified the test file * Test case update * addresss comment * Addressed the review comment * Fixed the CI error --- example/ck_tile/03_gemm/README.md | 3 + example/ck_tile/03_gemm/gemm_basic.cpp | 19 +- example/ck_tile/03_gemm/gemm_mem_pipeline.cpp | 10 +- include/ck_tile/core/tensor/shuffle_tile.hpp | 2 +- ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 2 + .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 70 ++-- .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 6 +- .../gemm_pipeline_agmem_bgmem_creg_v1.hpp | 63 +++- ...ine_agmem_bgmem_creg_v1_default_policy.hpp | 330 ++++++++++++++---- .../gemm/pipeline/gemm_pipeline_problem.hpp | 154 ++++++-- ...emm_universal_pipeline_ag_bg_cr_policy.hpp | 316 ++++++++++++++--- .../ops/gemm/pipeline/tile_gemm_traits.hpp | 16 +- .../gemm/test_gemm_mem_pipeline_util.hpp | 12 +- 13 files changed, 781 insertions(+), 222 deletions(-) diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md index aacbdf686..e9ffe72a9 100644 --- a/example/ck_tile/03_gemm/README.md +++ b/example/ck_tile/03_gemm/README.md @@ -8,7 +8,10 @@ This folder contains example for GEMM using ck_tile tile-programming implementat mkdir build && cd build # you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank sh ../script/cmake-ck-dev.sh ../ +# The basic pipeline method on the gemm calculation make tile_example_gemm_basic -j +# The memory bound pipeline on the gemm calculation +make tile_example_gemm_mem_pipeline -j ``` This will result in an executable `build/bin/tile_example_gemm_basic` diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp index 09427217c..b7d869344 100644 --- a/example/ck_tile/03_gemm/gemm_basic.cpp +++ b/example/ck_tile/03_gemm/gemm_basic.cpp @@ -17,10 +17,11 @@ template float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) { - // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part. - constexpr bool kPadA = true; - constexpr bool kPadB = true; - constexpr bool kPadC = true; + // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; + constexpr bool kTilePermute = false; // The rank and permutation will also be generate out by the CodeGen part. constexpr ck_tile::index_t kOutputRank = 2; @@ -56,8 +57,8 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) CShuffleEpilogue, ck_tile::CShuffleEpilogue>, ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem>>; + ck_tile::Default2DEpilogueProblem>>; using CodegenGemmTraits = - ck_tile::TileGemmTraits; + ck_tile::TileGemmTraits; using CodegenPipelineProblem = ck_tile:: GemmPipelineProblem; - using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy; + using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy; using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; // ToDo: Will add the codegen part to test different pipeline policies in GEMM. diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp index 2ee0395e4..ff9d8bad3 100644 --- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp +++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp @@ -31,9 +31,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) constexpr ck_tile::index_t K_Warp_Tile = 8; // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part. - constexpr bool kPadA = true; - constexpr bool kPadB = true; - constexpr bool kPadC = true; + constexpr bool kPadM = true; + constexpr bool kPadN = true; + constexpr bool kPadK = true; constexpr int kBlockPerCu = 1; @@ -46,9 +46,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) using TilePartitioner = ck_tile::GemmTilePartitioner; using GemmEpilogue = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem>; + ck_tile::Default2DEpilogueProblem>; - using Traits = ck_tile::TileGemmTraits; + using Traits = ck_tile::TileGemmTraits; using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem< ck_tile::GemmPipelineProblem>; diff --git a/include/ck_tile/core/tensor/shuffle_tile.hpp b/include/ck_tile/core/tensor/shuffle_tile.hpp index da3c7117e..55e3274cd 100644 --- a/include/ck_tile/core/tensor/shuffle_tile.hpp +++ b/include/ck_tile/core/tensor/shuffle_tile.hpp @@ -170,7 +170,7 @@ CK_TILE_DEVICE void shuffle_tile(OutTensor& out, const InTensor& in) } else { - // NOT implemented + static_assert(false, "The shuffle should always happen!"); } } diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp index fbb05e164..a3a29bb54 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp @@ -863,6 +863,8 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy{}, number{}), - // somehow clang-format is splitting below line into multiple. - // clang-format off - sequence{}); + auto a_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + a_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + a_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); // clang-format on auto a_block_window = make_tile_window( @@ -128,12 +138,22 @@ struct GemmKernel make_tuple(number{}, number{}), {i_m, 0}); - auto b_pad_view = pad_tensor_view( - b_tensor_view, - make_tuple(number{}, number{}), - // clang-format off - sequence{}); - // clang-format on + auto b_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + b_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + b_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); auto b_block_window = make_tile_window( b_pad_view, @@ -171,18 +191,28 @@ struct GemmKernel } }(); - auto c_pad_view = pad_tensor_view( - c_tensor_view, - make_tuple(number{}, number{}), - // clang-format off - sequence{}); - // clang-format on - auto c_block_window = make_tile_window( + auto c_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + c_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + c_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); + auto CBlockWindow_pad = make_tile_window( c_pad_view, make_tuple(number{}, number{}), {i_m, i_n}); - EpiloguePipeline{}(c_block_window, c_block_tile); + EpiloguePipeline{}(CBlockWindow_pad, c_block_tile); } }; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index b9b45d3f4..85c5c5805 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -113,9 +113,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem static constexpr index_t VectorSizeB = Problem::VectorSizeB; static constexpr index_t VectorSizeC = Problem::VectorSizeC; - static constexpr bool kPadA = Problem::kPadA; - static constexpr bool kPadB = Problem::kPadB; - static constexpr bool kPadC = Problem::kPadC; + static constexpr bool kPadM = Problem::kPadM; + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kPadK = Problem::kPadK; // Where is the right place for HasHotLoop and TailNum ??? static constexpr bool HasHotLoop = Problem::HasHotLoop; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index a2424290e..c0817e736 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -33,9 +33,9 @@ struct GemmPipelineAGmemBGmemCRegV1 static constexpr index_t VectorSizeB = Problem::VectorSizeB; static constexpr index_t VectorSizeC = Problem::VectorSizeC; - static constexpr bool kPadA = Problem::kPadA; - static constexpr bool kPadB = Problem::kPadB; - static constexpr bool kPadC = Problem::kPadC; + static constexpr bool kPadM = Problem::kPadM; + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kPadK = Problem::kPadK; CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize() { @@ -101,11 +101,8 @@ struct GemmPipelineAGmemBGmemCRegV1 Policy::template MakeADramTileDistribution()); // A LDS tile window for store - auto a_copy_lds_window = - make_tile_window(a_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - a_copy_dram_window.get_tile_distribution()); + auto a_copy_lds_window = make_tile_window( + a_lds_block, make_tuple(number{}, number{}), {0, 0}); // B DRAM tile window for load auto b_copy_dram_window = @@ -115,11 +112,8 @@ struct GemmPipelineAGmemBGmemCRegV1 Policy::template MakeBDramTileDistribution()); // B LDS tile window for store - auto b_copy_lds_window = - make_tile_window(b_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - b_copy_dram_window.get_tile_distribution()); + auto b_copy_lds_window = make_tile_window( + b_lds_block, make_tuple(number{}, number{}), {0, 0}); // A LDS tile for block GEMM auto a_lds_gemm_window = make_tile_window( @@ -149,12 +143,32 @@ struct GemmPipelineAGmemBGmemCRegV1 tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); // LDS write 0 - const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile); - store_tile(a_copy_lds_window, a_block_tile_tmp); + if constexpr(std::is_same_v) + { + auto a_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledARegBlockDescriptor()); + shuffle_tile(a_shuffle_tmp, a_block_tile); + const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp); + store_tile(a_copy_lds_window, a_block_tile_tmp); + } + else + { + store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile)); + } // LDS write 0 - const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile); - store_tile(b_copy_lds_window, b_block_tile_tmp); + if constexpr(std::is_same_v) + { + auto b_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledBRegBlockDescriptor()); + shuffle_tile(b_shuffle_tmp, b_block_tile); + const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_shuffle_tmp); + store_tile(b_copy_lds_window, b_block_tile_tmp); + } + else + { + store_tile(b_copy_lds_window, tile_elementwise_in(b_element_func, b_block_tile)); + } } index_t iCounter = num_loop - 1; @@ -180,8 +194,19 @@ struct GemmPipelineAGmemBGmemCRegV1 store_tile(a_copy_lds_window, a_block_tile_tmp); // LDS write i + 1 - const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile); - store_tile(b_copy_lds_window, b_block_tile_tmp); + if constexpr(std::is_same_v) + { + auto b_shuffle_tmp_loop = make_static_distributed_tensor( + Policy::template MakeShuffledBRegBlockDescriptor()); + shuffle_tile(b_shuffle_tmp_loop, b_block_tile); + store_tile(b_copy_lds_window, + tile_elementwise_in(b_element_func, b_shuffle_tmp_loop)); + } + else + { + const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile); + store_tile(b_copy_lds_window, b_block_tile_tmp); + } iCounter--; } diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp index 199ba56aa..c765b3ce9 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp @@ -11,6 +11,7 @@ namespace ck_tile { // Default policy class should not be templated, put template on member functions instead struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy { + #if 0 // 2d template @@ -116,6 +117,20 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy return smem_size; } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA() + { + using ADataType = remove_cvref_t; + return Problem::VectorLoadSize / sizeof(ADataType); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackB() + { + using BDataType = remove_cvref_t; + return Problem::VectorLoadSize / sizeof(BDataType); + } #elif 1 // fake XOR template @@ -192,80 +207,269 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution() { using ADataType = remove_cvref_t; - - constexpr index_t kBlockSize = Problem::kBlockSize; - - constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM; - constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK; - - constexpr index_t K1 = 16 / sizeof(ADataType); - constexpr index_t K0 = kKPerBlock / K1; - constexpr index_t M2 = get_warp_size() / K0; -#if 1 // coalesce reading for each blocks - constexpr index_t M1 = kBlockSize / get_warp_size(); - static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error."); - static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error."); - constexpr index_t M0 = kMPerBlock / (M2 * M1); - - return make_static_tile_distribution( - tile_distribution_encoding, - tuple, sequence>, - tuple, sequence<1, 2>>, - tuple, sequence<2, 0>>, - sequence<1, 2>, - sequence<0, 1>>{}); -#else // coalesce reading for each warps - constexpr index_t M0 = kBlockSize / get_warp_size(); - constexpr index_t M1 = kMPerBlock / (M2 * M0); - - return make_static_tile_distribution( - tile_distribution_encoding, - tuple, sequence>, - tuple, sequence<1, 2>>, - tuple, sequence<2, 0>>, - sequence<1, 2>, - sequence<1, 1>>{}); -#endif + using ALayout = remove_cvref_t; + + constexpr index_t BlockSize = Problem::kBlockSize; + + constexpr index_t MPerBlock = Problem::BlockGemmShape::kM; + constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; + + if constexpr(std::is_same_v) + { + constexpr index_t M1 = Problem::VectorLoadSize / sizeof(ADataType); + constexpr index_t M0 = MPerBlock / M1; + constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize; + static_assert(total_pixels % M1 == 0); + constexpr index_t K3 = total_pixels / M1; + constexpr index_t KPack = GetSmemPackA(); + static_assert(KPack % K3 == 0); + constexpr index_t K2 = KPack / K3; + if constexpr(get_warp_size() % (K2 * M0)) + { + constexpr index_t K1 = get_warp_size() / (K2 * M0); + constexpr index_t K0 = BlockSize / get_warp_size(); + static_assert(KPerBlock == K0 * K1 * K2 * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<2, 1, 2>>, + tuple, sequence<1, 0, 2>>, + sequence<2, 1>, + sequence<3, 1>>{}); + } + else + { + constexpr index_t K1 = (K2 * M0) / get_warp_size(); + constexpr index_t K2_m = K2 / K1; + constexpr index_t K0 = BlockSize / get_warp_size() / K1; + static_assert(KPerBlock == K0 * K1 * K2_m * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<0, 2>>, + sequence<2, 1>, + sequence<3, 1>>{}); + } + } + else + { + constexpr index_t K1 = 16 / sizeof(ADataType); + constexpr index_t K0 = KPerBlock / K1; + constexpr index_t M2 = get_warp_size() / K0; + // coalesce reading for each blocks + if constexpr(get_warp_size() % (M2 * K0) == 0) + { + constexpr index_t M1 = BlockSize / get_warp_size(); + static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error."); + static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error."); + constexpr index_t M0 = MPerBlock / (M2 * M1); + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + else + { + constexpr index_t M0 = BlockSize / get_warp_size(); + constexpr index_t M1 = MPerBlock / (M2 * M0); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<1, 1>>{}); + } + } } template CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution() { using BDataType = remove_cvref_t; + using BLayout = remove_cvref_t; + + constexpr index_t BlockSize = Problem::kBlockSize; + + constexpr index_t NPerBlock = Problem::BlockGemmShape::kN; + constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; + + if constexpr(std::is_same_v) + { + constexpr index_t N1 = Problem::VectorLoadSize / sizeof(BDataType); + constexpr index_t N0 = NPerBlock / N1; + constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize; + static_assert(total_pixels % N1 == 0); + constexpr index_t K3 = total_pixels / N1; + constexpr index_t KPack = GetSmemPackB(); + static_assert(KPack % K3 == 0); + constexpr index_t K2 = KPack / K3; + if constexpr(get_warp_size() % (K2 * N0) == 0) + { + constexpr index_t K1 = get_warp_size() / (K2 * N0); + constexpr index_t K0 = BlockSize / get_warp_size(); + static_assert(KPerBlock == K0 * K1 * K2 * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<2, 1, 2>>, + tuple, sequence<1, 0, 2>>, + sequence<2, 1>, + sequence<3, 1>>{}); + } + else + { + constexpr index_t K1 = (K2 * N0) / get_warp_size(); + constexpr index_t K2_m = K2 / K1; + constexpr index_t K0 = BlockSize / get_warp_size() / K1; + static_assert(KPerBlock == K0 * K1 * K2_m * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<0, 2>>, + sequence<2, 1>, + sequence<3, 1>>{}); + } + } + else + { + + constexpr index_t K1 = Problem::VectorLoadSize / sizeof(BDataType); + constexpr index_t K0 = KPerBlock / K1; + constexpr index_t N2 = get_warp_size() / K0; + // coalesce reading for each blocks + if constexpr(get_warp_size() % (N2 * K0) == 0) + { + constexpr index_t N1 = BlockSize / get_warp_size(); + static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error."); + static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error."); + constexpr index_t N0 = NPerBlock / (N2 * N1); + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + // coalesce reading for each warps + else + { + constexpr index_t N0 = BlockSize / get_warp_size(); + constexpr index_t N1 = NPerBlock / (N2 * N0); + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<1, 1>>{}); + } + } + } + template + CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledBRegBlockDescriptor() + { + using BLayout = remove_cvref_t; + using BDataType = remove_cvref_t; + static_assert(std::is_same_v); constexpr index_t kBlockSize = Problem::kBlockSize; - constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN; constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK; - constexpr index_t K1 = 16 / sizeof(BDataType); - constexpr index_t K0 = kKPerBlock / K1; - constexpr index_t N2 = get_warp_size() / K0; -#if 1 // coalesce reading for each blocks - constexpr index_t N1 = kBlockSize / get_warp_size(); - static_assert(N2 != 0, "M2 is zero, which will lead to a division by zero error."); - static_assert(N1 != 0, "M1 is zero, which will lead to a division by zero error."); - constexpr index_t N0 = kNPerBlock / (N2 * N1); - - return make_static_tile_distribution( - tile_distribution_encoding, - tuple, sequence>, - tuple, sequence<1, 2>>, - tuple, sequence<2, 0>>, - sequence<1, 2>, - sequence<0, 1>>{}); -#else // coalesce reading for each warps - constexpr index_t N0 = kBlockSize / get_warp_size(); - constexpr index_t N1 = kNPerBlock / (N2 * N0); - - return make_static_tile_distribution( - tile_distribution_encoding, - tuple, sequence>, - tuple, sequence<1, 2>>, - tuple, sequence<2, 0>>, - sequence<1, 2>, - sequence<1, 1>>{}); -#endif + constexpr index_t N1 = Problem::VectorLoadSize / sizeof(BDataType); + constexpr index_t N0 = kNPerBlock / N1; + constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize; + static_assert(total_pixels % N1 == 0); + constexpr index_t K3 = total_pixels / N1; + constexpr index_t kKPack = GetSmemPackB(); + static_assert(kKPack % K3 == 0); + constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave + constexpr index_t warp_size = get_warp_size(); + if constexpr(warp_size % (K2 * N0) == 0) + { + constexpr index_t K1 = warp_size / (K2 * N0); + constexpr index_t K0 = kBlockSize / warp_size; + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<2, 1, 2>>, + tuple, sequence<1, 0, 2>>, + sequence<1, 2>, + sequence<1, 3>>{}); + } + else + { + constexpr index_t K1 = (K2 * N0) / get_warp_size(); + constexpr index_t K2_m = K2 / K1; + constexpr index_t K0 = kBlockSize / get_warp_size() / K1; + static_assert(kKPerBlock == K0 * K1 * K2_m * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<0, 2>>, + sequence<1, 2>, + sequence<1, 3>>{}); + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDescriptor() + { + using ALayout = remove_cvref_t; + using ADataType = remove_cvref_t; + static_assert(std::is_same_v); + constexpr index_t kBlockSize = Problem::kBlockSize; + constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM; + constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK; + + constexpr index_t M1 = Problem::VectorLoadSize / sizeof(ADataType); + constexpr index_t M0 = kMPerBlock / M1; + constexpr index_t total_pixels = kMPerBlock * kKPerBlock / kBlockSize; + static_assert(total_pixels % M1 == 0); + constexpr index_t K3 = total_pixels / M1; + constexpr index_t kKPack = GetSmemPackA(); + static_assert(kKPack % K3 == 0); + constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave + constexpr index_t warp_size = get_warp_size(); + if constexpr(warp_size % (K2 * M0) == 0) + { + constexpr index_t K1 = warp_size / (K2 * M0); + constexpr index_t K0 = kBlockSize / warp_size; + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<2, 1, 2>>, + tuple, sequence<1, 0, 2>>, + sequence<1, 2>, + sequence<1, 3>>{}); + } + else + { + constexpr index_t K1 = (K2 * M0) / get_warp_size(); + constexpr index_t K2_m = K2 / K1; + constexpr index_t K0 = kBlockSize / get_warp_size() / K1; + static_assert(kKPerBlock == K0 * K1 * K2_m * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<0, 2>>, + sequence<1, 2>, + sequence<1, 3>>{}); + } } template diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp index 1156f549b..3c43790bd 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp @@ -3,40 +3,133 @@ #pragma once -#include "ck_tile/core.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" namespace ck_tile { -static constexpr int _VectorSize = 16; - template -struct GemmPipelineProblem +struct GemmPipelineProblemBase { - using ADataType = remove_cvref_t; - using BDataType = remove_cvref_t; - using CDataType = remove_cvref_t; + using GemmTraits = remove_cvref_t; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; - using GemmTraits = remove_cvref_t; using ALayout = remove_cvref_t; using BLayout = remove_cvref_t; using CLayout = remove_cvref_t; - static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size(); - static constexpr bool kPadA = GemmTraits::kPadA; - static constexpr bool kPadB = GemmTraits::kPadB; - static constexpr bool kPadC = GemmTraits::kPadC; + static constexpr index_t VectorLoadSize = GemmTraits::_VectorSize; + static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size(); + + static constexpr bool kPadM = GemmTraits::kPadM; + static constexpr bool kPadN = GemmTraits::kPadN; + static constexpr bool kPadK = GemmTraits::kPadK; + + CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentA() + { + if constexpr(std::is_same_v) + { + constexpr index_t pixels_per_thread = + BlockGemmShape::kM * BlockGemmShape::kK / kBlockSize; + return pixels_per_thread < VectorLoadSize / sizeof(ADataType) + ? pixels_per_thread + : VectorLoadSize / sizeof(ADataType); + } + else + { + return VectorLoadSize / sizeof(ADataType); + } + } + + CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentB() + { + if constexpr(std::is_same_v) + { + constexpr index_t pixels_per_thread = + BlockGemmShape::kN * BlockGemmShape::kK / kBlockSize; + return pixels_per_thread < VectorLoadSize / sizeof(BDataType) + ? pixels_per_thread + : VectorLoadSize / sizeof(BDataType); + } + else + { + return VectorLoadSize / sizeof(BDataType); + } + } + + CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentC() + { + if constexpr(std::is_same_v) + { + constexpr index_t N1 = kBlockSize / get_warp_size(); + constexpr index_t N2 = std::min(BlockGemmShape::kN / N1, get_warp_size()); + constexpr index_t M0 = get_warp_size() / N2; + constexpr index_t M1 = BlockGemmShape::kM / M0; - static constexpr index_t VectorSizeA = kPadA ? 1 : _VectorSize / sizeof(ADataType); - static constexpr index_t VectorSizeB = kPadB ? 1 : _VectorSize / sizeof(BDataType); - static constexpr index_t VectorSizeC = kPadC ? 1 : _VectorSize / sizeof(CDataType); + return std::min(M1, static_cast(VectorLoadSize / sizeof(CDataType))); + } + else + { + constexpr index_t M1 = kBlockSize / get_warp_size(); + constexpr index_t M2 = std::min(BlockGemmShape::kM / M1, get_warp_size()); + constexpr index_t N0 = get_warp_size() / M2; + constexpr index_t N1 = BlockGemmShape::kN / N0; + + return std::min(N1, static_cast(VectorLoadSize / sizeof(CDataType))); + } + } + + static constexpr index_t VectorSizeA = []() { + if constexpr(std::is_same_v) + { + return kPadK ? 1 : GetAlignmentA(); + } + else + { + return kPadM ? 1 : GetAlignmentA(); + } + }(); + + static constexpr index_t VectorSizeB = []() { + if constexpr(std::is_same_v) + { + return kPadN ? 1 : GetAlignmentB(); + } + else + { + return kPadK ? 1 : GetAlignmentB(); + } + }(); + + static constexpr index_t VectorSizeC = []() { + if constexpr(std::is_same_v) + { + return kPadN ? 1 : GetAlignmentC(); + } + else + { + return kPadM ? 1 : GetAlignmentC(); + } + }(); }; +// Alias for GemmPipelineProblem +template +using GemmPipelineProblem = + GemmPipelineProblemBase; + template -struct UniversalGemmPipelineProblem +struct UniversalGemmPipelineProblem : public GemmPipelineProblemBase { - using ADataType = remove_cvref_t; - using BDataType = remove_cvref_t; - using CDataType = remove_cvref_t; - using BlockGemmShape = remove_cvref_t; - using GemmTraits = remove_cvref_t; - - using ALayout = remove_cvref_t; - using BLayout = remove_cvref_t; - using CLayout = remove_cvref_t; - - static constexpr auto Scheduler = Scheduler_; - static constexpr auto HasHotLoop = HasHotLoop_; - static constexpr auto TailNum = TailNum_; - static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size(); - - static constexpr bool kPadA = GemmTraits::kPadA; - static constexpr bool kPadB = GemmTraits::kPadB; - static constexpr bool kPadC = GemmTraits::kPadC; - - static constexpr index_t VectorSizeA = kPadA ? _VectorSize / sizeof(ADataType) : 1; - static constexpr index_t VectorSizeB = kPadB ? _VectorSize / sizeof(BDataType) : 1; - static constexpr index_t VectorSizeC = kPadC ? _VectorSize / sizeof(CDataType) : 1; + static constexpr auto Scheduler = Scheduler_; + static constexpr auto HasHotLoop = HasHotLoop_; + static constexpr auto TailNum = TailNum_; }; } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp index 7044a5314..207f1f9e4 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp @@ -9,12 +9,8 @@ namespace ck_tile { // UniversalGemm Policy -template struct UniversalGemmPipelineAgBgCrPolicy { - using LayoutA = remove_cvref_t; - using LayoutB = remove_cvref_t; - using LayoutC = remove_cvref_t; static constexpr auto I0 = number<0>{}; static constexpr auto I1 = number<1>{}; @@ -34,13 +30,14 @@ struct UniversalGemmPipelineAgBgCrPolicy TransposeC>; using ADataType = remove_cvref_t; + using ALayout = remove_cvref_t; constexpr index_t MPerBlock = Problem::BlockGemmShape::kM; constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; constexpr index_t K1 = WarpGemm::kK; constexpr index_t K0 = KPerBlock / K1; - if constexpr(std::is_same::value) + if constexpr(std::is_same::value) { constexpr auto MLdsLayer = 32 * 4 / KPerBlock / sizeof(ADataType) < 1 ? 1 @@ -176,13 +173,15 @@ struct UniversalGemmPipelineAgBgCrPolicy using BDataType = remove_cvref_t; + using BLayout = remove_cvref_t; + constexpr index_t NPerBlock = Problem::BlockGemmShape::kN; constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; constexpr index_t K1 = WarpGemm::kK; constexpr index_t K0 = KPerBlock / K1; - if constexpr(std::is_same::value) + if constexpr(std::is_same::value) { // NLdsLayer * K0 as logical Bank constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1 @@ -331,72 +330,285 @@ struct UniversalGemmPipelineAgBgCrPolicy return smem_size; } + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA() + { + using ADataType = remove_cvref_t; + return Problem::VectorLoadSize / sizeof(ADataType); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackB() + { + using BDataType = remove_cvref_t; + return Problem::VectorLoadSize / sizeof(BDataType); + } + template CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution() { - using WarpGemm = WarpGemmMfmaDispatcher; + using ADataType = remove_cvref_t; + using ALayout = remove_cvref_t; constexpr index_t BlockSize = Problem::kBlockSize; constexpr index_t MPerBlock = Problem::BlockGemmShape::kM; constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; - constexpr index_t K1 = WarpGemm::kK; - constexpr index_t K0 = KPerBlock / K1; - constexpr index_t M2 = get_warp_size() / K0; - - constexpr index_t M1 = BlockSize / get_warp_size(); - static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error."); - static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error."); - constexpr index_t M0 = MPerBlock / (M2 * M1); - - return make_static_tile_distribution( - tile_distribution_encoding, - tuple, sequence>, - tuple, sequence<1, 2>>, - tuple, sequence<2, 0>>, - sequence<1, 2>, - sequence<0, 1>>{}); + if constexpr(std::is_same_v) + { + constexpr index_t M1 = Problem::VectorLoadSize / sizeof(ADataType); + constexpr index_t M0 = MPerBlock / M1; + constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize; + static_assert(total_pixels % M1 == 0); + constexpr index_t K3 = total_pixels / M1; + constexpr index_t KPack = GetSmemPackA(); + static_assert(KPack % K3 == 0); + constexpr index_t K2 = KPack / K3; + if constexpr(get_warp_size() % (K2 * M0) == 0) + { + constexpr index_t K1 = get_warp_size() / (K2 * M0); + constexpr index_t K0 = BlockSize / get_warp_size(); + static_assert(KPerBlock == K0 * K1 * K2 * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<2, 1, 2>>, + tuple, sequence<1, 0, 2>>, + sequence<2, 1>, + sequence<3, 1>>{}); + } + else + { + constexpr index_t K1 = (K2 * M0) / get_warp_size(); + constexpr index_t K2_m = K2 / K1; + constexpr index_t K0 = BlockSize / get_warp_size() / K1; + static_assert(KPerBlock == K0 * K1 * K2_m * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<0, 2>>, + sequence<2, 1>, + sequence<3, 1>>{}); + } + } + else + { + constexpr index_t K1 = Problem::VectorLoadSize / sizeof(ADataType); + constexpr index_t K0 = KPerBlock / K1; + constexpr index_t M2 = get_warp_size() / K0; + if constexpr(get_warp_size() % (M2 * K0) == 0) + { + constexpr index_t M1 = BlockSize / get_warp_size(); + static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error."); + static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error."); + constexpr index_t M0 = MPerBlock / (M2 * M1); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + else + { + constexpr index_t M0 = BlockSize / get_warp_size(); + constexpr index_t M1 = MPerBlock / (M2 * M0); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<1, 1>>{}); + } + } } template CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution() { - using WarpGemm = WarpGemmMfmaDispatcher; + using BDataType = remove_cvref_t; + using BLayout = remove_cvref_t; constexpr index_t BlockSize = Problem::kBlockSize; constexpr index_t NPerBlock = Problem::BlockGemmShape::kN; constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; - constexpr index_t K1 = WarpGemm::kK; - constexpr index_t K0 = KPerBlock / K1; - constexpr index_t N2 = get_warp_size() / K0; - - constexpr index_t N1 = BlockSize / get_warp_size(); - static_assert(N2 != 0, "M2 is zero, which will lead to a division by zero error."); - static_assert(N1 != 0, "M1 is zero, which will lead to a division by zero error."); - constexpr index_t N0 = NPerBlock / (N2 * N1); - - return make_static_tile_distribution( - tile_distribution_encoding, - tuple, sequence>, - tuple, sequence<1, 2>>, - tuple, sequence<2, 0>>, - sequence<1, 2>, - sequence<0, 1>>{}); + if constexpr(std::is_same_v) + { + constexpr index_t N1 = Problem::VectorLoadSize / sizeof(BDataType); + constexpr index_t N0 = NPerBlock / N1; + constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize; + static_assert(total_pixels % N1 == 0); + constexpr index_t K3 = total_pixels / N1; + constexpr index_t KPack = GetSmemPackB(); + static_assert(KPack % K3 == 0); + constexpr index_t K2 = KPack / K3; + if constexpr(get_warp_size() % (K2 * N0) == 0) + { + constexpr index_t K1 = get_warp_size() / (K2 * N0); + constexpr index_t K0 = BlockSize / get_warp_size(); + static_assert(KPerBlock == K0 * K1 * K2 * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<2, 1, 2>>, + tuple, sequence<1, 0, 2>>, + sequence<2, 1>, + sequence<3, 1>>{}); + } + else + { + constexpr index_t K1 = (K2 * N0) / get_warp_size(); + constexpr index_t K2_m = K2 / K1; + constexpr index_t K0 = BlockSize / get_warp_size() / K1; + static_assert(KPerBlock == K0 * K1 * K2_m * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<0, 2>>, + sequence<2, 1>, + sequence<3, 1>>{}); + } + } + else + { + + constexpr index_t K1 = Problem::VectorLoadSize / sizeof(BDataType); + constexpr index_t K0 = KPerBlock / K1; + constexpr index_t N2 = get_warp_size() / K0; + // coalesce reading for each blocks + if constexpr(get_warp_size() % (N2 * K0) == 0) + { + constexpr index_t N1 = BlockSize / get_warp_size(); + static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error."); + static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error."); + constexpr index_t N0 = NPerBlock / (N2 * N1); + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + // coalesce reading for each warps + else + { + constexpr index_t N0 = BlockSize / get_warp_size(); + constexpr index_t N1 = NPerBlock / (N2 * N0); + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<1, 1>>{}); + } + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDescriptor() + { + using ALayout = remove_cvref_t; + using ADataType = remove_cvref_t; + static_assert(std::is_same_v); + constexpr index_t BlockSize = Problem::kBlockSize; + constexpr index_t MPerBlock = Problem::BlockGemmShape::kN; + constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; + + constexpr index_t M1 = Problem::VectorLoadSize / sizeof(ADataType); + constexpr index_t M0 = MPerBlock / M1; + constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize; + static_assert(total_pixels % M1 == 0); + constexpr index_t K3 = total_pixels / M1; + constexpr index_t kKPack = GetSmemPackB(); + static_assert(kKPack % K3 == 0); + constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave + constexpr index_t warp_size = get_warp_size(); + if constexpr(warp_size % (K2 * M0) == 0) + { + constexpr index_t K1 = warp_size / (K2 * M0); + constexpr index_t K0 = BlockSize / warp_size; + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<2, 1, 2>>, + tuple, sequence<1, 0, 2>>, + sequence<1, 2>, + sequence<1, 3>>{}); + } + else + { + constexpr index_t K1 = (K2 * M0) / get_warp_size(); + constexpr index_t K2_m = K2 / K1; + constexpr index_t K0 = BlockSize / get_warp_size() / K1; + static_assert(KPerBlock == K0 * K1 * K2_m * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<0, 2>>, + sequence<1, 2>, + sequence<1, 3>>{}); + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledBRegBlockDescriptor() + { + using BLayout = remove_cvref_t; + using BDataType = remove_cvref_t; + static_assert(std::is_same_v); + constexpr index_t BlockSize = Problem::kBlockSize; + constexpr index_t NPerBlock = Problem::BlockGemmShape::kN; + constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; + + constexpr index_t N1 = Problem::VectorLoadSize / sizeof(BDataType); + constexpr index_t N0 = NPerBlock / N1; + constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize; + static_assert(total_pixels % N1 == 0); + constexpr index_t K3 = total_pixels / N1; + constexpr index_t kKPack = GetSmemPackB(); + static_assert(kKPack % K3 == 0); + constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave + constexpr index_t warp_size = get_warp_size(); + if constexpr(warp_size % (K2 * N0) == 0) + { + constexpr index_t K1 = warp_size / (K2 * N0); + constexpr index_t K0 = BlockSize / warp_size; + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<2, 1, 2>>, + tuple, sequence<1, 0, 2>>, + sequence<1, 2>, + sequence<1, 3>>{}); + } + else + { + constexpr index_t K1 = (K2 * N0) / get_warp_size(); + constexpr index_t K2_m = K2 / K1; + constexpr index_t K0 = BlockSize / get_warp_size() / K1; + static_assert(KPerBlock == K0 * K1 * K2_m * K3); + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<0, 2>>, + sequence<1, 2>, + sequence<1, 3>>{}); + } } template diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp index 9d050be2f..34756c3ff 100644 --- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp +++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp @@ -3,19 +3,23 @@ #pragma once +#include "ck_tile/core.hpp" + namespace ck_tile { -template struct TileGemmTraits { - static constexpr bool kPadA = kPadA_; - static constexpr bool kPadB = kPadB_; - static constexpr bool kPadC = kPadC_; + static constexpr bool kPadM = kPadM_; + static constexpr bool kPadN = kPadN_; + static constexpr bool kPadK = kPadK_; + + static constexpr int _VectorSize = 16; using ALayout = ALayout_; using BLayout = BLayout_; diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp index 1b243ab43..6b4789833 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp @@ -53,9 +53,9 @@ class TestCkTileGemmMemPipeline : public ::testing::Test constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 8; - constexpr bool kPadA = true; - constexpr bool kPadB = true; - constexpr bool kPadC = true; + constexpr bool kPadM = true; + constexpr bool kPadN = true; + constexpr bool kPadK = true; constexpr int kBlockPerCu = 1; @@ -68,9 +68,9 @@ class TestCkTileGemmMemPipeline : public ::testing::Test using TilePartitioner = ck_tile::GemmTilePartitioner; using GemmEpilogue = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem>; + ck_tile::Default2DEpilogueProblem>; - using Traits = ck_tile::TileGemmTraits; + using Traits = ck_tile::TileGemmTraits; using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem< ck_tile::GemmPipelineProblem>; @@ -108,7 +108,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test if(s.log_level_ > 0) { - std::cout << "Lunching kernel with args:" + std::cout << "Launching kernel with args:" << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl; -- GitLab From 489c78d0735b7817859a22722e381f62f345cea7 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 12 Nov 2024 09:35:33 -0800 Subject: [PATCH 052/153] test rocm6.3 rc1 build 20 (#1659) --- Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index e2e2bc276..791d1d9f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,10 +24,10 @@ RUN if [ "$ROCMVERSION" != "6.3" ]; then \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \ elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \ - sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3.0.1-20.04-1_all.deb --no-check-certificate" && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3.0.1-20.04-1_all.deb && \ - sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3.0.1 rel-5 > /etc/apt/sources.list.d/rocm-build.list' && \ - amdgpu-repo --amdgpu-build=2033700; \ + sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3-20.04-1_all.deb --no-check-certificate" && \ + apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3-20.04-1_all.deb && \ + sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3 rel-20 > /etc/apt/sources.list.d/rocm-build.list' && \ + amdgpu-repo --amdgpu-build=2074281; \ fi RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" -- GitLab From d20735691ccb9429ed66f42f831385c709707d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Wed, 13 Nov 2024 11:46:18 +0100 Subject: [PATCH 053/153] [CK TILE] Update gemm universal pipeline (#1644) * [CK TILE] Update gemm universal pipeline * Fixes * fix * Rebase --- ...emm_universal_pipeline_ag_bg_cr_policy.hpp | 399 +++++------------- 1 file changed, 116 insertions(+), 283 deletions(-) diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp index 207f1f9e4..94b0faf03 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp @@ -18,289 +18,136 @@ struct UniversalGemmPipelineAgBgCrPolicy static constexpr bool TransposeC = true; + template + CK_TILE_HOST_DEVICE static constexpr auto GetVectorLoadSize() + { + constexpr index_t BlockSize = Problem::kBlockSize; + constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; + constexpr index_t elements_per_thread = MNPerBlock * KPerBlock / BlockSize; + + if constexpr(elements_per_thread % (16 / sizeof(DataType)) == 0) + { + return (16 / sizeof(DataType)); + } + else if constexpr(elements_per_thread % (8 / sizeof(DataType)) == 0) + { + return (8 / sizeof(DataType)); + } + else if constexpr(elements_per_thread % (4 / sizeof(DataType)) == 0 && + sizeof(DataType) >= 4) + { + return (4 / sizeof(DataType)); + } + else if constexpr(elements_per_thread % (2 / sizeof(DataType)) == 0 && + sizeof(DataType) >= 2) + { + return (2 / sizeof(DataType)); + } + else + { + return 1; + } + } + template CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor() { - using WarpGemm = WarpGemmMfmaDispatcher; using ADataType = remove_cvref_t; - using ALayout = remove_cvref_t; constexpr index_t MPerBlock = Problem::BlockGemmShape::kM; constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; - constexpr index_t K1 = WarpGemm::kK; - constexpr index_t K0 = KPerBlock / K1; - - if constexpr(std::is_same::value) - { - constexpr auto MLdsLayer = 32 * 4 / KPerBlock / sizeof(ADataType) < 1 - ? 1 - : 32 * 4 / KPerBlock / sizeof(ADataType); - constexpr auto a_lds_block_desc = make_naive_tensor_descriptor( - make_tuple(K0 * number{}, number{}, K1), - make_tuple(K1, number{}, I1)); - - constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor( - a_lds_block_desc, - make_tuple(make_xor_transform(make_tuple(number{}, - number{})), - make_pass_through_transform(K1)), - make_tuple(sequence<1, 0>{}, sequence<2>{}), - make_tuple(sequence<1, 0>{}, sequence<2>{})); - - constexpr auto a_lds_block_desc_ak0_kMLdsLayer_m_ak1 = transform_tensor_descriptor( - a_lds_block_desc_permuted, - make_tuple(make_unmerge_transform(make_tuple(K0, number{})), - make_pass_through_transform(number{}), - make_pass_through_transform(K1)), - make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}), - make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{})); - - constexpr auto a_lds_block_desc_m_k = transform_tensor_descriptor( - a_lds_block_desc_ak0_kMLdsLayer_m_ak1, - make_tuple(make_merge_transform_v3_division_mod(make_tuple(K0, K1)), - make_merge_transform_v3_division_mod( - make_tuple(number{}, number{}))), - make_tuple(sequence<0, 3>{}, sequence<1, 2>{}), - make_tuple(sequence<1>{}, sequence<0>{})); - - return a_lds_block_desc_m_k; - } - else // ColumnMajor A - { - // kfold and mpair dimension is not always required. - // more dimension in merge_transform increase the difficulty of generating immarg offset - // for compiler. - constexpr auto M0 = get_warp_size() * Problem::BlockGemmShape::BlockWarps::at(I0); - constexpr auto M1 = MPerBlock / M0; - - constexpr auto KThreadWrite = Problem::kBlockSize / M0; - constexpr auto K0PerThreadWrite = K0 / KThreadWrite; - constexpr auto KThreadRead = 64 / WarpGemm::kM; - constexpr auto K0PerThreadRead = K0 / KThreadRead; - - constexpr auto kfold = - (K1 * M0 * sizeof(ADataType) > 128) ? 1 : 128 / (K1 * M0 * sizeof(ADataType)); - constexpr auto KThreadReadPerm = - (kfold * K0PerThreadWrite / K0PerThreadRead) > 1 - ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead) - : KThreadRead; - - // 1<=mpair<=kN0 - constexpr auto mpair = (K1 * WarpGemm::kM * sizeof(ADataType) > 128) - ? 1 - : ((128 / (K1 * WarpGemm::kM * sizeof(ADataType))) > M0 - ? M0 - : 128 / (K1 * WarpGemm::kM * sizeof(ADataType))); - - constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed( - make_tuple(number{}, - number{}, - number{}, - number{}, - number{}, - K1)); - - constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor( - a_lds_block_desc, - make_tuple( - make_pass_through_transform(number{}), - make_pass_through_transform(number{}), - make_xor_transform( - make_tuple(number{}, number{})), - make_pass_through_transform(number{}), - make_pass_through_transform(K1)), - make_tuple( - sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}), - make_tuple( - sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{})); - - constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor( - a_lds_block_desc_permuted, - make_tuple( - make_pass_through_transform(number{}), - make_pass_through_transform(number{}), - make_unmerge_transform(make_tuple(number{}, number{})), - make_unmerge_transform(make_tuple(number{}, number{})), - make_pass_through_transform(number{}), - make_pass_through_transform(K1)), - make_tuple(sequence<0>{}, - sequence<1>{}, - sequence<2>{}, - sequence<3>{}, - sequence<4>{}, - sequence<5>{}), - make_tuple(sequence<1>{}, - sequence<2>{}, - sequence<0, 3>{}, - sequence<4, 5>{}, - sequence<6>{}, - sequence<7>{})); - - constexpr auto a_lds_block_desc_m_k = transform_tensor_descriptor( - a_lds_block_desc_unmerged, - make_tuple(make_merge_transform_v3_division_mod( - make_tuple(number{}, - number{}, - number{}, - number{}, - K1)), - make_merge_transform_v3_division_mod( - make_tuple(number{}, number{}, number{}))), - make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}), - make_tuple(sequence<1>{}, sequence<0>{})); - - return a_lds_block_desc_m_k; - } + constexpr index_t KPack = GetVectorLoadSize(); + + constexpr auto DataTypeSize = sizeof(ADataType); + constexpr auto MLdsLayer = + (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize); + + constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, + number{}, + number{}), + make_tuple(number{}, number{}, number<1>{}), + number{}, + number<1>{}); + + constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor( + a_lds_block_desc_0, + make_tuple(make_xor_transform(make_tuple(number{}, + number{})), + make_pass_through_transform(number{})), + make_tuple(sequence<1, 0>{}, sequence<2>{}), + make_tuple(sequence<1, 0>{}, sequence<2>{})); + + constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor( + a_lds_block_desc_permuted, + make_tuple(make_unmerge_transform( + make_tuple(number{}, number{})), + make_pass_through_transform(number{}), + make_pass_through_transform(number{})), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}), + make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{})); + + constexpr auto a_lds_block_desc = transform_tensor_descriptor( + a_lds_block_desc_xk0_mnldslayer_mn_xk1, + make_tuple(make_merge_transform_v3_division_mod( + make_tuple(number{}, number{})), + make_merge_transform_v3_division_mod( + make_tuple(number{}, number{}))), + make_tuple(sequence<1, 2>{}, sequence<0, 3>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return a_lds_block_desc; } template CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor() { - using WarpGemm = WarpGemmMfmaDispatcher; using BDataType = remove_cvref_t; - using BLayout = remove_cvref_t; - constexpr index_t NPerBlock = Problem::BlockGemmShape::kN; constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; - - constexpr index_t K1 = WarpGemm::kK; - constexpr index_t K0 = KPerBlock / K1; - - if constexpr(std::is_same::value) - { - // NLdsLayer * K0 as logical Bank - constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1 - ? 1 - : 32 * 4 / KPerBlock / sizeof(BDataType); - ; - constexpr auto b_lds_block_desc = make_naive_tensor_descriptor( - make_tuple(K0 * number{}, number{}, K1), - make_tuple(K1, number{}, I1)); - - constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor( - b_lds_block_desc, - make_tuple(make_xor_transform(make_tuple(number{}, - number{})), - make_pass_through_transform(K1)), - make_tuple(sequence<1, 0>{}, sequence<2>{}), - make_tuple(sequence<1, 0>{}, sequence<2>{})); - - constexpr auto b_lds_block_desc_bk0_kNLdsLayer_n_bk1 = transform_tensor_descriptor( - b_lds_block_desc_permuted, - make_tuple(make_unmerge_transform(make_tuple(K0, number{})), - make_pass_through_transform(number{}), - make_pass_through_transform(K1)), - make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}), - make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{})); - - constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor( - b_lds_block_desc_bk0_kNLdsLayer_n_bk1, - make_tuple(make_merge_transform_v3_division_mod(make_tuple(K0, K1)), - make_merge_transform_v3_division_mod( - make_tuple(number{}, number{}))), - make_tuple(sequence<0, 3>{}, sequence<1, 2>{}), - make_tuple(sequence<1>{}, sequence<0>{})); - - return b_lds_block_desc_n_k; - } - else // RowMajor B - { - constexpr auto N0 = get_warp_size() * Problem::BlockGemmShape::BlockWarps::at(I1); - constexpr auto N1 = NPerBlock / N0; - - constexpr auto KThreadWrite = Problem::kBlockSize / N0; - constexpr auto K0PerThreadWrite = K0 / KThreadWrite; - constexpr auto KThreadRead = 64 / WarpGemm::kN; - constexpr auto K0PerThreadRead = K0 / KThreadRead; - - constexpr auto kfold = - (K1 * N0 * sizeof(BDataType) > 128) ? 1 : 128 / (K1 * N0 * sizeof(BDataType)); - constexpr auto KThreadReadPerm = - (kfold * K0PerThreadWrite / K0PerThreadRead) > 1 - ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead) - : KThreadRead; - - // 1<=npair<=kN0 - constexpr auto npair = (K1 * WarpGemm::kN * sizeof(BDataType) > 128) - ? 1 - : ((128 / (K1 * WarpGemm::kN * sizeof(BDataType))) > N0 - ? N0 - : 128 / (K1 * WarpGemm::kN * sizeof(BDataType))); - - constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed( - make_tuple(number{}, - number{}, - number{}, - number{}, - number{}, - K1)); - - constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor( - b_lds_block_desc, - make_tuple( - make_pass_through_transform(number{}), - make_pass_through_transform(number{}), - make_xor_transform( - make_tuple(number{}, number{})), - make_pass_through_transform(number{}), - make_pass_through_transform(K1)), - make_tuple( - sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}), - make_tuple( - sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{})); - - constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor( - b_lds_block_desc_permuted, - make_tuple( - make_pass_through_transform(number{}), - make_pass_through_transform(number{}), - make_unmerge_transform(make_tuple(number{}, number{})), - make_unmerge_transform(make_tuple(number{}, number{})), - make_pass_through_transform(number{}), - make_pass_through_transform(K1)), - make_tuple(sequence<0>{}, - sequence<1>{}, - sequence<2>{}, - sequence<3>{}, - sequence<4>{}, - sequence<5>{}), - make_tuple(sequence<1>{}, - sequence<2>{}, - sequence<0, 3>{}, - sequence<4, 5>{}, - sequence<6>{}, - sequence<7>{})); - - constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor( - b_lds_block_desc_unmerged, - make_tuple(make_merge_transform_v3_division_mod( - make_tuple(number{}, - number{}, - number{}, - number{}, - K1)), - make_merge_transform_v3_division_mod( - make_tuple(number{}, number{}, number{}))), - make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}), - make_tuple(sequence<1>{}, sequence<0>{})); - - return b_lds_block_desc_n_k; - } + constexpr index_t KPack = GetVectorLoadSize(); + + constexpr auto DataTypeSize = sizeof(BDataType); + constexpr auto NLdsLayer = + (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize); + + constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, + number{}, + number{}), + make_tuple(number{}, number{}, number<1>{}), + number{}, + number<1>{}); + + constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor( + b_lds_block_desc_0, + make_tuple(make_xor_transform(make_tuple(number{}, + number{})), + make_pass_through_transform(number{})), + make_tuple(sequence<1, 0>{}, sequence<2>{}), + make_tuple(sequence<1, 0>{}, sequence<2>{})); + + constexpr auto b_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor( + b_lds_block_desc_permuted, + make_tuple(make_unmerge_transform( + make_tuple(number{}, number{})), + make_pass_through_transform(number{}), + make_pass_through_transform(number{})), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}), + make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{})); + + constexpr auto b_lds_block_desc = transform_tensor_descriptor( + b_lds_block_desc_xk0_mnldslayer_mn_xk1, + make_tuple(make_merge_transform_v3_division_mod( + make_tuple(number{}, number{})), + make_merge_transform_v3_division_mod( + make_tuple(number{}, number{}))), + make_tuple(sequence<1, 2>{}, sequence<0, 3>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + return b_lds_block_desc; } template @@ -330,20 +177,6 @@ struct UniversalGemmPipelineAgBgCrPolicy return smem_size; } - template - CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA() - { - using ADataType = remove_cvref_t; - return Problem::VectorLoadSize / sizeof(ADataType); - } - - template - CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackB() - { - using BDataType = remove_cvref_t; - return Problem::VectorLoadSize / sizeof(BDataType); - } - template CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution() { @@ -362,7 +195,7 @@ struct UniversalGemmPipelineAgBgCrPolicy constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize; static_assert(total_pixels % M1 == 0); constexpr index_t K3 = total_pixels / M1; - constexpr index_t KPack = GetSmemPackA(); + constexpr index_t KPack = GetVectorLoadSize(); static_assert(KPack % K3 == 0); constexpr index_t K2 = KPack / K3; if constexpr(get_warp_size() % (K2 * M0) == 0) @@ -445,7 +278,7 @@ struct UniversalGemmPipelineAgBgCrPolicy constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize; static_assert(total_pixels % N1 == 0); constexpr index_t K3 = total_pixels / N1; - constexpr index_t KPack = GetSmemPackB(); + constexpr index_t KPack = GetVectorLoadSize(); static_assert(KPack % K3 == 0); constexpr index_t K2 = KPack / K3; if constexpr(get_warp_size() % (K2 * N0) == 0) @@ -530,7 +363,7 @@ struct UniversalGemmPipelineAgBgCrPolicy constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize; static_assert(total_pixels % M1 == 0); constexpr index_t K3 = total_pixels / M1; - constexpr index_t kKPack = GetSmemPackB(); + constexpr index_t kKPack = GetVectorLoadSize(); static_assert(kKPack % K3 == 0); constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave constexpr index_t warp_size = get_warp_size(); @@ -578,7 +411,7 @@ struct UniversalGemmPipelineAgBgCrPolicy constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize; static_assert(total_pixels % N1 == 0); constexpr index_t K3 = total_pixels / N1; - constexpr index_t kKPack = GetSmemPackB(); + constexpr index_t kKPack = GetVectorLoadSize(); static_assert(kKPack % K3 == 0); constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave constexpr index_t warp_size = get_warp_size(); -- GitLab From 73f02a108347d626ee9b31789f0ff8b26ef87006 Mon Sep 17 00:00:00 2001 From: Taylor Ding Date: Wed, 13 Nov 2024 11:20:38 -0500 Subject: [PATCH 054/153] Move checks for compatibility from Argument() to IsSupportedArgument() (#1653) --- ..._grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp index 6bb5d431c..17b7d962d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp @@ -381,10 +381,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle { tildes = {i_ztilde, i_ytilde, i_xtilde}; } - else - { - throw std::runtime_error("wrong! only implemented for 2D and 3D now"); - } const auto a_grid_desc_ak0_m_ak1 = transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1( @@ -749,6 +745,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle return false; } } + + // check number of dimension, only implemented for 2D and 3D now + if(NDimSpatial != 2 && NDimSpatial != 3) + { + return false; + } return true; } -- GitLab From efd92615459c83d1af3f226f846b395323374a74 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 13 Nov 2024 09:20:18 -0800 Subject: [PATCH 055/153] fix clang format (#1662) --- .../device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp index 17b7d962d..3fb047f20 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp @@ -745,7 +745,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle return false; } } - + // check number of dimension, only implemented for 2D and 3D now if(NDimSpatial != 2 && NDimSpatial != 3) { -- GitLab From c1f8d53ce83c6ca6d15fec8d987974bc05008c16 Mon Sep 17 00:00:00 2001 From: feli Date: Thu, 14 Nov 2024 14:06:36 +0800 Subject: [PATCH 056/153] [Ck_tile] hot fix, fix rpcf param setting err (#1657) Co-authored-by: dummycoderfe --- .../pipeline/layernorm2d_fwd_pipeline_one_pass.hpp | 2 +- .../pipeline/layernorm2d_fwd_pipeline_two_pass.hpp | 14 +++++++++++--- .../ck_tile/ops/welford/block/block_welford.hpp | 13 +++++++++---- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index 4b83ed4fb..eefdaf917 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -121,7 +121,7 @@ struct Layernorm2dFwdPipelineOnePass auto [mean, var] = block_welford(acc, cur_count, max_count); block_welford_sync(mean, var, cur_count); block_welford_cross_warp_sync(mean, var, cur_count, smem); - block_tile_welford_post_scale_var(var, cur_count); + block_tile_welford_post_scale_var(var, cur_count, constant{}); // compute inv-std auto inv_std = tile_elementwise_in( diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp index fadf56dfd..6a86cc43c 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -35,6 +35,7 @@ struct Layernorm2dFwdPipelineTwoPass static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; static constexpr bool kPadM = false; // TODO - BlockLayernorm2dFwdProblem::kPadM static constexpr bool kPadN = Problem::Traits::kPadN; + static constexpr bool kFastFDiv = Problem::Traits::kFastFDiv; static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd; static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant; @@ -137,15 +138,22 @@ struct Layernorm2dFwdPipelineTwoPass block_welford_sync(mean, var, cur_count); block_welford_cross_warp_sync(mean, var, cur_count, smem); - block_tile_welford_post_scale_var(var, cur_count); + block_tile_welford_post_scale_var(var, cur_count, constant{}); // compute inv-std auto inv_std = tile_elementwise_in( [&](const auto& v_) { - return type_convert(1.0f) / (sqrt(v_ + epsilon)); + if(kFastFDiv && std::is_same_v) + { + return type_convert(1.0f) * + __builtin_amdgcn_rcpf(sqrt(v_ + epsilon)); + } + else + { + return type_convert(1.0f) / sqrt(v_ + epsilon); + } }, var); - if constexpr(kSaveMean) store_tile(mean_window, cast_tile(mean)); if constexpr(kSaveInvStd) diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp index 968895e38..56ca86d9d 100644 --- a/include/ck_tile/ops/welford/block/block_welford.hpp +++ b/include/ck_tile/ops/welford/block/block_welford.hpp @@ -47,8 +47,11 @@ struct BlockWelford auto x = ck_tile::type_convert(x_tensor[in_dstr_idx]); - welford_update( - mean_tensor(out_dstr_idx), var_tensor(out_dstr_idx), x, cur_count_); + welford_update(mean_tensor(out_dstr_idx), + var_tensor(out_dstr_idx), + x, + cur_count_, + constant{}); }); } }); @@ -159,7 +162,8 @@ struct BlockWelfordSync v_local_count, v_remote_mean, v_remote_var, - v_remote_count); + v_remote_count, + constant{}); }); } }); @@ -307,7 +311,8 @@ struct BlockWelfordCrossWarpSync v_local_count, v_remote_mean, v_remote_var, - v_remote_count); + v_remote_count, + constant{}); }); mean_tensor.get_thread_buffer()(i_0) = v_local_mean; -- GitLab From d805a461aae7454de448bc0305cce01192fbc198 Mon Sep 17 00:00:00 2001 From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com> Date: Thu, 14 Nov 2024 09:40:50 -0700 Subject: [PATCH 057/153] Fix example_convnd_fwd_max_xdl_int8 failures on MI300 (#1666) * Improve test verbosity. * BUGFIX: Add missing initialization for reduction buffer * Change default initialization method Performance may be affected for fp32 and int8 examples. * Improve test verbosity * Cleanup --- .../common.hpp | 2 +- .../run_convnd_fwd_max_example.inc | 57 +++++++++++++------ .../gemm_add_add_mean_meansquare_xdl_fp16.cpp | 2 +- 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp index 7e3130a1a..036f288d0 100644 --- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp +++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp @@ -80,7 +80,7 @@ using RLayout = typename LayoutSettingSelector::RLayout; struct ExecutionConfig final { bool do_verification = true; - int init_method = 1; + int init_method = 2; bool time_kernel = false; }; diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc index cebfeb51d..d61aee81a 100644 --- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc +++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc @@ -73,16 +73,25 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, Tensor conv_output_device(conv_output_g_n_k_wos_desc); Tensor r0_device(r0_desc); + std::cout << "input: " << conv_input.mDesc << std::endl; + std::cout << "weight: " << conv_weight.mDesc << std::endl; + std::cout << "output: " << conv_output_device.mDesc << std::endl; + std::cout << "reduction: " << r0_device.mDesc << std::endl << std::endl; + switch(config.init_method) { case 0: break; case 1: ck::utils::FillUniformDistributionIntegerValue{-8, 7}(conv_input); - ck::utils::FillUniformDistributionIntegerValue{-8, 7}(conv_weight); + ck::utils::FillUniformDistributionIntegerValue{-1, 1}(conv_weight); + break; + case 2: + ck::utils::FillUniformDistributionIntegerValue{-8, 7}(conv_input); + ck::utils::FillUniformDistribution{-1, 1}(conv_weight); break; default: - ck::utils::FillUniformDistribution{-5, 5}(conv_input); - ck::utils::FillUniformDistribution{-5, 5}(conv_weight); + ck::utils::FillUniformDistribution{-8, 7}(conv_input); + ck::utils::FillUniformDistribution{-1, 1}(conv_weight); } DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize()); @@ -161,15 +170,25 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, return false; } + // XXX: DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle will not initialize r0. + r0_device_buf.SetValue(ck::NumericLimits::Lowest()); + const float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); - const std::size_t flop = problem_size.GetFlops(); - const std::size_t num_btype = problem_size.GetByte(); + if(config.time_kernel) + { + const std::size_t flop = problem_size.GetFlops(); + const std::size_t num_btype = problem_size.GetByte(); - const float tflops = static_cast(flop) / 1.E9 / avg_time; - const float gb_per_sec = num_btype / 1.E6 / avg_time; - std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " - << conv.GetTypeString() << std::endl; + const float tflops = static_cast(flop) / 1.E9 / avg_time; + const float gb_per_sec = num_btype / 1.E6 / avg_time; + std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << conv.GetTypeString() << std::endl; + } + else + { + std::cout << "FINISHED: " << conv.GetTypeString() << std::endl; + } if(config.do_verification) { @@ -189,6 +208,7 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, BElementOp{}, PassThrough{}); + std::cout << "\nRunning verification on CPU." << std::endl; ref_invoker.Run(ref_argument); Tensor r0_host(r0_device.mDesc); @@ -273,13 +293,18 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, conv_output_device_buf.FromDevice(conv_output_device.mData.data()); r0_device_buf.FromDevice(r0_device.mData.data()); - return ck::utils::check_err(conv_output_device, - conv_output_host, - "Error: incorrect results! (Matrix E)", - 1e-5f, - 1e-4f) && - ck::utils::check_err( - r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f); + auto pass = ck::utils::check_err(conv_output_device, + conv_output_host, + "Error: incorrect results! (Matrix E)", + 1e-3f, + 1e-3f); + pass = + pass && ck::utils::check_err( + r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-3f, 1e-3f); + if(pass) + std::cout << "Verification on CPU: PASS" << std::endl; + + return pass; } return true; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp index 2f6533d44..a46eaa481 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp @@ -198,7 +198,7 @@ int main() throw std::runtime_error("wrong! this device_op instance does not support this problem"); } - // init reducetion buffer to 0 + // init reduction buffer to 0 r0_device_buf.SetZero(); r1_device_buf.SetZero(); -- GitLab From 3b6a481e92d8ba2a9f9e87136678b05bcaf573a7 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:14:50 -0800 Subject: [PATCH 058/153] re-enable coerce-illegal-types flag for rocm6.3 (#1668) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bd2f60683..4bb69300a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,7 +221,7 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090) endif() set(check-coerce) check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce) -if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000) +if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132) message("Adding the amdgpu-coerce-illegal-types=1") add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1") endif() -- GitLab From b4a79045829b07f7e80603fb773c196e1f7a7214 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:15:01 -0800 Subject: [PATCH 059/153] re-enable fp8 gemms in ckProfiler (#1667) --- CMakeLists.txt | 6 ++++-- profiler/src/profile_gemm_universal.cpp | 6 +++--- test/gemm_universal/test_gemm_universal_xdl.cpp | 4 ++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4bb69300a..b28a6d912 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -183,12 +183,14 @@ message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}") if (SUPPORTED_GPU_TARGETS MATCHES "gfx9") message("Enabling XDL instances") add_definitions(-DCK_USE_XDL) - set(CK_USE_XDL "ON") +endif() +if (SUPPORTED_GPU_TARGETS MATCHES "gfx94") + message("Enabling FP8 gemms in ckProfiler") + add_definitions(-DCK_USE_GFX94) endif() if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") message("Enabling WMMA instances") add_definitions(-DCK_USE_WMMA) - set(CK_USE_WMMA "ON") endif() option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF) if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908")) diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp index 576bd009b..990cbd292 100644 --- a/profiler/src/profile_gemm_universal.cpp +++ b/profiler/src/profile_gemm_universal.cpp @@ -101,7 +101,7 @@ int profile_gemm_universal(int argc, char* argv[]) using F32 = float; using F16 = ck::half_t; using BF16 = ck::bhalf_t; -#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) using F8 = ck::f8_t; #endif @@ -164,7 +164,7 @@ int profile_gemm_universal(int argc, char* argv[]) { return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); } -#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN) { return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}); @@ -198,7 +198,7 @@ int profile_gemm_universal(int argc, char* argv[]) { return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{}); } -#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN) { return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{}); diff --git a/test/gemm_universal/test_gemm_universal_xdl.cpp b/test/gemm_universal/test_gemm_universal_xdl.cpp index 23b5c74dd..b872d7089 100644 --- a/test/gemm_universal/test_gemm_universal_xdl.cpp +++ b/test/gemm_universal/test_gemm_universal_xdl.cpp @@ -56,7 +56,7 @@ class TestGemmUniversal_KM_NK using KernelTypes_MK_KN = ::testing::Types< // ADataType, BDataType, ComputeDataType, CDataType std::tuple< F16, F16, F16, F16>, -#if defined(CK_ENABLE_FP8) && defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)) std::tuple< F16, F8, F16, F16>, std::tuple< F8, F16, F16, F16>, std::tuple< F8, F8, F8, BF16>, @@ -66,7 +66,7 @@ using KernelTypes_MK_KN = ::testing::Types< using KernelTypes_MK_NK = ::testing::Types< // ADataType, BDataType, ComputeDataType, CDataType std::tuple< F16, F16, F16, F16>, -#if defined(CK_ENABLE_FP8) && defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)) std::tuple< F16, F8, F16, F16>, std::tuple< F8, F16, F16, F16>, std::tuple< F8, F8, F8, BF16>, -- GitLab From efb34741fe1f6af938e32b80fa5a30211d8dd71c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 15 Nov 2024 18:30:58 -0500 Subject: [PATCH 060/153] Bump rocm-docs-core from 1.8.3 to 1.8.4 in /docs/sphinx (#1670) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.8.3 to 1.8.4. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/v1.8.4/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.8.3...v1.8.4) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index c2220e15d..9824df626 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.8.3 +rocm-docs-core==1.8.4 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 0dc2e70c5..f89fbcf27 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.8.3 +rocm-docs-core==1.8.4 # via -r requirements.in six==1.16.0 # via pybtex -- GitLab From 754adc70e3c98c08dc64f7338d8a2e5e5f38dc3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Mon, 18 Nov 2024 14:03:45 +0100 Subject: [PATCH 061/153] Batched GEMM Multiple D based on Universal GEMM (#1655) * Batched GEMM Multiple D based on Universal GEMM Co-authored-by: Jing Zhang * CI fixes Co-authored-by: Jing Zhang --------- Co-authored-by: Jing Zhang --- example/24_batched_gemm/CMakeLists.txt | 6 + .../batched_gemm_xdl_bf16_v3.cpp | 99 ++ .../batched_gemm_xdl_fp8_rowwise_v3.cpp | 106 ++ .../run_batched_gemm_example.inc | 36 +- .../run_batched_gemm_example_rowwise.inc | 280 +++++ .../device/device_batched_gemm_multi_d.hpp | 43 +- ...atched_gemm_multiple_d_xdl_cshuffle_v3.hpp | 1014 +++++++++++++++++ .../gpu/gemm_universal_batched.hpp | 185 +++ .../gpu/CMakeLists.txt | 9 + .../gpu/gemm_universal_batched/CMakeLists.txt | 19 + ..._xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp | 95 ++ ...16_bf16_mk_nk_mn_comp_default_instance.cpp | 32 + ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp | 33 + ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp | 33 + ...gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp | 109 ++ ...f8_bf16_mk_nk_mn_comp_default_instance.cpp | 32 + ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp | 33 + ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp | 33 + .../profile_gemm_universal_batched_impl.hpp | 280 +++++ profiler/src/CMakeLists.txt | 2 + .../src/profile_gemm_universal_batched.cpp | 187 +++ 21 files changed, 2655 insertions(+), 11 deletions(-) create mode 100644 example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp create mode 100644 example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp create mode 100644 example/24_batched_gemm/run_batched_gemm_example_rowwise.inc create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp create mode 100644 profiler/include/profiler/profile_gemm_universal_batched_impl.hpp create mode 100644 profiler/src/profile_gemm_universal_batched.cpp diff --git a/example/24_batched_gemm/CMakeLists.txt b/example/24_batched_gemm/CMakeLists.txt index 4cb45be7c..720af39af 100644 --- a/example/24_batched_gemm/CMakeLists.txt +++ b/example/24_batched_gemm/CMakeLists.txt @@ -9,6 +9,12 @@ add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp16) add_example_executable(example_batched_gemm_xdl_bf16 batched_gemm_xdl_bf16.cpp) add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bf16) +add_example_executable(example_batched_gemm_xdl_bf16_v3 batched_gemm_xdl_bf16_v3.cpp) +add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bf16_v3) + +add_example_executable(example_batched_gemm_xdl_fp8_rowwise_v3 batched_gemm_xdl_fp8_rowwise_v3.cpp) +add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp8_rowwise_v3) + add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp) add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int8) diff --git a/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp new file mode 100644 index 000000000..fa8b75218 --- /dev/null +++ b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +#include "ck/library/utility/literals.hpp" + +template +using S = ck::Sequence; + +using BF16 = ck::bhalf_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = BF16; +using BDataType = BF16; +using AccDataType = F32; +using CShuffleDataType = BF16; +using DsDataType = ck::Tuple<>; +using EDataType = BF16; + +using ALayout = Row; +using BLayout = Col; +using DsLayout = ck::Tuple<>; +using ELayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< + ALayout, + BLayout, + DsLayout, + ELayout, + ADataType, + BDataType, + DsDataType, + EDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CDEElementOp, + GemmDefault, + 256, // BlockSize + 256, // MPerBlock + 128, // NPerBlock + 32, // KPerBlock + 8, // AK1 + 8, // BK1 + 32, // MPerXDL + 32, // NPerXDL + 4, // MXdlPerWave + 2, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, // CShuffleMXdlPerWavePerShuffle + 1, // CShuffleNXdlPerWavePerShuffle + S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + S<8>, // CDEShuffleBlockTransferScalarPerVectors + ck::BlockGemmPipelineScheduler::Intrawave, // BlockGemmPipelineScheduler + ck::BlockGemmPipelineVersion::v3 // BlockGemmPipelineVersion + >; + +#include "run_batched_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); } diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp new file mode 100644 index 000000000..f0160b31c --- /dev/null +++ b/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +#include "ck/library/utility/literals.hpp" + +template +using S = ck::Sequence; + +using F8 = ck::f8_t; +using BF16 = ck::bhalf_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using MultiplyMultiply = ck::tensor_operation::element_wise::MultiplyMultiply; + +using ADataType = F8; +using BDataType = F8; +using AccDataType = F32; +using CShuffleDataType = F32; +using D0DataType = F32; +using D1DataType = F32; +using DsDataType = ck::Tuple; +using EDataType = BF16; + +using ALayout = Row; +using BLayout = Col; +using D0Layout = Row; +using D1Layout = Col; +using DsLayout = ck::Tuple; +using ELayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = MultiplyMultiply; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< + ALayout, + BLayout, + DsLayout, + ELayout, + ADataType, + BDataType, + DsDataType, + EDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CDEElementOp, + GemmDefault, + 256, // BlockSize + 256, // MPerBlock + 128, // NPerBlock + 32, // KPerBlock + 8, // AK1 + 8, // BK1 + 32, // MPerXDL + 32, // NPerXDL + 4, // MXdlPerWave + 2, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, // CShuffleMXdlPerWavePerShuffle + 1, // CShuffleNXdlPerWavePerShuffle + S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + S<8, 8, 1>, // CDEShuffleBlockTransferScalarPerVectors + ck::BlockGemmPipelineScheduler::Interwave, // BlockGemmPipelineScheduler + ck::BlockGemmPipelineVersion::v1, // BlockGemmPipelineVersion + F8 // ComputeTypeA + >; + +#include "run_batched_gemm_example_rowwise.inc" + +int main(int argc, char* argv[]) { return !run_batched_gemm_rowwise_example(argc, argv); } diff --git a/example/24_batched_gemm/run_batched_gemm_example.inc b/example/24_batched_gemm/run_batched_gemm_example.inc index 21934add3..741512bf0 100644 --- a/example/24_batched_gemm/run_batched_gemm_example.inc +++ b/example/24_batched_gemm/run_batched_gemm_example.inc @@ -210,17 +210,9 @@ bool run_batched_gemm_example(int argc, char* argv[]) problem_size.M = 256 * (dis(gen) + 1); problem_size.N = 128 * (dis(gen) + 1); - problem_size.K = 64 * (dis(gen) + 2); + problem_size.K = 128 * (dis(gen) + 2); - problem_size.stride_A = problem_size.K; - problem_size.stride_B = problem_size.K; - problem_size.stride_C = problem_size.N; - - problem_size.batch_stride_A = problem_size.M * problem_size.K; - problem_size.batch_stride_B = problem_size.K * problem_size.N; - problem_size.batch_stride_C = problem_size.M * problem_size.N; - - problem_size.batch_count = 16; + problem_size.batch_count = 2; if(argc == 4) { @@ -228,13 +220,37 @@ bool run_batched_gemm_example(int argc, char* argv[]) config.init_method = std::stoi(argv[2]); config.time_kernel = std::stoi(argv[3]); } + else if(argc == 8) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + problem_size.M = std::stoi(argv[4]); + problem_size.N = std::stoi(argv[5]); + problem_size.K = std::stoi(argv[6]); + problem_size.batch_count = std::stoi(argv[7]); + } else { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg3: time kernel (0=n0, 1=yes)\n"); + printf("optinal\n"); + printf("arg4-7: M = %d N = %d K = %d Batch = %d\n", + problem_size.M, + problem_size.N, + problem_size.K, + problem_size.batch_count); exit(0); } + problem_size.stride_A = problem_size.K; + problem_size.stride_B = problem_size.K; + problem_size.stride_C = problem_size.N; + + problem_size.batch_stride_A = problem_size.M * problem_size.K; + problem_size.batch_stride_B = problem_size.K * problem_size.N; + problem_size.batch_stride_C = problem_size.M * problem_size.N; + return run_batched_gemm(problem_size, config); } diff --git a/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc b/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc new file mode 100644 index 000000000..778be8ffd --- /dev/null +++ b/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#include + +#pragma once + +struct ProblemSize final +{ + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t stride_A = K; + ck::index_t stride_B = K; + ck::index_t stride_C = N; + + ck::index_t stride_D0 = 0; + ck::index_t stride_D1 = 0; + + ck::index_t batch_stride_A = M * K; + ck::index_t batch_stride_B = K * N; + ck::index_t batch_stride_C = M * N; + + ck::index_t batch_stride_D0 = N; + ck::index_t batch_stride_D1 = M; + + ck::index_t batch_count = 16; +}; + +struct ExecutionConfig final +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; +}; + +bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionConfig& config) +{ + using namespace ck::literals; + + auto& [M, + N, + K, + stride_A, + stride_B, + stride_C, + stride_D0, + stride_D1, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_stride_D0, + batch_stride_D1, + batch_count] = problem_size; + + // GEMM shape + auto f_host_tensor_descriptor = [](std::size_t batch_count_, + std::size_t row, + std::size_t col, + std::size_t stride, + std::size_t batch_stride, + auto layout) { + using namespace ck::literals; + + if(std::is_same::value) + { + return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz}); + } + else + { + return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride}); + } + }; + + Tensor a_g_m_k( + f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{})); + Tensor b_g_k_n( + f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{})); + Tensor d0_g_m_n( + f_host_tensor_descriptor(batch_count, M, N, stride_D0, batch_stride_D0, D0Layout{})); + Tensor d1_g_m_n( + f_host_tensor_descriptor(batch_count, M, N, stride_D1, batch_stride_D1, D1Layout{})); + Tensor e_g_m_n_device_result( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{})); + + std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl; + std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl; + std::cout << "d0_g_m_n: " << d0_g_m_n.mDesc << std::endl; + std::cout << "d1_g_m_n: " << d1_g_m_n.mDesc << std::endl; + std::cout << "e_g_m_n: " << e_g_m_n_device_result.mDesc << std::endl; + + switch(config.init_method) + { + case 0: break; + case 1: + a_g_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_g_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + a_g_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_g_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + } + + d0_g_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + d1_g_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize()); + DeviceMem d0_device_buf(sizeof(D0DataType) * d0_g_m_n.mDesc.GetElementSpaceSize()); + DeviceMem d1_device_buf(sizeof(D1DataType) * d1_g_m_n.mDesc.GetElementSpaceSize()); + DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a_g_m_k.mData.data()); + b_device_buf.ToDevice(b_g_k_n.mData.data()); + + d0_device_buf.ToDevice(d0_g_m_n.mData.data()); + d1_device_buf.ToDevice(d1_g_m_n.mData.data()); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto cde_element_op = CDEElementOp{}; + + auto gemm = DeviceGemmInstance{}; + auto invoker = gemm.MakeInvoker(); + + // do GEMM + auto argument = + gemm.MakeArgument(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()}, + c_device_buf.GetDeviceBuffer(), + M, + N, + K, + batch_count, + stride_A, + stride_B, + {stride_D0, stride_D1}, + stride_C, + batch_stride_A, + batch_stride_B, + {batch_stride_D0, batch_stride_D1}, + batch_stride_C, + a_element_op, + b_element_op, + cde_element_op); + + if(!gemm.IsSupportedArgument(argument)) + { + throw std::runtime_error( + "wrong! device_gemm with the specified compilation parameters does " + "not support this GEMM problem"); + } + + invoker.Run(argument, StreamConfig{nullptr, false}); + bool pass = true; + + if(config.do_verification) + { + c_device_buf.FromDevice(e_g_m_n_device_result.mData.data()); + + Tensor c_g_m_n({batch_count, M, N}); + + using ReferenceBatchedGemmInstance = + ck::tensor_operation::host::ReferenceBatchedGemm; + + auto ref_batched_gemm = ReferenceBatchedGemmInstance{}; + auto ref_invoker = ref_batched_gemm.MakeInvoker(); + + Tensor e_g_m_n_host_result( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{})); + + auto ref_argument = ref_batched_gemm.MakeArgument( + a_g_m_k, b_g_k_n, c_g_m_n, a_element_op, b_element_op, PassThrough{}); + + ref_invoker.Run(ref_argument); + + for(int b = 0; b < batch_count; ++b) + { + for(int m = 0; m < M; ++m) + { + for(int n = 0; n < N; ++n) + { + cde_element_op(e_g_m_n_host_result(b, m, n), + c_g_m_n(b, m, n), + d0_g_m_n(b, m, n), + d1_g_m_n(b, m, n)); + } + } + } + + pass = ck::utils::check_err( + e_g_m_n_device_result, e_g_m_n_host_result, "Error: Incorrect results c"); + } + + if(config.time_kernel) + { + float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); + + std::size_t flop = std::size_t(2) * batch_count * M * N * K; + std::size_t num_btype = sizeof(ADataType) * batch_count * M * K + + sizeof(BDataType) * batch_count * K * N + + sizeof(EDataType) * batch_count * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_btype / 1.E6 / ave_time; + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << gemm.GetTypeString() << std::endl; + } + + return pass ? 0 : 1; +} + +bool run_batched_gemm_rowwise_example(int argc, char* argv[]) +{ + ProblemSize problem_size; + ExecutionConfig config; + + std::mt19937 gen(11939); + std::uniform_int_distribution dis(0, 15); + + problem_size.M = 256 * (dis(gen) + 1); + problem_size.N = 128 * (dis(gen) + 1); + problem_size.K = 128 * (dis(gen) + 2); + + problem_size.batch_count = 2; + + if(argc == 4) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + } + else if(argc == 8) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + problem_size.M = std::stoi(argv[4]); + problem_size.N = std::stoi(argv[5]); + problem_size.K = std::stoi(argv[6]); + problem_size.batch_count = std::stoi(argv[7]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); + printf("optinal\n"); + printf("arg4-7: M = %d N = %d K = %d Batch = %d\n", + problem_size.M, + problem_size.N, + problem_size.K, + problem_size.batch_count); + exit(0); + } + + problem_size.stride_A = problem_size.K; + problem_size.stride_B = problem_size.K; + problem_size.stride_C = problem_size.N; + + problem_size.stride_D0 = 0; + problem_size.stride_D1 = 0; + + problem_size.batch_stride_A = problem_size.M * problem_size.K; + problem_size.batch_stride_B = problem_size.K * problem_size.N; + problem_size.batch_stride_C = problem_size.M * problem_size.N; + + problem_size.batch_stride_D0 = problem_size.N; + problem_size.batch_stride_D1 = problem_size.M; + + return run_batched_gemm_rowwise(problem_size, config); +} diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp index f18dc3290..58c0288e8 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -53,6 +53,47 @@ struct DeviceBatchedGemmMultiD : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; +template +struct DeviceBatchedGemmV2MultiD : public BaseOperator +{ + static constexpr index_t NumDTensor = DsDataType::Size(); + + static_assert(DsLayout::Size() == DsDataType::Size(), "wrong! inconsisiten NumDTensor"); + + virtual std::unique_ptr + MakeArgumentPointer(const void* p_a, + const void* p_b, + const std::array& p_ds, + void* p_e, + index_t M, + index_t N, + index_t K, + index_t Batch, + index_t StrideA, + index_t StrideB, + const std::array& StrideDs, + index_t StrideE, + index_t BatchStrideA, + index_t BatchStrideB, + const std::array& BatchStrideDs, + index_t BatchStrideE, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp new file mode 100644 index 000000000..314ecdf76 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp @@ -0,0 +1,1014 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" +#include "ck/host_utility/flush_cache.hpp" + +namespace ck { + +// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same +// kernel function Blockers: +// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on +// two lds chunks. +// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds +// buffer when we declare __shared__ inside blkgemmpipe +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy) +#endif + kernel_batched_gemm_xdl_cshuffle_v3_multi_d(BatchedGemmArg karg) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) + __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + + const index_t g_idx = blockIdx.z % karg.Batch; + + const auto a_batch_offset = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); + const auto b_batch_offset = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); + const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); + const auto c_batch_offset = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx); + + // populate pointer, desc for Ds + static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) { + // D pointer + karg.p_ds_grid(i) = karg.p_ds_grid(i) + ds_batch_offset[i]; + }); + + GridwiseGemm::template Run( + karg.p_a_grid + a_batch_offset, + karg.p_b_grid + b_batch_offset, + karg.p_ds_grid, + karg.p_c_grid + c_batch_offset, + p_shared, + karg, + karg.a_element_op, + karg.b_element_op, + karg.c_element_op); +#else + ignore = karg; +#endif // end of if (defined(__gfx9__)) +} + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy) +#endif + kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds(BatchedGemmArg karg) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) + // Pass two lds pointer is the key to tell compiler that ds_read/write + // operate on different lds chunk at same time without order dependecy + __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + + const index_t g_idx = blockIdx.z % karg.Batch; + + const auto a_batch_offset = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); + const auto b_batch_offset = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); + const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); + const auto c_batch_offset = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx); + + // populate pointer, desc for Ds + static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) { + // D pointer + karg.p_ds_grid(i) = karg.p_ds_grid(i) + ds_batch_offset[i]; + }); + + GridwiseGemm::template Run_2Lds( + karg.p_a_grid + a_batch_offset, + karg.p_b_grid + b_batch_offset, + karg.p_ds_grid, + karg.p_c_grid + c_batch_offset, + p_shared_0, + p_shared_1, + karg, + karg.a_element_op, + karg.b_element_op, + karg.c_element_op); +#else + ignore = karg; +#endif // end of if (defined(__gfx9__)) +} + +namespace tensor_operation { +namespace device { + +template +struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 + : public DeviceBatchedGemmV2MultiD +{ + static constexpr index_t NumDTensor = DsDataType::Size(); + + // GridwiseGemm + using GridwiseGemm = GridwiseGemmMultiD_xdl_cshuffle_v3< + ALayout, + BLayout, + DsLayout, + CLayout, + ADataType, + BDataType, + GemmAccDataType, + CShuffleDataType, + DsDataType, + CDataType, + AElementwiseOperation, + BElementwiseOperation, + CElementwiseOperation, + GemmSpec, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + false, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + false, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEShuffleBlockTransferScalarPerVectors, + BlkGemmPipeSched, + BlkGemmPipelineVer, + ComputeTypeA, + ComputeTypeB, + LDSTypeA, + LDSTypeB>; + + struct ComputePtrOffsetOfStridedBatch + { + ComputePtrOffsetOfStridedBatch(index_t BatchStrideA, + index_t BatchStrideB, + std::array BatchStrideDs, + index_t BatchStrideC) + : BatchStrideA_(BatchStrideA), + BatchStrideB_(BatchStrideB), + BatchStrideDs_(BatchStrideDs), + BatchStrideC_(BatchStrideC) + { + } + + __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const + { + return static_cast(BatchStrideA_) * g_idx; + } + + __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const + { + return static_cast(BatchStrideB_) * g_idx; + } + + __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const + { + std::array ds_offset_; + + static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) { + ds_offset_[i] = static_cast(BatchStrideDs_[i]) * g_idx; + }); + + return ds_offset_; + } + + __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const + { + return static_cast(BatchStrideC_) * g_idx; + } + + private: + index_t BatchStrideA_; + index_t BatchStrideB_; + const std::array BatchStrideDs_; + index_t BatchStrideC_; + }; + + struct Argument : public GridwiseGemm::Argument + { + index_t Batch; + ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch; + + Argument(const ADataType* p_a_grid_, + const BDataType* p_b_grid_, + std::array p_ds_grid_, + CDataType* p_e_grid_, + index_t M_, + index_t N_, + index_t K_, + index_t StrideA_, + index_t StrideB_, + std::array StrideDs_, + index_t StrideE_, + index_t BatchStrideA_, + index_t BatchStrideB_, + const std::array& BatchStrideDs_, + index_t BatchStrideE_, + index_t Batch_, + AElementwiseOperation a_element_op_, + BElementwiseOperation b_element_op_, + CElementwiseOperation c_element_op_) + : GridwiseGemm::Argument{p_a_grid_, + p_b_grid_, + p_ds_grid_, + p_e_grid_, + M_, + N_, + K_, + StrideA_, + StrideB_, + StrideDs_, + StrideE_, + 1, + a_element_op_, + b_element_op_, + c_element_op_}, + Batch{Batch_}, + compute_ptr_offset_of_batch{ + BatchStrideA_, BatchStrideB_, BatchStrideDs_, BatchStrideE_} + { + } + }; + + // Invoker + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + if(stream_config.log_level_ > 0) + { + arg.Print(); + } + + if(!GridwiseGemm::CheckValidity(arg) || arg.KBatch > 1) + { + throw std::runtime_error("wrong! GridwiseGemm has invalid setting"); + } + + index_t gdx, gdy, gdz; + std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch); + + float ave_time = 0; + + index_t k_grain = arg.KBatch * KPerBlock; + index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock; + + const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split); + + const auto Run = [&](const auto& kernel) { + if(stream_config.flush_cache) + { + + std::array DsSize; + + Argument arg_ = arg; + + const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1( + arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0); + const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1( + arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0); + + auto size_a_buffer = + a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType) * arg.Batch; + auto size_b_buffer = + b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType) * arg.Batch; + + const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N( + arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs); + + static_for<0, NumDTensor, 1>{}([&](auto i) { + using DDataType = remove_cvref_t>; + DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType); + }); + ck::utility::RotatingMemWrapperMultiD rotating_mem( + arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize); + rotating_mem.Print(); + + auto run_flush_cache = [&]() { + // flush icache + ck::utility::flush_icache(); + // rotating mem + rotating_mem.Next(); + // clear c mem + if(arg_.KBatch > 1) + hipGetErrorString(hipMemsetAsync(arg_.p_c_grid, + 0, + arg_.M * arg_.N * sizeof(CDataType), + stream_config.stream_id_)); + }; + + ave_time = ck::utility::launch_and_time_kernel_with_preprocess( + stream_config, + run_flush_cache, + kernel, + dim3(gdx, gdy, gdz), + dim3(BlockSize), + 0, + arg_); + } + else + { + if(arg.KBatch > 1) + hipGetErrorString(hipMemsetAsync(arg.p_c_grid, + 0, + arg.M * arg.N * sizeof(CDataType), + stream_config.stream_id_)); + + ave_time = launch_and_time_kernel( + stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg); + } + }; + + constexpr index_t minimum_occupancy = + BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2; + + if(has_main_k_block_loop) + { + // Tail number always full + if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 || + BlkGemmPipelineVer == BlockGemmPipelineVersion::v3) + { + if(arg.KBatch > 1) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy>; + Run(kernel); + } + else + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy>; + Run(kernel); + } + } + // Tail number could be One to Seven + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2) + { + if(arg.KBatch > 1) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::One>; + Run(kernel); + } + else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Full) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Full>; + Run(kernel); + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Two>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Three) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Three>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Four) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Four>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Five) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Five>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Six>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Seven) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Seven>; + Run(kernel); + } + } + } + else + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::One>; + Run(kernel); + } + else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Full) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Full>; + Run(kernel); + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Two>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Three) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Three>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Four) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Four>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Five) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Five>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Six>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Seven) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Seven>; + Run(kernel); + } + } + } + } + // Tail number could be Odd or Even + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4) + { + if(arg.KBatch > 1) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Odd>; + Run(kernel); + } + else + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Even>; + Run(kernel); + } + } + else + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Odd>; + Run(kernel); + } + else + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Even>; + Run(kernel); + } + } + } + else + { + if(arg.KBatch > 1) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Odd>; + Run(kernel); + } + else + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Even>; + Run(kernel); + } + } + else + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Odd>; + Run(kernel); + } + else + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Even>; + Run(kernel); + } + } + } + } + else + { + // Tail number always 1 + if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1) + { + if(arg.KBatch > 1) + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + false, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy>; + Run(kernel); + } + else + { + const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d< + GridwiseGemm, + Argument, + false, + InMemoryDataOperationEnum::Set, + minimum_occupancy>; + Run(kernel); + } + } + } + + return ave_time; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + static bool IsSupportedArgument(const Argument& arg) + { + if(!ck::is_xdl_supported()) + { + return false; + } + + if(!is_bf16_atomic_supported() && std::is_same_v && arg.KBatch > 1) + { + return false; + } + + if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding || + GemmSpec == GemmSpecialization::NKPadding || + GemmSpec == GemmSpecialization::MNKPadding || + GemmSpec == GemmSpecialization::KPadding)) + { + return false; + } + + return GridwiseGemm::CheckValidity(arg); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument(const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + index_t M, + index_t N, + index_t K, + index_t Batch, + index_t StrideA, + index_t StrideB, + std::array StrideDs, + index_t StrideE, + index_t BatchStrideA, + index_t BatchStrideB, + const std::array& BatchStrideDs, + index_t BatchStrideE, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + { + return Argument{static_cast(p_a), + static_cast(p_b), + p_ds, + static_cast(p_e), + M, + N, + K, + StrideA, + StrideB, + StrideDs, + StrideE, + BatchStrideA, + BatchStrideB, + BatchStrideDs, + BatchStrideE, + Batch, + a_element_op, + b_element_op, + c_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + // polymorphic + std::unique_ptr + MakeArgumentPointer(const void* p_a, + const void* p_b, + const std::array& p_ds, + void* p_e, + index_t M, + index_t N, + index_t K, + index_t Batch, + index_t StrideA, + index_t StrideB, + const std::array& StrideDs, + index_t StrideE, + index_t BatchStrideA, + index_t BatchStrideB, + const std::array& BatchStrideDs, + index_t BatchStrideE, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) override + { + return std::make_unique(static_cast(p_a), + static_cast(p_b), + p_ds, + static_cast(p_e), + M, + N, + K, + StrideA, + StrideB, + StrideDs, + StrideE, + BatchStrideA, + BatchStrideB, + BatchStrideDs, + BatchStrideE, + Batch, + a_element_op, + b_element_op, + c_element_op); + } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + std::map BlkGemmPipelineSchedulerToString{ + {BlockGemmPipelineScheduler::Intrawave, "Intrawave"}, + {BlockGemmPipelineScheduler::Interwave, "Interwave"}}; + + std::map BlkGemmPipelineVersionToString{ + {BlockGemmPipelineVersion::v1, "v1"}, + {BlockGemmPipelineVersion::v2, "v2"}, + {BlockGemmPipelineVersion::v3, "v3"}, + {BlockGemmPipelineVersion::v4, "v4"}, + {BlockGemmPipelineVersion::v5, "v5"}}; + + // clang-format off + str << "DeviceBatchedGemmXdlUniversal" + << "<" + << getGemmSpecializationString(GemmSpec) << ", " + << std::string(ALayout::name)[0] + << std::string(BLayout::name)[0] + << std::string(CLayout::name)[0] + << ">" + << " BlkSize: " + << BlockSize << ", " + << "BlkTile: " + << MPerBlock<<"x"< +#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +#ifdef CK_ENABLE_BF16 +void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances( + std::vector>>& instances); + +void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& instances); + +void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& instances); + +#endif + +#ifdef CK_ENABLE_FP8 +void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances( + std::vector>>& instances); + +void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& instances); + +void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& instances); + +#endif + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = DeviceBatchedGemmV2MultiD; + + static auto GetInstances() + { + std::vector> op_ptrs; + +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances( + op_ptrs); + + add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances( + op_ptrs); + + add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances( + op_ptrs); + } + } +#endif + +#ifdef CK_ENABLE_FP8 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances( + op_ptrs); + + add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances( + op_ptrs); + + add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances( + op_ptrs); + } + } +#endif + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 80f0fc306..6a1558a52 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -81,6 +81,12 @@ function(add_instance_library INSTANCE_NAME) list(REMOVE_ITEM ARGN "${source}") endif() endforeach() + foreach(source IN LISTS ARGN) + if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "batched_gemm_xdl_universal" AND source MATCHES "_f8_") + message("removing batched_gemm_universal_f8 instance ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() endif() #only continue if there are some source files left on the list if(ARGN) @@ -102,6 +108,9 @@ function(add_instance_library INSTANCE_NAME) if(source MATCHES "gemm_multiply_multiply_f8") list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) endif() + if(source MATCHES "bached_gemm_multiply_multiply_f8") + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic) + endif() endif() set(offload_targets) foreach(target IN LISTS INST_TARGETS) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/CMakeLists.txt new file mode 100644 index 000000000..1affa12bb --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/CMakeLists.txt @@ -0,0 +1,19 @@ +# ONLY XDL_KERNELS +set(GEMM_UNIVERSAL_BATCHED_INSTANCES) + +list(APPEND GEMM_UNIVERSAL_BATCHED_INSTANCES + device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp + device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp + device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp + + device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp + device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp + device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp + ) + + +set_source_files_properties(device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1") +set_source_files_properties(device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1") + + +add_instance_library(device_gemm_universal_batched_instance ${GEMM_UNIVERSAL_BATCHED_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp new file mode 100644 index 000000000..5db041de0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = bhalf_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMPadding = GemmSpecialization::MPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMKPadding = GemmSpecialization::MKPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template , + typename DsDataType = ck::Tuple<>> +using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tuple< + // clang-format off + //##################################| ALayout| BLayout| DsLayout| CLayout| AData| BData| DsData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //##################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //##################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //##################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Compute friendly + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 64, 8, 8, 16, 16, 8, 7, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 32, 1, 8>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + // clang-format on + >; + +template , + typename DsDataType = ck::Tuple<>> +using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tuple< + // clang-format off + //##################################| ALayout| BLayout| DsLayout| CLayout| AData| BData| DsData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //##################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //##################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //##################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 4, 4, 16, 16, 4, 1, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 2, 2, 16, 16, 4, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 8, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 8, 8, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp new file mode 100644 index 000000000..12aa7c380 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances( + std::vector, + Row, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + PassThrough>>>& instances) +{ + add_device_operation_instances( + instances, + device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp new file mode 100644 index 000000000..1dbf5f3d1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances( + std::vector, + Row, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + PassThrough>>>& instances) +{ + add_device_operation_instances( + instances, + device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100644 index 000000000..f532309a5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances( + std::vector, + Row, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + PassThrough>>>& instances) +{ + add_device_operation_instances( + instances, + device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp new file mode 100644 index 000000000..355dc3212 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F8 = f8_t; +using BF16 = bhalf_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template , + typename DsDataType = ck::Tuple<>> +using device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple< +// clang-format off + //##################################| ALayout| BLayout| DsLayout| CLayout|AData| BData| DsData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //##################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //##################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //##################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | +#ifdef __gfx94__ + // Compute friendly + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 64, 16, 16, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 128, 16, 16, 16, 16, 8, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 64, 16, 16, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 128, 16, 16, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 128, 16, 16, 16, 16, 8, 7, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 2, 1, S<1, 64, 1, 4>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 64, 128, 16, 16, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 64, 64, 128, 16, 16, 32, 32, 1, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8> +#endif + // clang-format on + >; + +template , + typename DsDataType = ck::Tuple<>> +using device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple< +// clang-format off + //##################################| ALayout| BLayout| DsLayout| CLayout|AData| BData| DsData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //##################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //##################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //##################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | +#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 256, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 512, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 256, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 512, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 128, 16, 16, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 128, 16, 16, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 128, 16, 16, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 128, 16, 16, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 128, 16, 16, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 128, 16, 16, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 16, 16, 16, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 256, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 512, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 256, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 512, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 128, 16, 16, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 128, 16, 16, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<8>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 128, 16, 16, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 128, 16, 16, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<8>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 128, 16, 16, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 128, 16, 16, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, S<8>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8> +#endif + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp new file mode 100644 index 000000000..7f19a0112 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances( + std::vector, + Row, + F8, + F8, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + PassThrough>>>& instances) +{ + add_device_operation_instances( + instances, + device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp new file mode 100644 index 000000000..4489a974b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances( + std::vector, + Row, + F8, + F8, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + PassThrough>>>& instances) +{ + add_device_operation_instances( + instances, + device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100644 index 000000000..afbc9afb9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances( + std::vector, + Row, + F8, + F8, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + PassThrough>>>& instances) +{ + add_device_operation_instances( + instances, + device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp new file mode 100644 index 000000000..53f81162a --- /dev/null +++ b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp" +#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" + +namespace ck { +namespace profiler { + +template +bool profile_gemm_universal_batched_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + int M, + int N, + int K, + int BatchStrideA, + int BatchStrideB, + int BatchStrideC, + int StrideA, + int StrideB, + int StrideC, + int BatchCount, + int n_warmup, + int n_iter, + uint64_t rotating = 0) +{ + bool pass = true; + + auto f_host_tensor_descriptor = [](std::size_t batch_count, + std::size_t row, + std::size_t col, + std::size_t stride, + std::size_t batch_stride, + auto layout) { + using namespace ck::literals; + + if(is_same::value) + { + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz}); + } + else + { + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride}); + } + }; + + Tensor a_g_m_k( + f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{})); + Tensor b_g_k_n( + f_host_tensor_descriptor(BatchCount, K, N, StrideB, BatchStrideB, BLayout{})); + Tensor c_g_m_n_host_result( + f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{})); + Tensor c_g_m_n_device_result( + f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{})); + + int total_gemm_needed = + a_g_m_k.GetElementSpaceSizeInBytes() + b_g_k_n.GetElementSpaceSizeInBytes(); + int rotating_count = std::max( + 1, + std::min(n_iter, + static_cast(std::ceil(static_cast(rotating) / total_gemm_needed)))); + + std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl; + std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl; + std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl; + std::cout << "rotating count: " << rotating_count << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_g_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_g_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + a_g_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_g_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto c_element_op = CElementOp{}; + + if(do_verification) + { + using ReferenceBatchedGemmInstance = + ck::tensor_operation::host::ReferenceBatchedGemm; + + auto ref_batched_gemm = ReferenceBatchedGemmInstance{}; + auto ref_invoker = ref_batched_gemm.MakeInvoker(); + + auto ref_argument = ref_batched_gemm.MakeArgument( + a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, c_element_op); + + ref_invoker.Run(ref_argument); + } + + DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize()); + DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a_g_m_k.mData.data()); + b_device_buf.ToDevice(b_g_k_n.mData.data()); + c_device_buf.ToDevice(c_g_m_n_device_result.mData.data()); + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device op instances + for(auto& op_ptr : op_ptrs) + { + std::unique_ptr argument_ptr; + // false branch for multi d dl kernel + + argument_ptr = + op_ptr->MakeArgumentPointer(static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b_device_buf.GetDeviceBuffer()), + {}, + static_cast(c_device_buf.GetDeviceBuffer()), + M, + N, + K, + BatchCount, + StrideA, + StrideB, + {}, + StrideC, + BatchStrideA, + BatchStrideB, + {}, + BatchStrideC, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + // re-init C to zero before profiling next kernel + c_device_buf.SetZero(); + + std::string op_name = op_ptr->GetTypeString(); + + float ave_time = invoker_ptr->Run( + argument_ptr.get(), + StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter, true, rotating_count}); + + std::size_t flop = std::size_t(2) * BatchCount * M * N * K; + + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + + sizeof(CDataType) * M * N) * + BatchCount; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + c_device_buf.FromDevice(c_g_m_n_device_result.mData.data()); + + pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result); + + if(do_log) + { + LogRangeAsType(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl; + LogRangeAsType(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl; + LogRangeAsType(std::cout << "c_host: ", c_g_m_n_host_result.mData, ",") + << std::endl; + LogRangeAsType( + std::cout << "c_device: ", c_g_m_n_device_result.mData, ",") + << std::endl; + } + } + } + else + { + std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; + } + } + + if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = f32"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = f16"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = bf16"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = int8"; + } + + if constexpr(is_same::value) + { + std::cout << " ALayout = RowMajor"; + } + else if constexpr(is_same::value) + { + std::cout << " ALayout = ColumnMajor"; + } + + if constexpr(is_same::value) + { + std::cout << " BLayout = RowMajor"; + } + else if constexpr(is_same::value) + { + std::cout << " BLayout = ColumnMajor"; + } + + std::cout << " B = " << BatchCount << " M = " << M << " N = " << N << " K = " << K + << " StrideA = " << StrideA << " StrideB = " << StrideB << " StrideC = " << StrideC + << ": " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec + << " GB/s, " << best_op_name << std::endl; + + return pass; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 7d4df3cf9..f079d554b 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -59,6 +59,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp) list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp) + list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp) list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp) @@ -141,6 +142,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") endif() target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance) + target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance) diff --git a/profiler/src/profile_gemm_universal_batched.cpp b/profiler/src/profile_gemm_universal_batched.cpp new file mode 100644 index 000000000..4afef8e55 --- /dev/null +++ b/profiler/src/profile_gemm_universal_batched.cpp @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "profiler/profile_gemm_universal_batched_impl.hpp" +#include "profiler_operation_registry.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp" + +enum struct GemmMatrixLayout +{ + MK_KN_MN, // 0 + MK_NK_MN, // 1 + KM_KN_MN, // 2 + KM_NK_MN, // 3 +}; + +enum struct GemmDataType +{ + BF16_BF16_BF16, // 0 + F8_F8_BF16, // 1 +}; + +#define OP_NAME "gemm_universal_batched" +#define OP_DESC "Batched GEMM Universal" + +int profile_batched_gemm_universal(int argc, char* argv[]) +{ + if(argc != 18 && argc != 21) + { + // clang-format off + printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); + printf("arg2: data type (0: bf16, 1: fp8->bf16)\n"); + printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n"); + printf(" 1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n"); + printf(" 2: A[g, k, m] * B[g, k, n] = C[g, m, n];\n"); + printf(" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n"); + printf("arg4: verification (0: no; 1: yes)\n"); + printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); + printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n"); + printf("optional:\n"); + printf("arg18: number of warm-up cycles (default 1)\n"); + printf("arg19: number of iterations (default 10)\n"); + printf("arg20: memory for rotating buffer (default 0, size in MB)\n"); + // clang-format on + exit(1); + } + + int n_warmup = 1; + int n_iter = 10; + uint64_t rotating = 0; + if(argc == 21) + { + n_warmup = std::stoi(argv[18]); + n_iter = std::stoi(argv[19]); + rotating = std::stoull(argv[20]) * 1024 * 1024; + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const bool do_verification = std::stoi(argv[4]); + const int init_method = std::stoi(argv[5]); + const bool do_log = std::stoi(argv[6]); + const bool time_kernel = std::stoi(argv[7]); + + const int M = std::stoi(argv[8]); + const int N = std::stoi(argv[9]); + const int K = std::stoi(argv[10]); + + const int StrideA = std::stoi(argv[11]); + const int StrideB = std::stoi(argv[12]); + const int StrideC = std::stoi(argv[13]); + + const int BatchStrideA = std::stoi(argv[14]); + const int BatchStrideB = std::stoi(argv[15]); + const int BatchStrideC = std::stoi(argv[16]); + + const int BatchCount = std::stoi(argv[17]); + +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) + using F8 = ck::f8_t; +#endif + using BF16 = ck::bhalf_t; + + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + + auto profile = + [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) { + using ADataType = decltype(a_type); + using BDataType = decltype(b_type); + using DsDataType = ck::Tuple<>; + using CDataType = decltype(c_type); + + using ALayout = decltype(a_layout); + using BLayout = decltype(b_layout); + using DsLayout = ck::Tuple<>; + using CLayout = decltype(c_layout); + + const int DefaultStrideA = ck::is_same_v ? K : M; + const int DefaultStrideB = ck::is_same_v ? N : K; + const int DefaultStrideC = ck::is_same_v ? N : M; + + const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA; + const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB; + const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC; + + const int DefaultBatchStrideA = (ck::is_same_v ? M : K) * StrideA_; + const int DefaultBatchStrideB = (ck::is_same_v ? K : N) * StrideB_; + const int DefaultBatchStrideC = (ck::is_same_v ? M : N) * StrideC_; + + const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA; + const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB; + const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC; + + using AElementOp = ck::tensor_operation::element_wise::PassThrough; + using BElementOp = ck::tensor_operation::element_wise::PassThrough; + using CElementOp = ck::tensor_operation::element_wise::PassThrough; + + using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemmV2MultiD; + + bool pass = ck::profiler::profile_gemm_universal_batched_impl(do_verification, + init_method, + do_log, + time_kernel, + M, + N, + K, + BatchStrideA_, + BatchStrideB_, + BatchStrideC_, + StrideA_, + StrideB_, + StrideC_, + BatchCount, + n_warmup, + n_iter, + rotating); + + return pass ? 0 : 1; + }; + + if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile(BF16{}, BF16{}, BF16{}, Row{}, Col{}, Row{}); + } +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) + else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile(F8{}, F8{}, BF16{}, Row{}, Col{}, Row{}); + } +#endif + else + { + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; + } +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_universal); -- GitLab From 8aba2724cc9a3bc9ddaa7e26055169e014f8dab7 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 18 Nov 2024 14:07:04 -0800 Subject: [PATCH 062/153] Add bf16 and int8 wmma gemms for Navi3x and Navi4x. (#1671) * add bf16 gemms for gfx11/gfx12 * reduce the input values in test_gemm * add int8 wmma gemm instances for gfx11/gfx12 * add example gemm_wmma_int8 * fix bug in gemm_wmma_int8 test * increase bf16 gemm test tolerance * update the dates and clean-up commented-out instances --- example/01_gemm/CMakeLists.txt | 4 + example/01_gemm/gemm_wmma_bf16.cpp | 84 +++++++++++++++++++ example/01_gemm/gemm_wmma_int8.cpp | 84 +++++++++++++++++++ include/ck/utility/amd_wmma.hpp | 11 +-- .../tensor_operation_instance/gpu/gemm.hpp | 52 ++++++++++++ .../gpu/gemm_wmma.inc | 40 +++++++++ .../include/ck/library/utility/check_err.hpp | 2 +- .../gpu/gemm/CMakeLists.txt | 33 +++----- ..._wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp | 77 +++++++++++++++++ ..._wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp | 77 +++++++++++++++++ ..._wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp | 77 +++++++++++++++++ ..._wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp | 77 +++++++++++++++++ ..._wmma_int8_int8_int8_km_kn_mn_instance.cpp | 76 +++++++++++++++++ ..._wmma_int8_int8_int8_km_nk_mn_instance.cpp | 76 +++++++++++++++++ ..._wmma_int8_int8_int8_mk_kn_mn_instance.cpp | 76 +++++++++++++++++ ..._wmma_int8_int8_int8_mk_nk_mn_instance.cpp | 76 +++++++++++++++++ 16 files changed, 896 insertions(+), 26 deletions(-) create mode 100644 example/01_gemm/gemm_wmma_bf16.cpp create mode 100644 example/01_gemm/gemm_wmma_int8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_kn_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_nk_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_kn_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_nk_mn_instance.cpp diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 98fd9c6b7..52c8ab580 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -83,3 +83,7 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) add_custom_target(example_gemm_wmma) add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp) add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16) +add_example_executable(example_gemm_wmma_bf16 gemm_wmma_bf16.cpp) +add_example_dependencies(example_gemm_wmma example_gemm_wmma_bf16) +add_example_executable(example_gemm_wmma_int8 gemm_wmma_int8.cpp) +add_example_dependencies(example_gemm_wmma example_gemm_wmma_int8) diff --git a/example/01_gemm/gemm_wmma_bf16.cpp b/example/01_gemm/gemm_wmma_bf16.cpp new file mode 100644 index 000000000..a87426094 --- /dev/null +++ b/example/01_gemm/gemm_wmma_bf16.cpp @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" + +using ADataType = ck::bhalf_t; +using BDataType = ck::bhalf_t; +using AccDataType = float; +using CShuffleDataType = float; +using CDataType = ck::bhalf_t; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// clang-format off +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle + < ALayout, + BLayout, + CLayout, + ADataType, + BDataType, + CDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CElementOp, + GemmDefault, + 1, // Prefetch stage + 128, // BlockSize + 64, // MPerBlock + 128, // NPerBlock + 64, // KPerBlock + 2, // K1 + 16, // MPerWmma + 16, // NPerWmma + 2, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave + S<4, 32, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 2, + 2, + true, + S<4, 32, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 2, + 2, + true, + 1, // C shuffle (M Repeat) Per store + 1, // C shuffle (N Repeat) Per store + S<1, 32, 1, 4>, + 8>; +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; + +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + +#include "run_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_wmma_int8.cpp b/example/01_gemm/gemm_wmma_int8.cpp new file mode 100644 index 000000000..a88e42d42 --- /dev/null +++ b/example/01_gemm/gemm_wmma_int8.cpp @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" + +using ADataType = int8_t; +using BDataType = int8_t; +using AccDataType = int32_t; +using CShuffleDataType = int32_t; +using CDataType = int8_t; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// clang-format off +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle + < ALayout, + BLayout, + CLayout, + ADataType, + BDataType, + CDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CElementOp, + GemmDefault, + 1, // Prefetch stage + 128, // BlockSize + 64, // MPerBlock + 128, // NPerBlock + 64, // KPerBlock + 2, // K1 + 16, // MPerWmma + 16, // NPerWmma + 2, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave + S<4, 32, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 2, + 2, + true, + S<4, 32, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 2, + 2, + true, + 1, // C shuffle (M Repeat) Per store + 1, // C shuffle (N Repeat) Per store + S<1, 32, 1, 4>, + 8>; +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; + +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + +#include "run_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index d04513f3e..aa519fb2b 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -13,6 +13,11 @@ namespace ck { defined(__gfx1103__) || defined(__gfx11_generic__) #define __gfx11__ #endif + +#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__) +#define __gfx12__ +#endif + /********************************WAVE32 MODE***********************************************/ // src: fp16, dst: fp32 @@ -99,7 +104,7 @@ struct intrin_wmma_bf16_16x16x16_bf16_w32<16, 16, Opsel> // opsel usage // false: D0.[0:15] = result // true : D0.[16:31]= result -#if defined(__gfx11__) +#if defined(__gfx11__) || defined(__gfx12__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32( reg_a, reg_b, reg_c.template AsType()[Number<0>{}], Opsel); @@ -261,10 +266,6 @@ struct intrin_wmma_i32_16x16x16_iu8_w64<16, 16, neg_a, neg_b, clamp> // gfx12 /********************************WAVE32 MODE***********************************************/ -#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__) -#define __gfx12__ -#endif - // src: fp16, dst: fp32 template struct intrin_wmma_f32_16x16x16_f16_w32_gfx12; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp index 50c18fc22..3b3baf697 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp @@ -180,6 +180,58 @@ struct DeviceOperationInstanceFactory< } } #endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instances(op_ptrs); + } + } +#endif +#ifdef CK_ENABLE_INT8 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_wmma_int8_int8_int8_mk_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_wmma_int8_int8_int8_mk_nk_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_wmma_int8_int8_int8_km_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_wmma_int8_int8_int8_km_nk_mn_instances(op_ptrs); + } + } +#endif #endif #ifdef CK_USE_XDL diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_wmma.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_wmma.inc index c97298c25..c50226335 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_wmma.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_wmma.inc @@ -28,6 +28,46 @@ void add_device_gemm_wmma_f16_f16_f16_mk_nk_mn_instances( DeviceGemm>>& instances); +void add_device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_wmma_int8_int8_int8_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_wmma_int8_int8_int8_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_wmma_int8_int8_int8_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_wmma_int8_int8_int8_mk_nk_mn_instances( + std::vector>>& + instances); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp index 88741c3b9..08bfefb87 100644 --- a/library/include/ck/library/utility/check_err.hpp +++ b/library/include/ck/library/utility/check_err.hpp @@ -206,7 +206,7 @@ typename std::enable_if< check_err(const Range& out, const RefRange& ref, const std::string& msg = "Error: Incorrect results!", - double rtol = 1e-3, + double rtol = 1e-1, double atol = 1e-3) { if(out.size() != ref.size()) diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index e4efae617..b8ecb4557 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -2,9 +2,7 @@ set(GEMM_INSTANCES) list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp - device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp) - -list(APPEND GEMM_INSTANCES + device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp @@ -21,9 +19,6 @@ list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp - ) - -list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp @@ -78,9 +73,6 @@ list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp - ) - -list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp @@ -92,15 +84,11 @@ list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp - device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp) - -list(APPEND GEMM_INSTANCES + device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp - device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp) - -list(APPEND GEMM_INSTANCES + device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_default_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_interwave_default_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v2_default_instance.cpp @@ -109,14 +97,19 @@ list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v2_padded_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp - device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp) - - -list(APPEND GEMM_INSTANCES + device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp device_gemm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp device_gemm_wmma_f16_f16_f16_mk_nk_mn_instance.cpp device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp - device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp) + device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp + device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp + device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp + device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp + device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp + device_gemm_wmma_int8_int8_int8_mk_kn_mn_instance.cpp + device_gemm_wmma_int8_int8_int8_mk_nk_mn_instance.cpp + device_gemm_wmma_int8_int8_int8_km_kn_mn_instance.cpp + device_gemm_wmma_int8_int8_int8_km_nk_mn_instance.cpp) add_instance_library(device_gemm_instance ${GEMM_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp new file mode 100644 index 000000000..7a952c44d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = ck::bhalf_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[k, m] * b[k, n] = c[m, n] +using device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instances = std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise|Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp new file mode 100644 index 000000000..f0dbee5f5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = ck::bhalf_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[k, m] * b[n, k] = c[m, n] +using device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instances = std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise|Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp new file mode 100644 index 000000000..3db41222a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = ck::bhalf_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[m, k] * b[k, n] = c[m, n] +using device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instances = std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise|Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp new file mode 100644 index 000000000..ee25b8f6d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = ck::bhalf_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[m, k] * b[n, k] = c[m, n] +using device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instances = std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_kn_mn_instance.cpp new file mode 100644 index 000000000..dc763afa0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_kn_mn_instance.cpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using I8 = int8_t; +using I32 = int32_t; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[k, m] * b[k, n] = c[m, n] +using device_gemm_wmma_int8_int8_int8_km_kn_mn_instances = std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise|Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_int8_int8_int8_km_kn_mn_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_int8_int8_int8_km_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_nk_mn_instance.cpp new file mode 100644 index 000000000..ec4541ed7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_nk_mn_instance.cpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using I8 = int8_t; +using I32 = int32_t; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[k, m] * b[n, k] = c[m, n] +using device_gemm_wmma_int8_int8_int8_km_nk_mn_instances = std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise|Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_int8_int8_int8_km_nk_mn_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_int8_int8_int8_km_nk_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_kn_mn_instance.cpp new file mode 100644 index 000000000..a2166bdbc --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_kn_mn_instance.cpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using I8 = int8_t; +using I32 = int32_t; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[m, k] * b[k, n] = c[m, n] +using device_gemm_wmma_int8_int8_int8_mk_kn_mn_instances = std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise|Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_int8_int8_int8_mk_kn_mn_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_int8_int8_int8_mk_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_nk_mn_instance.cpp new file mode 100644 index 000000000..187a9c772 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_nk_mn_instance.cpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using I8 = int8_t; +using I32 = int32_t; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[m, k] * b[n, k] = c[m, n] +using device_gemm_wmma_int8_int8_int8_mk_nk_mn_instances = std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, I8, I8, I8, I32, I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_int8_int8_int8_mk_nk_mn_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_int8_int8_int8_mk_nk_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck -- GitLab From e4dfe4d892bfba901204b4975a478d4cce21e5a5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Nov 2024 22:00:18 -0800 Subject: [PATCH 063/153] Bump rocm-docs-core from 1.8.4 to 1.8.5 in /docs/sphinx (#1674) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.8.4 to 1.8.5. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/v1.8.5/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.8.4...v1.8.5) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 9824df626..3a2e266ef 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.8.4 +rocm-docs-core==1.8.5 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index f89fbcf27..b65d2391f 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.8.4 +rocm-docs-core==1.8.5 # via -r requirements.in six==1.16.0 # via pybtex -- GitLab From da0c21f6610e4fa98cf7719e3f92410ffafc963f Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 19 Nov 2024 10:00:17 -0800 Subject: [PATCH 064/153] add more fp32 dl gemm instances (#1675) * add more fp32 dl gemm instances * update the dates --- ..._gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp | 41 ++++++++++++++++--- ..._gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp | 41 ++++++++++++++++--- ..._gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp | 41 ++++++++++++++++--- ..._gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp | 41 ++++++++++++++++--- 4 files changed, 140 insertions(+), 24 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp index e696bfdcd..038234111 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include @@ -14,15 +14,12 @@ namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; using F32 = float; - using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; template -using S = ck::Sequence; - +using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; @@ -34,7 +31,39 @@ using device_gemm_dl_f32_f32_f32_km_kn_mn_instances = std::tuple< // ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector| // ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | | // ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 1, 4, 4, 1, S<8, 2>, S<8, 2>, S<2, 1, 4, 1>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<2, 1, 4, 1>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 1, 2, 3, 4, 5>, 5, 4> + // MPerBlock=128, NPerBlock=128 + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<8, 2>, S<8, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<4, 4>, S<4, 4>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<2, 8>, S<2, 8>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=128, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 16, 2, 4, 4, 1, S<8, 2>, S<4, 2>, S<2, 1, 8, 2>, S<8, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 8, 2>, S<8, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 16, 2, 4, 4, 1, S<2, 8>, S<2, 4>, S<2, 1, 8, 2>, S<8, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 8, 2>, S<8, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=128 + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 16, 2, 4, 4, 1, S<4, 2>, S<8, 2>, S<2, 1, 8, 2>, S<8, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 8, 2>, S<8, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 16, 2, 4, 4, 1, S<2, 4>, S<2, 8>, S<2, 1, 8, 2>, S<8, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 8, 2>, S<8, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<4, 2>, S<4, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<2, 4>, S<2, 4>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<8, 1>, S<4, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<4, 2>, S<8, 1>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=16, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 64, 16, 2, 1, 4, 1, S<4, 2>, S<4, 2>, S<1, 1, 4, 2>, S<16, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=16 + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 16, 16, 2, 4, 1, 1, S<4, 2>, S<4, 2>, S<4, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<1, 1, 4, 2>, S<16, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + // MPerBlock=16, NPerBlock=16 + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 16, 16, 16, 16, 2, 2, 2, 1, S<2, 2>, S<2, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 16, 16, 16, 16, 2, 2, 2, 1, S<1, 4>, S<1, 4>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // MPerBlock=8, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 8, 64, 32, 2, 1, 2, 1, S<4, 1>, S<8, 2>, S<1, 1, 4, 2>, S<32, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 8, 64, 32, 2, 1, 2, 1, S<2, 2>, S<8, 2>, S<1, 1, 4, 2>, S<32, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // MPerBlock=64, NPerBlock=8 + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 8, 32, 2, 2, 1, 1, S<8, 2>, S<4, 1>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<1, 1, 4, 2>, S<32, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 8, 32, 2, 2, 1, 1, S<8, 2>, S<2, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<1, 1, 4, 2>, S<32, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + // MPerBlock=8, NPerBlock=8 + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 1, 2, 1, S<4, 1>, S<2, 1>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 1, 2, 1, S<1, 4>, S<1, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 2, 1, 1, S<2, 1>, S<4, 1>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 2, 1, 1, S<1, 2>, S<1, 4>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp index d3ad7c60e..f61ae84ba 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include @@ -14,15 +14,12 @@ namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; using F32 = float; - using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; template -using S = ck::Sequence; - +using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; @@ -35,7 +32,39 @@ using device_gemm_dl_f32_f32_f32_km_nk_mn_instances = // ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector| // ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | | // ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 1, 4, 4, 1, S<8, 2>, S<8, 2>, S<2, 1, 4, 1>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<8, 1, 1, 1>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>, 5, 4> + // MPerBlock=128, NPerBlock=128 + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<8, 2>, S<8, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<4, 4>, S<4, 4>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<2, 8>, S<2, 8>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=128, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 16, 2, 4, 4, 1, S<8, 2>, S<4, 2>, S<2, 1, 8, 2>, S<8, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 16, 2, 4, 4, 1, S<2, 8>, S<2, 4>, S<2, 1, 8, 2>, S<8, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=128 + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 16, 2, 4, 4, 1, S<4, 2>, S<8, 2>, S<2, 1, 8, 2>, S<8, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 2, 2>, S<2, 1, 64, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 16, 2, 4, 4, 1, S<2, 4>, S<2, 8>, S<2, 1, 8, 2>, S<8, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 2, 2>, S<2, 1, 64, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<4, 2>, S<4, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<2, 4>, S<2, 4>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<8, 1>, S<4, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<4, 2>, S<8, 1>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=16, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 64, 16, 2, 1, 4, 1, S<4, 2>, S<4, 2>, S<1, 1, 4, 2>, S<16, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 4, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=16 + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 16, 16, 2, 4, 1, 1, S<4, 2>, S<4, 2>, S<4, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 1, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + // MPerBlock=16, NPerBlock=16 + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 16, 16, 16, 16, 2, 2, 2, 1, S<2, 2>, S<2, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 16, 16, 16, 16, 2, 2, 2, 1, S<1, 4>, S<1, 4>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // MPerBlock=8, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 8, 64, 32, 2, 1, 2, 1, S<4, 1>, S<8, 2>, S<1, 1, 4, 2>, S<32, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 8, 64, 32, 2, 1, 2, 1, S<2, 2>, S<8, 2>, S<1, 1, 4, 2>, S<32, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // MPerBlock=64, NPerBlock=8 + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 8, 32, 2, 2, 1, 1, S<8, 2>, S<4, 1>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 1, 2>, S<8, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 8, 32, 2, 2, 1, 1, S<8, 2>, S<2, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 1, 2>, S<8, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + // MPerBlock=8, NPerBlock=8 + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 1, 2, 1, S<4, 1>, S<2, 1>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 1, 2, 1, S<1, 4>, S<1, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 2, 1, 1, S<2, 1>, S<4, 1>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + DeviceGemmDl< F32, F32, F32, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 2, 1, 1, S<1, 2>, S<1, 4>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp index a56a36b0a..2aeaed1fe 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include @@ -14,15 +14,12 @@ namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; using F32 = float; - using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; template -using S = ck::Sequence; - +using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; @@ -35,7 +32,39 @@ using device_gemm_dl_f32_f32_f32_mk_kn_mn_instances = // ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector| // ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | | // ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 1, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 1>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 1, 1, 1>, S<2, 1, 4, 1>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 1, 2, 3, 4, 5>, 5, 4> + // MPerBlock=128, NPerBlock=128 + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<4, 4>, S<4, 4>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<2, 8>, S<2, 8>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=128, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 16, 2, 4, 4, 1, S<8, 2>, S<4, 2>, S<8, 1, 2, 2>, S<2, 1, 64, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 8, 2>, S<8, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 16, 2, 4, 4, 1, S<2, 8>, S<2, 4>, S<8, 1, 2, 2>, S<2, 1, 64, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 8, 2>, S<8, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=128 + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 16, 2, 4, 4, 1, S<4, 2>, S<8, 2>, S<8, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 8, 2>, S<8, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 16, 2, 4, 4, 1, S<2, 4>, S<2, 8>, S<8, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 8, 2>, S<8, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<4, 2>, S<4, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<2, 4>, S<2, 4>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<8, 1>, S<4, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<4, 2>, S<8, 1>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<2, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=16, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 64, 16, 2, 1, 4, 1, S<4, 2>, S<4, 2>, S<4, 1, 1, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<4, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=16 + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 16, 16, 2, 4, 1, 1, S<4, 2>, S<4, 2>, S<4, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<1, 1, 4, 2>, S<16, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + // MPerBlock=16, NPerBlock=16 + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 16, 16, 16, 16, 2, 2, 2, 1, S<2, 2>, S<2, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 16, 16, 16, 16, 2, 2, 2, 1, S<1, 4>, S<1, 4>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // MPerBlock=8, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 8, 64, 32, 2, 1, 2, 1, S<4, 1>, S<8, 2>, S<4, 1, 1, 2>, S<8, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 8, 64, 32, 2, 1, 2, 1, S<2, 2>, S<8, 2>, S<4, 1, 1, 2>, S<8, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // MPerBlock=64, NPerBlock=8 + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 8, 32, 2, 2, 1, 1, S<8, 2>, S<4, 1>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<1, 1, 4, 2>, S<32, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 8, 32, 2, 2, 1, 1, S<8, 2>, S<2, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<1, 1, 4, 2>, S<32, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + // MPerBlock=8, NPerBlock=8 + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 1, 2, 1, S<4, 1>, S<2, 1>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 1, 2, 1, S<1, 4>, S<1, 2>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 2, 1, 1, S<2, 1>, S<4, 1>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + DeviceGemmDl< F32, F32, F32, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 2, 1, 1, S<1, 2>, S<1, 4>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<4, 1, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 1, 2>, S<1, 1, 4, 2>, S<4, 1, 2, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp index 63d55e81d..ff3394d83 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include @@ -14,15 +14,12 @@ namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; using F32 = float; - using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; template -using S = ck::Sequence; - +using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; @@ -35,7 +32,39 @@ using device_gemm_dl_f32_f32_f32_mk_nk_mn_instances = // ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector| // ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | | // ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 1, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 1>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 1, 1, 1>, S<8, 1, 1, 1>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>, 5, 4> + // MPerBlock=128, NPerBlock=128 + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<4, 4>, S<4, 4>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<2, 8>, S<2, 8>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // // MPerBlock=128, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 16, 2, 4, 4, 1, S<8, 2>, S<4, 2>, S<8, 1, 2, 2>, S<2, 1, 64, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 16, 2, 4, 4, 1, S<2, 8>, S<2, 4>, S<8, 1, 2, 2>, S<2, 1, 64, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // // MPerBlock=64, NPerBlock=128 + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 16, 2, 4, 4, 1, S<4, 2>, S<8, 2>, S<8, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 2, 2>, S<2, 1, 64, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 16, 2, 4, 4, 1, S<2, 4>, S<2, 8>, S<8, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 2, 2>, S<2, 1, 64, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<4, 2>, S<4, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<2, 4>, S<2, 4>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<8, 1>, S<4, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 8, 2, 4, 4, 1, S<4, 2>, S<8, 1>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 2, 2>, S<2, 1, 32, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=16, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 64, 16, 2, 1, 4, 1, S<4, 2>, S<4, 2>, S<4, 1, 1, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 4, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // MPerBlock=64, NPerBlock=16 + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 16, 16, 2, 4, 1, 1, S<4, 2>, S<4, 2>, S<4, 1, 4, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 1, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + // MPerBlock=16, NPerBlock=16 + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 16, 16, 16, 16, 2, 2, 2, 1, S<2, 2>, S<2, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 16, 16, 16, 16, 2, 2, 2, 1, S<1, 4>, S<1, 4>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 4, 2>, S<4, 1, 4, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // MPerBlock=8, NPerBlock=64 + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 8, 64, 32, 2, 1, 2, 1, S<4, 1>, S<8, 2>, S<4, 1, 1, 2>, S<8, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 8, 64, 32, 2, 1, 2, 1, S<2, 2>, S<8, 2>, S<4, 1, 1, 2>, S<8, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // MPerBlock=64, NPerBlock=8 + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 8, 32, 2, 2, 1, 1, S<8, 2>, S<4, 1>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 1, 2>, S<8, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 8, 32, 2, 2, 1, 1, S<8, 2>, S<2, 2>, S<8, 1, 4, 2>, S<4, 1, 16, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 1, 2>, S<8, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + // MPerBlock=8, NPerBlock=8 + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 1, 2, 1, S<4, 1>, S<2, 1>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 1, 2, 1, S<1, 4>, S<1, 2>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 2>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 2, 1, 1, S<2, 1>, S<4, 1>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1>, + DeviceGemmDl< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 8, 8, 8, 4, 2, 2, 1, 1, S<1, 2>, S<1, 4>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<4, 1, 1, 2>, S<1, 1, 8, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 1> // clang-format on >; -- GitLab From 81ec5eff4a3cb64c6681043593862016193797d1 Mon Sep 17 00:00:00 2001 From: Haocong WANG Date: Wed, 20 Nov 2024 23:03:56 +0800 Subject: [PATCH 065/153] fix bug (#1680) --- .../device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp | 4 ++-- .../device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp | 4 ++-- .../device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp index b1b64ca85..9555dffd2 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp @@ -41,7 +41,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances = std //################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Compute friendly DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 256, 64, 16, 16, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, @@ -69,7 +69,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_instances = std: //################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Latency friendly DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2, 2, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp index 658714d35..8666cf858 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp @@ -40,7 +40,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple< //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#ifdef __gfx94__ +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) //Only enable these instances on gfx94x // Compute friendly DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 64, 16, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, @@ -67,7 +67,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_instances = std::tuple< //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Latency friendly DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp index 382ed5b5a..f5e801c16 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp @@ -40,7 +40,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple< //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Compute friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 64, 16, 16, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, @@ -68,7 +68,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple< //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Latency friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, -- GitLab From d31e8249c1be17aaada2a8e29df1c6495dc709f4 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 20 Nov 2024 14:01:04 -0800 Subject: [PATCH 066/153] Optimize docker file. (#1679) * reduce the docker image size and layers * clean up docker file * fix linker error for client example 24 * install CK into the default /opt/rocm/ path * restore installing CK to alternative path in CI * add linking for utility lib --- Dockerfile | 91 +++++++------------ .../24_grouped_conv_activation/CMakeLists.txt | 4 +- client_example/CMakeLists.txt | 2 +- 3 files changed, 35 insertions(+), 62 deletions(-) diff --git a/Dockerfile b/Dockerfile index 791d1d9f3..b06726335 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,18 +4,14 @@ ARG ROCMVERSION=6.2 ARG compiler_version="" ARG compiler_commit="" ARG CK_SCCACHE="" - -RUN set -xe - ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ -RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins -# Add rocm repository -RUN chmod 1777 /tmp -RUN apt-get update -RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl - ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn -RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg + +# Add rocm repository +RUN set -xe && \ + useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins && \ + apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \ + curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg RUN if [ "$ROCMVERSION" != "6.3" ]; then \ sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate" && \ @@ -30,8 +26,8 @@ RUN if [ "$ROCMVERSION" != "6.3" ]; then \ amdgpu-repo --amdgpu-build=2074281; \ fi -RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" -RUN amdgpu-install -y --usecase=rocm --no-dkms +RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \ + amdgpu-install -y --usecase=rocm --no-dkms ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache @@ -76,66 +72,49 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- clang-format-12 \ kmod && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* && \ + rm -rf amdgpu-install* && \ +# Remove unnecessary rocm components that take a lot of space + apt-get remove -y rocblas rocfft rocsparse composablekernel-dev # hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1 RUN if [ "$ROCMVERSION" = "6.1" ]; then \ sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \ fi # Update the cmake to version 3.27.5 -RUN pip install --upgrade cmake==3.27.5 - +RUN pip install --upgrade cmake==3.27.5 && \ #Install latest ccache -RUN git clone https://github.com/ccache/ccache.git && \ - cd ccache && mkdir build && cd build && cmake .. && make install - + git clone https://github.com/ccache/ccache.git && \ + cd ccache && mkdir build && cd build && cmake .. && make install && \ #Install ninja build tracing tools -RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip -RUN gunzip /usr/local/bin/ninja.gz -RUN chmod a+x /usr/local/bin/ninja -RUN git clone https://github.com/nico/ninjatracing.git - + wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \ + gunzip /usr/local/bin/ninja.gz && \ + chmod a+x /usr/local/bin/ninja && \ + git clone https://github.com/nico/ninjatracing.git && \ #Install latest cppcheck -RUN git clone https://github.com/danmar/cppcheck.git && \ + git clone https://github.com/danmar/cppcheck.git && \ cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . WORKDIR / -# Setup ubsan environment to printstacktrace -RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer -ENV UBSAN_OPTIONS=print_stacktrace=1 - # Install an init system -RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb -RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb - -ARG PREFIX=/opt/rocm +RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \ + dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \ # Install packages for processing the performance results -RUN pip3 install --upgrade pip -RUN pip3 install sqlalchemy==1.4.46 -RUN pip3 install pymysql -RUN pip3 install pandas==2.0.3 -RUN pip3 install setuptools-rust -RUN pip3 install sshtunnel==0.4.0 -# Setup ubsan environment to printstacktrace -ENV UBSAN_OPTIONS=print_stacktrace=1 - -ENV LC_ALL=C.UTF-8 -ENV LANG=C.UTF-8 -RUN groupadd -f render - + pip3 install --upgrade pip && \ + pip3 install sqlalchemy==1.4.46 pymysql pandas==2.0.3 setuptools-rust sshtunnel==0.4.0 && \ +# Add render group + groupadd -f render && \ # Install the new rocm-cmake version -RUN git clone -b master https://github.com/ROCm/rocm-cmake.git && \ - cd rocm-cmake && mkdir build && cd build && \ - cmake .. && cmake --build . && cmake --build . --target install + git clone -b master https://github.com/ROCm/rocm-cmake.git && \ + cd rocm-cmake && mkdir build && cd build && \ + cmake .. && cmake --build . && cmake --build . --target install WORKDIR / - +# Add alternative compilers, if necessary ENV compiler_version=$compiler_version ENV compiler_commit=$compiler_commit -RUN sh -c "echo compiler version = '$compiler_version'" -RUN sh -c "echo compiler commit = '$compiler_commit'" - -ARG DISABLE_CACHE=0 +RUN sh -c "echo compiler version = '$compiler_version'" && \ + sh -c "echo compiler commit = '$compiler_commit'" RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ @@ -152,9 +131,3 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd make -j 8 ; \ else echo "using the release compiler"; \ fi - -#clean-up the deb package -RUN sh -c "rm -rf amdgpu-install*" - -#ENV HIP_CLANG_PATH='/llvm-project/build/bin' -#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'" diff --git a/client_example/24_grouped_conv_activation/CMakeLists.txt b/client_example/24_grouped_conv_activation/CMakeLists.txt index dc55250bf..67bbdfec4 100644 --- a/client_example/24_grouped_conv_activation/CMakeLists.txt +++ b/client_example/24_grouped_conv_activation/CMakeLists.txt @@ -54,7 +54,7 @@ target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8 PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_reduction_operations - utility) + composable_kernel::utility) # Fwd convscale + AMAX add_executable(client_conv3d_fwd_convscale_amax_fp8 grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp) @@ -62,7 +62,7 @@ target_link_libraries(client_conv3d_fwd_convscale_amax_fp8 PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_reduction_operations - utility) + composable_kernel::utility) # Fwd convscale add_executable(client_conv3d_fwd_convscale_fp8 grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp) diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt index acb57d7bb..c393972b4 100644 --- a/client_example/CMakeLists.txt +++ b/client_example/CMakeLists.txt @@ -62,7 +62,7 @@ else() set(CK_USE_WMMA "ON") endif() -find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations) +find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations utility) if(GPU_TARGETS MATCHES "gfx9") find_package(composable_kernel COMPONENTS device_contraction_operations) endif() -- GitLab From 6916d8cc033543d1ea2028215d75409e11813dd9 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Thu, 21 Nov 2024 14:49:13 +0800 Subject: [PATCH 067/153] Add QianFeng to code owners (#1682) --- .github/CODEOWNERS | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 459315e58..5340be274 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,8 +1,8 @@ -* @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk +* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk # Documentation files -docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk -*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk -*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk -.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk +docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk # Header directory for Doxygen documentation -library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk +library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk -- GitLab From fb1ccfa9df534c8c9f351dd959a0ff692d6f9210 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Thu, 21 Nov 2024 14:53:10 +0800 Subject: [PATCH 068/153] [CK_TILE] Add paged-kvcache support in group mode fmha fwd splitkv kernels (#1678) * Generate group mode paged-attn kernel * Enable paged-kvcache + group mode support * Add missing header: fused_moe.hpp * Add comment to explain kernel arg usage * Make error message more clear * Add comment for confusing data member names * Add more comment for confusing variable names * Fix typo in option description --- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 3 - example/ck_tile/01_fmha/fmha_fwd.cpp | 59 ++++++++++++------- example/ck_tile/01_fmha/fmha_fwd.hpp | 10 +++- example/ck_tile/01_fmha/utils.hpp | 4 +- .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 49 ++++++++++----- include/ck_tile/ops/fused_moe.hpp | 11 ++++ 6 files changed, 94 insertions(+), 42 deletions(-) create mode 100644 include/ck_tile/ops/fused_moe.hpp diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index b084e9d0f..d1da95156 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -655,9 +655,6 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> if pipeline.F_spad != 't' or pipeline.F_skpad != 't': # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not continue - if pipeline.F_pagedkv == 't': - # we only use batch mode kernels to handle (paged-) kvcache problems - continue k = Kernel(F_idx=0, F_hdim=hdim, F_dtype=dtype, diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index 14291715f..00e0a1653 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -62,7 +62,7 @@ auto create_args(int argc, char* argv[]) "-1 to choose s_knew in [1, s] randomly.") .insert("s_kpad", "-1", - "seqlen_k stride between 2 tokens, currently used in group-mode only\n" + "seqlen_k stride between 2 batches, currently used in group-mode only\n" "for kv-cache case, each batch [1,s,h,d]/[1,h,s,d] can have a stride\n" "along seqlen, instead of packed. same as xformer kv_padding") .insert("d", "128", "head dim for q, k") @@ -294,7 +294,8 @@ bool run(const ck_tile::ArgParser& arg_parser) #if !CK_TILE_FMHA_FWD_APPENDKV_API if(seqlen_knew != 0) { - std::cerr << "kvcache is not supported. ignoring the 's_knew' option" << std::endl; + std::cerr << "fmha_fwd_appendkv() is not enabled. ignoring the 's_knew' option" + << std::endl; seqlen_knew = 0; } #endif @@ -321,6 +322,13 @@ bool run(const ck_tile::ArgParser& arg_parser) rotary_dim = 0; } #endif + // to use fmha_fwd_appendkv(), make sure it's in batch mode + const bool need_append_kvcache = (0 < seqlen_knew || 0 < rotary_dim); + if(need_append_kvcache && mode == mode_enum::group) + { + std::cerr << "fmha_fwd_appendkv() will be invoked. ignoring the 'mode' option" << std::endl; + mode = mode_enum::batch; + } if(!(rotary_dim <= hdim_q)) { std::cerr << "rotary_dim should be less than or equal to head dim for q" << std::endl; @@ -356,22 +364,26 @@ bool run(const ck_tile::ArgParser& arg_parser) << std::endl; use_cache_batch_idx = false; } -#endif - if(0 < page_block_size && use_cache_batch_idx) +#else + if(use_cache_batch_idx) { - std::cerr << "paged-kvcache does not support cache_batch_idx. ignoring the " - "'cache_batch_idx' option" - << std::endl; - use_cache_batch_idx = false; + if(0 < page_block_size) + { + std::cerr << "paged-kvcache does not support cache_batch_idx. ignoring the " + "'cache_batch_idx' option" + << std::endl; + use_cache_batch_idx = false; + } + else if(mode == mode_enum::group) + { + std::cerr << "group mode will not use cache_batch_idx. ignoring the " + "'cache_batch_idx' option" + << std::endl; + use_cache_batch_idx = false; + } } - // the input tensor layout for kvcache is same as batch mode - const bool need_append_kvcache = (0 < seqlen_knew || 0 < rotary_dim); +#endif const bool use_kvcache = (need_append_kvcache || use_cache_batch_idx || 0 < page_block_size); - if(use_kvcache && mode != mode_enum::batch) - { - std::cerr << "kvcache enabled. ignoring the 'mode' option" << std::endl; - mode = mode_enum::batch; - } auto [seqlen_qs, seqlen_ks, seqlen_kpads] = decode_seqlen(mode, @@ -380,7 +392,7 @@ bool run(const ck_tile::ArgParser& arg_parser) arg_parser.get_str("s_k"), arg_parser.get_str("s_kpad"), /*seqlen_k_min=*/0 < seqlen_knew ? seqlen_knew : 0, - use_kvcache); + need_append_kvcache); // compute kvcache seqlen_k (before appending knew/vnew) auto cache_seqlen_ks = seqlen_ks; std::transform(cache_seqlen_ks.begin(), @@ -741,8 +753,10 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t)); ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t)); - ck_tile::DeviceMem seqlen_k_buf( - use_kvcache || 0 <= seqlen_kpads[0] ? seqlen_ks.size() * sizeof(int32_t) : 0); + ck_tile::DeviceMem seqlen_k_buf((mode == mode_enum::batch && use_kvcache) || + 0 <= seqlen_kpads[0] + ? seqlen_ks.size() * sizeof(int32_t) + : 0); ck_tile::DeviceMem cache_seqlen_k_buf( need_append_kvcache ? cache_seqlen_ks.size() * sizeof(int32_t) : 0); ck_tile::DeviceMem rotary_cos_buf(rotary_cos_host.get_element_space_size_in_bytes()); @@ -763,7 +777,9 @@ bool run(const ck_tile::ArgParser& arg_parser) seqstart_q.ToDevice(seqstart_q_host.data()); seqstart_k.ToDevice(seqlen_kpads[0] < 0 ? seqstart_k_host.data() : seqstart_k_with_padding_host.data()); - seqlen_k_buf.ToDevice(use_kvcache || 0 <= seqlen_kpads[0] ? seqlen_ks.data() : nullptr); + seqlen_k_buf.ToDevice((mode == mode_enum::batch && use_kvcache) || 0 <= seqlen_kpads[0] + ? seqlen_ks.data() + : nullptr); cache_seqlen_k_buf.ToDevice(need_append_kvcache ? cache_seqlen_ks.data() : nullptr); rotary_cos_buf.ToDevice(rotary_cos_host.data()); rotary_sin_buf.ToDevice(rotary_sin_host.data()); @@ -976,8 +992,9 @@ bool run(const ck_tile::ArgParser& arg_parser) (mode == mode_enum::group ? seqstart_q.GetDeviceBuffer() : nullptr); args.seqstart_k_ptr = (mode == mode_enum::group ? seqstart_k.GetDeviceBuffer() : nullptr); - args.seqlen_k_ptr = - (use_kvcache || 0 <= k_paddings_[0] ? seqlen_k_buf.GetDeviceBuffer() : nullptr); + args.seqlen_k_ptr = ((mode == mode_enum::batch && use_kvcache) || 0 <= k_paddings_[0] + ? seqlen_k_buf.GetDeviceBuffer() + : nullptr); args.seqlen_k = shape_seqlen_k; // unused in group mode (or kvcache enabled) args.max_seqlen_q = max_seqlen_q; diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 251e61bc7..41edac67b 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -173,8 +173,11 @@ struct fmha_fwd_splitkv_args // seqlen_k = kargs.seqlen_k // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b] // seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b] - // kvcache mode (use same kernel as batch mode): + // batch mode (kvcache): // seqlen_q = kargs.seqlen_q + // seqlen_k = kargs.seqlen_k_ptr[b] + // group mode (kvcache): + // seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b] // seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b] const void* seqstart_q_ptr; const void* seqstart_k_ptr; @@ -251,7 +254,7 @@ struct fmha_fwd_appendkv_args ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr ck_tile::index_t page_block_size; // only used if 'block_table_ptr' is not nullptr - const void* cache_batch_idx; + const void* cache_batch_idx; // only used if block_table_ptr is nullptr -> batch mode (kvcache) ck_tile::index_t stride_q; ck_tile::index_t stride_k; @@ -389,6 +392,9 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args) args.nhead_q, args.nhead_q / args.nhead_k, args.num_splits, + args.block_table_ptr, + args.batch_stride_block_table, + args.page_block_size, args.scale_s, args.scale_p, args.stride_q, diff --git a/example/ck_tile/01_fmha/utils.hpp b/example/ck_tile/01_fmha/utils.hpp index 996032a71..faf3f0843 100644 --- a/example/ck_tile/01_fmha/utils.hpp +++ b/example/ck_tile/01_fmha/utils.hpp @@ -145,7 +145,7 @@ decode_seqlen(mode_enum mode, std::string k_val, std::string k_pad_val, ck_tile::index_t seqlen_k_min = 0, - bool use_kvcache = false, + bool need_append_kvcache = false, std::optional seed = std::nullopt) { #define _S2I_(str_) static_cast(std::atoi((str_).c_str())) @@ -159,7 +159,7 @@ decode_seqlen(mode_enum mode, const ck_tile::index_t seqlen_k_max = (k < 0 ? q : k); std::vector seqlen_ks(batch, seqlen_k_max); - if(1 < batch && use_kvcache) + if(1 < batch && need_append_kvcache) { // to keep the original s_k value, we always use seqlen_k_max in first batch randints(std::next(seqlen_ks.begin()), diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index 4ffebc3c9..98a4329d7 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -46,8 +46,7 @@ struct FmhaFwdSplitKVKernel static constexpr auto BiasEnum = FmhaPipeline::BiasEnum; static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant; static constexpr bool kIsPagedKV = FmhaPipeline::Problem::kIsPagedKV; - static_assert(!kIsGroupMode || (kIsGroupMode && !kIsPagedKV), - "paged-kvcache only supported by batch mode kernels"); + using FmhaMask = ck_tile::remove_cvref_t; static constexpr bool kHasMask = FmhaMask::IsMasking; @@ -198,8 +197,10 @@ struct FmhaFwdSplitKVKernel const int32_t* seqlen_k_ptr; ck_tile::index_t batch_stride_q; - ck_tile::index_t batch_stride_k; - ck_tile::index_t batch_stride_v; + ck_tile::index_t batch_stride_k; // when using paged-kvcache, this will be stride/size for + // single kcache page-block + ck_tile::index_t batch_stride_v; // when using paged-kvcache, this will be stride/size for + // single vcache page-block ck_tile::index_t batch_stride_lse_acc; ck_tile::index_t batch_stride_o_acc; }; @@ -212,14 +213,17 @@ struct FmhaFwdSplitKVKernel AlibiKargs, EmptyKargs<0>>>, std::conditional_t>, - std::conditional_t> + std::conditional_t>, + std::conditional_t> { const int32_t* seqstart_q_ptr; const int32_t* seqstart_k_ptr; const int32_t* seqlen_k_ptr; - ck_tile::index_t batch_stride_k; // only used for paged-kvcache - ck_tile::index_t batch_stride_v; // only used for paged-kvcache + ck_tile::index_t batch_stride_k; // only used for paged-kvcache, this will be stride/size + // for single kcache page-block + ck_tile::index_t batch_stride_v; // only used for paged-kvcache, this will be stride/size + // for single vcache page-block }; using Kargs = std::conditional_t; @@ -363,6 +367,9 @@ struct FmhaFwdSplitKVKernel ck_tile::index_t num_head_q, ck_tile::index_t nhead_ratio_qk, ck_tile::index_t num_splits, + const void* block_table_ptr, + ck_tile::index_t batch_stride_block_table, + ck_tile::index_t page_block_size, float scale_s, float scale_p, ck_tile::index_t stride_q, @@ -416,6 +423,7 @@ struct FmhaFwdSplitKVKernel {}, // placeholder for bias {}, // placeholder for mask {}, // placeholder for fp8_static_quant args + {}, // placeholder for paged-block table reinterpret_cast(seqstart_q_ptr), reinterpret_cast(seqstart_k_ptr), reinterpret_cast(seqlen_k_ptr), @@ -443,6 +451,12 @@ struct FmhaFwdSplitKVKernel { kargs.scale_p = scale_p; } + if constexpr(kIsPagedKV) + { + kargs.block_table_ptr = reinterpret_cast(block_table_ptr); + kargs.batch_stride_block_table = batch_stride_block_table; + kargs.page_block_size = page_block_size; + } return kargs; } @@ -489,15 +503,22 @@ struct FmhaFwdSplitKVKernel const long_index_t key_start = kargs.seqstart_k_ptr[i_batch]; batch_offset_q = query_start * kargs.stride_q; - batch_offset_k = key_start * kargs.stride_k; - - if constexpr(std::is_same_v) + if constexpr(kIsPagedKV) { - batch_offset_v = key_start * kargs.stride_v; + batch_offset_k = static_cast(i_batch) * kargs.batch_stride_k; + batch_offset_v = static_cast(i_batch) * kargs.batch_stride_v; } else { - batch_offset_v = key_start; + batch_offset_k = key_start * kargs.stride_k; + if constexpr(std::is_same_v) + { + batch_offset_v = key_start * kargs.stride_v; + } + else + { + batch_offset_v = key_start; + } } if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) { @@ -685,7 +706,7 @@ struct FmhaFwdSplitKVKernel return make_page_block_navigator( kargs.k_ptr, - kargs.batch_stride_k, + kargs.batch_stride_k, // kcache page-block stride/size fixed_offset, block_indices, num_blocks, @@ -715,7 +736,7 @@ struct FmhaFwdSplitKVKernel return make_page_block_navigator( kargs.v_ptr, - kargs.batch_stride_v, + kargs.batch_stride_v, // vcache page-block stride/size fixed_offset, block_indices, num_blocks, diff --git a/include/ck_tile/ops/fused_moe.hpp b/include/ck_tile/ops/fused_moe.hpp new file mode 100644 index 000000000..b74607f06 --- /dev/null +++ b/include/ck_tile/ops/fused_moe.hpp @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp" +#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp" +#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp" +#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" -- GitLab From d6d4c2788bc66c7ead56f1d7b03b7c7b28c2b007 Mon Sep 17 00:00:00 2001 From: Harisankar Sadasivan <135730918+hsadasiv@users.noreply.github.com> Date: Thu, 21 Nov 2024 08:21:37 -0800 Subject: [PATCH 069/153] universal streamk fp8 changes (#1665) * universal streamk fp8 changes & ckprofiler instances * revert strides to -1 and verification options * fp8 exclusion on pre-gfx94 for universal_streamk * PR review based revisions: permissions reverted, removed hip err checks --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> --- README.md | 3 +- example/01_gemm/CMakeLists.txt | 3 + example/01_gemm/common.hpp | 2 +- example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp | 13 +- example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp | 58 ++ .../01_gemm/run_gemm_example_streamk_v2.inc | 40 + .../device_gemm_xdl_cshuffle_streamk_v3.hpp | 382 ++++++-- .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp | 818 ++++++++++++++++-- .../gpu/gemm_universal_streamk.hpp | 315 +++++++ .../gpu/CMakeLists.txt | 6 + .../gpu/gemm_universal_streamk/CMakeLists.txt | 45 +- ..._universal_streamk_f16_f8_f16_mk_kn_mn.hpp | 84 ++ ..._f8_f16_mk_kn_mn_comp_default_instance.cpp | 24 + ...f8_f16_mk_kn_mn_comp_kpadding_instance.cpp | 24 + ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp | 24 + ...8_f16_mk_kn_mn_comp_mnpadding_instance.cpp | 24 + ...8_f16_mk_kn_mn_mem_v1_default_instance.cpp | 25 + ..._f16_mk_kn_mn_mem_v1_kpadding_instance.cpp | 25 + ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp | 25 + ...8_f16_mk_kn_mn_mem_v2_default_instance.cpp | 25 + ..._f16_mk_kn_mn_mem_v2_kpadding_instance.cpp | 25 + ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp | 25 + ..._universal_streamk_f16_f8_f16_mk_nk_mn.hpp | 90 ++ ..._f8_f16_mk_nk_mn_comp_default_instance.cpp | 24 + ...f8_f16_mk_nk_mn_comp_kpadding_instance.cpp | 24 + ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp | 24 + ...8_f16_mk_nk_mn_comp_mnpadding_instance.cpp | 24 + ...8_f16_mk_nk_mn_mem_v1_default_instance.cpp | 25 + ..._f16_mk_nk_mn_mem_v1_kpadding_instance.cpp | 25 + ...16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp | 25 + ...8_f16_mk_nk_mn_mem_v2_default_instance.cpp | 25 + ..._f16_mk_nk_mn_mem_v2_kpadding_instance.cpp | 25 + ...16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp | 25 + ..._universal_streamk_f8_f16_f16_mk_kn_mn.hpp | 85 ++ ...f16_f16_mk_kn_mn_comp_default_instance.cpp | 24 + ...16_f16_mk_kn_mn_comp_kpadding_instance.cpp | 24 + ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp | 24 + ...6_f16_mk_kn_mn_comp_mnpadding_instance.cpp | 24 + ...6_f16_mk_kn_mn_mem_v1_default_instance.cpp | 25 + ..._f16_mk_kn_mn_mem_v1_kpadding_instance.cpp | 25 + ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp | 25 + ...6_f16_mk_kn_mn_mem_v2_default_instance.cpp | 25 + ..._f16_mk_kn_mn_mem_v2_kpadding_instance.cpp | 25 + ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp | 25 + ..._universal_streamk_f8_f16_f16_mk_nk_mn.hpp | 90 ++ ...f16_f16_mk_nk_mn_comp_default_instance.cpp | 24 + ...16_f16_mk_nk_mn_comp_kpadding_instance.cpp | 24 + ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp | 24 + ...6_f16_mk_nk_mn_comp_mnpadding_instance.cpp | 24 + ...6_f16_mk_nk_mn_mem_v1_default_instance.cpp | 25 + ..._f16_mk_nk_mn_mem_v1_kpadding_instance.cpp | 25 + ...16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp | 25 + ...6_f16_mk_nk_mn_mem_v2_default_instance.cpp | 25 + ..._f16_mk_nk_mn_mem_v2_kpadding_instance.cpp | 25 + ...16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp | 25 + .../gemm_universal_streamk/CMakeLists.txt | 26 - ...universal_streamk_f16_f16_f16_mk_kn_mn.hpp | 91 -- ...f16_f16_mk_kn_mn_comp_default_instance.cpp | 30 - ...16_f16_mk_kn_mn_comp_kpadding_instance.cpp | 30 - ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp | 30 - ...6_f16_mk_kn_mn_comp_mnpadding_instance.cpp | 30 - ...6_f16_mk_kn_mn_mem_v1_default_instance.cpp | 31 - ..._f16_mk_kn_mn_mem_v1_kpadding_instance.cpp | 31 - ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp | 31 - ...6_f16_mk_kn_mn_mem_v2_default_instance.cpp | 31 - ..._f16_mk_kn_mn_mem_v2_kpadding_instance.cpp | 31 - ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp | 31 - ...universal_streamk_f16_f16_f16_mk_nk_mn.hpp | 98 --- ...f16_f16_mk_nk_mn_comp_default_instance.cpp | 30 - ...16_f16_mk_nk_mn_comp_kpadding_instance.cpp | 30 - ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp | 30 - ...6_f16_mk_nk_mn_comp_mnpadding_instance.cpp | 30 - ...6_f16_mk_nk_mn_mem_v1_default_instance.cpp | 31 - ..._f16_mk_nk_mn_mem_v1_kpadding_instance.cpp | 31 - ...16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp | 31 - ...6_f16_mk_nk_mn_mem_v2_default_instance.cpp | 31 - ..._f16_mk_nk_mn_mem_v2_kpadding_instance.cpp | 31 - ...16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp | 31 - modified_files.txt | 10 + .../src/profile_gemm_universal_streamk.cpp | 24 +- 80 files changed, 2887 insertions(+), 992 deletions(-) create mode 100755 example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp mode change 100644 => 100755 example/01_gemm/run_gemm_example_streamk_v2.inc mode change 100644 => 100755 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp mode change 100644 => 100755 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp create mode 100755 modified_files.txt mode change 100644 => 100755 profiler/src/profile_gemm_universal_streamk.cpp diff --git a/README.md b/README.md index 302173dc1..d8eb152ee 100644 --- a/README.md +++ b/README.md @@ -154,8 +154,7 @@ Additional cmake flags can be used to significantly speed-up the build: other platforms have faster instances, such as `xdl` or `wmma`, available. * `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances, - such as `gemm_universal` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not - have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on + such as `gemm_universal`, `gemm_universal_streamk` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on architectures like the MI100/MI200 for the functional support only. ## Using sccache for building diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 52c8ab580..957acce16 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -77,6 +77,9 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8) add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8) +add_example_executable(example_gemm_xdl_fp8_streamk_v3 gemm_xdl_fp8_streamk_v3.cpp) +add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_streamk_v3) + add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp index 6e1c9f2a0..67bf92bbb 100644 --- a/example/01_gemm/common.hpp +++ b/example/01_gemm/common.hpp @@ -44,7 +44,7 @@ struct ProblemSizeStreamK final ck::index_t StrideB = -1; ck::index_t StrideC = -1; - ck::index_t NumSKBlocks = -1; + ck::index_t NumSKBlocks = -1; // number of stream-k blocks }; struct ProblemSizeStreamK_universal final { diff --git a/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp b/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp index 5b163962b..36ac51f1d 100644 --- a/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp +++ b/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp @@ -8,7 +8,7 @@ using ADataType = ck::half_t; using BDataType = ck::half_t; using AccDataType = float; -using CShuffleDataType = ck::half_t; +using CShuffleDataType = float; using CDataType = ck::half_t; using ALayout = Row; @@ -43,6 +43,17 @@ using DeviceGemmV2_Streamk_Instance = using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example_streamk_v2.inc" int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp b/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp new file mode 100755 index 000000000..3b79ae9b8 --- /dev/null +++ b/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +using ADataType = ck::f8_t; +using BDataType = ck::f8_t; +using AccDataType = float; +using CShuffleDataType = ck::half_t; +using CDataType = ck::half_t; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +// clang-format off +using DeviceGemmV2_Streamk_Instance = + ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3< + ALayout, BLayout, CLayout, + ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, + PassThrough, PassThrough, PassThrough, GemmDefault, + 256, + 128, 256, + 128, 16, 16, + 16, 16, + 4, 8, + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 16, 16, 1, + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 16, 16, 1, + 1, 2, S<1, 32, 1, 8>, 8, + ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ck::f8_t>; +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + +#include "run_gemm_example_streamk_v2.inc" + +int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); } diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc old mode 100644 new mode 100755 index 8ed8b81be..04243b829 --- a/example/01_gemm/run_gemm_example_streamk_v2.inc +++ b/example/01_gemm/run_gemm_example_streamk_v2.inc @@ -176,6 +176,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; @@ -196,6 +197,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) * + c_m_n_device_ref_result.mDesc.GetElementSpaceSize()); a_m_k_device_buf.ToDevice(a_m_k.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data()); @@ -240,6 +243,13 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) return true; } + std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument); + if(workspace_size != 0) + { + workspace.Realloc(workspace_size); + gemm.SetWorkSpacePointer(&argument, workspace.GetDeviceBuffer()); + } + bool pass = true; if((config.do_verification == 1) || (config.do_verification == 3)) { @@ -271,6 +281,36 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) #endif } + if((config.do_verification == 2) || (config.do_verification == 3)) + { + // GPU verification + auto ref_gemm_gpu = ReferenceGemmInstanceGPU{}; + auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker(); + + auto ref_argument_gpu = ref_gemm_gpu.MakeArgument( + static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_ref_buf.GetDeviceBuffer()), + M, + N, + K, + a_element_op, + b_element_op, + c_element_op); + + std::cout << "Running verification on GPU." << std::endl; + ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{}); + + c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data()); + c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + pass &= ck::utils::check_err(c_m_n_device_result, + c_m_n_device_ref_result, + "Error: Incorrect results!", + get_rtol(), + get_atol()); + } + if(config.time_kernel) { ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp old mode 100644 new mode 100755 index 452063156..cfd9a1204 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp @@ -131,6 +131,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2 0) { arg.Print(); @@ -147,26 +148,27 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2(arg.p_workspace_) + + arg.block_2_ctile_map_streamk.get_workspace_size_for_acc( + sizeof(GemmAccDataType)); + auto preprocess = [&]() { + hipMemsetAsync( + workspace_semaphore, + 0, + // sizeof(uint32_t), + arg.block_2_ctile_map_streamk.get_workspace_size_for_semaphore(), + stream_config.stream_id_); + }; + + ave_time = launch_and_time_kernel_with_preprocess( + stream_config, preprocess, kernel, grid_dim, dim3(BlockSize), 0, arg); + } } }; @@ -211,14 +236,12 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2; - Run(kernel); - } + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + + Run(kernel); } // Tail number could be One to Seven else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2) @@ -340,53 +363,49 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2; - Run(kernel); - } - else - { - const auto kernel = - kernel_gemm_xdl_cshuffle_v3_2lds; - Run(kernel); - } + const auto kernel = + kernel_gemm_xdl_cshuffle_v3_2lds; + Run(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3_2lds; + Run(kernel); } } else { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) { - if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) - { - const auto kernel = - kernel_gemm_xdl_cshuffle_v3; - Run(kernel); - } - else - { - const auto kernel = - kernel_gemm_xdl_cshuffle_v3; - Run(kernel); - } + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); } } } @@ -396,14 +415,11 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2; - Run(kernel); - } + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + Run(kernel); } } @@ -418,6 +434,29 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2(pArg); + if constexpr(GridwiseGemm::Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + return p_arg->block_2_ctile_map_streamk.get_workspace_size(sizeof(GemmAccDataType)); + } + else + { + return 0; + } + } + + void SetWorkSpacePointer(BaseArgument* pArg, + void* p_workspace, + const StreamConfig& = StreamConfig{}) const override + { + Argument* pArg_ = dynamic_cast(pArg); + + pArg_->p_workspace_ = p_workspace; + } + static constexpr bool IsValidCompilationParameter() { // TODO: properly implement this check @@ -464,8 +503,205 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2; + calculate_grid_size(kernel); + } + // Tail number could be One to Seven + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2) + { + + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Full) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Three) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Four) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Five) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Seven) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + } + // Tail number could be Odd or Even + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4) + { + + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3_2lds; + calculate_grid_size(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3_2lds; + calculate_grid_size(kernel); + } + } + else + { + + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + else + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + } + else + { + // Tail number always 1 + if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1) + { + + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, streamk_sel, Grid_size}; } static auto MakeInvoker() { return Invoker{}; } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp old mode 100644 new mode 100755 index ff1021535..6ef35da48 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp @@ -14,6 +14,8 @@ #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1r2.hpp" #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/workgroup_barrier.hpp" +#include "ck/utility/reduction_functions_accumulate.hpp" namespace ck { @@ -38,7 +40,7 @@ __global__ void __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; GridwiseGemm::template Run( - karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg); + karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg, karg.p_workspace_); #else ignore = karg; #endif // end of if (defined(__gfx9__)) @@ -62,7 +64,13 @@ __global__ void __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()]; GridwiseGemm::template Run_2Lds( - karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared_0, p_shared_1, karg); + karg.p_a_grid, + karg.p_b_grid, + karg.p_c_grid, + p_shared_0, + p_shared_1, + karg, + karg.p_workspace_); #else ignore = karg; #endif // end of if (defined(__gfx9__)) @@ -521,7 +529,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, Streamk_sel_, Grid_size_}, p_a_grid{p_a_grid_}, p_b_grid{p_b_grid_}, - p_c_grid{p_c_grid_} + p_c_grid{p_c_grid_}, + block_2_ctile_map_streamk( + M_, N_, AK0Number * CalculateKPadded(K_, 1), Grid_size_, Streamk_sel_) { } @@ -529,6 +539,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 const ADataType* p_a_grid; const BDataType* p_b_grid; CDataType* p_c_grid; + BlockToCTileMap_GemmStreamK_v2 + block_2_ctile_map_streamk; }; struct SplitKBatchOffset @@ -853,6 +870,19 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock; } + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MShuffle_MPerShuffle_NShuffle_NPerShuffle() + { + constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl); + constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl); + + return make_naive_tensor_descriptor_packed( + make_tuple(Number{}, + Number{}, + Number{}, + Number{})); + } + using BlockwiseGemmPipe = remove_cvref_t(); + constexpr auto NPerBlockReduction = + NPerBlockPow2 / CShuffleBlockTransferScalarPerVector_NPerBlock; + constexpr auto MPerBlockReduction = + (BlockSize + NPerBlockReduction - 1) / NPerBlockReduction; + return Sequence{}; + } + + __host__ __device__ static constexpr auto GetPartialAccBlockDescriptor() + { + const auto c_partial_acc_block_m_n = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(MPerBlock, NPerBlock), + make_tuple(NPerBlock, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(MPerBlock, NPerBlock), + make_tuple(I1, MPerBlock)); + } + }(); + return c_partial_acc_block_m_n; + } using Block2CTileMap_streamk = BlockToCTileMap_GemmStreamK_v2( + p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); + + const auto b_grid_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); Block2CTileMap_streamk block_2_ctile_map_streamk(problem.M, problem.N, AK0Number * problem.KPadded, problem.Grid_size, problem.Streamk_sel); uint32_t iter_start, iter_end; - bool is_sk_block, is_dp_block; + bool is_sk_block, is_dp_block, is_reduction_block; index_t num_k_block_main_loop; - + const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( + problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC); + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + c_grid_desc_m_n, problem.MBlock, problem.NBlock); + auto c_grid_buf = make_dynamic_buffer( + p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + + uint32_t* p_semaphore = reinterpret_cast( + reinterpret_cast(p_workspace) + + block_2_ctile_map_streamk.get_workspace_size_for_acc(sizeof(AccDataType))); for(auto block_idx = get_block_1d_id(); block_idx < block_2_ctile_map_streamk.get_grid_dims(); block_idx += gridDim.x) @@ -1163,6 +1241,214 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end); num_k_block_main_loop = iter_end - iter_start; + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + is_reduction_block = static_cast(block_idx) >= + block_2_ctile_map_streamk.reduction_start_block_idx; + if(is_reduction_block) + { + // descriptors + constexpr auto cluster_length_reduce = GetClusterLengthReduction(); + constexpr auto reduce_desc = make_cluster_descriptor(cluster_length_reduce); + const auto reduce_thread_cluster_idx = + reduce_desc.CalculateBottomIndex(make_multi_index(block_idx)); + const auto thread_m_cluster_id = reduce_thread_cluster_idx[I0]; + const auto thread_n_cluster_id = reduce_thread_cluster_idx[I1]; + + constexpr auto MReduceIters = math::integer_divide_ceil( + Number{}, cluster_length_reduce.At(I0)); + constexpr auto NReduceIters = math::integer_divide_ceil( + Number{}, + cluster_length_reduce.At(I1) * + Number{}); + + constexpr auto acc_thread_buf_load_desc = make_naive_tensor_descriptor_packed( + make_tuple(I1, Number{})); + constexpr auto acc_thread_buf_store_desc = + make_naive_tensor_descriptor_packed(make_tuple( + I1, I1, I1, Number{})); + + constexpr auto c_partial_acc_block_m_n = GetPartialAccBlockDescriptor(); + + constexpr auto partial_acc_load_step_n = + make_multi_index(0, + cluster_length_reduce.At(I1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_load_step_n_reverse = make_multi_index( + 0, + -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_load_step_m = + make_multi_index(cluster_length_reduce.At(I0), 0); + + constexpr auto partial_acc_store_step_n = + make_multi_index(0, + 0, + 0, + cluster_length_reduce.At(I1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_store_step_n_reverse = make_multi_index( + 0, + 0, + 0, + -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_store_step_m = + make_multi_index(0, cluster_length_reduce.At(I0), 0, 0); + + StaticBuffer + parcial_acc_buf; + StaticBuffer + acc_buf; + + // start to compute + auto reduction_idx = + block_idx - block_2_ctile_map_streamk.reduction_start_block_idx; + auto spatial_idx = block_2_ctile_map_streamk.tile_to_spatial( + reduction_idx, problem.M, problem.N); + + workgroup_barrier wg_barrier(p_semaphore); + + uint32_t tile_acc_offset_start = + block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx); + uint32_t tile_acc_offset_end = + block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx + + 1); + __syncthreads(); + + auto acc_load = ThreadwiseTensorSliceTransfer_v2< + AccDataType, // SrcData, + AccDataType, // DstData, + decltype(c_partial_acc_block_m_n), // SrcDesc, + decltype(acc_thread_buf_load_desc), // DstDesc, + Sequence<1, + CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths, + Sequence<0, 1>, // DimAccessOrder, + 1, // SrcVectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // SrcScalarPerVector, + 1, // SrcScalarStrideInVector, + false // SrcResetCoordinateAfterRun, + >{c_partial_acc_block_m_n, + make_multi_index(thread_m_cluster_id, + thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock)}; + + auto acc_store = ThreadwiseTensorSliceTransfer_v1r3< + AccDataType, // SrcData, + CDataType, // DstData, + decltype(acc_thread_buf_store_desc), // SrcDesc, + decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), // DstDesc, + CElementwiseOperation, // ElementwiseOperation, + Sequence<1, + 1, + 1, + CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths, + Sequence<0, 1, 2, 3>, // DimAccessOrder, + 3, // DstVectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // DstScalarPerVector, + InMemoryDataOperationEnum::Set, // InMemoryDataOperationEnum DstInMemOp, + 1, // DstScalarStrideInVector, + false // DstResetCoordinateAfterRun, + >{c_grid_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(__builtin_amdgcn_readfirstlane(spatial_idx[I0]), + thread_m_cluster_id, + __builtin_amdgcn_readfirstlane(spatial_idx[I1]), + thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock), + CElementwiseOperation{}}; + + wg_barrier.wait_eq(reduction_idx, tile_acc_offset_end - tile_acc_offset_start); + + if(threadIdx.x == 0) + { + p_semaphore[reduction_idx] = 0; + } + using Accumulation = ck::detail:: + AccumulateWithNanCheck; + + for(int i_m = 0; i_m < MReduceIters; i_m++) + { + static_for<0, NReduceIters, 1>{}([&](auto i_n_reduce) { + acc_buf.Clear(); + for(auto i = tile_acc_offset_start; i < tile_acc_offset_end; i++) + { + auto c_partial_acc_buf = + make_dynamic_buffer( + reinterpret_cast(p_workspace) + + i * c_partial_acc_block_m_n.GetElementSpaceSize(), + c_partial_acc_block_m_n.GetElementSpaceSize()); + + acc_load.Run(c_partial_acc_block_m_n, + c_partial_acc_buf, + acc_thread_buf_load_desc, + make_tuple(I0, I0), + parcial_acc_buf); + + static_for<0, CShuffleBlockTransferScalarPerVector_NPerBlock, 1>{}( + [&](auto i_vec) { + constexpr auto offset = + acc_thread_buf_load_desc.CalculateOffset( + make_tuple(0, i_vec)); + Accumulation::Calculate(acc_buf(Number{}), + parcial_acc_buf[Number{}]); + }); + } + + if(thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock < + NPerBlock) + { + acc_store.Run(acc_thread_buf_store_desc, + make_tuple(I0, I0, I0, I0), + acc_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + } + if constexpr(NReduceIters != 1) + { + if constexpr(i_n_reduce != (NReduceIters - 1)) + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_n); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_n); + } + else + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_n_reverse); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_n_reverse); + } + } + }); + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_m); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_m); + } + } + + continue; + } + } + + // offset for last acc buffer of this block + uint32_t block_acc_offset = + (block_2_ctile_map_streamk.get_acc_buffer_offset_from_block(block_idx + 1) - 1) * + MPerBlock * NPerBlock; while(true) { uint32_t current_iter_length = __builtin_amdgcn_readfirstlane( @@ -1173,33 +1459,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 iter_end - 1, tile_idx, iter_offset); iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1); - const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M, - problem.MPadded, - problem.K, - problem.KPadded, - problem.StrideA, - problem.AK0); - const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(problem.K, - problem.KPadded, - problem.N, - problem.NPadded, - problem.StrideB, - problem.BK0); - const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( - problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC); - - const auto c_grid_desc_mblock_mperblock_nblock_nperblock = - MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - c_grid_desc_m_n, problem.MBlock, problem.NBlock); - auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); - - const auto a_grid_buf = make_dynamic_buffer( - p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - - const auto b_grid_buf = make_dynamic_buffer( - p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); - auto block_work_idx = block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N); @@ -1363,11 +1622,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); + constexpr auto c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle = + GetCBlockDescriptor_MShuffle_MPerShuffle_NShuffle_NPerShuffle(); + auto c_shuffle_block_buf = make_dynamic_buffer( static_cast(p_shared), c_shuffle_block_desc_mblock_mperblock_nblock_nperblock .GetElementSpaceSize()); + auto c_partial_acc_buf = + make_dynamic_buffer( + reinterpret_cast(p_workspace) + block_acc_offset, + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle + .GetElementSpaceSize()); + constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, @@ -1477,7 +1745,34 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 c_grid_desc_mblock_mperblock_nblock_nperblock, make_multi_index(block_m_id, 0, block_n_id, 0), c_element_op}; - + // LDS to global partial acc + auto c_block_copy_lds_to_partial_acc = ThreadGroupTensorSliceTransfer_v6r1r2< + ThisThreadBlock, // index_t BlockSize, + CElementwiseOperation, // ElementwiseOperation, + // InMemoryDataOperationEnum::Set, // DstInMemOp, + Sequence<1, + CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + 1, + CShuffleNXdlPerWavePerShuffle * NWave * + NPerXdl>, // BlockSliceLengths, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, + CShuffleDataType, // typename SrcData, + CShuffleDataType, // typename DstData, + decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock), + decltype(c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle), + Sequence<0, 1, 2, 3>, // typename DimAccessOrder, + 3, // index_t VectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, + false, // bool ThreadTransferSrcResetCoordinateAfterRun, => need to be + // false, othre wise has scratch + false> // bool ThreadTransferDstResetCoordinateAfterRun, => need to be + // false, othre wise has scratch + {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(0, 0, 0, 0), + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + make_multi_index(0, 0, 0, 0), + c_element_op}; // space filling curve for threadwise C in VGPR constexpr auto sfc_c_vgpr = SpaceFillingCurve, @@ -1535,15 +1830,40 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 } else if(is_sk_block) { - // each block copy its data from LDS to global - c_shuffle_block_copy_lds_to_global - .template Run( + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Atomic) + { + // each block copy its data from LDS to global + c_shuffle_block_copy_lds_to_global + .template Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + } + else if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + // constexpr offset + c_block_copy_lds_to_partial_acc.SetSrcSliceOrigin( c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, - c_shuffle_block_buf, - c_grid_desc_mblock_mperblock_nblock_nperblock, - c_grid_buf); + make_tuple(0, 0, 0, 0)); + + c_block_copy_lds_to_partial_acc.SetDstSliceOrigin( + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + make_tuple(MXdlPerWave, 0, NXdlPerWave, 0)); + + c_block_copy_lds_to_partial_acc + .template Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + c_partial_acc_buf); + } } if constexpr(access_id < num_access - 1) @@ -1555,15 +1875,33 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); } }); - } + + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + if(is_sk_block) + { + // increase the counter for this tile + workgroup_barrier wg_barrier(p_semaphore); + wg_barrier.inc(tile_idx); + } + } + } // shuffle c and write-out end + // exit condition iter_end -= current_iter_length; if(iter_end <= iter_start) break; + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + block_acc_offset -= MPerBlock * NPerBlock; + } // make sure next loop LDS is ready for use block_sync_lds(); - } - } + } // while loop + + } // for loop } template ( + p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); + const auto b_grid_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); + uint32_t iter_start, iter_end; - bool is_sk_block, is_dp_block; //, is_padding_block; //, is_reduction_block; + bool is_sk_block, is_dp_block, is_reduction_block; index_t num_k_block_main_loop; + const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( + problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC); + + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + c_grid_desc_m_n, problem.MBlock, problem.NBlock); + + auto c_grid_buf = make_dynamic_buffer( + p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + + Block2CTileMap_streamk block_2_ctile_map_streamk(problem.M, + problem.N, + AK0Number * problem.KPadded, + problem.Grid_size, + problem.Streamk_sel); for(auto block_idx = get_block_1d_id(); block_idx < block_2_ctile_map_streamk.get_grid_dims(); block_idx += gridDim.x) @@ -1601,6 +1963,235 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end); num_k_block_main_loop = iter_end - iter_start; + uint32_t* p_semaphore = reinterpret_cast( + reinterpret_cast(p_workspace) + + block_2_ctile_map_streamk.get_workspace_size_for_acc(sizeof(AccDataType))); + + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + is_reduction_block = static_cast(block_idx) >= + block_2_ctile_map_streamk.reduction_start_block_idx; + if(is_reduction_block) + { + // descriptors + constexpr auto cluster_length_reduce = GetClusterLengthReduction(); + constexpr auto reduce_desc = make_cluster_descriptor(cluster_length_reduce); + const auto reduce_thread_cluster_idx = + reduce_desc.CalculateBottomIndex(make_multi_index(block_idx)); + const auto thread_m_cluster_id = reduce_thread_cluster_idx[I0]; + const auto thread_n_cluster_id = reduce_thread_cluster_idx[I1]; + + constexpr auto MReduceIters = math::integer_divide_ceil( + Number{}, cluster_length_reduce.At(I0)); + constexpr auto NReduceIters = math::integer_divide_ceil( + Number{}, + cluster_length_reduce.At(I1) * + Number{}); + + constexpr auto acc_thread_buf_load_desc = make_naive_tensor_descriptor_packed( + make_tuple(I1, Number{})); + constexpr auto acc_thread_buf_store_desc = + make_naive_tensor_descriptor_packed(make_tuple( + I1, I1, I1, Number{})); + + constexpr auto c_partial_acc_block_m_n = GetPartialAccBlockDescriptor(); + + constexpr auto partial_acc_load_step_n = + make_multi_index(0, + cluster_length_reduce.At(I1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_load_step_n_reverse = make_multi_index( + 0, + -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_load_step_m = + make_multi_index(cluster_length_reduce.At(I0), 0); + + constexpr auto partial_acc_store_step_n = + make_multi_index(0, + 0, + 0, + cluster_length_reduce.At(I1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_store_step_n_reverse = make_multi_index( + 0, + 0, + 0, + -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_store_step_m = + make_multi_index(0, cluster_length_reduce.At(I0), 0, 0); + + StaticBuffer + parcial_acc_buf; + StaticBuffer + acc_buf; + + // start to compute + auto reduction_idx = + block_idx - block_2_ctile_map_streamk.reduction_start_block_idx; + auto spatial_idx = block_2_ctile_map_streamk.tile_to_spatial( + reduction_idx, problem.M, problem.N); + + workgroup_barrier wg_barrier(p_semaphore); + + uint32_t tile_acc_offset_start = + block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx); + uint32_t tile_acc_offset_end = + block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx + + 1); + + uint32_t expected_count = tile_acc_offset_end - tile_acc_offset_start; + + if(threadIdx.x == 0) + { + p_semaphore[reduction_idx] = 0; + } + + __syncthreads(); + + auto acc_load = ThreadwiseTensorSliceTransfer_v2< + AccDataType, // SrcData, + AccDataType, // DstData, + decltype(c_partial_acc_block_m_n), // SrcDesc, + decltype(acc_thread_buf_load_desc), // DstDesc, + Sequence<1, + CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths, + Sequence<0, 1>, // DimAccessOrder, + 1, // SrcVectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // SrcScalarPerVector, + 1, // SrcScalarStrideInVector, + false // SrcResetCoordinateAfterRun, + >{c_partial_acc_block_m_n, + make_multi_index(thread_m_cluster_id, + thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock)}; + + auto acc_store = ThreadwiseTensorSliceTransfer_v1r3< + AccDataType, // SrcData, + CDataType, // DstData, + decltype(acc_thread_buf_store_desc), // SrcDesc, + decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), // DstDesc, + CElementwiseOperation, // ElementwiseOperation, + Sequence<1, + 1, + 1, + CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths, + Sequence<0, 1, 2, 3>, // DimAccessOrder, + 3, // DstVectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // DstScalarPerVector, + InMemoryDataOperationEnum::Set, // InMemoryDataOperationEnum DstInMemOp, + 1, // DstScalarStrideInVector, + false // DstResetCoordinateAfterRun, + >{c_grid_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(__builtin_amdgcn_readfirstlane(spatial_idx[I0]), + thread_m_cluster_id, + __builtin_amdgcn_readfirstlane(spatial_idx[I1]), + thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock), + CElementwiseOperation{}}; + +#if 0 + if(threadIdx.x == 0) { + printf("bid:%d, rid:%d, os:%d,%d, spatial:%d,%d\n", static_cast(blockIdx.x), + reduction_idx, __builtin_amdgcn_readfirstlane(tile_acc_offset_start), __builtin_amdgcn_readfirstlane(tile_acc_offset_end), + __builtin_amdgcn_readfirstlane(spatial_idx[I0]), + __builtin_amdgcn_readfirstlane(spatial_idx[I1])); + } +#endif + if(threadIdx.x == 0) + { + atomicAdd(&p_semaphore[reduction_idx], 1); + } + + wg_barrier.wait_eq(p_semaphore[reduction_idx], expected_count); + using Accumulation = ck::detail:: + AccumulateWithNanCheck; + + for(int i_m = 0; i_m < MReduceIters; i_m++) + { + static_for<0, NReduceIters, 1>{}([&](auto i_n_reduce) { + acc_buf.Clear(); + for(auto i = tile_acc_offset_start; i < tile_acc_offset_end; i++) + { + auto c_partial_acc_buf = + make_dynamic_buffer( + reinterpret_cast(p_workspace) + + i * c_partial_acc_block_m_n.GetElementSpaceSize(), + c_partial_acc_block_m_n.GetElementSpaceSize()); + + acc_load.Run(c_partial_acc_block_m_n, + c_partial_acc_buf, + acc_thread_buf_load_desc, + make_tuple(I0, I0), + parcial_acc_buf); + + static_for<0, CShuffleBlockTransferScalarPerVector_NPerBlock, 1>{}( + [&](auto i_vec) { + constexpr auto offset = + acc_thread_buf_load_desc.CalculateOffset( + make_tuple(0, i_vec)); + Accumulation::Calculate(acc_buf(Number{}), + parcial_acc_buf[Number{}]); + }); + } + + if(thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock < + NPerBlock) + { + acc_store.Run(acc_thread_buf_store_desc, + make_tuple(I0, I0, I0, I0), + acc_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + } + if constexpr(NReduceIters != 1) + { + if constexpr(i_n_reduce != (NReduceIters - 1)) + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_n); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_n); + } + else + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_n_reverse); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_n_reverse); + } + } + }); + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_m); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_m); + } + } + + continue; + } + } + + // offset for last acc buffer of this block + uint32_t block_acc_offset = + (block_2_ctile_map_streamk.get_acc_buffer_offset_from_block(block_idx + 1) - 1) * + MPerBlock * NPerBlock; + while(true) { uint32_t current_iter_length = __builtin_amdgcn_readfirstlane( @@ -1611,33 +2202,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 iter_end - 1, tile_idx, iter_offset); iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1); - const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M, - problem.MPadded, - problem.K, - problem.KPadded, - problem.StrideA, - problem.AK0); - const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(problem.K, - problem.KPadded, - problem.N, - problem.NPadded, - problem.StrideB, - problem.BK0); - const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( - problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC); - - const auto c_grid_desc_mblock_mperblock_nblock_nperblock = - MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - c_grid_desc_m_n, problem.MBlock, problem.NBlock); - - auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); - - const auto a_grid_buf = make_dynamic_buffer( - p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( - p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); - auto block_work_idx = block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N); @@ -1811,11 +2375,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); + constexpr auto c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle = + GetCBlockDescriptor_MShuffle_MPerShuffle_NShuffle_NPerShuffle(); + auto c_shuffle_block_buf = make_dynamic_buffer( static_cast(p_shared_0), c_shuffle_block_desc_mblock_mperblock_nblock_nperblock .GetElementSpaceSize()); + auto c_partial_acc_buf = + make_dynamic_buffer( + reinterpret_cast(p_workspace) + block_acc_offset, + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle + .GetElementSpaceSize()); + constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, @@ -1925,6 +2498,35 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 make_multi_index(block_m_id, 0, block_n_id, 0), c_element_op}; + // LDS to global partial acc + auto c_block_copy_lds_to_partial_acc = ThreadGroupTensorSliceTransfer_v6r1r2< + ThisThreadBlock, // index_t BlockSize, + CElementwiseOperation, // ElementwiseOperation, + // InMemoryDataOperationEnum::Set, // DstInMemOp, + Sequence<1, + CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + 1, + CShuffleNXdlPerWavePerShuffle * NWave * + NPerXdl>, // BlockSliceLengths, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, + CShuffleDataType, // typename SrcData, + CShuffleDataType, // typename DstData, + decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock), + decltype(c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle), + Sequence<0, 1, 2, 3>, // typename DimAccessOrder, + 3, // index_t VectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, + false, // bool ThreadTransferSrcResetCoordinateAfterRun, => need to be + // false, othre wise has scratch + false> // bool ThreadTransferDstResetCoordinateAfterRun, => need to be + // false, othre wise has scratch + {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(0, 0, 0, 0), + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + make_multi_index(0, 0, 0, 0), + c_element_op}; + // space filling curve for threadwise C in VGPR constexpr auto sfc_c_vgpr = SpaceFillingCurve, @@ -1982,15 +2584,40 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 } else if(is_sk_block) { - // each block copy its data from LDS to global - c_shuffle_block_copy_lds_to_global - .template Run( + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Atomic) + { + // each block copy its data from LDS to global + c_shuffle_block_copy_lds_to_global + .template Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + } + else if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + // constexpr offset + c_block_copy_lds_to_partial_acc.SetSrcSliceOrigin( c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, - c_shuffle_block_buf, - c_grid_desc_mblock_mperblock_nblock_nperblock, - c_grid_buf); + make_tuple(0, 0, 0, 0)); + + c_block_copy_lds_to_partial_acc.SetDstSliceOrigin( + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + make_tuple(MXdlPerWave, 0, NXdlPerWave, 0)); + + c_block_copy_lds_to_partial_acc + .template Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + c_partial_acc_buf); + } } if constexpr(access_id < num_access - 1) { @@ -2002,6 +2629,27 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 } }); } + // exit condition + iter_end -= current_iter_length; + if(iter_end <= iter_start) + break; + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + block_acc_offset -= MPerBlock * NPerBlock; + } + // make sure next loop LDS is ready for use + block_sync_lds(); + } + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + if(is_sk_block) + { + // increase the counter for this tile + workgroup_barrier wg_barrier(p_semaphore); + wg_barrier.inc(0); + } } } } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp index 19fa6c209..f44c02517 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp @@ -237,6 +237,206 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpaddin PassThrough, PassThrough>>>& instances); #endif + +#if(defined(CK_ENABLE_FP8)) +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances); +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances); +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances); +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances); +#endif + template && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + op_ptrs); + } + } +#endif + return op_ptrs; } }; diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 6a1558a52..2c0b6c7b7 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -87,6 +87,12 @@ function(add_instance_library INSTANCE_NAME) list(REMOVE_ITEM ARGN "${source}") endif() endforeach() + foreach(source IN LISTS ARGN) + if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_xdl_universal_streamk" AND source MATCHES "_f8_") + message("removing gemm_universal_streamk_f8 instance ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() endif() #only continue if there are some source files left on the list if(ARGN) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt index 2a930ab9a..08746a52d 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt @@ -21,6 +21,49 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp) + + device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp + + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp + + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp) add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp new file mode 100644 index 000000000..d03002af5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F8 = f8_t; +using F16 = half_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 4, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 256, 32, 8, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + #endif + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 256, 8, 4, 16, 16, 1, 1, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 256, 8, 4, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 256, 8, 4, 16, 16, 1, 1, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 256, 8, 4, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 128, 8, 4, 16, 16, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 128, 8, 4, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 64, 8, 4, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 4, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + #endif + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp new file mode 100644 index 000000000..239d3a67f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp new file mode 100644 index 000000000..9b65bbe9b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp new file mode 100644 index 000000000..38cda9bf8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp new file mode 100644 index 000000000..2afa4d5d6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp new file mode 100644 index 000000000..0f7dad4c5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp new file mode 100644 index 000000000..596817694 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp new file mode 100644 index 000000000..c4423e457 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp new file mode 100644 index 000000000..06f701f48 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp new file mode 100644 index 000000000..fda53c689 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp new file mode 100755 index 000000000..9272c74d7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp new file mode 100644 index 000000000..7736f38cb --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F8 = f8_t; +using F16 = half_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Compute friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 16, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + #endif + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 8, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 128, 8, 16, 32, 32, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 128, 8, 16, 16, 16, 4, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 128, 8, 16, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 128, 8, 16, 16, 16, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 8, 16, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 128, 8, 16, 16, 16, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 128, 8, 16, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 128, 8, 16, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 128, 8, 16, 32, 32, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 128, 8, 16, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 128, 8, 16, 32, 32, 1, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + #endif + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp new file mode 100644 index 000000000..4701d951a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp new file mode 100644 index 000000000..cb57860da --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp new file mode 100644 index 000000000..67be95888 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp new file mode 100755 index 000000000..f9e46a5f2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp new file mode 100644 index 000000000..419fcebdd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp new file mode 100644 index 000000000..7cbbc1813 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp new file mode 100644 index 000000000..e3ae25828 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100644 index 000000000..0c6aa0a4e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp new file mode 100644 index 000000000..75871166a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp new file mode 100644 index 000000000..8c91bc877 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp new file mode 100644 index 000000000..57b6ab3ae --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F8 = f8_t; +using F16 = half_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 128, 128, 16, 8, 32, 32, 4, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 192, 256, 64, 16, 8, 32, 32, 3, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 128, 16, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + // We prefer following instance, however, existing compiler bug cause it failed to generate sanity code. + // DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + #endif + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 2, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 128, 16, 2, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 2, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 128, 16, 2, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 128, 16, 4, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 128, 16, 2, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 128, 16, 4, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 128, 16, 2, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 2, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + #endif + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp new file mode 100644 index 000000000..51a51d3c2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp new file mode 100644 index 000000000..7613f5076 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp new file mode 100644 index 000000000..d015086f3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp new file mode 100644 index 000000000..4cb327f4f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp new file mode 100644 index 000000000..19b49c1f3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp new file mode 100644 index 000000000..9dd02b6e9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp new file mode 100644 index 000000000..e54568eaa --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp new file mode 100644 index 000000000..cd1e17648 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp new file mode 100644 index 000000000..7996c4441 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp new file mode 100755 index 000000000..c2544be5f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp new file mode 100644 index 000000000..14bd36d29 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F8 = f8_t; +using F16 = half_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Compute friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 64, 16, 8, 16, 16, 8, 7, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 64, 1, 4>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + #endif + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 16, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 128, 16, 8, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 128, 16, 8, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 128, 16, 8, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 128, 16, 8, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 128, 16, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 128, 16, 8, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 16, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 128, 16, 8, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 128, 16, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 128, 16, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 128, 16, 8, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + #endif + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp new file mode 100644 index 000000000..eefc77615 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp new file mode 100644 index 000000000..185874b24 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp new file mode 100644 index 000000000..a92181ccc --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp new file mode 100755 index 000000000..1551dba0f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp new file mode 100644 index 000000000..0f3e51db1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp new file mode 100644 index 000000000..f87b8f670 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp new file mode 100644 index 000000000..0058a2ad6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100644 index 000000000..3a3bd5df9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp new file mode 100644 index 000000000..fb50e2589 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp new file mode 100644 index 000000000..6413655b6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt deleted file mode 100644 index 2a930ab9a..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt +++ /dev/null @@ -1,26 +0,0 @@ -# ONLY XDL_KERNELS -set(GEMM_UNIVERSAL_STREAMK_INSTANCES) - -list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp) - -add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp deleted file mode 100644 index 6e8d5c798..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = half_t; -using F32 = float; - -using Row = tensor_layout::gemm::RowMajor; -using Col = tensor_layout::gemm::ColumnMajor; - -template -using S = Sequence; - -using PassThrough = element_wise::PassThrough; - -static constexpr auto GemmDefault = GemmSpecialization::Default; -static constexpr auto GemmKPadding = GemmSpecialization::KPadding; -static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; -static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; - -static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; -static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; - -template -using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple< - // clang-format off - //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| - //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 256, 32, 8, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 128, 32, 8, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> - // clang-format on - >; - -template -using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple< - // clang-format off - //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| - //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - - // Latency friendly - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - // Memory friendly - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 8, 2, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 2, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 8, 4, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 4, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 64, 8, 4, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 4, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 8, 4, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 64, 8, 4, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 64, 8, 4, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 4, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> - // clang-format on - >; -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp deleted file mode 100644 index 6adcb8f4f..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp deleted file mode 100644 index 631ae6872..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp deleted file mode 100644 index 2c49773a6..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp deleted file mode 100644 index 39d54fb88..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp deleted file mode 100644 index 8ee50d63c..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp deleted file mode 100644 index d31e0819a..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp deleted file mode 100644 index fe19f35e5..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp deleted file mode 100644 index 6c1873b37..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp deleted file mode 100644 index ffd53f406..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp deleted file mode 100644 index 094b8f92f..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp deleted file mode 100644 index e00c1733e..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp +++ /dev/null @@ -1,98 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = half_t; -using F32 = float; - -using Row = tensor_layout::gemm::RowMajor; -using Col = tensor_layout::gemm::ColumnMajor; - -template -using S = Sequence; - -using PassThrough = element_wise::PassThrough; - -static constexpr auto GemmDefault = GemmSpecialization::Default; -static constexpr auto GemmKPadding = GemmSpecialization::KPadding; -static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; -static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; - -static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; -static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; - -template -using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple< - // clang-format off - //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| - //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - - // Compute friendly - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - // AGPR Spill - // DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - // AGPR Spill when use permuted lds layout. so, use padding for these two. - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 64, 8, 8, 16, 16, 8, 7, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 64, 1, 4>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> - // clang-format on - >; - -template -using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple< - // clang-format off - //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| - //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - - // Latency friendly - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 8, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - // Memory friendly - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 8, 8, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 8, 8, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 64, 8, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 8, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 8, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 8, 8, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 64, 8, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 64, 8, 8, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 8, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> - // clang-format on - >; -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp deleted file mode 100644 index 546f909b3..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp deleted file mode 100644 index d91de96be..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp deleted file mode 100644 index c70678b44..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp deleted file mode 100644 index 5410a0cc2..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp deleted file mode 100644 index 4ae7329f9..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp deleted file mode 100644 index 4fc5458a9..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp deleted file mode 100644 index 7369f87a5..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp deleted file mode 100644 index 45425a41a..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp deleted file mode 100644 index 3b5ac0366..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp deleted file mode 100644 index 53aa011a7..000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/modified_files.txt b/modified_files.txt new file mode 100755 index 000000000..34a42e3f3 --- /dev/null +++ b/modified_files.txt @@ -0,0 +1,10 @@ +example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp +example/01_gemm/run_gemm_example_streamk_v2.inc +include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp +include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp +library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp +library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp +library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp +library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp +profiler/src/profile_gemm_universal_streamk.cpp +modified_files.txt diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp old mode 100644 new mode 100755 index cd3f5787d..85f6c2577 --- a/profiler/src/profile_gemm_universal_streamk.cpp +++ b/profiler/src/profile_gemm_universal_streamk.cpp @@ -85,8 +85,10 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) using F32 = float; using F16 = ck::half_t; - // using BF16 = ck::bhalf_t; - // using F8 = ck::f8_t; + +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) + using F8 = ck::f8_t; +#endif using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -145,6 +147,24 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) { return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); } +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) + else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN) + { + return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{}); + } + else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{}); + } + else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) + { + return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}); + } + else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); + } +#endif else { std::cout << "this data_type & layout is not implemented" << std::endl; -- GitLab From 4c7035ff08f17aa138a747b8ea00ccf47276d85c Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 22 Nov 2024 08:30:01 -0800 Subject: [PATCH 070/153] fix path of ninjatracing (#1685) --- Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index b06726335..76e6f0ebe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -87,17 +87,17 @@ RUN pip install --upgrade cmake==3.27.5 && \ git clone https://github.com/ccache/ccache.git && \ cd ccache && mkdir build && cd build && cmake .. && make install && \ #Install ninja build tracing tools + cd / && \ wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \ gunzip /usr/local/bin/ninja.gz && \ chmod a+x /usr/local/bin/ninja && \ git clone https://github.com/nico/ninjatracing.git && \ #Install latest cppcheck git clone https://github.com/danmar/cppcheck.git && \ - cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . -WORKDIR / - + cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . && \ + cd / && \ # Install an init system -RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \ + wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \ dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \ # Install packages for processing the performance results pip3 install --upgrade pip && \ -- GitLab From ff92222f937b54955011d394f46130fc5002110c Mon Sep 17 00:00:00 2001 From: schung-amd Date: Fri, 22 Nov 2024 17:51:35 -0500 Subject: [PATCH 071/153] [CK_TILE] MakeKargs overloads for backward compatibility (#1681) * Add overloads for MakeKargs Overload MakeKargs to accept std::tuple and std::tuple to preserve functionality of code currently passing in list initializers or tuples. * Add overloads for MakeKargs Overload MakeKargs to accept std::tuple and std::tuple to preserve functionality of code currently passing in list initializers or tuples. * Re-format files using ck_tile remod.py --------- Co-authored-by: Po Yen Chen --- .../ops/fmha/kernel/fmha_bwd_kernel.hpp | 444 ++++++++++++++++++ .../ops/fmha/kernel/fmha_fwd_kernel.hpp | 338 +++++++++++++ 2 files changed, 782 insertions(+) diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp index c5858a20f..ccf15ee60 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp @@ -470,6 +470,248 @@ struct FmhaBwdDQDKDVKernel return kargs; } + // std::variant can't take in a list initializer, overload for backward compatibility + template + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_do, + ck_tile::index_t batch_stride_lsed, + ck_tile::index_t batch_stride_dq_acc, + ck_tile::index_t batch_stride_dk, + ck_tile::index_t batch_stride_dv, + ck_tile::index_t batch_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + lse_ptr, + do_ptr, + d_ptr, + rand_val_ptr, + dk_ptr, + dv_ptr, + dbias_ptr, + dq_acc_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_do, + stride_dq_acc, + stride_dk, + stride_dv, + stride_dbias, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_do, + nhead_stride_lsed, + nhead_stride_dq_acc, + nhead_stride_dk, + nhead_stride_dv, + nhead_stride_dbias, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_do, + batch_stride_lsed, + batch_stride_dq_acc, + batch_stride_dk, + batch_stride_dv, + batch_stride_dbias, + split_stride_dq_acc, + window_size_left, + window_size_right, + mask_type, + p_drop, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + + // std::variant can't take in a list initializer, overload for backward compatibility + template + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_do, + ck_tile::index_t batch_stride_lsed, + ck_tile::index_t batch_stride_dq_acc, + ck_tile::index_t batch_stride_dk, + ck_tile::index_t batch_stride_dv, + ck_tile::index_t batch_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + lse_ptr, + do_ptr, + d_ptr, + rand_val_ptr, + dk_ptr, + dv_ptr, + dbias_ptr, + dq_acc_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_do, + stride_dq_acc, + stride_dk, + stride_dv, + stride_dbias, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_do, + nhead_stride_lsed, + nhead_stride_dq_acc, + nhead_stride_dk, + nhead_stride_dv, + nhead_stride_dbias, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_do, + batch_stride_lsed, + batch_stride_dq_acc, + batch_stride_dk, + batch_stride_dv, + batch_stride_dbias, + split_stride_dq_acc, + window_size_left, + window_size_right, + mask_type, + p_drop, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + template CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -616,6 +858,208 @@ struct FmhaBwdDQDKDVKernel return kargs; } + // std::variant can't take in a list initializer, overload for backward compatibility + template + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + lse_ptr, + do_ptr, + d_ptr, + rand_val_ptr, + dk_ptr, + dv_ptr, + dbias_ptr, + dq_acc_ptr, + seqstart_q_ptr, + seqstart_k_ptr, + seqlen_k_ptr, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_do, + stride_dq_acc, + stride_dk, + stride_dv, + stride_dbias, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_do, + nhead_stride_lsed, + nhead_stride_dq_acc, + nhead_stride_dk, + nhead_stride_dv, + nhead_stride_dbias, + split_stride_dq_acc, + window_size_left, + window_size_right, + mask_type, + p_drop, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + + // std::variant can't take in a list initializer, overload for backward compatibility + template + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + lse_ptr, + do_ptr, + d_ptr, + rand_val_ptr, + dk_ptr, + dv_ptr, + dbias_ptr, + dq_acc_ptr, + seqstart_q_ptr, + seqstart_k_ptr, + seqlen_k_ptr, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_do, + stride_dq_acc, + stride_dk, + stride_dv, + stride_dbias, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_do, + nhead_stride_lsed, + nhead_stride_dq_acc, + nhead_stride_dk, + nhead_stride_dv, + nhead_stride_dbias, + split_stride_dq_acc, + window_size_left, + window_size_right, + mask_type, + p_drop, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_, ck_tile::index_t nhead_, ck_tile::index_t seqlen_k_) { diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index e0c145fde..4443a4503 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -399,6 +399,186 @@ struct FmhaFwdKernel return kargs; } + // std::variant can't take in a list initializer, overload for backward compatibility + template + __host__ static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_lse, + ck_tile::index_t batch_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + const std::tuple& drop_seed_offset) + { + MakeKargs(q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_lse, + batch_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + + // std::variant can't take in a list initializer, overload for backward compatibility + template + __host__ static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_lse, + ck_tile::index_t batch_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + const std::tuple& drop_seed_offset) + { + MakeKargs(q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_lse, + batch_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + template __host__ static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -522,6 +702,164 @@ struct FmhaFwdKernel return kargs; } + // std::variant can't take in a list initializer, overload for backward compatibility + template + __host__ static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqstart_q_ptr, + seqstart_k_ptr, + seqlen_k_ptr, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + + // std::variant can't take in a list initializer, overload for backward compatibility + template + __host__ static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqstart_q_ptr, + seqstart_k_ptr, + seqlen_k_ptr, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + __host__ static constexpr auto GridSize(ck_tile::index_t batch_size_, ck_tile::index_t nhead_, ck_tile::index_t seqlen_q_, -- GitLab From a420b3b34d2ad3e897aec824288182cf1e442dd6 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:30:12 -0800 Subject: [PATCH 072/153] add Andriy to the code owners (#1687) --- .github/CODEOWNERS | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5340be274..d7a6b1778 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,8 +1,8 @@ -* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca # Documentation files -docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk -*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk -*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk -.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca +*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca +*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca +.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca # Header directory for Doxygen documentation -library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca -- GitLab From 19d4b790399e479abd66d6555265fd7cd6389931 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:16:08 -0800 Subject: [PATCH 073/153] add --squash flag when building dockers (#1686) --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index b79b2045b..2f790d8e5 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -132,7 +132,7 @@ def buildDocker(install_prefix){ checkout scm def image_name = getDockerImageName() echo "Building Docker for ${image_name}" - def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' " + def dockerArgs = "--squash --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' " if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ dockerArgs = dockerArgs + " --no-cache " } -- GitLab From ce2bdf42a9c7d78e60d16cfb00581c83a0bfc49c Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Mon, 25 Nov 2024 12:31:38 +0800 Subject: [PATCH 074/153] Change in fwd-splitkv kernel to support num_splits=1 case (#1690) * Change in fwd-splitkv kernel to support num_splits=1 case * Update in codegen fwd-splitkv to make num_splits > 1 cases pass * Specify instance traits in dispatch * Fix link error for fp8 kernels --------- Co-authored-by: Po Yen Chen --- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 45 +++++++++++-------- .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 19 +++++--- ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp | 3 +- .../ops/fmha/pipeline/tile_fmha_traits.hpp | 2 +- 4 files changed, 42 insertions(+), 27 deletions(-) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index d1da95156..1c40cf6f3 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -247,12 +247,22 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const }} """ -FMHA_FWD_SPLITKV_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) && +FMHA_FWD_SPLITKV_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) && ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{ - using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; - using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>; - - return fmha_fwd_splitkv_(s, a); + using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; + if (t.has_lse) {{ + if constexpr (std::is_same_v<{F_dtype}, ck_tile::fp8_t>) {{ + return -1; + }} else {{ + using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, true, {F_squant}, {F_spad}, {F_dvpad}>; + + return fmha_fwd_splitkv_(s, a); + }} + }} else {{ + using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, false, {F_squant}, {F_spad}, {F_dvpad}>; + + return fmha_fwd_splitkv_(s, a); + }} }} """ @@ -614,27 +624,26 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> squant = 't' if dtype == 'fp8' else 'f' pipelines = [] if dtype in ['fp16', 'bf16']: - for mask, bias, lse, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]): + for mask, bias, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]): # TODO: use async pipeline when compiler is more stable if hdim == 256 or hdim in [32, 64, 128]: ### [32, 64, 96, 128]: # if True: - pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) else: - pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) + pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) if receipt == 1: - pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) # TODO: cover arbitraty hdim - pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask)) # TODO: cover arbitraty hdim + pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim + pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim elif dtype in ['fp8', 'bf8']: - # no need lse/paged-kv kernels for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): - pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', squant, 'f', mask)) + pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask)) else: assert False return pipelines diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index 98a4329d7..3c4e02d08 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -35,6 +35,7 @@ struct FmhaFwdSplitKVKernel using LSEDataType = ck_tile::remove_cvref_t; using SaccDataType = ck_tile::remove_cvref_t; using OaccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; using VLayout = ck_tile::remove_cvref_t; @@ -234,8 +235,10 @@ struct FmhaFwdSplitKVKernel const void* k_ptr, const void* v_ptr, const void* bias_ptr, - void* lse_acc_ptr, - void* o_acc_ptr, + void* lse_acc_ptr, /* workspace for lse accumulation when num_splits > 1, otherwise + final lse */ + void* o_acc_ptr, /* workspace for o accumulation when num_splits > 1, otherwise final + o */ ck_tile::index_t batch, ck_tile::index_t seqlen_q, ck_tile::index_t seqlen_k, // only used if 'seqlen_k_ptr' is not specified @@ -356,8 +359,10 @@ struct FmhaFwdSplitKVKernel const void* k_ptr, const void* v_ptr, const void* bias_ptr, - void* lse_acc_ptr, - void* o_acc_ptr, + void* lse_acc_ptr, /* workspace for lse accumulation when num_splits > 1, otherwise + final lse */ + void* o_acc_ptr, /* workspace for o accumulation when num_splits > 1, otherwise final + o */ ck_tile::index_t batch, const void* seqstart_q_ptr, const void* seqstart_k_ptr, @@ -591,9 +596,9 @@ struct FmhaFwdSplitKVKernel static_cast(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v + batch_offset_v; - OaccDataType* o_acc_ptr = reinterpret_cast(kargs.o_acc_ptr) + - static_cast(i_nhead) * kargs.nhead_stride_o_acc + - batch_offset_o_acc + i_split * kargs.split_stride_o_acc; + ODataType* o_acc_ptr = reinterpret_cast(kargs.o_acc_ptr) + + static_cast(i_nhead) * kargs.nhead_stride_o_acc + + batch_offset_o_acc + i_split * kargs.split_stride_o_acc; // Q/K/V DRAM and DRAM window const auto q_dram = [&]() { diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp index 71c3bd171..4e8d8694d 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp @@ -25,6 +25,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS using LSEDataType = remove_cvref_t; using PDataType = remove_cvref_t; using OaccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; using FmhaMask = remove_cvref_t; using BlockFmhaShape = remove_cvref_t; @@ -48,7 +49,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS static constexpr bool kPadHeadDimQ = Problem::kPadHeadDimQ; static constexpr bool kPadHeadDimV = Problem::kPadHeadDimV; static constexpr auto BiasEnum = Problem::BiasEnum; - static constexpr bool kStoreLSE = true; // always store LSE (acc) + static constexpr bool kStoreLSE = Problem::kStoreLSE; static constexpr bool kIsPagedKV = Problem::kIsPagedKV; static constexpr bool kHasUnevenSplits = Problem::kHasUnevenSplits; diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp index e3187042d..d7bf8ea7e 100644 --- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp +++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp @@ -39,7 +39,7 @@ template 1 or fwd training is running */ bool kDoFp8StaticQuant_, bool kIsPagedKV_, bool kHasUnevenSplits_, -- GitLab From 36c7ce4e0eef86df186f8d796d7e177b8b13df92 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Mon, 25 Nov 2024 13:12:35 +0800 Subject: [PATCH 075/153] [CK_TILE]Moe update index (#1672) * update MOCK_ID for moe-sorting * add moe-smoothquant * update a comment * fix format * hot fix * update topk in overflow case * update comments * update bf16 cvt --------- Co-authored-by: valarLip <340077269@qq.com> --- .../ck_tile/14_moe_smoothquant/CMakeLists.txt | 25 ++ example/ck_tile/14_moe_smoothquant/README.md | 15 + .../moe_smoothquant_bf16_n1024_instance.cpp | 22 ++ .../moe_smoothquant_bf16_n1536_instance.cpp | 13 + .../moe_smoothquant_bf16_n2048_instance.cpp | 14 + .../moe_smoothquant_bf16_n256_instance.cpp | 12 + .../moe_smoothquant_bf16_n3072_instance.cpp | 14 + .../moe_smoothquant_bf16_n4096_instance.cpp | 14 + ...moe_smoothquant_bf16_n4096_tp_instance.cpp | 14 + .../moe_smoothquant_bf16_n512_instance.cpp | 13 + ...moe_smoothquant_bf16_n64_n128_instance.cpp | 12 + .../moe_smoothquant_bf16_n768_instance.cpp | 12 + .../moe_smoothquant_fp16_n1024_instance.cpp | 22 ++ .../moe_smoothquant_fp16_n1536_instance.cpp | 13 + .../moe_smoothquant_fp16_n2048_instance.cpp | 14 + .../moe_smoothquant_fp16_n256_instance.cpp | 12 + .../moe_smoothquant_fp16_n3072_instance.cpp | 14 + .../moe_smoothquant_fp16_n4096_instance.cpp | 14 + ...moe_smoothquant_fp16_n4096_tp_instance.cpp | 14 + .../moe_smoothquant_fp16_n512_instance.cpp | 13 + ...moe_smoothquant_fp16_n64_n128_instance.cpp | 12 + .../moe_smoothquant_fp16_n768_instance.cpp | 12 + .../instances/moe_smoothquant_fwd_api.cpp | 145 ++++++++++ .../moe_smoothquant_instance_common.hpp | 62 ++++ .../14_moe_smoothquant/misc/moe-sm.png | Bin 0 -> 206879 bytes .../14_moe_smoothquant/moe_smoothquant.cpp | 264 ++++++++++++++++++ .../14_moe_smoothquant/moe_smoothquant.hpp | 114 ++++++++ .../14_moe_smoothquant/script/perf_test.sh | 37 +++ .../14_moe_smoothquant/script/smoke_test.sh | 30 ++ example/ck_tile/CMakeLists.txt | 1 + include/ck_tile/core/config.hpp | 5 + include/ck_tile/core/numeric/bfloat16.hpp | 36 +++ .../host/reference/reference_moe_sorting.hpp | 29 +- .../fused_moe/kernel/moe_sorting_kernel.hpp | 83 +++++- include/ck_tile/ops/smoothquant.hpp | 1 + .../kernel/moe_smoothquant_kernel.hpp | 205 ++++++++++++++ 36 files changed, 1321 insertions(+), 11 deletions(-) create mode 100644 example/ck_tile/14_moe_smoothquant/CMakeLists.txt create mode 100644 example/ck_tile/14_moe_smoothquant/README.md create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp create mode 100644 example/ck_tile/14_moe_smoothquant/misc/moe-sm.png create mode 100644 example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp create mode 100755 example/ck_tile/14_moe_smoothquant/script/perf_test.sh create mode 100755 example/ck_tile/14_moe_smoothquant/script/smoke_test.sh create mode 100644 include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp diff --git a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt new file mode 100644 index 000000000..12224a39a --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt @@ -0,0 +1,25 @@ +function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC) + message("adding ${TARGET_NAME}") + # not using add_example_executable() to add target, since we don't want this to have + # to be included in "make all/install/check" + add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC}) + target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) + + foreach(source IN LISTS ARGN) + list(APPEND INSTANCE_SRCS ${source}) + endforeach() + + target_sources(${TARGET_NAME} PRIVATE ${INSTANCE_SRCS}) + + set(COMPILE_OPTIONS) + # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations + list(APPEND COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + # list(APPEND COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) + + target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS}) +endfunction(add_moe_smoothquant_example TARGET_NAME MAIN_SRC) + +file(GLOB INSTANCE_SRCS instances/*.cpp) + +add_moe_smoothquant_example(tile_example_moe_smoothquant moe_smoothquant.cpp ${INSTANCE_SRCS}) + diff --git a/example/ck_tile/14_moe_smoothquant/README.md b/example/ck_tile/14_moe_smoothquant/README.md new file mode 100644 index 000000000..599b4c348 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/README.md @@ -0,0 +1,15 @@ +# moe-smoothquant + +This folder contains example for moe-smoothquant using ck_tile tile-programming implementation. +![](misc/moe-sm.png) + +Unlike standard smoothquant op, the input scale is from different expert `[expert, hidden]`, we need reuse the `topk-id` from previous `topk-softmax` and select the corresponding `expert` from current topk, and expand the output/per-token-scale by `topk` + +## build +``` +# in the root of ck_tile +mkdir build && cd build +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +make tile_example_moe_smoothquant -j +``` +This will result in an executable `build/bin/tile_example_moe_smoothquant` diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp new file mode 100644 index 000000000..f43626147 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +#if 0 +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +#endif + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp new file mode 100644 index 000000000..e380520fc --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp new file mode 100644 index 000000000..4d536cd61 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp new file mode 100644 index 000000000..b38a4733a --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp new file mode 100644 index 000000000..c5c170aef --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp new file mode 100644 index 000000000..0e48a1b69 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp new file mode 100644 index 000000000..4af42c6c8 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp new file mode 100644 index 000000000..ea611a183 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp new file mode 100644 index 000000000..a6209820e --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp new file mode 100644 index 000000000..f569dedf3 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp new file mode 100644 index 000000000..3793adb5c --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +#if 0 +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +#endif + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp new file mode 100644 index 000000000..4bf9cb1a4 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp new file mode 100644 index 000000000..eb0d0fe10 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp new file mode 100644 index 000000000..36bc0de15 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp new file mode 100644 index 000000000..fa6f53b2d --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp new file mode 100644 index 000000000..9b7462ab9 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp new file mode 100644 index 000000000..8911bc229 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp new file mode 100644 index 000000000..07783ac16 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp new file mode 100644 index 000000000..a5ab56a76 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp new file mode 100644 index 000000000..4272cbafc --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp new file mode 100644 index 000000000..a65d3fde6 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "moe_smoothquant.hpp" + +template +using trait_ = moe_smoothquant_traits_; + +template +float moe_smoothquant_dispatch(moe_smoothquant_traits /*t*/, + moe_smoothquant_args a, + const ck_tile::stream_config& s) +{ + float r = -1; + // clang-format off + // rm rn tm tn vn pd 2p + if(a.hidden_size <= 64) { + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 128) { + if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 256) { + if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 512) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 768) { + if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 1024) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 1536) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 2048) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 3072) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 4096) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size > 4096) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + return r; + // clang-format on +} + +float moe_smoothquant(moe_smoothquant_traits t, + moe_smoothquant_args a, + const ck_tile::stream_config& s) +{ + if(t.data_type.compare("fp16") == 0) + { + return moe_smoothquant_dispatch(t, a, s); + } + else if(t.data_type.compare("bf16") == 0) + { + return moe_smoothquant_dispatch(t, a, s); + } + else + throw std::runtime_error("Without supported instances!"); +} diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp new file mode 100644 index 000000000..88d300091 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "moe_smoothquant.hpp" +#include + +#pragma once + +using S = ck_tile::stream_config; +using A = moe_smoothquant_args; + +template +using trait_ = moe_smoothquant_traits_; + +template +float moe_smoothquant_(const S& s, A a) +{ + using DataType = typename Traits_::DataType; + + using PipelineProblem = ck_tile::SmoothquantPipelineProblem< + typename MoeSmoothquantTypeConfig::XDataType, + typename MoeSmoothquantTypeConfig::XScaleDataType, + typename MoeSmoothquantTypeConfig::ComputeDataType, + typename MoeSmoothquantTypeConfig::YScaleDataType, + typename MoeSmoothquantTypeConfig::QYDataType, + typename Traits_::Shape, + Traits_::kPadN, + Traits_::kTwoPass>; + + using OnePassPipeline = ck_tile::SmoothquantPipelineOnePass; + using TwoPassPipeline = ck_tile::SmoothquantPipelineTwoPass; + using Pipeline = std::conditional_t; + + using Kernel = ck_tile::MoeSmoothquant; + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + auto kargs = Kernel::MakeKargs(a); + if(s.log_level_ > 0) + std::cout << ", " << Kernel::GetName() << std::flush; + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); +} diff --git a/example/ck_tile/14_moe_smoothquant/misc/moe-sm.png b/example/ck_tile/14_moe_smoothquant/misc/moe-sm.png new file mode 100644 index 0000000000000000000000000000000000000000..5a40099ef3ce3860ed133e4b150ad4785108f129 GIT binary patch literal 206879 zcmdSBWmjC$)-{+ABzW);EVx5(*WeDpHMmo_YjAfhoZv2n6&48Y?uEO%Ls#xIp8Iz9 z=s(c)ZJlvy>?3>0oNKNVuB0G|iu48P-Me?F(o$k7@7}@Dyn6=&`3M6&LX~5o_U_%= zo3xmSs{7J$7TnJnsb&8}o4F>4O0JBK46R*ZC?9rc&*~1hE;@JS2Q;^)%~KV^h#$GZ zD%vH&t1dp|YbL2%;J{TzLhVzZ*Qa~%;4yHJB7J)?ew|_12)e}o{=D(+KLP&Nc=zt! z(ck^Uf6XpD*uMn-wcd4n{R;U%);Zk!3;6$<{>&f4xc+Os^A-_qd;ec^9T{d5^}p6T zgHQjDC4c?@c*%$;cY4}g)9et%_1(n0tPSHdFr`7)7QTF3Wgg}Zwsab@D2t7jHAarP zx6UFRot89)8F?D=f6mkRNAQY}HU?$hVk@tWF)FO0^g7WNoHL5V>C zGK?oOjRJaxUr~z9z=01vtQT;+JiVB~;`WIN&EZaXC&5!Tu&QbwJ(^RKC;n=4D(4MP zvj&TOy-;Bl92Xhb6+IkHHoqT*>sM!%M9qd>H3*Tc<`gXntpO*9;JX9)@sF zf|LF2&t9A|B@M zPAls?5}o6*U-ZKQ)_KZP2U%YA-X=jb^@k=0<=1ZwctvQQDgm7RHpJJ;-wX)en|U(} z=4Q|xOH;}A>aFh|a`{>RbBYdn=q@slqp@&3r!?DqIDV_CNKSGqZr9MliruBpHi>G_ z3`=)c@eb6+6DVmbM*O3BOJV$cLlFf&%4oK&W1#&y2G`8xQ2VX*RY^`i6^lhmWMns> z71A!(o}f}deg79S((S`ZL78gDXT?N zn@FH<+@U4>qg4b30Bw)hYg8;ZO`)SPG^6#4UOL7614*R9$gELN%SXN8#bfJZtHba) z2T3u{fH7UzjM5W2@FHY*e@GWp_0TeZhqaxCZ}2<)(GIs*r$c$G%Oz4FSk8LXwMQ2FCy{TQ2CH$iQZ;JWWSo;VYUVv#ZnW#X?uhN_E zRn#`Vnc)}m2J0wCKcyiD>pPaB=(|C`M^O|wb;N7-gQN*`tI`(kOZkpOmX3jD}dW0M=H7T6De9@=WN)u|%R>z+DiVY2{Yje<`71H8cIBu2gYGrhl z5MIvOrIItLT2Kb>@*;iL?9~2HVzuLPR%}1boG#9vx+8*C6nb-WXBW+svdi1OwB|S~ zD<}V_+`2{y4Z%T`2cqc75*jDG)Rb)HrtRY|z0p-8N(T|PD0@}NZ25)8@ahf=Q(cSu zLkr&W(FK!>4`$5vL~JDyFT3zeMRQkN0)=s&z)F*h)JDOzvZTpmOBvhg>FVqv8kX#q zkMALK)Ie}N72W&!)p$fLD<=$#B5%aSX{N&HD%Jgcf41sh20K7-m*pHd?OsIE+2xDS zz7SDl$5U&UH>I)ppYdaZv3u>56UT#_ymP`R024Z;4J5NbTc#C- z;i6Q=?$JgH`w*>nn8N<5>T{^(yX;h5F(Np*TUr7Z@|tx{`mE>0&jje26(b5E4v5+d zFw|-=vi`8AK&&o-_0_h@fqTi@W4E-_m~mw(m6doyx~S}7Q3H;*Pc@N)7c_$nu28?g z*g1)&5*|7v{A@_O33PyXGYWvpdib%%|FU>w;F{}Gmea7=8qst&l%YAvi%m?)g?C9Y z_GDO2CLE=8m{v5+alNW$l}&)S3GT=gsn^pZzvV{FQZfCyU^l&!Y)|=NMpi;oR9b)S z?0->w=#4tNLz-BMx;LzlQn7lLU~0g>=uB!+C9V6@G%@$ z4Wo*cwZxQG0}JndswN82O=VOSV}EmM%El2uqJx!szBv&k?e>E)m zPNP0Qt>q`yroF;@(Xx+}Hx;2x)!}=WT_wi`zY-B)sO5C^*oPw!y)}&YcA5G>4^}E7 z1y~Njsu*;KGGGPmcLWdb&mBv>}G`vy6+yQ4Y2-1Inq(Pu(&qQXHV- zaO>D9fUltN-o|o~sVXD?u(Dfc5;Tu`6j|dJ#*~xwsX7bq_3@P}?W9!AsPPi^>=~Mk z=!mcc`q<#kFxat+>UI?M7Y4ynif`_cpH~iP7^cwi8A(^2)EaWysMBe{Q$& zXPeE{r1nV<%qd-1W?8{9x~$(YhwWc!(8&RmXIumU;AVGsi{dG=DR=2HzqAepepxeK znBF?dzZ64(P032;4Cu@GbqEHs2^WYsqnvIj*{uKdo8RtxmX=YR@wO?T__(HJuZ+W} z)=TNWH2UC1_aBoVhSRMs^j|5ASE5ZuY$R0z{J9C%=N1<4ul6_6+Qz7jSqw=%nkOse z^spEq2>DJER|tI>Cr2UmU|{nt`=q`PwVHDIIWEA{fZ)zfzmJ?>PGgQ2qkKoZO@vW@-F3E^fK4DZ-%CE#W~Wga;tVx=86$AqS{&XSR%zqezuv6E1~2x}H0A-1 zP5cLO&*(x6zW*4J_^FAmqbwK8+83E_<_nEEho+>AkZjlPa(+iK~Evu|-i!A5Dl$!*&5(w`%A@3;MJf_SEL+f!@;BL`F#c_bD@+xlLPI zs__Qr@*gYr0Hi`^YY;9miftw6c%4a(vNatpMp`2Cg0NXIUF=ZQ$cZ|K=i!Wt`{^5 zXT~xvB@XGI!D6{u65sI+A{AgsV}NJh0Aaa>(Ly-60J3NbB!c#zGzbN70b3PuUyp`!jV$M zW6UC%#V4(3D$ahUI-~L zJ4!`fJUbVT#laK4 zObCmReLO+Pp;@>647(GLG!ui(yW&uswdJ8^JsKAjI_aw(G-X&tdS6TWt?#d&Q`!?K z%#jZ)W0I2+aOAHKW+$AbT4fAfo!KMS87PN=VgAkWWE#L|f8v3Yzio$TeO|)3)}-t| z3ITI$cz^V}%WddcIXWKgGRW4BCnTN!NI!_3Z1(w~7-_*{*Yt3Q_HD%tlJf>8(3>WQg|R)PJr6_9<9s$cYWi zi|uAa=8}X&Xw>2Xtuz!>EP7aHgVix^WTi}D5;3}+8u(Kz5k^k3e>qjEx&QURpHby_+-TQ8snQm?H)4olA z_dyhH3wVS1584#RfP%$)Pl}VU!Il(-G(UOlci#Oq*yN6>9og2IhcY9)+9Iu=C9LeDn4-dM%pX2!lyOD~+!kY@i zMl_(0LOgkU)xqNre$Yqlwt#%30%Nf%=;fR<#>f4Q=r2Wtv&!hT;+A?X(R$X|`w0Cn zO(q#RBsCE!gU~&k)L2>B%`3H1OYXAL3HBWcY8aQ&%!A=y0!j>IDN#R0R)(TUR_xKX zMEHIdQt1Tmkxw!C-#{sJUz$epKLl(Uk;Z=grdmUX^Syqgxc_-$lEsD*eCC`D2VJHM zTU?@zgFUea$Fibc_1ccq*C@~g`Qd%9R{)$jyp`AP+WAwKgpxt?)(0`U@we6p%+w2Y2Od}_c$ zy0>3@#*-5JExaaCK5Hm*19ZeaAB+oIkhXR^Y8Krk(%~K(5|v;Rk}9}c)|)%{7M$@o zWAFFjKU-HB2{MwpSA=ar9b?GFS_`Yio2JigR&fE;4uuZ_y59IaVx2U~Mfm}2%vvUELmKXU|Yee^7c+1fkZGC z<6JtBi*|M?2KAZx)~cY8##uwxW&OBx{nM||tLv-q36!m$rY$DUfITDE5S|5hei&ni z3=z8CebYEf`(Kiuq~O*pYs9T2Fli`d^O6`+vG+R0zW3d}I@=Yo%GqPYfoek*xIu>jD~o+hm$f#!vC@u=wy03#$v?s zWlQFBFsa9WiQ`UeqcIIx()~%EBnc*49aSYHD;`*_VvQHGuUsv|Q>orhR6p*J{XfzE zU#wm~dcTQ^&Ev~|AyGx6QTjBl2mFyM%+*!oi$lyGJ32;h&t%u>arwzwc5 z4mqy*)x~%g$aqUbx4RigJ#K{QgpEWT$+x7%QK8;pT1l6}DxJ4lhB`!a$$vXi8H!k% zj_&~{g?Ca>N$1(xP^!`1;N;N$=nz==y+B`HtCQfx)|wW`RpBKib{*_xIGE0ZYb0G6 z0r{}R-T+EybI)>pG#hx?V@Yp;qtVKW5N!W&7#`&Poyy+lgPo`PY3&iZD48*v)0GRm zEr9aM!O8B_ptLoX+Htcf76s~K1tma3<xQJ1qL1ysf=dyPDz&_k&3HYst@_6i-DoG6*f*B^V$nQ7&l7zc8D7lY z2z@(&jQNWj`@Cy`hMP|HPqpEep;fD>k)FxLN{DbK9A_%+I$U94^143zgyPDkt6Az% z2eICK)QYdmtpe54^>Na7fYlPWVO~1a!aEbs0TO~K$1AB!u4cWh9D-R@YuX0O=$O`v z{l(~-l7+uBy1|&VEd|jP+FIjtpmH#FJ;yQIQYt|nMkGwEHSJPM+G^$>J*7uvx9`*S zPH>_kl}LqNT-9e5fau!`>Fc9(AbN?CQj(L;9a5fuET>+3)ZU0lmODSBSja8lKgGiT zYx~4Ja?_p87cK`;&|)H`L>+@(_k$-=t0yl@UwB(_>j|2y%HeYo>3fH4LbJ zf913L0X~chIJv~embvAn^no-MY&al3Voo;E8grvl$E9*!mES*gjiq&zCQo=BhZy6b z>$Vk49@tlzv-cfRVii{GZ*T(K?~FLrv27jqndjYQ*MYGH5CZI|>kR9q*+^b#d(`<} z8wToI3!UN!^h-?`>RW^6md!4$aH;+6pVp)pWMJF`@88}u=u$8rtkkNM8?Re?_I}hk zT@SZ>h`0v%mBV9sf|8Dk(?1~(Ub$s6toiZs8N4>4b3}}sAdASIPUjiFl!ZulPcW5$ zT1}tEGTW~PCN8ws4r9r=B!*wS+s#l9jm#-=5Z%ks^@L4nwy%_1@I$7pX^9@)l)!z7 zo*Y6>SBrl^;6k$a%HWaqpq`UrWvTf|5z-cJZNOeaq~vW~eWMs(7#i+U0Y1Uq-XIQt z!ZuR0VLYZ656_aej$mZCB#X{`DW{hdLUKrSm~3XVXR2tSvL%Smk7Q%(G^qPh8WK4_ z_#MyI>@$4wRKqLLks4EZ0e-YPnEQNt5gZ$b>GXp4*)HdSKU+bv1-I$V$pp}}iDu!% z%F^<<-n%Nor4+EBqvH-vX?7PPO#YCnl}V#isxwn@-G6?%zuaH73T%I41 zG-kFfc9*%(08_#5F2R7Q&U6x^dZJ}R*2)}nfT`3oZ#`3Y@_bR4o`1ihXvB7?9ajCg0NSqcFf@%vuTA(KX*~PYYNTT_X7czjnRI<$>Rd#+aAbGCl?lQTZ*6 zk4VH-Fz=bkm`a69C7yPE8n;+p{*+C}ZaGa3-H^S@9!f2rSnD7j!{J}i%=Gz$fdxzp zX;>RmX*{3TFVu@5e=64Ue2v*kJrff7MVAb%DT1UX>uSFfmGg5h?Fl(T>gu2+0q z#1PhwNZuR+;bgrCjzNZDB5PfL^+J3?RzLMeS!6_Bc3iT_lk<}85yJPeCJu7tv(blN zCfcB*`V*{b6D4(o^0GZ|nC&V|T!W}yK;T*bDN4hcZ z7@p-0E+YEgfDD)E1*915sGv>MH?a^j>je;qHqt7}PhCE|_z-tfXtqlRP#hkl{AP4J zF(z=BLjRM6G$y#u>@K4Psno{73kcdmSkTt`c;CFy{sSQ5X13kS9YXF0tl|A+Ny^&P zTi^+I|AZuUEbTlu)lN-Iu>kX$BuI2z$%t)ravluCoT|4s@yDI2UDf8eqVL}`maGmQ zU2{zv-?Mg6Eloh{bhXnet}BjkkyembDH05DSI_jcZ1((dTyDv|d1-O^#~v`J{i zQWiW)iNPkc?*BdB@Ut6Cq*o?Iu~@bL2@aw0GPet6T5uqwZWfux@@lkQ%VnL?ONeD$ zbTOQ7If4;fgA5rxEchy*al#13GixYngP}AIzvCrOAfKCsCJv>#wH7j>pwZwdpQ1q= zf3~LZt?lai9z4fMa4!3WuL&Ebc3g@&?0#!(-Quj!~Wx749g((rV`~ zf*~DM@7o-(LXbr7#lvrv%EjyD8l-F{-i#&B!=;!X8u0{A51ylhD_RVna;F*-5}cZH zcyS1Sy-M&kdsua5dTZxRT?4^XOteHe4{ZjXx3>Bq;Xp0q+BMEBLDwxoPhl%f&`J(D zcu-9)K$pk0H%6lrfB(#SO&ym|(B$7u%et9G zGk+307=QkeviJJ7(KYE~hfB=6^@4}57iP6u2+Z&T5twM zeLRJRujNRF(5WCL=kBn8g?#z=>coki*;dzNt-$4CvvS;HZddYap4NSK-l3+X>0cWH zaigc)PPSWL)FZ z9K4*2Nxv~qT=V$rm{ZJ&Dqpq24BT^%4d$)>ij{x{CCO=EF-qnB&Tl1UBT^66`;E&N zb7$;mSErusyqHGZrW`Rv{d(k5&Z(`+>5vrqornmS{`V?JRUHTLgEG5*So*0oh|$OrbJ_u{eW))!`XYbeNGS!6hfr{Yw)v0+&=c>y6) z(o()_607c-`9<_|-A zuj@)J+I+uDbMP+|21Zb06;VqLukg8E0>im@_XpPcsut3dsU_!^O2*!20q@DZTJFDk zA3nCD^epLUF+GDMXIbrd9?Bm0Vj)P8qP1mDo+Bhn-x)Cf7Mr3#fW)1Jei<` zaE+LwGZO%xtLe{$VQz}$5V@%pbla)fEqM24ZZa?$TBLT}or$9a8rL@GuWp0_REEzhZvOBs!UkBC#OF$V;NBA2_70UUk|V zM82+-U$t@yb#{-BW zBiDSt@K`On%9bubWE!32{LB#uw$Dpo^RZO=tC!NNl@cjftcCYH)rQIV#D}*M*dE9A zWcNcA$$Kl6=LC+$yZ_FDldVu=DRxm0)XVdm{uq0}Vz}Qf6>X(TCuo%ax{G?BMPX#< z+R=75GD}yfUR;0|1qHT_TG27Agl^arjC|;b4^aVG+Rv)elBDD2J;s=Jg5jRYN!dy1 zg-Yeq^~e)DR2Y|t#EJ!zFH42u)wm#hcreZv9@7{71_Mz+zwzUE&J@-LJZK=J;b6(5UZ2uBH6u9#t5YTAfegLaWy+^86oGaJ1i*W6ky9&)E|*g>CljUGSrC z^=A?LMB;Wm3wLV9I|;%Rjy9+JN(2eY!luklhE6<42+VrI9p372u?LJMO#PjFR0RNBdq$cy7cI$YnlK|YfV&1@Fcwgl}MBI>h_K| zhVL{{N-L&|)Iw%=13^UC^Bj^_Zug3Q3QR{IP!_m}FmqpB;LEbSC;M%JjC0gB3{H zAK%Tw?(k=F$r1mma;S<<&C@s)@*?Ag&0^yif;#wIG{wUTp#?09$Ia;z#$-v=zUCUh z>aDTG@uZtbl3B<*gscEFjqNvEtfV6VtEa`X&vRUxC?`AI3g52Fsi}_?193|oK3{%& zI2mC#nto^dr~4l6&|ok)Z<>oZE+Uu{H(|&45y~2b8qMZiJuyQ>lT|1c=i$gb!(qKu zvB}X+&zd+~N`mMDf6>Fa@E7D?$FM8G?gl0Zi8rGAw*~WW(W)n2-qT3eEFRwz;?cU-4#-jQ_1IXkiIL;U z#<15fyIfd7I|ZF$bTy}6&piLpj?qM5vs^S{CVJ2vDw{(gq10vVX8Y+%R!b;zu}zlU zeHbzR9#Mz)v2T#Vn*Vf@$W45uHR4UB36KfG=lix;&1f~s<4#*IY|4l;dbsr4<)&^y zsFy&%Nnwq^nUdeLXCg*TAQQ5gFOxBfgf0{KHDYfu?-ZJqn6E#|%){GLhM0?jv zOxJ-RG7!s?2#hCa_H?jX&^mLo^T>s^YJY9%eOOBCbHY%#*6NRppyqCsr66^QS$Da_ zys}vGYktbNvIW1`;n~(}OS1OFaUi3fi-eKr3f1kFbD!aLtD zl`&NuF1GbUg>U;04uNpK-!5=3q$%a((3BFx=Yf+Np2S2h9Z;bg)u71{>x2+Evt6oT%0##M?B*({4Y_}mI!+a85{il- zw{Nt~`Z5jQFsh;C$z0f1xq|UPmtK_q-j)E(a$c%*!`E4Md@!eh_Zz8zPU+kT;Y^F| zYMEoQ>*-w2pS*Dgm}=!YYe>zhw~yba(y+G6^M@7NP}DPcpkS`k)}bjezn_4^EEAFZ z9=$5q{7t*B7xBf`9p|>;<_Q|oMJcrkEB2lZ^?G4Q9bXa4T~}hm#OL+Q6 zCUVUY3ovgxnRa;U`teyG6h3WqRxVS2ic7(CSW~|N&Xmw#pU2ncbP_%8onr0V2)it$ zSoTyh{&r0E?Cbl0PHsFYa=e&77I!=Pt%t8KXnVhq56Vnxg`CXRK}yEtCn2O4#GO`W zl@ogmF`x=H)3Nm!kBITbal#M(q7Fx4U_@0fK)fRZ z+uc?5Xvqo8C27qCyGqn~<-B!S7|L0FV8N*4ef$UtTd4o>q;ciQ8Fshy=R{kTmd#^+ zv5-2D`BkvSP`vt%3okthVl;L;8@SX^x&dU=cvRF6gf-UcoP$ZwByipnK2NLy(VyZk z6IqG?;rQ5)x7~1~nm<40?+twbhQ6(gRmb@ruMyz4ToySsa#rekI^?7P@J?GUxbGk6 zGm6SRc+qVSLyvU6USXFN7Ta-F?)PqOfUAXy@GERlbJS;tzI0qFUYMoiv?iA;DW0vw zjX#C<0b1>uWvd96i(>jvm2G~!lK?GDZNhwV?5w`-p;D8|~m4#pzqUA$LP@2s7!A=53P z)j>=|^_wQ=i9zfL$kLq>97DW(E8!6azP$Hlf6F&H>&l1~D$5@lsnT znd+x}PKR^fw{vEPJfB%A5LnNf?*4EACPaOizQ*c#`I{(GVT8jz7ZyA|Hm8dD5(6_K zZgW-*AA8dm>TQc9W+ydhk#zj7H#?A<;`j^5L5?f_bdb~iU#lLseUIdLanECi_NCjkoBY+6OX2u_{${0K1-VzRZS>PwG|^ zxH`|L=w-~QmML?(uNGHJjgE;23)RRM#AhAt3~3RK(x@IjEXS;R^xgbCgEY*Y=jhZt zx@9bD$Gz;@d0-O74*hmY)ZS|4m_MvJ${RNa@Pyr>5rKq!4<>6esKt$G@h=a6x1sIc z(YI1mILxo0)Fx}h_ap`_LKccjHU5fAnnUWb1nXON!1(g-UQK7v3Yi5;uc;Sw2SHA3 z7=niH9B3zTRD1fZuNnlc7^yam@kqp&7aZ6XtHkijwK}e$tau?=)zf3w!_lOusu8l^ z!ECO!mJj09<}@o+)lzc2PHuto^pl&F8aP5-l76OS!nKd0C@(Qju^c|plXEQ5Nl)Q? ze4h5>|GlNDh;f!SBiA`-UF>YZUF##zvs4NmC+WcR-M9-l?X{UhSj}?VcRYVKB#v>l zz;SG&+fu)-&@dIL_AmJzHHYk4L9!yqQ~dMxkL;Bf`;mHa-Q=i4gdm{5EWlP&1+IIgbs(8)9%F_PQ-PM6wp6&JAJwELr{WiDOY9W*{ zllcqRsfn|6yZtvaHDsUE>%PxeBfF22>Fm$?{pen8@~W)e1Dt@nRd0n)7SYyJ-FKq|XntLt8&54d=qN`Y zoL$=b@tAv!9HYDFeBpc7np|0!f09b_=moYT8Eof!+|Md5Xw0tA7$dJ2wxq=rjFNS9 ztKne-892-A7RjQI@kcl}f+~K@RaR9?4xcWN$NhoOjT2DD)y0UwP^jSqNmE!K#)G$gafTS8kZW>|{CJ@_h8Vx|z&2 zoxJz~uJ~v-%ggBrS9wI@KYJHs2GsJ}?erRXm~m~6#ADE^b9YEh(Ri~!pZb8NZ%nk+ zMptM&A%+*Lz^_;Iva679_%>5w(uRQ%=9!-JraY-y^@P`poPifj{M?nc_NUl*!mWC2 zfqp@jsK@uatI8jt9^8Cicr^@=5p!xEK3xh(>I^^hc)D}*IAY}`wnd_M7u&ATV|+ZZ zW#E+iVq;7CUFLIQ9IxT%X(kL0(&pB#YGD+?(%&l$#CQj3hLD1Mpg^-zJ6(sAt~iY2 zPCz{Z^D1czUf$J_`gvnR91cy=A46pskalswk=jibS3~)0^cI8kCjqkxg~1_-#hRuD zdwM&Kv0+=y(2&|ep2Yc61-W6Bd@`cBO66!InsEzy569lPU?8j0l?Okh`LuFrrZS}N z&nSAQvrfwE$>yy(>)R#7F`DYx)LYv%+kbq^b2pWt(MHYyIs7xu%>gjh48UhXiaU*R zwEa`3dyJQojgOWsb-YJ81)Fa4p16{a^BNE7JX(?oAm|~^fI0BDTOCx!v!L0gFxvLK zPAd~=L-@ki?338>{>Gi+OiQji7mh#-rb9Ifr86`ZT%U(5JaCn^Hg~oY!dT^3cV zgMd2KiP28ei)#D99~6qNu&>;ixtCGjgud$G>2o^Kv{uPw+b&1O%UAg}oHWwtcYJ;4 z%45%GvsYr5Qi={=ZPhQ=eoq$fem1`)+`ur5dXbX6Tx3Em36J8*Vk?XqDn?3 zqignP^~IxpcY=H*xRb zhh^%9N}-)S_|g_QTfF03{Nt$+5l+^sUHs>ja?KMX0zzOUaJqwq>+-0d^r5Z zYr!so%XgNVCoiMH{l%)ETb{c!|H#uPqd`eZ^PTtabFLj67dTo8dTK_^0JKNCHV^)s zALfWvt(-&O=(h6+V)FmG;k`!C9g0}T4sa}e*z&TVEgK1=49u8In=56JaoRqezTi{H zf!2f-^;?r*MT9B={wmw{H9n{E7zi>6=wbWO@MKx_ZuXGot@H?n&SU&+cC!HKUrtWw zmz5ho6tKo=y({Ht6_UKEFj&;EmJ4MtHF@I!W{dR~B<%6>#~wrmA+0jT1I)K~<+Yl) zj(4oL^%{Iht4N&XKe-<*&Gz3|L}y07T8~^oa)AV~@Rm0ulcGK{zf3<{(=R4LP7XD0 zX2;J=39*&(W*8Pct`7^$r8a0XCY772R7iH99hRDq|0>9r0*f3`?3 zmLH$&YyD#{DRC0%?{el9b7II(41_GheB*eACl;oBs1+dYVcw8xXpy_Isw`|r>>}uK zdb@@5q_{_`3VUIOr$jtK64R*uVVT{^U=7{W|0sH#9BI2uNU4uF1o_^(DjZ0FI)($I ztutCYbNolI!jx6}t=md4_HHj53xOGuBK?;pUAz2bvhB3Nw^G!g=r*Sw-ZpF-mWlv%`0Lzm>at6|5qE|W zfxW3OU7d|eKr|G+O3sC>h1hg1o_2!blYXyDV0|1CXU-GER*?4^e!_MOPLlyoRcAC@ zliR)tx8n`uSC_HWfPf6P4v(*YuSsxS{oYE4uizoTjW?>>!a zd;fPWnRkRcf@E76m#fxhhlfIAN(g8^r`JWh&U$U(RGuzHV$)|`UGw~T1JJqb`RZC2 z)5>rBQ%>6z7|eHc-;AK|)9j$x?;Bl4fJxZ?;(T!Mvno9%zuc>}0)Iu2Q=q&z(-t0D zv0sgpdSP(kp)i|K_FUIzAdD>QkEmR#w>24{;e)r5fbBPhi-AN8rvFK-osnE+uzrj7 z*JHy$aYB-ZdxcvT9DZO4NxcjXZUIGz``j+6ymgF?)^N??{pxxAvO*>COjUc%*)2V3 z`8b4lrn&Ez@%-yC(M0AA}AJz~&xtGv%off`A zXcysLf6PBaN+)xvI3`1wg2nIjeb%>=1UcGt(N>Xw9_N?KEReifX~q&o8c0CO{sz2k;=RO%4PF{GxL9_bzAw9@& z<&I)<{7&8Ro0IGzLC~3CG_N1?v6>?l9c*6QnO44g;Ni~8u$MHfXol(Q5*$CDf$!sP z58np!w9}>3ug^0b(#x&Min@Lc-fx!SQkvb+Ch+=LuT4eVP=1|&AD8pV##a>M)d)TY zlGWJWmUQw-4CS(hl99kueud%D`nWK9&z8pOy(ag&!IFCuRhbXV9EbB|RVwoMXa>3} zOI25YFdV}7!G(G%hC%ya&bMAr4BV$u8;4 zSE|l+{<>SK46VD|Nl`h7dQa4@D}w}gpA%5bB!nUvtU5h!eVeNw!$>D>Hx(`4&#c#s z7javwpL_{zOZHJegscqVSZ#0Lk?EW5;10|l8t`!e)WE!OjQ_?9-)#RQO1hxkB>Do> zb=fRS?7ZQSOAlGVBjmN5Uxe0ps#o^KhkyIxH_@q(^O7 z+`RaN){e%Fj#BACzX6k{Rx!k3d<;v`Xake9KKE^0lF;<9)s(Mq0A!PsRYo<{4faY5h<>Xe<3G&iR=FWHsoc_z8l&O>7rTYhH!-u8 zYV4ix^VBFbN~M+7K_8&pn$@C&WsBIXq@=8FI2X~ycACS!aVk6gxZ6Ec?DQ=H4>6X z!=@$yeS1~3Nj1ae|Ce&1>S z$^?=6@L0)f$KJd8enj_e(7MRY>RGl+G2| zLJxY#(fDe5?VG4HAN^_NBxhZE7Vsg}L1cK%^1>GzIAgc5_Sb7}*9=FN`E>s9f6BBE z_XaAKue*1tKa7j)5&&4(Uk;72fWwX9ADhD~zUaN;yPdqz%1g3=lz$*}kG%K93hE7l zxsod{a)c3Kfh4Xy#CuYy)QDp3fc8gX${tFHq`95dTjb-`>_>#y&a8IMOlZ96k=A8 zz{>Zc>U*;8$}Pe#D`or5czSn3sOgTrc)|j{LeQK3lEV@3##(h|6C?ZZv@74U0f-$~dAI@q4ZX$zk2>qreS3wStO#bVf@ zK~c89&@Z<>cR+_bVC9x=Ts_(iw5NUVv|Xm%S!u01=`^~2YuN(FxNCcL7i+PLCICh0xtN~tlG2gnX%}&e1cV@D$zwNh=O(cy^DnFd7B|8 z>)_1l9zg8xfkmrZpj9XQ^KT_fFvD8l;dcEr{O69k^@Z8hdE%Tb$oS~So8G9?_7GKl z<1?m`_0O%tGn?)xcXN7G)FOa|_Mo+8Cnt%Tl2Eq$2cdUlf-Ohh=S$>!(!VrXY zz20sFo8Mri+Lm*|eNzhiG%M00*-JqEO=B3)NUy{#zAND(paogij8YY&Q=RFlf~bkK z1&|H&N4S|7Efq9x6T6hR=_RBkyHOvyxCTtkiB6ULO3e!XTjM?2vK$H!65#Ub3Xl6( zD6@BJ@uO3hHoF7eq)OC>^te|MQpI$e`FW34jrvqFsR3lCGP#w-^?8SOMkJ5fmu_p* z2+-qWRtr>iz(B-TH!>8SWo5BkOn-`C>e}{^vI=DU(YLv{>Hxaiwo+0+twQF+`PPbDI zKMz;Iien&3S0KqtGfYe|sIThtSDRe`vdE#YHP;?G&%7p+%q4g;q%bh7`s0q*Q>_u< z0zTrO2a=3I!S3A}68NEMaQ5BR@4dlLTkhSn6W-GAK0OH<2Ru+04egLI`aA~?dTAJo z7Kn7JeihwZH9DPxQq85H*79*9OQO}W&gU@PSC#lr)17~;$4C1^sjvSy*qz=%NwU!X z;>!KCzWLdUmuTWnJVDbt50m4NfJYA*7pQVIwE?ukQ#QQJZcZ#=2gXcD0WxpLGnS>` zLq-g!FjOAVKGLCrD*VciVR~~>p>q^hM~|&X7F4u+Zz6KPa#F7XF4~zwIy8MwZ%fx6 z@kIq>rHEQ~DC$m`Ky>-Bclv$IJj8^F89Zd`#R2lqf;G9&NtUxu*ZT>H%=0Qz{94H2 znp7Asp$$L+l^74{Z}@}lU!%JmFr>YY2>*RAA^3;MUp~d}LL;h3A9aUoe9?^3{hy)J zDgLp$!{K*1{nZHiUf3N*$4!u4l1~&6c{5w$>9VohA`-fCgQT6}K(0*i)6WYjSqcKr zxy%-htM#2!>%L`eDWRv`e;0!F0&BGrV3y{#*YNajt^|iZ`?L4Iksc{!6DaxU`>rF? zot$T6AXUTuW#YA`D>N}7(ZKByo&wVOdc#`_7>*8G_;VDK;>M`H_28os3MgzqX*d(TpXa^b@0`!S zP=BzmYpuEF9CM5@7fta7#qQ<%sG22f%NX6>(l>^-4NDi{r_>6*MQD`F-`)G$b7B3N z-*Fd)XrdOzUO1G|NL1&VI;@VbJTkOG7gzV7`|a5hZkXOE(C^Oh^>UexEq-|VEz=Nx zpl<~I(Kle)gYEQciu#>P|IX<2(~=RVgCMnnf9KTxvMun9LDZ+bOUTxdlW*#^H7MF_g(A3py^d=g_Bv_Es9SLF(w=w9F&4DIIdyW zy=eFFq!mLD(9kr^d00H$bO|2m?=QI|xZA7jYb zZ^dV)(>ikfkEuZcCa5wUvz2~*woZJiLNA!`ivCKmX25YKn^wUvc})xpQC}F1*`=!q ziBS&S<680qW${9O;cQ{|GF>DxCF{NR`9I5tPCNgc#O&;x44drbeiIV580=x!P~v@p zr1Md`saLw#yWDr*&c5&+bGTug7tKAkYm+rpdUeNdH#Rox7HXLaF8*1bKLa<6uIw{UYe4`NGqzcm=^Rv%}n&k$jA zZktbgtK^fo{Z5MPh<;65tl69O(Brbd(K-t$3WeO*T(RBxIh7z_>A6~RsCLCi;-yiz zT(GHuW&`W%2&=IY%Z+a?90%lYWc{~^rLkmW=rnTsx5i4%YHMp}su0Kcr;p_i0SzN} z!hidnV)RsvTT>_%Sv-NwalLolOyyRYX_uW@_hbT3+Q^5yO}8HG>G5J}JzCwi?|*DTJd54jef^W^iordb)?Z=dmp&i|t;4`u)tu z)F(f);^onIJl!jrgS}2Vd|aIuoeVf6HVe-Je@pin?wuzkdC?HJomwKzpRG zk8sSf{oQ(Pzv=h>OznynS8;F-Mg|=8tDUUtU!5f-C2d=YjErPeN)oc0Z|kx9|9Kw! zV8NFdbaH#7Mp~WcL4p2KH)n{;uggId^jtrq!`)6`IagmS!=OXFZLi@Rqwg|{t-IapXOV)5+@nS6~pe;Kbco^NTg(jE&7tD&L6 za^#a-7!46oiOozc7V&j4_g#WatEsB;2iI4}%U3sMo5HDuiO9%C>by=;?zl0wrieJM zz&kAUq}q(?ArOe6yeD)Lo>p-S)$iZGf7bpk@wWBKbj?9F9S*@d{^qE2hO=w=)ZjS*5#t!vzR?yX@oem``p9CG39 zAv4%JImy#Kx%p(O%CT>xHI!=Sd+RyW!TG&J9u2Xb&_R@TNWaeY@RB?|W686RU zd54o@4_PYVz4 z8OT=dh~*Cdu8r*TdpqI$y?>#`pHMRx^IfBxYlYv->;;E?_P@YpHR!p?3B*vUlsWL?%XG9prif!)19hp1B+ zR_lkTOnkQ9+gz|5{wS5_ zKHGTl!!iUJHSaUUkx#mXI+Z>|Icj+qaY)5H_RAE)Y5ly>%d4!)DM**C#fcF=h^2gZ z@!{cN#XkKS*K&x{YqO=)ILy<;tiLtOo@afpTou3YJ%-cU_i~T%SgR{0OM)Saho8Uf zUNkLafIojEXkmhQOJF;ay8KmrIj(lIIvdH`84-mu8+`yxw6!N3msKoRhbdfKHoa=( z0%gnGcE+T97i&B{##>r^(vNE>urC~%^%z+WuN|l(x3;#J7#Z`PaCtrE65!HqK7)fH_Y~4Ud?>P+*>l-)VrJRHy1=`)7hN~U z$;x`rN>}LmYyS-@k^YJg{S|a;#&Ce3)ZrQTSj0%ah6LA7V@I>f3!0cuL-ju(&Y8m% zR^B3rFGvwwkUB)bSF%GGU9@g?SWIqiU*j9%{(;haxb+KmyY3XU;TkF`Y}XcJ5u6Z!7W`4?2SH4!F zXjQU4T_eDMxYtTxL#tf$EO4qN)RM)$rd*{7Piv#25+tZ!;59-FFR)j2h=KdDUyd(xnL+1pj=(m39Wq^?*E?JT#}M zpqv`1#AK$6ysHmVyeL0kNk;g$qfwuDGm*Uo5m64_b|>CYSy#s?y0U$Iv-#2EBW2K^sKDwDdCJ~ z&+ra^w$@HlAtxkEd;gw(s)`=-iJp1-Ph1kblR-Zuc;a8yOm(H!{qE}mVUckTO&+-q zU2<$1$i0^@F~3%RZS6>m7EE}_R(yG)5E7S?ob0ynHNs0=PtPG|FGC?55LHA{(#M>f zfbfWj2&g5k3hwzFddSfhm~;9$r4UB#o$9r}ZNBlX0m`H0G;$A)qoDQpg@)n&exDdO z7nfu)m;S-Q!JZ!ce$Q|9XK`?FuGetu*M?9F^Fpxm^Q-CU=~X#Op>?Gckyz=M-lgiR zv>mKwkJM!?=eYE2!ucX-{=4ntXwJl_C(bFJYA`}^G~cSHpQegWEn;9s4r+zO)+l@F&a$j$wZMnm4_f__?1T&#(&50+oO|1+vi-OPJug@08cRV2ww#Rf->R=X0ug!;3 zS27C-()eLd%AMl&-djtprkE*6TU2aN6h478~9=J(l-TM97fqlkGHy$LrN8;u3$n|Cuhq`NNT&*WDoVb+c zR(YA4Q^Je^>+-JXu7ls4nX1{C+1d5$4|d}_7KZY(YY@dXB_)qXUDma(>q z87?dw`110|A77uG;(KMajoPC}$JuvdSf=mZy}P?UH)m{O(o7h*GhS|4sq?e(4bG3X zH8C+UBog`H!2^JCrk!!O?B*4Xjg5D2``p?o)35U^NK5k`#l^+NqY*(}zl+xWahjB` zqR>Y310O#6NQXIku&jGt%@-AgfRf!CWW*?~|0Kywr;m(b!7VYj$YWWEkGYbL{Wcwu zot0Ht(9VHGcz+oE5bx&YHF(aO3}^LwL4Wp0y+S|F-6PYzI`KPl6OBo+=DlR19in| zzAfTWhWy2g7r(Er&eVCqS^7>$NSKh208l{n$&+S1t9RK-$v!b&yBpruHS+&PnKYd1 z3y)TFO@X^ffn2y@Q2hm<4oC!~eXOf)L87`_50@nOYATIfKwr zKD?j-z-f2feoNNHa_i%Zk(>>?z^T(?5aqLFDzrCzE(zPOKWR( zbtDUegoM7lynLe~^I3F_?X0HRQZ!M}PMQpBmXfZ%COOCc4Ot>gjN7r;C<+w=^*X@r zPfE^l{7)G!$j^_Bi0CiNeWaIC)PG}h56P|n82XICs`6XmtvB9E;<|{vfSzl0C{d+h(uo5YZQ|Ni~!*RMm#I6m~pB!AzjMy8AP z;mW~;jhj$X|KZ!f?2S4vmUP6z*Glj6OV>zi39broA1n$hZ^!E2vnolI@c+KNy>|^#Zo?iFF0bq!wH#mtdVib3OB5*GF6cxoa2RomFh(}z^(ij6k{bPC&MPME-#allYg;aQo4V?<7W3P)B>BSsy8Q3pFR~45lI$x zriJFVw${@zcCwB#P34PmE8bwoJC^%-$5HghSO2^Q!%+Lsvs(8(5$o}+%=@fmHZxVD zxbtwel+F+CRlUzvg0b3P44ZWNv>Sp|9PG-qu7Wyi*%_!O(4?6|9yBsSL!)f){T;J^?RGF3_v6Qp zA$_PldD4Lw!o62%zqG%<&r1hg6CWR6F+CHQ1~2gG-@r()NJ!8<@q1?j=RtIyft%ZA zw8*d%=jnCj6rCxTC|)S-&5nI#=Bm27Q!Cvab!7j+0oY+-`CcbS$insY8aHiQA>qsq zPhVoxzu3-xZHoxLbEo3)&|c76Qe~}n$2r*9&TbPLkG3Z2 zGCplwVj>^=8|m4&TgF^EUu?IR`=BUlIMZfg;aTr)%)*h6IW)U2O>|n|R^I+xMd8tG zCW(7QhWCiyzi%Jdto`%&NWtTv+s-NF*L-{G{g{xG@7Y<+Et&sCT6M?!>{XPV^Yc)M zPB{d@bj1$`JL?iIoAW;=DojjF*forRHd_v6zJ6UxQ2dV9Bn=+YR!01M{21;b+!8?Fb+x>JXZZwiQJ zFr@SI&&tgChQH>`O;#FLJKBG7bFM3XIoFR8<>4uU1kbYqN%b3*J9LIsRaM5VVMBw1 z^N0572{WDtKY%z{ezvi92%GFJS$`aqnvkINOiWK*JqC#s^2etG-Z-R1Sx(Lb zU=|bv>6`s|I7I8%rKUGPZCc0G~mior1=# z><=F%Ydu88#c4wkfHALLy&BIWv;xgC13rNov#wJ+-_BbomEk7T8I_flett0A_h@f- z_q`$~2Zxc7(YbTyYCR6J3JYz_&D-%rBxxI)noOQO6LDDX{iIuszI)ngLx7u`o41+8 za_NS^0qn%;$HvB%E`kb%emgcmFDMwB%WGz~ghHWM?#H~p@4HFO|D52fArxx@n{8Vw zYwL)sQQELRHLkxt78LZ!F9uZ$ca4*B^J_9+a=yk!q7?~hh+jHfWalZ zG#A$v*4Ix`PY(UDuGb##Z7s#e#|ya-V0BwN5vPmTFX9rxJ|{sO@E&e1+*jid!ZW); zyx0;-#a0ISHe<3SgaVM!8zVX~1P>k(tstxTMvHlxVv1WXrvp=Gg5xVsnP_oC-8t+E zx;o+~jZGSlhU!57fPLaKbXP}vdkT-?Lt9%-$@*DA8>7~cr;AHFZjskR?-AQL#yiP5Jw`jZgunG?TC$vyc#Z@+$@zKgq^EKFRY%<^$Ku z1R?oD4nZO?jQfnw6<J1@uTtGsJcTdpS0HH* zO1>+7BmL;IQdsMg$ToWP$Ts7y=gLi=af>#r<$8HQ5?-J zEsVOOoBf&h$7Zq$_3MCHBd4M1a+h5VjBx;%-zHXii8I&N_q-b2W(bXh$3E(#AP>(M zhwnsasl}HeJZ7dl`d&wlP)tIqm{NOruJu6-Eq2DAzhnGc?)+80*QQLSHa0ekHyn68 z4-tZbf|@S__jWcm;skA`Arp@ZK#(X}HonH{Y=tAoOH6R=np-Xfr-(Xpi4pU5e)M+BD4ktdx%sKE=qm^dO?+btxYw>-%X$#+ zllMBo$gw{_W8>RdhgiaOIMEqgkkDF{h+jRHehm%(jh#uEL!mPJrS4l> z1!y)EZnnFFjlj+KwHtQdv9`7jTXx2FbaZsm-%hrD+|cr9x08}w)Go}zC@-8T0mq>K z!`&Qi>TAT&DJgoMp5iN%V=Y)oiHRL*+v#$_w!FrbupMhj=QxC@qns-JWfWtzn5F}t zN~hwR+wmgT4t>c>2=yuP0jlIO-)VECqB2a!MDf3!*jB;^VyKZ zDc+oI@_#=5{sBQZw7M166QnI~)JxjtG&W{|f`wYvz{5Tr17I1K;?++26g7{*nIvp^ ze>H>SvuDqCl7ENP(B$9up%MLFQ`7IARHv;@OiTVq8{#X{M9RP;nf{UbxWhwcY$mOqky1 zKW{Iq`o)--l)T2D6B@jU@$vNf*%m-?O|@{8!vg{&S8Pq=nD_CQ8R&068$zL8-#v$o z{X@a@EsL(ME z0`BqNlv;$!Q4KuFBL9oN^oz&F6tFLNaKya5z1#RyZ)9aRat`; zMkcBGvI)V7r9~)$7={1Jp;?rhTjObEe#MTB$G$_XB(n1PYiXH`zE1#!W4W1PxTc`} zPs@7P0kk03bZ1Y+n#XwnVP(Am1U1AQ8PUxncf@IhY|ki#l{p1T8o$!JsO8F!#bfL6 zvKN4FS==GUA4*M5E_7@3qlgN8Gpk#vSg`caE}9V0{96&MT%Yyf4G8$h+WiJ*s2N7^j} z<>#}z2kN$mD4JPE654Ros{uXQ{raw=qCrdi-1{)vJ71?IQiWbJigRgy4xx~02@w2x zA*L;dQh|RGH)Rt)Gj4E;A5kG4FzDv3r5^u<67~pX*$(;c zfQs~V#>4oUsi2$7gSo^C%$ByN0rked|GC3GF5-=?O9XOZ?FLxrQ}AjVy4A1yX!n9j)s zKCYOlDh<=7z0{p-xMng!)_wVc?|mlbgJq$|f_!`jyBI)M_8^}%nUs3#g__h!mHvsmyw(qxWip{cuv>sd{bnifTiE1D-UVehXA0! zmdck3AT|uFPSwECVy0_hzpBuvmX`E(U;C1#4;0^w-^y>d#NSYgOTF~XoaYm;;J|;> z(?aswbr9oCq5TyAIFo zQG>d%+4&PbIl$+kq9Qbic*fswmzlp*Rfz}-{}?OvKhl(4JDW#jgnvuk`5fJBe9|hT7l;l7C~!);OkNPFu&*_bXNLclJdN-|Z! z{f_FI)e6ZqoB6A5X|xe^w>zQ6+P*O6c4*oD_3I_&+~I?}ccqr34s(M-sRX(7Ym;hP z!y+%UV%@y9bh~?Fy)y2K`6*(4;t!-7(_<5NeY5BnQ%UPC$>iq`?|C=1bKH3L2=n{b zuO9>1_>}jpt$zRjcev)@=-87a?5j@7X=z|Uzs1}6m&SQd zPK_(cOHwS*N086^7*WF+3i!)+##I6sOR+3KO2jMl4N{PkqaY{eL5K1{K(*YvPEJkq z3E;j$`YoQTJ0I>wPEJk=osg_M{TNT99`!hTNI}WrxkUHvOd4Ei%J$;riEL!3v}EpW zi@_@o-65k+RM>F8h)y~Q+Cc>pH~W8(7cttKQ{CNoX3fA< z+tTu)Lz|?Q(!fMxg&U%pU- zbth3M-48n%q(92{nckk6uKYrYK z-b>FP3s@Un$f%HbWnHc#I7?MS10bJalV4hciH&S2`fAT>xzYo@I?oyqG30(CWU?RO zp#_U6FkG8LJCJ|A^=AeODmZ9(0WM;@aD`{7@I{CPbU^B#blQ~)y8?DzV-cfR?Y$99 z`5}nl8I0*qT0V3rk$gV`149nSGflBa{0VHzQB8@0HY&WDe%__Z(WUkA6XllKg@q?8 zL+lS9c9k?&x2>Xr0t7#Lg2e(I!SoC8{30t6K0ZD*jc6+Md37UI78KRxa+?|PYeMc1 z)h{TKm~Nv0QfOl2#n>)mu%;<6rk5)s5hDbD=!RWAOo~|LOn0&y^R;!$k?yPHJk>=T zEdCkS(}4N*Gq==^#(=^pMVG!SS)O+RU1PU^8p;>E$2Og%(;jRAD2(oAig$6ZRc7dc-qHY|T7uo)Fi=&rDtY0Lnk z`H@yPsL}v#dIIR|`QWTj2R&(N7^cRULU&3NM3@<5C6ia|SH5sB*jY&l@16XCrVf%K zV|ZXdX3VZq$27q6CC1MnADjuqkd#jk>ooYRoZZ1P=Ecl=J>l{ijR zE&^c4ewf%o*p>JM96teq3C%!eH5o$-ey?K)T5PAElYwWhs2jb?$zcS7 z{@Oh~E2t;8I|S#IyWg8PX^chgTZ>%~)#Ib10hdn`aDVela2v@%C^;`PWO|EI7-0Mn7 z2s8={L$Mo>cgo1-g{>3W(9r1Ov1ZHo#|R6vNHm6KUG1X!^o{qE5s1$iV`)Xj51A$k z6cl<=Qn?J3I#~ttL7p|73`%P$5fKyw8=D}ZnwXGWy}CDCkhrk8_(X)r`xPA()y}@- zgH-Y4N1xxhCpXTjst%m&&XN3gwSl0=qZRuC8Qsjx3^T=`)XU2DGMJQeW1$D>hy8T~ z5_!MfMQh)=hMz$*geC@hbDasoA=elwiHYYH76hKP5!}|4mA#OF#lph!;Zt_Dh75}8 zV%YCs7cY`(BHobAzQMs~)6Oh`h$d4|>rmb0EhPpGK0wBm_Jqplq!p^DU#UazTqDE| zD$rpQxwp15ln<<;nMEqoRg8Qt7-Y&ph?mp*aooXOw>DE(x4Vi`;aGE6u(}R6a49R( zLDzE=rodu={f=d0fg-kEHFwo^4N`~vQ)q<4Wz>j4rF`V*zOJ%$H*){@Hx*@E)MIoc zp^M#FwWr$}j6oWODF=H?!f)Pi%gS`!udeQRVAYLcVxo-DQ{hk=gIIHL*gqskcgt{xjd*3u;Bx+!^AezQYP_5vjpJ}`Dy(sk6u0`ZmGo17i0MK0PjYSAT#k}kXlyj>m`S8gRkJyQhR4-NLbiyUS9O!4#@!YOupmo3H}5l09vN7*bb=KcLC0tnwroK&vphp19rCa9`o$-a==A(U&1qHY7ZXZ zS|w44H0-Uvy21ekpGj_(i58PPu5v{)P!;2rin)|SmZT(?6~&qt2c5dP%N}>#vyRO# zm3wb~l+8z>vh!oKl)7vv(Uhwpk0(CWCL)mhy z|I6e^;E=_0yYjz0Ez@D{j2T$mpDe=_}p4$0T_1ZrNI&xK&>tgZdzc! zfm`ae!;n2?a1`2rG8{Ju94cVEkO!3H1yfuVkAoD&Me=JZ~vV_)SB_@FH*TOnqS%*C@ zJI7D{=1g6)LevkCF-8qW$vn!xH_XmQy)y;>1D{}a89?k zw=W;PXZeOf4C(%sWB6+qcyzw_3;7=Zjr{j-jdk_gtM!{sIbP(5=$(;ipg=jaJ906vn>7+je>uR6DXp0b2|W3T2ug%6P_5hY1?*wa9#I3YAF1g zbSSJ^)JMoTYke85=;q+&uE?|U*nvK}*&hz?9A3>szL*mEZ3wP04aIACX9G*Lu1viN zZ$_c6wPS5@y$HYnO^WGjZEYPNukt+F2Xz#E&m3spF5e`XzOm>KFUDU`Sn4kY(~(WO zfzwklLU#b{N`Tu+C0KaYC;cp3U0q8I(Q|X!WzW4ci0OV^wGCm{`1HC~+IKAaG9L9P zI4BSZAu5TmiUMy(FIMJI_3%J(76u5SK!# zYe&#Cp$mQ338TbNRJVAdCyET43bZS3-nfxQib@^sZfmPoSj>D;KR9^J-)Z4%H1wju zCp>2tT8f{Uni@S-R_uuP%Q#GC=eqDkG|dg4}+_S?6@u=A#w9&-uw4w##Ob>zvfw= zsq=qbuScG^zl$*j&GgeJuC1_6MrIZkDdZdJG)ej#)$FjUDpw3oz}YGmCMIJ81C`Hd z434r_CS>pXZ&+AZJlE{F5?fEqcH^R>NdU_Sn_lWHt>=jlj0(xiyID;XDt<_f1#7M>#_&>Q5xkPn zP-{w739(`9cK3g=2VH4qKMoVTc4HGpAu0I@@I$Hsf`etgG1lP7;!Ql84)sBx!j-7! zC2rj!>_acxA~P?_^f5?_r-t8@QCy^!R6^lXv)*-Z;8*$pykE4_H#y0EA@Bc_WbxNSZ4L+z-wZBfv}s#S(4!)A+i`ozE1Y#4Z>_< z@u1X|0PIZvbmH(Jrv3Qyt-<{=3qd`r@@iNA6ie7<%n*0Rrx%{Hjq2?`5gp9T?ze9h zzZv+wHdg!6*w^>+omlQvUgMVK{-21;moB9{DPByz zmpWg@Wor(unG4`#9WOR+1p~!mcQOvc&i1zZ$#D%((xs)P1qB5_N%K?ni-TDRL~*5f z-c0Bnfm41)=VD}#qS~nbTZ$uB@;`Q`Kg)Q<+1c5_?Zb2=g9Xjsjn+VJZQ4agSG&5M z3}-7lw_n#FwwhXTS`6v=Y;7MMz0ex2FYrvUCzQ&J#4SVfDLOeS>h+1{K&hFs6)!)( z4w&v?Q6eLWx2PlP>gpuvg+P%>5)&15J9R^SwszZ`ZwEcR_Znnb9@X&W@853~qMuc| zY+WBbf{qC2Q#VjkK0G1&->V03qX||T(&(uQveJ$7DEY6hKV^n|(})!JFnuw-2HJus z;Dwf^eNJtqq&urj>>U+FJWnh8rJg>`Eo&cw?h5viot+)1I)M3L7Y2@92z+Mu8Nn00 zibg~HhCplzn)jm`8qz>J1vU^^v0CplQzuLXXoOH|!Ey+RWNUM? z=&`>333#9Yxg~*$35px60>mC1uTrq>fH;jdmX(!-mN|?@^fvg9K8_3woH_;ppB3w= zs+NMboR^n3Jw5%vm6nzk;v3|7-=T@oQE)9w9sZ&vlK%I;gM;U?nXKV;rbDMjA+na;d`hS&@+g0I^Or+66nPT{X~Ef%hI_a{9|lpta{< zOHY+}hh-IT@Cc>H8g?cM;{ViDQ854~d404S=hHw*Bm{7UVUreC^TB9w2J!WeTpWk0 zMqt4?+85D0{Hc%e)7#&h_;^d$lhc!u=;^<`Nu0=>@zj2w!xo#W z@4`2~Wg_nRK&NIrn#AUATUS@X7rCXSrQo_!wVQ1n>~PBZr#%f!O&5YS^z?oY7qn(8 zP1(MD3dT@u?3*W!HwiYFm?W&m9Zt`8fZcaA35NL=m9CKG`31|PyMN*UoC2pkMl51= zbtE`EJpU(0>?<`?_4p%I)%y>lpRqNZV4qq`zh(9uh3!oYf)9H4EPSC zlnV%L>hQnU^!VEwa6D~n{oEz`{U7kP$AwW>9iZmRYCS&cCLzb+{AT-2Zj2?AC|1IL z5gm7{LsfM>Hzy}Xd@zaCwM;a-v!Nf1HXEdzeqj3HWMUHas_ly5WEW0yz)F8)>v%`N zW8d%aP?A=;-wb@AzM@WLtbbM&TlkHa1f^QrtX<_$G7Z>H)7?SHU^T@$63=Sy*eH2@ z@d#IKXdzSVXB2(jt96eSw>8aLuvb+T{l||RKo<^8#0AWG`X5ibyeA=!s+^sdM+!TBPT|PmOOuhTZ-U_$Z8LN; zw%iiG#$(moYVvIpVaz-AzNcLsETEP$G08l97}`4b#CZ(l2!X=?+q!zd-Su%5pq$UO7XJQSOGfIH_Hs@>vo2}P2IC24ROpj|IY8}MI^Eg> zAIRbQNMY)`caI}zfVW51)PTZu@tpT-xyutvc9oBj14@4n^Pi?4FE>b#Z1W2O zW5cSfSM6Y4ow>lRQTVg0Xqm&6tU`#yC0tN~ZF#835XZw~N5FdggU;zpK%RERPv{hk zjE#X`9EF|-Xn&xgS5{Za#tb3xf{O!8g)T2KErQC;srSX~7dyYaoa53%3K`U&F(3MW zdy7kc{+u4jvPK*qs?D|d>AX1b$Diu#6j8f}yl7}h=y4#jzpJK*Y@V63HZ+WgiMh_r z-9uQp4H&kF=#seGZz~8U@cz);ape~h5|WVc0wo*#&5%ez0RZxS9UiL6dHscr%?VIU z8ijfSR-*(?S5s3ZylSD>LFDzGRt#hPix=O3)BoOoGwrD@_x1eW?0*Nd2#l@6vH=wl z(NK-s0dyVN$})5k;^VoI<_joM{%z}%W4<~KB&eLAIMa}kWoKqSQBnB@-vo3qAWM+T z1i&H*S^v|gPXy~gZk*0WuFj4I1qA{65V(DtR={Eq{7Li^kgvfaL5-xO6^HeM)W*b| z_)J}09WtEOXDf3U0t3E&RX=x-(xBX9TJ@t}H+x>-5 zzxk#&8OMm3SzCv52Sn+=shIdDY{^JN7M+`4TIx@iiwx?q;NqHw-SOkQeBpw__x_8C zlmZrIKrpzt>;dyFVBRkaF7e}!@854s*wl3;i;`s}CnZ@~TH;_yfUOTEDHlJ3-9u{VQI-g>i6daHDLlTB-ey zk9@~h`FjnQg5}q(x@S14xR(8&K1n++1<=|nRjPCDBVxp2 zeu0FOL+fE24-X%owvXa75BHQLF_$0p-V0*174Id}X9O6TBtGhU#r;7D|G0z)DKWKV zvH!Pk8cO=`;77|B)9$}kpti)bT>55L=`OA>?{|5oqk}l+&-jjR&A!2rYel|>b8ZgX z;!>q=#=jMWkT+Ny5-m8Cq@IO}G9z;0Rh}%r8W#&{k*eSM^?pUd(1<@s2={F!DsVJON zep$BQ)Ki&t0N8Z!L}m-VXGVxCWCt>P>T4q#Wd>CLQczCvu5AZQl%Pduwn6!5Jtvt>5&WunuhYWh*}afQv`x|Lqlza6VSu{Bhya`^k8~M@&aQM`~z?j5|Sawd%?h*QEWODBMb*KBWO18E|bGCNH zdG==7e5J#p&Y$*|c-HEg49$*&c3rU#6%soyKQ{(W$x(v04p#%qN3Hnit7^c^74YZi8$IX z2d5IOD-wAK?H@Qyg3gT2udYtyszMHbs%P}M$f5hK{(4)5zBzvzh9b~tjejPGZqqm|_kJa0x)gxY?#=FD)}0b* z4PW1f$)Qxt@132Bk`kZWz|DP|gIxt&GF@&t0yXKWwsv(@)lU%S03U!`(QSYGMhNB9 z4(uX_&X*0S3CpM%FoOVg0^a1c7HB81oK$qh!BE&DX;IZsszGc~I9%T0)~&ksiL1bZ z)cpn#HRSOgHS@0{MNL$jDPTMLF^{cv z!Os4Tw4vw1LQu!jOP3R#^+umYX+zAFN@|MukhWf4*!g`&6!_2|K1iSuM@rMHu$h|* zC|yJ)YOB1pv8m%_n9VO-_k2`lqg7r~X=~Wgn|5^Uf>V=&Xo_SQ1$!<`%t16mD8o+Z zLE!?uPy4f#3z#97mzTkm^byFH<{*;OQ4xOge&7jaM{TB&eIp|p&@s4f=cK0_fq5Xl zdKqj?c^ZY&;K7aGwzIb{#v%0sWDLgue(?U@9#|C=GqwKEjLy*!U~{mi#evvCE1SKK zKjlWw-8%MWvQ23^49(@zJe_ZLgN=I7Z8LcEI$#V+|&=^zQ*lj1Gd$&ZHxT!4E83&|a2- zB0e)ix9a?3T)?awrp~~P8Cn3T21aOY!LAsTpO@EI&oB+cE-a+<=p=A(yGiIs|1se@ z4}Du%aa|iN2LGwy7)-N~g@MmvYg7*8CBXc%O#pRusvK@8Y$vJ#lR+>FMhj6q3SJa! zsMu{Cb#SttjsyXYgHhY!IS_ZiUK@7hbe0)9cTm(AM?vxht}Fp=K~SWorY^dE;|7-K zG}vS4ic4U;0bB*>TXT7!ZksvAk~UB{@xdKkPErin{|`tw~g;7>Vzi7$1kiNFbfFWS?As?FXO+2%M%+Jd9Z|2FGNDTUgy)?T37A9r{{i1 zMjUDCd)ax(cDMOO8X1WVq`?4ZcXu(AsIc%;Rn;wUyzrdW^iE5gg{%#8z%XCpw7vW& zpaEXEcTVAqo15FRGxQ<`bJTsiLy!igIQjYcfs8Af`vbNikii5%5!otr`^6*z&I25r zsC(t_PF;fQHw|Dsz}A?9?dHwRy~Dx5I<7t7N5KuKLr;JNLK1dfI?Waq9@yNu0Iti+ zr!W{KEZ7QmN=-n(W8M!Vhk`nl(4936g+7Ec4+F7q>VE*a=SW80 zuooAR$d0P;NJC1>D;K{j3_YFy&3gxA0NudWWK$&kDz*oP-X468G;(8qc1lVIsoX=w zS8_k`o&w9zNZ`vfi;dP3PY!}RDY}PtpWY+vWwW|+m^402Hq(^kAP#nSdC)M$6&w!J zB6r|+jwE4wQ2(NnlYgwP;xsrpI>Jx@7_Ckq(ZYDHfuW(Hxp^+sSzwOgb^t}8s$;O( zdVxJ-yS}C6(zpvuv&D%qzV-y?5&kL_|8ww|osZiCe)hwYfO@7?e2a7 znP|iC%5R8h;S-OKl<29fhg5NVZmm@4(EHyZYrWk%_2SdC%7F?`XEhEJm15ME^c$y+ z+}E0cI9+37Pai(K+J_ctrrhP*7G-As0Ip&R3W_h%e;gBma36~dC?!1s&frzxngA+U zx)N-w!bb3WwSuYDo#P2lM-+ej89-Mf8)BRO{JBO)=kt-h1!i3ibLgT>mB=D29RAGY z$vqdmq&FT8rKrM-WB=(=px=afAVc!eSUr}2$(aT{iKa&VL@Q^O&JYON9iv6bcJw+2L^tJU^6me zBz^+cZ~B*CTD;H0m-LZCf}g*o-%f;mx}4Hm@U#f&QEN$e^H2e>`2Cl4mv# zLIhK}RaJ8RPblW+FF+;(qc2|IPF*Ol(5Gbn!-r6`ZPva*R|KBAUu$DqP~gG81tdyB zVxsXZ+{psuiV3_&^^<&cFeRrisR#jU05(B47!|FlIfBi$Et%Eg(ni8v5a(vPdwR@L zs0x4fGr|i6jAKL9iIQ;)bcS0q;xF@BQUbm|6$aguy1Y>bGk%~hR96G=5ZPiAkbyxZ zMkXf15jdpA&G6&9fYDzJFlAaFFK2&N&4>>5@b&SDyW0Uh9>a71LBiEB)8@n}9I}h3 z14!DZ{Q$uQ2HjI&PMvojDIl<#5HFxz{*{4~vlMkkhf6~}Cj12EQo&0^#BJi@GPXK$ z{afEkagyAuBtxjI7JvJ0i-?j%*_9 zitN3!bID%WWOKi+&iD7d&;5A(?(@faJWh3VU7yc;yk5`sl4iVgF{PEAoed*6^hm&N zCIN=SgWo?0e_6<@_PML8#^2C_8ryms$UD%f@Vxq7Z@x}R^*uekDv#+)CiLf_vSSO;CS2H+n%R;BXC{VJ+go6ygIxF>$JJ~7_cHT zW+f9N(TyG?NwXpvL?H6nx8TSLmaO>7B z%`K1{1|nt?r-AK&zBc#&eF40pvk$Cw!CK-Jv;BtuZR50zOx@wp9v~(WU%!$BML!$( zG8PH|KLgCL#ii|R%lQC*Ed8C!x#;%JVi6`L>^K7X_-sW!zCQ&~{omoaxQNuxG&?2o z6+~@PZ7|40C?=GBD&BlQ*NX`9$EdJ=5DIU%1-|{SA@eX;VRayn)6#+j1r1PlzGwKe z&IwQ=>`m(H*Fk*vFcI$iJvhN4%opsXk@VAD7CgZ7!9(CT@4HajtyCC@N9oE%jQxR2 zA8jY#j19}Hgmnp}J6Pp{s}vLYRE|K53Cbc(At%FT>c=*Qh8MR2;~BOC38+KsaFYJU zIsDah%&R;#6E8 z6y7Kt<=(t-(|mK=uLWNQnKu<>PcT0tP2t&m=T)bDNKJvTq4!#lD#UhtNi4$r3BRCV z6-0tTF?2)Blg29bCz@w_NkbO`3;db0&p+oVy4&qY`xMF#08O;1X#t?84!;FfvQw_# z5dNCM7<5M;IXKAk;m(+_fp@@r^=uLb%IvHc2ZvR$4vD7#C~wc<2+e@%;NTBPIY@8= z4s6WL+ZK1ietSm(Hw}qFCLg&9XzqprKn2Pk2xKB42!GuAwCn)NEX)w+p-li*;ptC{ z*{`~RZ5?{3-@|2PWpLnu--=wv>+H~MbQ=TE?H1O9LqDB?s#MZpb^4ewQ1 z8VNE>XYhstDsy;zYz2n@dGCvtNAsZZa)kZ^z{e(a{P2pB5}!>d0zANQ1PTX}hVPrO zR2<<#gTj!WvGmzwmJ}LdDMu%DummMQbOkpT*WUJvbM-u3=ihN~p}l=P37QtvtxqiD znF5|CU2kclL7t%%h!Au8qX75Sjop|S@&j*ZY9k_gpu_ovl5!e_;s>f52?+@{4%Fr= zgfGaPkVtnp#2^o90K|ZgW9tO|Ub8iS8iM(ivcYsP&{^Si_h)0BM@N?=QID*;6M~`q zUl9ZL{#Q~;{oQXq(^SbRVqr)X#ZXoMVX*a~3b8c|>VuRnX_YtcLv#g)fMT{3Ym(zOMr&Y7I(RUu+MLQ)tQrkg$L9^Y0|i0fy@9>k9{s zpCVl2He;o?Z^iyk2{(?%j;h22x+Paw*#ACv74ew}eOM#wZ;MoSHYLkDK2Nwb;9SJNc9)KaX9rLu`1*;^7fm%&T!lEgXprF>kUc`Hi6bH+268q2 z`c0p^#lv9_wZz`;E~?fe50F0qCo@Buo7;dwyAnXNdh{4|dR@cW34Nt)qsw?4=>NAD4zf|(!gNC+t0<1vTHmQLzD64rIvufR@vBZ-x7*kliEYZ%)et zFMfUZf!jh|O%3g9NJ?^9jslhgU7U~Oao*SU1BEJ_F_~!#V%PrOpyd(TOejm-6Y}7F zSyG$ym7Mq)g8m!aMslX6A8KpAzU`&Oqa5Yb7cF=q*?mYtqT15Zc;J54m-e8v6ivsL z+A-|b9CS8zl0uA$m?RS9gsC0qE{~z@E7)*fYDq}}MeiSP;{pZH-NnIM7?P!H8_d5$%3urU1v*Cp6O#|_09wO6AfXT1X15rN{&{QvZzMyEWH`AEo%gD% z+p&yHtK-^uq0LmGLQJ#odv=tU;K|{R=OrJIsp~;}f~QA;&*g$gMcbSSMkHH*c6R&_ zAjOhILc|iBJy5m+<&p?7YbGXTKrMsy39jB7g0D^5BI$udNfB|`4k2R(0ejjM&`|Jy zNHKy{ z7#afd>qm3*3y_Mz9lo%z@W6G~+Zc{=5hf-~POv6+LbToY5{TmC0p}|yUyhHD0ke;l zdktmWsMX2A7FwPMPHt!g(q{gDACWh7KZf-b$`74t1dpw}sx@Z7>PYY2hFD(lCPh$& z#zOwW{H?^8U2XY1es;9h8+vpo;51ZK4JCdtywsJEF@t9C;2aX*yuH2Qpk0`s=Kx`QRxyvD7M zAhWNmtOQJ4&4d(#@tIRG_*Z`Zz5O_h_|%>=Ht77;XHFJR-krmH+}w6vCnWb?YAS0Q zS&Rr1*_~7r>1(Um9>l~z@WRbAi1x_L%+Je1RoSmXD#&A*Tg01Fwc!M~xL-<^Ii@$R zv!F6F)x_#r8Byna;34;LcZbjJbaG%-wQ~; z5@C+KUzbVpyA%h0+z$t=@>&0*X`SWM<&u~n~)qK_jSWoTA6KLIg zIy#=HtH1bp0K9PJ-RO7kh^a+CWo6MTgzy`Hlt%~_3l!NB{qbtsJ3CRhB1s-EF?3|Y zuLqnIyL@L_Sge9G0$Xt41@gwBxDqkZnehWsj>aL-;sd8oB3RmbX!R@PaY%h6&6}&| zP^NDzESSD})y?St4JwJ5nHhk2=;okLTnJuA89&$#yu)3C7a&1HBJm*^7q5eC9DZK} z+#NCPHg%^h)<*KwQsEiKmKUQkh>6AA*hL}Mqot(<^ubWDV*9cX{8OU|Rb6-e4pI&e zr!}F|J*(V=1mJ;Zg9ytx`M9*JqVuNZ>AX5Wkb(U3_1Q}C!s@D3@`D-(e{yZUbTwJG zK(|j_YM=Z;e+NPGX^KR2o;qA*=e)!=92^C3PC!~Jl%%r|`v5E+R5Iw&{9SZ3D6$S9 zp|nj8u%_p_x_}&Em0Lr=AyhuLxk!L-;cNk>9=eG)2(Qy4bxBEMP?Z7ySp)Gfc0@?` z!+ZI{LPFJ0a6or8UTi1{UWDOCkm=*-w!1X_`}ZAo_SM>_DtY1!Fk7S6xzyake=i@xOsr?A8dpv{nLP`_2E zU5<;w3xlZ81>hHmmEq%{Eqjq3N4s(g3Q(j&O)WUgpsKkKPu5+MLnknQ~W_oNny4_me@bYGMX7g zcO?}+L8cCANag1J{_B&9r>7}vqtoltCCvi^x(-B1S?*g4wzz9+&!nVl6NN`O`T3Pq zReQ$A6&z^u+hQB1W^P=&W^Q8QUt6of5)&QW+0!GPLh)S3(Q)sVYR0pb>1m%r9$)qJ z%Dz~sltlJRGijeb1&X>>F^EA713Jp?O#kv$V8OJJxP4Mol%b`iD@0R4B{iE=QBe_; z+;Gzl=ldyLJQN>(_>u*1z6ZA`Nh`qO+O zs0z5rAhmP^&+S>EE{;XeOG#(IBe8-Wk%BzmhloL(*n1aS*IePM$&&HF`zx(OjRWyBLI28Gt2RKo6S-98_u= z8ekqkmA(i|NdfJ17Brc*ws$`qx}QPkQKVRLP+{rssbEG!J>%;2)b z#EBTzr++r4AhS;_Lo+f9sG4Bqu{Kg-;y!RuLiiWdnlvLIJNW?WuQ@|_3SgE#*_@Vy zzdVS5I&h$c|K>ZtQOQ}bAAwkC2E1-f-iY+{^y1HK%{AX-4!$VIYZl*E+d8gN9b$Y8 zotLac0m;S$?SPxhBis2GxL^F0TliLU3efRsXBSC#nTpXg=u@50hbJx6IhIy;d zNEUfMQasb`?l0ZIW*8EQf}Wu90;q&~NVp=M(}!06LWMqEkg6YT8K;TLZXo>;ac@3Sz zLP9-97JHX3D?7XKu3b5Lw|6J-R^CnP2Fma%*BPyw*(GJEebLS7>cV7l4mKPu?I@HQ zo5Vn&8jCi$R4`?h$I#;tS9!cw<_r{B@A)6!?l+RleDde_#J7?2DxR&K4{Q1#kRLa) zN8C&ox&+M@Q1OCO!aTh=DyphJq^Tx)dMKc(MzqSavTo;J(o>97orlanf<u@|35%IL_M2N}N~#pbk zjYeN~&`so*FFd>Y5bcWq!5uuPmmqh6jTMKJY_UWp=}cLD+-Drfi2!~4?nh{Ch&+Y? zcG_F&pABN7yn0n~WDGJHp;xUE)ga$s@|~F+_(5q?dCTv$Uq34wTdnIJ?wjj`#E`k^ zD}oDEAch3?C7c<8Ao?udRCj~s&8rdU$YTj@35oO;#??6}@@?-wa6eFymBo%gL$BYU z-BVK|rxY*wwF7NCNQL_Dsu)G;-F3;m2%R4nE32xyde%p$i{C+*3HKJ>tH1?uK4#`@ zsIMTsSSU}g)I<&fxEKV!x)71gsA3iH6UZ!_aAv!?NEY5)(Uz!w|qn5DsH*<9jL%U0P&k$yC|` zUY)ya7&?{&e^=nj&|+gh1QfM2UlZt>VIAD^J&C%MfSX;tgXk-WsrhdU^bMrZ)$cUc~n(fn^juc<#U>EN5u#;g^}bj72KP2Mv9Yl$Jx_&9gcV6CD6S+MY~re5b8;2}d=d^o z)%wq>KF7d#KOE8agxiNCl~rFh0|5`bE6T z=4T~@rzuZvq5WGg5&@UGat<^WUodSj|G~`{?`(Ay68Q*D&|3e<^LjrlTSmzK%Ga%W z$9BG$Ef1<=$QtyjSDO~N!DEV#U=ZNDgNu*vZ_EQ+@2y)#JxPMVAi}1bDzg}bgh<2B zfX>}Vr!T7s8Z_8$`rfp%uW_Ff>yWC*OM^u0;hLg1yOI`=elz!TCAYql3y*VXxU4f^Ty+Q=yEEuJ4@dEL>gf zv4}oQ)m(L56MI98ibVJ^K730iH1*@VSRZxZO^zhJuhx^e$*-5FrjRaAwY86+g$JaE zje{fLBd5bBaZ=Esr)8uhB(OnAVqkC~c^m-Ap+X7`J6l`tlhj+?2PNqUeiHJH4+}vu z-P7im#y;MAfy=TAKSulnB5(~wgYQ@J3kdE-@Gs2+@ktLdur`Cc4=+It_W0W@K`zcM z+?#kur=ax!jouvy9KpSyL_$Hq3&H9FM@Rb`%J7Gx4Mt!q0t2O?u70HiqY=;=Qveic zkhTVBAe8b72FB?4xWsc%i_leo)}apxxg&5^gB^$0BK+9t(+@PvB0nVMhQtuGz|NkK)vDu(KWV5z5^Qh znit=@y13Lcfyl+D3m|Li+`wu&vEKlYR^JeHzcx_J517*>rHC0cAA0`^xebPeznD6b zYI^#uS(oOEm-$7xxd(IY;57}ZugAh7SUKO2X`=S}fy~TwXb!#nlFw?_yxgo8l_W;{ zdNHCE5PoWpAIPt0osch2P(XnART|1Y;C08wsN43Ue5Md4`}r z%CHyK)`U(MpIm$nk!T3WZy-3D>YFe4OnS82HLx{WFN>Y(X&qG7zYdF}bcZ zUy%$nCaYCST`Jcfyucke1j1(l*bl&xF*7s!aXh_9o{C|9fnq3Wz!Q!VXgD7707?w< z)9-?SNGnDfI}|*x(NI<<(nx}F2`MQszr^+zPrSnBBG9-3e*j*p0$XxIE)ZY!svrfz zc@i=m8T6?^8ZHUz(c+8alJgg_)<=qHlN7$^)NewN6>MSfXTS!qhsz^Q#BqiE&ZUe% ziHGm&BMqfSDS& z_y-0qd2R0P!HcdQfgkdVv@hNo^_HT!CL|S6ai1;)WW97by!N4CVbcS57#SZaQc_UV z>qFoR+mQiSHfikO+XJXyj9sd_WB49uw}hxH`Nxmx2*xb{+JhAi_rOqLOlYc7Mt_bS z(vURtl21OB$Vt!u9nYu$#W*pclZx}ji#`d=qn#Zc*YGii@HH`TG%*HhZKox25^xD+ zF3P8wu!JV~v(XV8(q9Z15s4)n62E@^@?ly|LUEW{VwyS_=OXL=xU+mUy;`DqXNAMT zQZkj8Wesk3SXVhz#x&cq^72n#`EmHljYdaOX=M6}H)yNjlvfb0WHmfEQ;`SD8e~|& zhYk3<1%F_L^84OzpuK~-;CF&W)zoINAQp7XCKPeG@2x+fGu0@s#oHH-^PeS=jt&kF zk*w^PWOuo_eQLQ7+tBhtx$g!Y3RwA|FL)#)1FBBQRCfj=CjbqwK>$^wBch|GTZE`Y zwDFN}6`(c{y8?;uij9tE*BNeqOro0#lZIA$D#%3aP-pp!qN0?zICC>ItRQ8NM*R^c z(1evrNlOoVx*|nEmyN~{3JP?*YT&j7^r94cis7Oov+t46YQ;xJs_YnF>{=cvngU$g ztIn21kdBTHDvTb}e{L2S>cJq(P?m;BOk4#cJjo+Z<16+fMedh;d}2Q-+UMuG5=xyp zh@D{NOvvmdlR=rGv@|xDDvg8-6-Z(z^Y!dg*tobh08s>z6aXanJm6+!y>llskP4*Q z$*?JRACJBK16?j;YJ!~Z@~VcK8h{_()lZ$j8Ia`b*yrKq-+Y``wKwv{8&RZV_7!9+ zm|i31TmwSl1c@L&&3Ee5-&WGk3rrJao7u)LaV*!bMDEW5>nzL_xh>ZL_ zDC55VQx34n^S4501YBGoW~`{Kt@vtPZgLkP?&?*=ay|S*R1}~KRoAzyM>coxaz3|H zQ!PU_==p2xaT#ARz_;wF@14c{-O8VW~0-dX;4~R@&0ti-=y>CQCV@ZcS=e<{PJ6ppYPu* zFnw`x;;MJ4uDt|5cHI9lCA)B)Y~)+|wpRJ$eHL>sI$@beyPMl(7KB4@6TYl)1&M#?``$RPtDHg#17J3%h;6o2cU%C>I`4Ky0%Y>a^lM4( z%GyTV>UaH20Z0@z=YihB+jJa9`cfeWm!1kpltL=2XY~ z!sGoE#mCjRm`5tkF1(z|NPvziEHso;t2DE+GVZ!-HbDQW#A2-hpqYf015>q#>cP@E zp5_wFg8{wqHT*yTs6aF7>9Tl!L7LYFeZM#P<6zR$*K2F%!W@P=eQo3}tK znm0b^i7?S=#iTIT#ZbLsEb^TRFQfDVVYi_0My)+_$AlZ4!iE0+mG7R2`R1IkGPz{T zbQ7GtH2yL%!oNGjuCFy#YeMh1{TvCkqmb>dhX==yNSpt>W*G9~fVl+3advSLuoq={ zdC1`ZQdudGZwqoQC|E&l_$DZb`N4x5CrS5giI)L-fb=Md*w}8> zF6Mr0Cz$%aazE^3ufSuMAmMUtTU#XP0qW&KuKQjTaF|WuHIu}kLWZnI%ryAEnrsky zxFt=DVd0#L;P>fG33-E`-edN{aHGEN;{zPa&7ljLBU_QlD3F`K2w-tr@8&Pu}_Xh(n4QOIB>`NECw%+M9m>YH&!12AHfC*k5m32|i72 zo>FU7)u`Ou6{xqN8_3wR#mzI$co19EaW?zFdqnNxhHe|u3 z%5y;bY)`+?%j>dLk$QAWG{8%;vdhci`J(iF@M@kv?+pr)k$>#;wLJNOX;<--n@ze$ z{Z-P}&J7qf&^VfuIP%@VbBliM79G%IaQM~Q@CIxmY~M0rxI@3cp=x^1PVi!L`xYg~ z4C9+O!Sb;yZFdoOxy=X-jdwECrYgm01RjdGK8ktP`L2-D@P*X1>u2oL=2byh(`$+l z%;!O9jx6CWd4bpZJ&JPSExFA)$mG=3pK6WW6c_)9K@XQVul){3NJy-IfYjOKp>-%? zaCUZTc~$4!>u=u-=m8Xlv5}E1931Kpr#v_a;GZU74PEC!O%THEKGD#S4nQejH(Og< z#p@tf0@f429q@~#W#h#|F$Wf#Tp&H!+3}VW*dB<8ynOi*B3-wk+=l|_U_$r5i!^}= zwV9QEm3;2*H6*^YnE(ZZzVTd>nBxyuS5{R}AsF{C>DZ1MrSnYhbg@wJdDBlDJ zPoDIV#a?XmmApZ7NK}=_!jGJ+=+%4p&|7# zG$}abGk^9cCX(xq?_Qf?76(AGPjej!eLVoosT1EqSn_|NK&szV3K1tjcN5TPpgE3?*UHQpU7W0ep*o-QG`h>b^7-sEM%<7o(kH@d1^!k9 z(bqpPaGQq*3zvlW7e9Z>eXlc9%`%hrHxJx~Ngr{~aj~&|>PH+LHV+q0!ic-oi3+0V z68s67{a+QIb&#(juv{@@Ds>b}xUY>3bL3x!Hc~v&2`kjun30i~LFV3Pdx*r_hx!-^z6)$M?^H$dL%W8of8|kl9~vZ z8?_Nj=p?dmGI8pk)mA6bM8)iSybr(&d753DJ@f;6i4y6l2r62So)EGmd|CNzcx80p zQauYIN;y2yDL`Q$JtA^)Y+b?gYvB_%vWQ{5MQ2 zE&Cu{0q$Z7>Q(36rTM|^q5yvyN;0w!QOni_22(})MIOh$1N<#}aCSk)C1{@nIVQj% zxcwgO&x~{?UZJ#Eo%Q)hhYHIcRrq^7h$mbm-)O-mxV^huo1G#;e;rXrj-m5x-|eWI zIs!40&+91?r5u5MB~J|T-6eKnG;VWmzKi?rFA-a%>9y_>b3W>ru;2(LlNTXPk1pXa zPVn~(BnyWOfkg{Z=PtLjN~T};+O#K5b-!^h>y2^xwNv6L^^3$Rri8j>7b1hc;{ z_6jbCkcF&{=>+FAhw3gB1pWO-OvmcV}m0%L=59@8!~BBrT1 z3Xmn-v){qhjUxC9RYEZZ^2IjTYTTL?UgsNn`)F=AIx0#$dl2?p{rAMx|G|SE8AEW8 zk$OjmlJULx!ECG5Wr7qDEoxKGgGfu0GA3^BS6fDn-`t)xxLj&-YsNLUTFu%(HKRux zqP{Wc>G_~gBUd?-?ztaau-|FFpHQ5YwF8^U1|_1Pw;LbLER0We3(E7*6)clVC}IJd z_Vxu!{8uw&zgR9V{Og&z=Z#$F-2dPz!xx%s#5M0UH2H7tFJ`K%3-$)>?2IH1{)pM! zna$+o7342rm1AmaTO1f@YJHnjRxM9X7#}GPi}5GfGuEj5zIwP^Qp%s2)|NWTZ2!q_*gzqA@1(>lIr4o zr!y9Yh^!(3=!DiwUvS29uE7Aeh~VI%eyXppvveLmK1vo&5=H^!H=NHaC^)5*G|I>^ z8Qww_DB0Fv6rO2f_I$FgP@f2lJj!x7FeH;5wRINV?AA2oto)_p*PF{T9)lPDX*^&7 z(=|6^?0OFl2rSD*2N&qz2qo{Z2#BxT=jUIvuvJpiytgSZm#OznukPdM*EKhsNSEH5 zMr-36(SQE<=dIS|B93D0_jP20cNUyXnOaN4b3sw(*_f0YmgkzI&NjWSG&=5?Vt@SS z+&YedLy|hQqN?g+O^ub`%p}XDix*=*DLVtUphVnIHxj*YdLozxuk-v3qUgERSL4>< zDKeY?+xwq(p<8mYTCS#& zCL%;`ckjU+uA=V7#=-JD0bP2hwbJ1}6+d3*H6d6x9OB|Ai@_ElL5|Uy5%3^RHu_bQ z+64qk-_ft_9YvgcQsgrscW_nv5w=V8^VMcEz@q=D#X?DNr|Hb#8mAe9U9`b zas00L?Kbo#FTY{vmeC2R@b}Lwb1<{tk-L5SMV&4KfG=?0fDXF+RU(C7eW&TUs~Ifa zKlAc1W5w)@dZxRV>#jo%GAc^XV!3ETZ#2u$kPA5OjTxiH?(S3JnpmYpImOeN_4S~c zNpbUSTO(f#Rg6~JcvBMp3xe&8WIt`6M@0`hyiTw&ALfP~Q606u#rfu@{IVKxm68%- zgcV_U55``svo@Z%sHNyz_aDe5{!kJ+THYyL^}Pfn^P$cL0u-+h52VF{x`9Ew;p+>s z5fVk>4NPJY2lI1?{yE;KRb_u9xP9)+b?=@@Q?S%O*J0Q|L{kmNHQ^74gUIM>qDeb% zoG+VO*tOtXzJFh!(RBp>CkNvr*~hPa<42I$1j0$B&MG8@+G4aASoVRG8L~=eBL%;^ z6pY61^{`BlYiJ~STROPvY$QiW=wLmYcse4C(!HaRmA85r6T{^o6dn`=^|FN__p3g- ztdt)KQzQJ}M-k8M>?*w6LibY3JSiF z#9pqhe-imw#l;JwnJi8R1W@E}f`fCmW?vD~*y3Rgp!+$`YoFoEaeE04hl+;FQJ7`{B0z9CQPMl+=+%D?jdN;n{x>(Xt`+iWE*V>ei zuv(qB^eo_r^sI31Cn**t z;atAv;Mhf+DP(-~1&hN?pQV1kXiTY!l4*YhtW++~b(+@AsNs2N&eMfFNiUPX?x6`w zN;=W05RtPxtv^j*K{;&ds1Lb(F9CET0RRx7 zjUa0UruaB*pkO|r9jLJYNBV}?O3%q@8y!W{gHWY{w^>O^3G{}3640PHgX|Wda5fee z$N*DT^M!}GO;D;x4<@CO;o?5BSl(wIn&#>{S)SI*96Nm4 zoak$_zn;+=@jN70V6r@KPB~WrPB_690hiL}hK9@aei4CzdA@!`e^!}Cau8sWwt=D^ z5=3&q?G3i~O?@}MKuY{u1bvU=Zx7~fQ$GCELbtM8|BHP#)a$3>fAXjC$c+nFa4^Kat zJI0A4aqtTRfmkggWI^qHv76nWt_<)G&d(Q0PaZ0^ATtZL@%=HI$>>{eo-_JWS4oX; zrOE+UD48s`fXQQ#aPjQ0+VT9$`qAdzE5;b10h*Kg8O7d?j>vZj5PxT4$X)dDWq2rm zws5Ui-P@qnEa#0&ukGtT=Os?ouF-Q%?VsQNPdf{In~z{|4hXLSG{-6e_}LEkLgEqpk6>+o*n0IxaxNEK_n`7;*U0Z18Ui;Y z(2P|uW)!rE|M$VkkTp@)!knFS8lmLadg7v7wwxS;d)F?_`E(WL_GuJt~zjRW4 zar+MG=bQ9zR5d>6YqgV7eqan9Sm9#EBYXfP6-UAC%b0Cz6J}g=bPo&G=jT^{*&1|G z<`oxT_3xESjEfSjCowuPkkquqdzP*R)OP*K5|tf#2i zZc(-}b+4ABE=`H;^nQaCk2&!of$b(G%Tq}|#PQcOiE78$7`od-!)_~ey(w%cx-JpG zQy$8IiJ8pZ{bWy5KVwGm_(Txo6JzlGb8~Uy;}4sgZ%0KjgYjhNoOA>1HQ;zEUIl$; z!!2;ks#pspTfcfmVE+&aDIacoD~>?OZ7x#rn0lK+MzAcTW=~B`4G;5)it0i1@nb&` zFhTRGw=^|XR#nwofUbNQ&I7b~;L^XItMQB?I~$vWt*yuYy0M{QdsqB;7^Nch16Qbw zF7fbudC$&2H%I{>pww6az{hG!us^dQ~mZf1<$<>NHPnk zmX~`dxB{{ehh`Q zN7uEo^>#AQ+z@1N8#H$;(o?N>kxFu9rm-+QQ`=gAgLn@Zf>3=V0Vee4Vp_L zIfHFjj|M-08iC3yA~Y2KOf?Li0sI&M%*sjxh^;E`$3eJE^27f6X^04s9cncyNiBp1 z)M<6Ndtw6ZwE*+*3G%-<+w1#GsC4-c`4CF87k2eoZ*mR(*Q8kIjo)L5tHUlg1%6$S z0HvlS8CjV1WX#aeG1ir;HIxTPKp*h%+`o0psr}vB=g;D_Vy56(V&@y)fH;e6Wql$X zyQ^emD}yqY1B`BhReHz!MZ~0}xh~^LC#@A$BZgOFpITV(Ka!FX?iyTJdxDe{qn!W1 zlXiJT(VOpU(0^D&izKM`)E}we&I?TpGk8Yg&?BJh^|SbUo7a2La2!5=u1xd5mG$0s zStTO-sHRQPr8O~(rj|N_^{B?#4qDrvNEa39PtF-L1bPo3f(*nLxfMr1BZF%Zg7$$U zg?tQDN5_f)P8(m_96*&Y&Hhq#F&JG{%xxV(QUUQ;;9r&KiHnPa_fk_?iPprzoenC0 zrZ#~^@Q1*(#GZI{j=hwE-2;-l|Ny z-#7OaHLMpe;-A;hSQ4Ce3*zY3ICXYdE}otKX=!;XG?4QPo+y^S!kccEfZF~zlmyYI zj<1gPaMwk)SO8yiRxZnaS1L)olhs%>k+o&xTmkMdC>gzU$(^8u*80fS z$Rv97CCi7Dx}IQ6u~7ab^xX{c|CPCZJ5PsUuz&(eZuVzspy0_#QUmg*gxs^RV7mYtL?=#xJqG$5E6kgIf>Hn|Eu$@yS^TS`d;j20=tothmH$|4%tC?5 z>d1lPMzZ14?3M3ebQJSeVrI06j2axPJOAyrAAxYW0s?c7 zvlCWesp7=EZV|oB`QH%^+=0T+-(Mnus`s(kj=0ge(3>K2ll;b<6F+pFIy`iT7EY`+ zq1UFK-Zl4C_=m;*yZ(pwdDd%QcIbVpTOe(7bJ#2A%Tm=a>-o@pry0c%FVy)a~ zRA=(Bh1aMGKIgl4H4auHqHRZ>GCjC``?JjyEAPx?;l(YzZ{t6jU+ma~?zcyahEcvM z$aOtDp1E;#->8qy7&v!H8+kCPs5}{IgXqd%)s6Ji>wXIyziS z5cPmK1U8rvxV`J3SNroe-7nbuea&HMe0*+&x?|a?eaHP7v9m1%I#mN45U+86m!yQzBdVDUf9X2!h`u=s@f>b(~SS1-1Wd}p(Q=5P>i-^FoMv07m ztLf@VUN0$;tE~;1pD(w#|MBL7YH!tz(GvGLSzb-32#v3D`eWm6E63k+TGQTvk*G+o z6wkRf?c8ylid(;b|G7r3JX+Qsh)+achA0_?jt&rQU9EN)?nFeeN2kxXOvnFC^{=Q1 zz%Od?>uJy?YH1z5YK&T&2nLTYKGnU<0-dcIkikGfQSQjTLdVUms;5^AzbFRE%ZN_| z%uKSN@vWEqRPTSKO^zx4m%k)L!2Z+lhtXX_H~d5m*zdU8+4Ibvh|`4T*%&>I&es;F z3TR8>73-pKxVFmlpLc>Lw$UHq7&xy!lFXs2W3?RKgIE0ORp30caOgsceFWE>iK*$C zPt7wJR-brGgnKgt9-=n_JVKt+5_nHSs5@lO3U-VpwN$r?DA*P5a1_YW+stVH82Dqi zXXA2kQv`cxpi}})qKa#i{}s%ACAGmgBzw4JEm&D;zOZl<&HR=Xda!a*|9~s6UhP|0 zU%foB`uzO+walk|FE_WiFI}cpNc>9r z-yVk~lSBm>6@ee>+<9@vV>c%{Yu@+BFVCYFzHY7(|5o{$(A3t#8*#`c*Lw1ViDNQv z_{lo-+=I&^8PcD>?&(>09Domv$MfX%e;Z1Wg#uq7bPsrY8M5>Q`>6UZ_I3goee|><|R9An=$tn7p<9gMtt5szkKO8)!YM;RL(E=w&R|$!C z=`#vuivgf7-;mwvV!fLyh<-rV|309y90cB9ULIMKNLz%3NUkMF;ZSYQ2 z_3eWPT&0@7f$LdWwtyZXD>w9gTwIGuDL%e*d|W(*j9;8a|Q4*#wP5cXuJMEPkkDcm2Kxk%lWw+o;-d06M zX{5t^mvpB0Q{hkB>=m2gR3~ zWTSc_9|tR!S)tw<kkJWQSP=zey?ZyK^Yu>u4;rOOA|Ur*a_Ued z{i)N&SDCF(2aV0gukHpWaO8mL4*jtM`ScS(n<`(F@!QL?OS0ls0 zjOwd;@nuZ?{o3rcM?&DNxK4DH`G^uH)R|`hvT{ChJX1>&cr~thHI`s!fx~NiZf~L@ zwz4v}tn9-#k7LgU|0tOV0UYez53#g{R#w3EZtrgxV-h5gU3VSr<+?9)^h$Fyd9tQr zeah=kH}yHhH?+29!_3`xcdCp)S#a^P$yXa^#^CK0nZ7(wj>W`oKyEZHxlQPc*20@O zF28FL3EddsU#;n3O>OJL+0E};M!~^?!pQd087eudloj@4_&GGJ2(k!v0^p;-ZCSdl zaPuahUFh+^Wv{b7zfsT^c%hI~xx3IUb9TllEgg*`@leUSEO|0%zOw*i#VYE_Zf7Ta zv?#TjAo?hskn0nnbw}XLSm<6*ZYF zS1iak5~?~QuF&9wIOvNEnKlIf&u0ZrfgZ3x5N|?|_jonEtj<6wRN-s5=az zyx}&!$A0+|AvPYK>5F;CHMUyw{;%8HxQgBMiF`NQ{&Z#A=sO-%nD?uhnK6T?4MHi} zyO6;C&HG_-iFmqeKYk3cEJZSJd*ox+mE?~}X?|Y}C>_hU7^DFw=kMiZ42|CA=I0zJ z!c2s4D&pxseHWZi&follg1EFw{gtxyD{a%fynerM(uJcq6BGT}xVIKSI&XNis}9n( zWZ^IfmUmrJnQxDNhV)o1$`LT{h-n@ha{>Lwy?Y5+S@V~%Iw196y!l1d`ym>pFZiVo zJZab&7_xG6m)2{?QJzy16|9DadM2axg??DQ;}aX8Nv*M&vfRF5!$}$TIom8F|IX}s zEoDZ+zdvD;kfSBoj3coL35Q2UY*rWUHv!YfteDVba&PkIPX$HAiE6=f-v54T9Mx`@ zT8~b7S=p??<;Rabez_*U-&54CrWSB9H1yQd6W>5R-rhD=RNP}$qPfjFCZ}C3eE9mp zx6DisViNf4s+KrsE6spe7>;5H3`(7j}L+}E=sRqr1cW}yDIioK(yQimX$BmVh=A-S=%Joe9<=D>wFU96c8 zxZBdTE%2Mz+c!plch&S>JdovSgUbL!fgscP?lqA0TDr!AV;4+qk-rufuTmDywm?IA z(ZqWNjGJ`5MDm~yGhi))bpw24jP&uk4TN1XNZYz%!WjB}$fC9^y0mtOiz}!AqP?eQ zXRTI~$2l#ew>W!Q%_uZpqdq^1{DY&*4HjK#XPUk(XP;F(U2&%kJ!@kg=s$Kxa7nIS zRafQa=1GK!$Xy^v{fheJghNXUmJnNGW3&I>T!pa^7N?MKABPka+f;lnH6i^MQu=Z> zz|{dFVu;F!j*Z3oQv=xm5G}>8eSMTG|1~&!Axb1J=G{AOSni-b@BX$6U=e+*)8h%zg%IoW=NN4ze5OQ3+2%!Rt;9%uR(*%pc<6Q9#!)?HW zA|aZr9GsIFyS=Hgm^ja$PeL;cHd*M}Bzhn)5mehJcTk{7M8lAHY&Yuw*9xcZ=-!&n^hQjh6ycvg=hZQwMEO{4_{B<0y{EacF}~fz zsjSTs!>|5$slT&3@!VQ}#$(l3T!|G=bnm9Gi1w2ko84d%i~J)49L-}5bq%WM7RBOG ze?~Dg{Fo1gQKUA25fR&{tE)#EqLq@+`M9|u5Z}`Q0ac2-d(kO1AfOpQbZQn{Mi-cV z{%Z(4gO@IT8eE3m1!-%xAb|z?X&iDC%dJMx*+f0SPIKrXuZg&T%w~Fe8VtCA8xn*y zUkVGPFzZgZ(n2@}BX;lx2ys}Rii=Of4~JPwk2tmfEqepz8(_ddGF3lh*n_siuZ`d4 z;7E>){BCe}{AlN;GWy*O{D!>A`FRhBM8BweOS{6f(p5uEox7^ivX1NfbG4XmqY^2p z+d`w7Mi-_|kJCT_H311>n2a(n?DQT`Pst0IOQifusG4F*NgeRX(Wvirrxrg`8$Pkk z*2w!+GMkQYob7-$h;|0Qvn6-7ZO8NPH7-_ zHml!KQR4@0VT)wX=X{Eg@GynD{4Z@|BZ(`@c^P)>b6Hqet_eAO3~4{;PKy^@e7JYb#cq|{^yRiE-+I*`|bT4Ed5itPp1l{oaa8-4SR%AXEaR2^mbV2eAmQg9-i~L|%Y<05(1|BV$*x@CZ~EJ+C44 zu?`jnWDYe1G%%B7A)t939U|$)X`|NL@*(eyM*YDvAr{ ziO^6V7iMNE=zIC+(`ODF zoYQk<4`s~Amr?GuwIgrBM+)@o%qRG)p8S}QX-$nckDV#NnMr=$!PG2lehJ-D$O`{2 zjqBrykO~l2!hrN(?U`3#x0;!Zxd#z@5XA&+(-P=1>KR!aAioBX9tcGR6H_oGahQ4P zt-@Uj%CrDcpZxsaFdX!I2}~dd8$u_z>6!r>I%^b0pB-Q-1HG@WD8`+FTX0{1p~m2s ziv0PR0OrQ|U$t05GE*?WjY1d9z)DtApT4xE3e(I-oU`8zbjd|aA0AX%nsdSsDP9HTYwCkKY&;y35#Z)0LQL3eTS;+gY?sHo@fS0-gb0KR>{poJ5nB>zEy zpJ(DxhTX&%mCa9cnFx~>uqt3ugi~kNTE*T7<-hVzuZ2uSjL7qZ?t{Dt9Axnd3lJy- zau@U(3l! zIL9)j{^RLVU?5>>={L|e`phVzKLdEdxB%iEcr%k69i8#%)1T=^tTs4JNw~SW`S~lr z7fUk7GY57Oc*T0f1{W11(8GBkY!H2uC|!u#0WqEU_+nT5WsWhpEMNro?92=ubqwYq z>sFux$fylSOCgZ>?RDN<>1H^PU|Q+-3UKBwt*sH*r)Ok9O2)qqPe5RQ!pYRpA?=6R z4*ffX6|4LouHHJTtF3$ERp}N%ltxjyr5iz%1}W*3MnF75(I{vp5S8QokFI-qZb)Tzsv;c z_k67rs380ih18G!{_&xq8oFLVk2}A*mD33*W|K)Ynf~mGEw7d!*Ps_ zjX{+M)uD`()Q9H${CurfuiXD0VN6MOFd%9r&PiT659s0_v+<3>Y5I=twgnltRUDu; z0QaBh;k!fhUoAo-XwmPr+Pk8f|I4IrcxzNDAU2l$O;Up2_f`lC3Z*E7T?{OGo=59q z=H`w-`GuKXQPCPIHlz=jbWT_ONE?UzLvg<@sA0cpJeeCBVy52^cR9ltUd!L!IZYOZ zPs#&AV+ryQiYOU%=lIA-h$pjDs;B?26}5_jXhk9VQSg4u;Z8yQFa;IuIVL)rNk7^1 zjG@*xy|}pbk&Q2Qc5s3QW@p>~JszRpSJ6;YlUA#oDA1_Ab;}#MB_Q0(<9O3&aj_J( zK^#iqM+^+Vr>kxWH7pp%!2j`Wz%&VN~4)L`TTKXG6gXt){*^*HV!9pOjM* zjc|7r;cC{?qrAf-uimnU5xDr&l#Y#XwpzREtH ze@p#{F~8>R$}hkud^%+O`g|cgl9+YYXSD3`(Hu)bT@H^ek{CRV~|A&;v{^A>>#TmB6;o&y50yfw(9tJET z%5+RUvvqPIp~RENv(#esWsSk7IRBFp-+&F8^WX8+*I#W{OHBO3&hCnj?@yC9I;6)0 z-^0+U3xcyMzQeHUn7cv-G&ekNeEZ^;el@mhG1C1pHz5wF(p6}=aVh+NTuF1++L<1I z$V0>ZRlD6S3fNSjS7!bg@%qf2AZmtHn|E%!Q1t5g`JLV3fsZwJnbw;fS8~GFX(sBCMt)A_=I`@=1wU6{y#C#jbNQ;9YN_W*L%_CnH7llnfVx| z+Xn}lO%TH|?jb~4l+p`RNtU8iO0p$CORJYJfmkkNULJP%iuiw}eJdZjoKkC7qt6ksixz}b$0=>+7& z0{#hf34Y3TCLDS6(yK zK-Q9xmj|z2=8#$6^cCE@a`WK=#q;LZfAf9^r0JL9hK7a)2I+wAg6boc_?#TEl4H^^ z7W<--l1T0z2es(0ujd{f41d$Rey;8u2Qz56@sySwa&h?!dyEcObaWa{E?{B6I1Pq42&s9++H*-s2t2q&M&WAIH|-hm2dqLp z5H*qNb@jm&SZ{NLpLD^*4I=@#c(yddL8;}*bKUgsgL@2;<##Zte{H;+nVlUMu_vaY z(lua{^JNiHR-K=1PYlMeWzC%dceW)5)*S6IQPKsmQajvd-pEz!w2SI)yH&Q zu}K&74Bc+uL&os&OOQ~HAccG+^0D9QDybPKh1N7*BKJ>qb-97AGt7^ynw+Y#vejv6 zE&~)fsJm^~vH#nL5I++5!ze)a>y3g1u2F+~N{I7OLsxeeMDt*ijO*>`fh_2|gv$~H zC`xFK$Tq$G{lAB^df<$K`mxz9FN-h&O%4C{?f2NDNf#IHU5c8T-6mz=8!vwW#6Iw> zY}R)o?7I{cDNwwNUEjFFwF0wDQO~cRph}N<+NK}o>)Q@`aIrFs14*88ISGkJfJq0) zVkJt-8JACJTcx;LT5^PcVxrc|m!$??!07U}-UT@yD3{ZXA=>9v@(<&uq-pHM zp~YHo$4GzpqS*vtPL>5{3P~T}R|BCmsOI0EuDn{_`gcahq_OW44UcjH_-(7ko>oWa zG;*lvO;)CRJnvAY*L9=0Iq}t19U-A)R~I%HH$@Y`^Y-?pGoG`09)5@1E=rqS%Wrzm z{XXhEVJi*~KD2i}b=#ZY$aqe>P%fJ>lp|ZBs`eX#2t{wpZi)eK`XzNnh2lzuC7Ma! z+gnj{4bypT*x3J>{1LNzN2CKj*CVsrI=5}&kmq!(-spgdev-Ot0hS$rp??EO+8$){ zR7hPrK#xyOPBs8`*#?Z%;d`U<04q9}D!Ob}y6-|(uA&Cp0F=Bxr>4A<pFZj5) zRJF7OXld7=?4xSauZP1K=I|m_7U2H}Yc5>1;8XyU?oV*ULJlhDT?c3A&M4YQN`42D zVaAs)kc6Ce2`RTOd-EUs{7`#SpP0%67Z182aFtL<;=Rd?Y=a=f0sOxlQ;_(e%?Xg; z;w8|mNCnRB=mBJbfCN2Gc`ghZ_LratNY6EB^P8!$?`&>vhFkEjbD(!iJ%axw+)G>~ z5%)j%dZ3GQPZM28o-TC?;q>yY%wJ`sgU(NRNO%=>#f>7GuF2ipjO-7I zkskL(V_YW7!Zs(07mZ&d$zS5-&clc_eJ5J%t6_ zo_{~vUH&yFUGAn=CX0@KZ4p>C53{=d{uHgsR}2g%KO1K(H%CjRD+{;h7p)zSHs`7+ z-I(GVWxZUqs&2zR`-k=Yq7Khz7uHwB2x2@9fNK}m)||obcH5|U9O^j{V6qrHf)Nqy zUN_MEveG^c1~0Cz0w<#zz!5OsXJTeHGBN@LLrV-G`Rx%@obC2@3FtPy@cBqWO#BQH z2jzxG!ubN0M}exkx-=^F!}zVbg2&4T+iJjTbbkf|k>37>f* z0Jgonye5$pN#ghj>8PP4;0Em)OkqId&p>Mf2C2jr2nI;<(!zorAhO{1WM^mN-e+qb z9JB?LiWN7h(^Giu(1<}fzzZFK3tdB zXm|C?I9J7m%$a45j5ABX9PtVZj!XbsK>HiSQ(vHY@w5rz44mPw3@Z`k97Jxfs%; zjXTGsr*K<^0CWR$7&t=zmlNCJ9tkQ9o;CO?Q3xHOt1>C-?fyXs z2ngUk%E`6j4#6vNc80WBez zAF_Tk)Be>H7w7x#P!?`7YRbxypFTxKN8frzixkWTlP|}W?n`LL2NG~S4^sj5`)lyt zGzuRVaWFr5vURkv1@5OPnvL5M?;LeTMV!pmmsoV#VM6euq?`YiWAJItFPkaSaF4Fu zRDRi?jxCS!`oCDu^n0-nitMXO0vaoeo|%kWwX!hL2hqHchu(2+h+*4cc(O~4eUHWi zR1PG8!wNG;k$_WRyqzC!ff`ocA`dQ@aI(R91HsAg2H`kEUjsg~{-@$%wzEg~=`;1} zU2X<~q5HNBBV&GfIj4n7A%M3doFILavfb@gU|}IM7t}BP`fL_em{UmP6@Y)dlj)+6 z%+Dn#2pk-7LzpCh%Nd#YumElXFzXn63x^o^o4%?J-Y;HEbi(W28JQ45_}iZ6@82 z>1Ppbenxb}4VeFt9BF5wSXfTqv#UkzXJP$1@-*wqX}_M9mL>r66uJ%gd`(|m_pDtD zRbP!6xlV+TNrvL|7nwG*>AuA;D|A-KSE_PBN@pdAbwly!y&BICHv7h6m}t5YqOv(M zUU(U1B6={g{nF_!dt;9NAaeX1E8(H)+Mk}p+68hrK;^I?POhcvz4{qw>&`^?+)`5Tu+H`Oz zq-CqZAY)JoArQ_4oR<)+JWvg{Fu?gZ2mMNDvZw*?QKscPD6b6qLB%OrFXnoWSrd83 zxA5#>17*9k}1SxzWKr5+W!d7OtE@{*gJ@; znZt{DGPl*u3_+>QPxH=}sfHua93W+R zu&WCl`L6Z^#MCM&K?!rLau=?8m8GRX!Mc@3Dd5YFX`oQ5rgB6)ZzmP;H@|yZMwDfDcdw$<-jNiYyAD+r*3I^gYsS1 zz(qPE0Z4=i0I^{&gFyv?xClKUN3RrDYf`lW`{$Q1_5{e`pdUch#f>1K4S-JX7TYn* z_^eQ@SUHiL$mxW{gk|soVVWmKVkNS7Jt=@!O;RWFUc`A$t19`?RMvxpmk)6`&8R+O zYGgf-@i-7U+%c2h8@Ur>w{So+kIInsZS?6Sv9|Q$wuXJM&OBti;^?z>wBHmhrg#Q z(oqVl+$Czf%5l+iRhdt%zTVG2-FftE<#GGN8LOnat#Su95$|@=5293>#H&DZL;UYFd2QES>IG?3tf{*yOwMEbMF-V~1g^u;)~bx5%yVvoLT zvcJCafJtC^7~;W{2RE=C9sl$vt8Q*J&BbpxFobvdUtg=!-@!QFUc{qsr=$IlR`KiJ z#r<5B?yU)%veLK01U6@@yCODiZC{+7P!p7>$sPDBXytJ+j$Np4qa!cYi7sBghL>+; zZB0+u4W)HU!{bMdTwgJRJEp2^QZq7^ysVm`rumwm4`EFg%BibWND3hQD9=MA#ei4$ z;xqvwR~?{{0N(^4!(v?R9m$zQj!fNER!t<+%tHgaA5U&c-}OB_$M^x+*CZo4Fce>aByC8c)jVZ z-$g>a(lcUrEn6;GvsQ1vT=QkW&~ULXQWHypiQw$ZS>pv3UGdfL552|x`y+omqNUyP zYgj%wz>|rw0-P6vPYXIW8Mk5kdv1bZvh%8W3wx)Ol%}oGf`!Y~y6doFmYUiiC|&cVAk_+7KjJbs&i5n6@|nF+9w=uWK-%d`V#FwejHF$P5wPQQ(?E1 zE|-X#D$*qO)EjG+{jkk9DlY(~{cjO`b%}BjOv%p9kBVC9 z`h?UO5q@KmwVj`GBLiK8zZP&*}!=9`W|vtU2S_W z_9~(LtMOMZvU$|kLUscqvdi`Ms1$h*UzVp8|V&j{WZixn<$ z{oQ5TC?DMl_phtlm6iJc{WadzyE;0?{&k%Vz&}Zwi-~Dq3jqm$5nebkBu2b>F8Vg3bCSZcpz8Bfa_DYF}Psqj*7K=*%xN_K?*r(mxk}wv0L#eeZb` zMJWfQr>}aWP|cF12eqyEL2%NiA5pQikNa#L$%71br)8aa#-%&zrhj+-?BLql)LQK_O0J9BEg8k7Wjh{p_0Br3h3zoUd~W);;dr9<;c(r< zz)XM^UI(+*s#0D#tpX+!Q?28!u6O)WPce35w!qwHdODlF7EmU9G?sdz= zzFAw|=5yyAnQ%5RT*S7vUMVObGd1lQ92^@+xeKi5wmnMwKNJ)rSeUZ~dX#w?8PXqQ zuxS7kzsYAG>4&$w_IvjF@*GdxZ}P>9fGXnCH_?k83#zKbXiYq3otbezf-4>DVcWhn zBv>d}ZxgF>{+du~kp8t}J@dG<=F1SuSARAqC+E|s`afhjYR{_6qc_&mf9&<2CThH~ z`#X?;e;hh_nkBq<7foB~^H@F@0fhC8T2${#<8BzSn@Q0YG4x0uJsJPH z;7)eXnW;RoD;TQKpmO$GD6xZNy6U`m_V2BNih9Ls40b6e+A8w~RBTDaMt$XyfK#NV zrS-&8t+mE}qN9EC>gvtTlvH1QRI%IXTRy&y>guM_(%-bS--ZW^@$n71x~19Uk2->p zu(2K3Ln=-BVhXjC%{ey*(>-!z_aRQhRIDX`eVx8_>prX+rsAU(AM1zIw3q@7KgGoC zS+uLE>3^)geMdF>JiJIv?f0mO8lh$Mzat|+?Rh@_ZfbW`08i-C>_~dN;GmU9lckFf zcNs1DjvWtPE;=6nN^oV#38STEd8w*J`~6E3Lt&Z{-Ol5md$*aiK6kjkJR$-VmvoZy zK;hcqB(~(fzcg={cS!Zoo`%ui)41Oz&vJ$WGDP<&(n~65IQ>t@o1)drjTKynA60#- zq*2t8NzF-xa0hr-PG^2A?R&q?(ji7@s;{+JI{i+WZ*V9(7pE)S^eukzvKpYuxBarh zU%3A^g-wCkm_SW6)FY|7-Ru=SA7PF~@r*%u)ui$pn|(7Hd*s0I!N zcj^m6bfZ<_x4l$(sn(HK#j{!8+|ziy{fH=8S6eM_EH7CmoTV+++Nf-ONLZfQI;7hD z1G*2l$m5;)XgBA@r)du!G5j?85wo`2{UqZSg9R_MM~Ww8F5{z1Oy6vGTl%gcEWG4v z>FU9pvA27Vb)2KN)Uy!xhKndjOpXf-G zeJECYeB|3tKE0z$uqg6mV|#{{&$5m7F~y|YhVJ`}KilLvMG|P;Qb7rR1^XM8rB>cs zzN++f{c|;>lqB|h?wiN{;nzXGoHi4@78~)-W7f}~m)$R4rvJ@}sZppbko5_x-a2H} z?c`ftTiv~C_r{5Xo$j5iMWkO043?9!fBlpvzMQ7Y=y51XfBxxk_Lmalzn4fAmFI(Y zx-mDG)b^+Urkqt}SAmA5+c8IZ3HyTWR9ES=$bJ4~*YlI5_I8i|9Tm#3og6yuXDiX; zVFlyl6a*bQszwkcCb+wMdxRH@sZoBlloL19t7jD8v=0ew{JV6|#`+;am0xHv`(XXm z`n%JJxZd3NB_(y|#7<6|=>(-$FBGh!GdE@>mtrP2Etyj1HFY$oAF_O16&jc;F`b+j zzRN{+C$N3(@VSJIdStXF>6udfL~3>IqW<@v9iEX^M0a+HVC5hRB9JEn7xki3`W2I; zhfr*fwE%US7t#ND@DWI#B_vApYj;rnv|E_Z-`bt)AI6c|Gwro<)ta1hs23|)J1vP_ z@$P85f-Ar|*S|2Ze+W1f{nHNCujk|9gkGzTo%rz=t?F!tR?so|eVXGS^$1-r>AyQx zVR*;t?(RF~BsYbd`$g?GO_NOD3ko|=b$dOl4frVxljX^4^+&c&@80`Di%)!I^{;HH zL<*G9ta|g;7en}3xbvJ=?C#YHw{@`XB99zzXwu8sil1C9n9luN9Kb_{cX4sgtp!wJ4!;NAiZj>_pd)-L&w<9WHfsDbHh8r%k@lP0+#}$7`EoQy;X;$Yy_O-P6zjDZbgVcup2~m7iA_X`SHA|hWj11l9XS_dp zs_sz~h5n)_23n4h4Q0T;OU7-HyN_mh`%u+06uWGs8CQSvI`Ukrx?v0vx19^SE4i=s zM=jI0EzBDChbiA$cV+FlapU@0N7C58TVl_N1OKErC17$;TJXx8 zCJA}5-jhG(J*QZLWnK7UNGURrbMTggrmxs^j3-jiz-g&{*&v*zu144lT!&&;2bXJ@ zb1~gPWtZE1Nv`YMI&0^6sEEa8Cg3ivV!vlxE2=2-P{~(lxV@Sdv!u0Bw3+r(K%*sz z-xwA_UMtRfT+&u-CW9pVAT}k@^`nRf3&{eE%o-elif^!uXlQpKs>ugNe8*dOj~=xv zq=aOQm|IKME2-E+F(7(@8R4?z-t*Q_1rr9zqjsBJ#vG~PtUu+llo;~)Y_%SBXBrV6 zw3&XhrY2B+Sns;Q5cT8pla9TN8iDVj>o;jig`M1o5e zKj*@P8Y*o%%qXl9Sn|w8`;DBA2kE=$|JJ0q{bHyV_~#EAJxp^|Kw>&eL!;9ipOKLh z7x$`2EB0vl$Km>VL|Pi1we=YFa}F3qsuf{#^YFB?sab3efu*6^_Uw!|lw81Jw!Xra zxYn69{AtUtnH|Z{1o3pm&QSZXuoU`jD_WQbVB!v?i=T2?5`EK6etUiOh9>ISx(8{R zYS3xu;%1Pm;O}6@T4`lk7K}D8dRj{i^x^ z(Eq%Sn{N#khM)$RnaQDjB>SzXh*-dBA(V8Qh~-zYQD@-#x;|Weu&_MTClpaoYEZB) zuyKw_T)WJ7=VC=Gfu_>_-4BQIHmH>mKnV^Wf`LT-!a#pK-a} zGCVttlI{YAOp;x-9z|^pl{V8;!`6RD5!bp!(>>Mw?{#%!V>{17Y+n^qxdZ9pd3Xw# z84@K7kG6PVz*$HA95*05oGNOCi_7h!pDSn+n_TP^)SbIZjM(Yvb3nWWc@$u_&MDIK z0*|{8T%LORJUq82Q4K1%d{93+ZG5SrXCP-`@(G_KU2rk z_p>Ja^Jf=O2a+ehX>5FwBI4z-{VHki%YdU3?#7fVXh-x7j@}9$k%V_gMnw0O=* zIi;^(0`={tNWgx=u!ivS{UUT(nvt%q7tpz+CAS0>6vX?~LEO;i&nhw^yR2;swq~3p z@fXyzoF1Q6{SwOlo*d`pEhO=l__d@7yPaq%m0T7ZjAUlETUpngpR@LKOOKE3WPP>R zURvS=cqo7%q@rR#2|z=?F&}hS%jVZ44J&I@?CHyA&xEP;HN;wa2x!L*&ul0KZ`%+P z?^jrwvcMnOcLpf)p+}v8DV#W_k?0jnBlqW(=w?z zx~R&N_181hR|Q+o#o&4N@$ReiT_VH85NSC^NcKSObKtHVTgZ1Y!u)K5Pk5Ud|{ zM?Y9tU=OFbn$UX@zSBCjhr%wiRx`r2AYCiP4YGsq%Lv>Y7~l(w4%ILNjt;>k4dL&97h^p^Vf zQU@R&6m`-CJnXfOZF3ih0inpm*x1J%V15%45{mLi{qXgN2Iz;-sVPl>UAVl`(0B*X zL71`KLFxh6JjB_}iz-SeD1^(>v?+m>RZCYFzT3j0B6&{KFmyh|%;Wi^%|En)6nF>k z=^>rd!XizGpPwIr3{QaYn1DHc2xkY&G7OeqlrIB;#FiKC@Q|W&=lO^6);5UrgSiF> zdFarOVP5-lV88|jwUBs3Lqo$!va^# ze?iL59+R1A+$Qk`CM;uZ{(m6z`R!W{Ysngn|MNa`{y5hEc*O9resl0+6c0tdV+5vM z$uh=SKAuVcVd)m#V+ybBFX*8rKUIy67Y>E^8MBOkMMfJx$1PLZrNZLjt>f6W5^$6G zEKhD@GyYtG*xZSqK!^LnK8XAB>4?O2vPW)iS2Z4%D1VY6Rb2~_i z2bsbYg=1 z>Ojma_EvNV<{rt3iCCnHL|?s2BdlRC1=ExFCqNN@D)!>q)aQYMyL&wZTqYzS%n|eJ zAYKyz+Odnv87#(i2mWQ?xTl7O_8o1EwA4cQA1o-E$B$9`v2bxyIL+^%GgtGE-iQYo z1scx#MTj7Q&l(^Z%AlED3l9!H&s$m$tB;B_3855`tj?Zj5+f>Vy|lyl?H64Yh;zqJ z%EBxTG}B;v?ko=S(~+Qmh@=qr0z1(rX+*-}6?VNjX3c|76XWdyur~CXZv1PQemKjc z+W-Ei)&=Wa{zcUI?;=dR8E*> zeu>+B@zGeWuNa|r%LHO|7Gt$*$gY$G19ZXc3&U-IT> zX-99b6)>Uo%OJ}Z2FalHLhFP6mOP-ey4cX!GI<3AAQ0Pmeeh;C@IydHw*a$Essa5W zkl4Uvt*X2nXyciHupAlpUV{vF3P}7A6k%n>(?lUoGXOrR&v>h@;cWq4FNkwU$AO6! zAeR*Jd|G%xhma=b14%y*SXh7{nwFPWW49zZiHn7Wbm=FkI0*cbhYwr(`@`0+uixd6 zGk`(|R%+4M*j$ZM?B~zbSu4nR;a5P(^m2Y|O-2(1lBhAeC8B?gWc2p`_nI#GV6DM( zj@h=O$d*+g3P{kFjsicjBwU2u4t2c_EKo!5@D3U#B+6ZFosK`bQ~BIQli1R9*7KKu z;ZD};D4NIQwrrybeD%A7ucw1%qi19jEI1x?mnQ!0ZcabGJI2Y5?>O)-3iTB--iET0 zhB%qL>`U1(x0KWd4PVkbyK`^?28g79FbHzId>%|pO!#;~R4zj2KP(mkdl9KR@eL7m zRaKpGvrnLYf=OYs+w+k7_=JSWx;Rg%JD{|c-j{yl4dw$kxl_PQ^D=UexujkrHD_hE8m1wG{I2^9Z+vDMRom%OU^?A9<;hsp7HM9{v9Fah2aCK47HuFYj5lF z929q{gocKmDrZCuojtDnOVZQHudXr?Bvq{XydTSU`!4F@%%H{xT5lJDIO_c?xijKL z9c5)Co?nmdV95H!kxes=jg3)JfTczx0}HnvWfKkM?*f-sPu`8h$cTuuJe}ZxZ%-iU z6U+uri^ZG~Q$s=E>q6==ysFr^xEIFu&cKWL6c`?kmWL`+Qc;022b+--2*lvr;CVp8()4x(bBJNj$?Q<1gLF9kGHo0_+hh>y zzknbmS9y6vz%x7zYKjf`XsrEm0x^sVeFjniQbQ2L!`U`8v~QZ~bSIGP&f>z#$}64A zt;WrT%*Y?MhEndr*CSME_d_MK6!$)I&$}Na1akgeYc?M4Pp-o!BJ%k^7j<}b{i#Rq)FQ_do+?Otf@iIN#Rvki$Zye7T(BW~|a z&d=*BE&`bXe}ifatbQP^g#>j&V1WZCucCsh{>jevHX@G4Ma=H;qemco0l6$FqFUlx zT*R8hoLfp3WiL9nu5a{ z-^rkrZ`Q)X0#vQ=rkTmd$kMnk7#MH+30YzXNw&)`VJQobPu!NqA)w(#Rk7?9A8o_K z(i%$lu`*j<_v5v2F|GiYs9Yk^x7^7_F#BoLK$dOK^2&5oXjZ+}YZd!LB!zE;6Z*G4 zB!8IR|f2eKN zy&aj2)C8_4yf0yNzQrcq-G>Z%x8xz!6_>HuP}uLA@v_wZZ}ehek*1LA3#C2=%lJSs(zj zap!936-#L;2Xa3g4iCIwiq_B|RC15Z4a|_Rg+P4+idB^8k5zNU25tKHCI$Y*UjwTP zBE7*H{!&E+nKB3}Xz$-oQC-ULlV|r(PfkyFA)ZNu6da$&V4Q^8&IfLBoIObpNKj@% z$;`veO_YE<cFOy{rK5 z5uXQtnKJDki2YVk8?UbU+1)K|bic5;c>Zi(X0OHBc0DgW-D#y;jzxWMbFA*~Usoib zerPjw|A4!tn}ERSy`m*Kh?>ePDOrPSwy>}~nS64CN1CrM=u=8sxi9u()79u_&kwri zR3_RiGupHH@?Mnx-??LBA+cknVx$=T(w@_)>iDOtZ1a*i^aryS)IBx7zR6i;MEub2 zIJ@8xrYu0d+COudyA>buZ7NyosQ>NMRG|U2RY=9VB>wt`2C9nEyy>`07_k?A{AAcBw$gGMTC6fg9h4dw8 zb%DC*g_IQ7+sC`RF_(SgoRec>xFO9K!nHvm?E;}z9UzH}j*V>#6CbJ@QcnK-`IwfLz-MydY7FGOBp917)znZQj`GG( zxc?X#DTB2?iq3>p+M>(*+f*k3y}b%a7;pFF`|rPM684LI)lNIvIcvR*9&!)?zqMKI zk6So6aavqFI9Ep7Kk88I{q)=ijVB;@D?hgjGPiOvlQ_A?=UPEk3v-}c#}r@zxMGv+3f^+u_(%R|FI>D3n5xx zX__Ybe6kt~(@`@t>SK9FZC#nvVzt2XxB#j7x1?+7^wU)p4q8{Mok6FN{-Uj;17X;l z5TgVNavYq{?>ZnM4+#pI0>3uIVFW-2*#ml#f%C(lT=olvedT>Q+=}OBXDOD6f87_p zr#Ju~AZYqpnp=7e)WM|L#r$goa5`9mHO0jdudpz;UExEYZ`Nk1+Th#;86n6Mo!}^f zeI(-ZXWvCdJ^;g}FFS^1ch~IyM`A7jvAnvsO>a9@N%akn+&5XqL0ln5-SHb?P>;lvLrFO~{i7chJ)Z ze~u0v8#kNE|+6V2OAsFwj>d%hM&lWGKwr_ZS$yhp>Z=oPh?zh&@K8@{| z7TFMO4P>AwmqMkTw=@s_ln*NaD1DG-Z$mB5^^)9U{8g#Ds<>r1S*% zgYc%&+o*>irS&FF6By!wdjS7idrKji^z%#@Pib~ z`C)uB$fmZ4zX+Xx5?%U0au`!c^b_=5KfM6s6RaJs?j0WwGN7L+T4 zCT6tN9uPcqAI$!osROLcfQ#b-Eq)xY-6D`mh(O7{ImQSq-bKjrJVo_-pHX!b9Aef{ zpjCz2K0PeqpqT|xq&k>{Bn3=g5Q31YWwGsj?NC7Y-?vH_*g(+#FLni zU~nJ60vvQ*LTrOo)w~0flLGYgZ+l`!w+cXW?+HZ%J>mSvIi zTgDGI6WFjKZZ!s;)Pw?|I~( z!aVa7@pWEcK=0&%YyAMr{?dU<)0un3$0z0@q^j~+5qZWOlmw&pUXE^>L{i*6p5Ha= z#M+!MYJ_G(fz0$4E}iP^(wA4#3rtZfhAP~BQz2RAh&2HHbk@LEtCiZfBs8>3Kao&RLX#S9_$ zjdwv;!&Ck6!e>BSChaElI$dv8?LdF??p@fVPoU?82wl$rYTawF4kaWcK>rvU2j_86 z+faHzN(#mP%&p%zC%7@fCcSa60n;%scn=fsL5B@~5!~U2cq{IM6w+O_et+=cOf8N9 z#|8$*NJWLJ#IyZ~hm2}gd=BgN*(NzKchYI74eV1J8DvZ3H0O1{KaeB^X{I6~*IiQ; zbv$nb>%F|ni;BJ$6cG9;xqATbrcu~aP`*0oRYRG5Ze`_L2ZsRBW&%lj{mdLwem*(A zcN~rCtOUG8b#AswLv@F16Q5pu<`(EYA0V_5rp(W6=9xc_|ItozudLu}>*m%z*9skB zNZ-}dmTu{go(>uI`*8v2Da0?k`a*?#ztVjkz`L0x@hp)w80H&ZWFXNB;G6Z%7!rK* zL?$-USIGdSl@x!HNZ4 z@SpsczQrlU{&=D|1=nkp%x2Jt?N#ws_U6{k4#M+wdkVe+Ma475TQ;Z^o;(P(^+RUx z=Rh(C7FIAgH*Ga<>)*W#2(>t{7ro^@v#jDBR=6*RdQF9lBA}jn3VMWE6le^$JuYCLD z{knGqbBc=1MYV2(T{Sb6MKsMjvHl7R*%3d|HmEjL z1c^vc6FD(awiB4Jn^7x!3BJo@L`u0haNYTS7Y6^4bnO=oytkEoSNLswGD=Qd&MEGW zkyT?DDbjbo5yqAllt_O$``L5Po=t0crP-RqWhZ=fAk4z^^tTR-85Dng?>CUG_I#_W zOVMQwW(t>MDD+AF4bV_g0UPpu(3=?mO>ktwJszUYWn?Vd{BS~h!Y`pq1D#Pt=IzMX zSg3QU*iT3SR4V+)yTs_sI+~r=%V_&Z5*Qjw8yhvS6uwUm5zxrqxJF}abl8Y2D?0>A ziD+D)4qV2l#z0g)*WkuOXr=AR1Xe>TsIxsCEyaWn0<^hh#Uv~CzApQ_c2Y@(&c?y`zZXL{3e_C2LmdYxH+vU-o zL1S52aCWxi?yR4y>m~yOIT<-~oGg(a`&;`|aMu8Jyb`3wfB!f+gcsR=+=kU|JN6f#Vr| zDX^d2g}tC&((i!m`N*iKp59*Hhrk~3bLAz&rE|8kV-F~aj2wml-j-$XimxN_2;I*H z62-B2vK4pT#&ElShpmL0+c^xfA%+2tAOd3-Fcv`b51!)IKI#tIPrnSGW+0>@k-0(D z2LR&_I)j3PjxM^;6oD2zpbIfn?t_9szvUJk%HPYg{UzujL*D~OQ-&rg9evdPU|Grx zDfb-O4={j%h=Rd0Eu>rf=gKdY9)C9;HvfFBJ62V=Bxm!=B9Zl18m4OeBvo5gcCG8~ zprV4;wH?T~0hy%Dlq;upvC74SI(wfcbhx-`$N3*tkw{hiK z&&yLquH#&m(YcM$)#mn2*w~smp6n0JhbD=HA8#Dmo&S6yt3k|Tzvoo!T%ABjc%c)W z{y^BB{P(No@|l^EJY~BqufK$I0Ka_awp5pPlT~+~UVl~bM~+W1qMk#@Z$Ks3m-f<0 zyIRP4Ny)}?X_S;tO=q;#|CJ+-HhWv2la+|F++0?Akq2tRrUPDJ#pc&>B}v2Q-wOjz zR@oa_CQ^xTG^_KP(!rv9hv8#VW0%6a?S~W!X|rOQ>Lo&5H6!(*J6Y}HXSKZdw+=pL z^gX-JSxrhklB*@JliaxCVxl7ICRM`CF?BIOWJATuY6GK&(3Szw_c_3!@w#$fT!8bY z+5Ew$7JVoMfm-DNw)r^Q8rFzc=H?q<*fBRVYo1BZ&u0uMf^d?>-^g$)K*$d3C3h~(#pR(kA zElG&soviP9Xydf)S1>=j?WJgg>1Q?{kXHf?AmyHrvohuC3))bpCSZ;XatM zAXH?2toWus$dHn~y`LJo!;KP90Z-C+%O&LHi?Z|#kQcA1PS!ip9MP7irG7doV^K^_zi6Pw zFIy70O+b({f>K&KEI8|YRW6;O^wvK3(c_M+%*i<&DN?k?GzP|e-3A789iiEQ{LRgp zvrEii$(Og&y_Dl2S~3qNOiR=HJ;C?9WS#BXPIjpKQ6w+AZH#=lE<{6eWRn6OKVFzN z`mVn<;dSR;VI@&nH>DK2OPrFu52J#8>mRH#8Zn zP1msE_^9awB&?SRW|es}T@3X8Nad>EWka*iC<#6%Hj@msM;$SdDOPP*I~O>?rX0@s z@^=ZFjwtygyS~+6994>Y>+Bc%BaW<42WKN|afCQHpXT>uRzzPONUh{_8s4+sw>ILa z{1zBkyXdR}=XuUU!&#<9o_PQy$x-WFtu7W;ZS7z3jK$q3Q#3KHwQyICPvjAi-~66% zWF8xB5EJvtiM!F-)IV7;&}Pu=jLMdPz`ZUTRedX8eSfE2qIcpniXTDlYz_+E-x*Y7 z;{O}f5FdYb-?7^NQ&iiVx$gmU-syMP-Zl4P_jEO{&zxty71;#yC?d~a8}&cDEpZ9IjuI7jU{AAjgmYGtIT6MNqJi>fRplH)z5 zRM15inmn??CNY^Sv+VtS6tAUVT$r%k2p-(YQ1nLyUq2gE4pV)gbS zb0F*QkKd3rl-fQS1fOhtA^UCkI&*ga`YUGRp^wjVmv?+B>JwMpIil_7%lpW3v6w#J z74rx&8_%NB8dEx{C2v*TjFbIaV_((Ure0N)zS@Ahs_UiRc2D0aSt6V_*HbxuEWc!4 z-`z*{JB3hdXDAXL#k_rC;Jm5+3_kv3vcMTI$cKz71^@%Bn@3hJc_Am#kU50Ack>Ooy&R%@^sd z%NhrotRg?R8J(V2VaG^}*R?J_$1mxYv)l%X1f0y@(^JfRO_c7$9X2$Y*#4nQaO;$I z%CpC`?v&ix(hwitreG*9C#*Yx`v0Npt)r^!wzyFR1nJIANOyOOq>|Fof^>I-fV6-h zA>AO|UD6HG-QC@F7w>z%d(QpFxc=uD4%s~WS#!-fe-&?V8_#rH$?tIS_=~flsNr{2 z91e9m(VVh%GP-@?1?SW)YFDSGOb468;KlqF?PncNWR8^#RUH+6hn>k#-r={Wbpk0# z6z(ax+Hkb2I&(qh;r~8z4s2^ErU=w26uz+4QSGUv#lnWJnoBRgy<%MwUaNU^q=H-j z;TOfGNuUz(B|n&w6;+;ZtaZO~;49MPmPHV+B$b;ZL1hx%E3G6F3*|I_Z;J5$?85hJ%(okvTtYBTtzB9HvTI@Uz=~%SI0s*~=kY z8h#u4p_961(e3thHPcrEWhy%`yg2GUFEJu(n(Q{8XxG)Fk+~^r|L?{fPJRA@GT&7Mn9-E&cV}cIyrBne zaHXMD{EZ53tAB5f2Z+c+efBu*Qr}n)P}u5#ts{m0-2dgXTp=lYne2FbM36cqGcyqe zn*`ogQf;jQFk?MFi3hcwBd%XXRh7q9-!c*2Vv#O0!8^hQg8#Y3Y6XIV0`}V^n7B+G zMuTnKGP1T1W~Q!+CRe{)ndA^jStm~)kPR+!g zj-qsO3uKNzse30AHLp5!dKms(ALBVBh-9Kc#Hva$AMNV|ic94o;qP)+UIjHNK<5 jU1fDu(cQg&ESK47m%{nXNKQrNV7ZkGTu*os?k5??(`BI-7h3%M z_Q=Q+^lDaTH}k+a2tegKd(KXd$d*ewR08vJGj+n-=!6>^{Zk7K?vi3Is03g9;HQMf zM@P}jJT%iRgRVkxucz(We71%=#-4; z;FzKRG?>A|3@6Yqfe8|t1r+8j+ z@c~XZ+UlTiIU&riCvEC)66xzrw5-V(4es^MxB8)`jnnxMSWC7|eNUEgfn<8LpH8`m*i8tgQTSe06mWD#_-=#0d(K#z;|O zV0QLnv9@hajvqGGEQvs^j@JBJaq(u!2&)FC z%HyU6&d>LZjI`&feiHIezS{l5vQ0`#-qscrGa$CQ_A`!YfkW?YNgVSM4sM*dM7Wb3 z!Pn0*jGpD*$lyXmCA{W&nJCdN?sk0~u@e$PqBfz^Oo)is&JC_6_pIRG#qjVhah&L% zEO!(eQ5$JNSogX0N=Ot*LpQ|r24ifj{e!eH^XnK0f!Dl{H}T#=!~Zb1vX#p?u*3?!mxr|&`%R!`~Z{O2u6`j5Uq(W>2(_x{ z272?<*8og$u#|oMXb)hliKkeu$)G+a@lsU=ti;V`O)IsDr#-rXhelc(rWeP zob_Q4;-SE|vYT3j6aKuX10pBe)wMNhP6;@XN)E8Nl#!FO+EIO0e5N%wr$}U0InbjO zrTlazb}d@e)|}GlgrxoGUD98GvGLzHd3Efu@{WIY*5!NA`?0^~y4cosz`0I@6L`o+2eV0nHo5AJ{% zJ#}~4!9xXL+}5j`?D5$Bm#d?}R?0PwX}Zas1hdAfQ!)ygh=28D7!P@`Gf5zLX3krY zaNr&!O0uIdXor-*p-8=7aVp9@yDG+}7rIOpjkMo-Od0ZA>*`kN`>K>H6ksX(TpeFx zSQBP+^IN7fr(PwC4z@tbZTPrTmmFkqJ(l@^LF?Jgg^RsLH{!YI5B&;4Bg$!HyCpz( zZ$XIY#$*z@Dsq0(1&VuM9%={{NRyM3y2HS43?B`x)O3t0vr8wG>iXse0H=(f%CoYE z5i+oCZV!Ksm|uPkQhc0isF?sW5O!6P(<)h$<9M4kL$H5$Gu@(%X`0g>*SFR*phB1! zRYb?c^czBS`j5n(PU?s3sn?M_pE{UVBEre??6 z&(BvB6ubgb>Dk#!oS_#_Yo^ev+NuE;`@tLQ%X6~1W^iz8=HFkcs!q>TcyDdl99K0k z>pt>#cEMhKLD=|d7Kqe(dDP9#!CBTFAv_Zk=~IFrCqsKQhdWn|j2e=X8sX}uKUKUE ze0u8gKu5>uNd|<OVRD68( z|9nkH% z1Y6$ef;d3g2ux4EoGjW?R;~x!q}ywx?-5JphK9d>@o)>b{8%HnS1Jw9* z^d~Bo1E!<2moxbd}Rszq1;7sgID}JR_ByYyPMy8K=JCnO~<%Oq@d}+FhowM zXl)fUZfr6Y=Y*@VV&mPoFZvsn*LOg%6s-!wcab18o>N|3oGt&v3?q07-JPE5(FW<@%KNaP^6 z)-u=Wzw_2qCnoXpog1+Qd0A}}N{)KGNk(KF9O?N51w(tkAQ4VOgg#7_2;10|XN`0a z3wr_Usps^!x{4})f1i|;gpiQ7uP>W^$<*?&ot~x;3VYSn*H?q|_kaNK(Ke&w)Xonr zWlS#buChU7mbnSdC(T5}1A?@#?j*c+kF9?5Oxoi_gbaWzKKBWLEVxHqJ!ZdbO1W8` z^3q=DzALQ7TLSSa5Z2W*MFIlvl9nc8)XmMyUY#GV?Cgaa*OOr(ZDeFZ%qek!k{^J4 z3h@9Qu|1Zt{crY5Fyyg<)`Ek6@f{_VwyUzsNh^D{6NZz7hDy~1{*|5fz_}QHE_xP_>-#NECNC$ z07nG?T^pnt0qu#2$?w2GxUxOat^yi1U_kN!dvSn!1205?tr%bKe8>evs8<_IjEob2 zOA25$j~ljm5J>090s#*pykdW{Vq*8#5?X=Awkx$n6fpk{DQmb(VL#cbiX=i+~zt5zyDmaVg3^I9= z!)y={d$Zpi4T_W-!!CvDRJ5S2VIY*3_|#QKcGT2sXJz7I$4`MMD|_xQ0Zbq2?s@%n zVtl#12OC?MIVSqsm0f!>QcQ^R*S#Ija&CIWL>}~|-}!A*r0m>Jr|GTJN=jwa^2eXI zY_x2^n^v|N@QDe$g_5{`v2pU7)#7 zDcksRoVK`j8=paZ6uSW!6oAV2c)M2uXbpg4{_JB8Oc_m0O{=S`OU=O@3GCbf70Joo zes4Sv6OU*9l}??V|L7MDZQyJPOgmAC1%9QZ$dN^!UtF}hgLNk{HueOFXYBzL+I3Fz zNwij|^v9Li)g+T=h*#iq{iqKzwCzu&GUHZf>si-0N?b z?0Uc*V}4*D*D?629*&Zk?qo@pqtleUD*JyuA*L3Uk%b<#Lgc)LSCVa%59r$OePX<| z8&_aY76R#hrl}|eY6ON8v(R`P+~wzg&wdHVyl|#f*3=L{hBeJTcqfY7Z8kd#1Osk7 zzbHxSc-$_E=U7QAs*}+$(=Ah3&4#+x5=Sc#Y6e+Pbc-`7QUyAjqLiqaD04OituvZF zT`SD>wxXJ?Gk1*X3R0Y$jsO?a0G`Rug4Kx;z>`#hNi7PvB>}P!Klr5~ zcT=3|uCcx!zL5xvv5fc!COO$XTLxXNtj?#pLU-l?<&Ed1Vmq|u;#RDQ*n64u|3`^~X=CDvx%u607r@dR)z>R1hdz<0S9@P0tL5ZEO9 z!6F0y9qcm1^9x`u3jixLbac}JAR}}Kwwx?mJ8j}xf9yVI0y?9OqaGmqrHuiE6ZR(w zi{96)taZSH2-XIi6o7sH5N2UvVPr%Wo1L5E2M+~gFgy@UI`{+v_f68-n79|L195QRcROABz<&TdQqMEv-jmj`c&w}j@`Ju`zpi`3Vo-x(6?wSd+_u8U<| zYQ^WTzo1gpl+|JVs4bcK<50le{^0%RibL6RJGBYBO1{3ZO19Sy`<)ry&2LD;B&Md+ zj%HOVnji1t`g2Bp3kZDed9qy$Dn*oKcKX6rXUYIOec`P#ou96lH=Nq?LB8dmedHT7 z;TToAMd4p1pBsJw(n4R8 z5LwD<@~cUFinC$V*h0|Oa$`N%{Az04)(G{;c>dG-ZxZNnINayjMg=A4fdv$u4hh!U_Q&p%K6+LesX$qs zQOl=%;b17)Jvst?=p!g@rMAak{$2-na0noQK#8*IY7iJt1Tu&K3zfzVd~+bW2bi-h z;VC`K6@Xka2OeDAC>%zuzOMk=2>dVr8L_jld|iD$W19fs5=jJ?ZZPr1?yi~11gPz7 z?ChjX*}B{vM?JWmkmjYVnL=nKNhg(9>m69GVO`Ttm_JdZLLT=Va=Ub_UYVYxy~Obw ztnYarO<<}TJea-1^)eSpk@5=W$VlQzHF|@P*Th@%3-glCW zn?0^1bBBnH2dFyD+@w|!Dr#XUQ5HfHA*s9B?E2wV&6^ret+g(p!Ri6Ub@7^&f)--B z(6T(Lj@k>AgWcuCxcUA4U*^3xt@2vz3Nholt;v~3QyCWph==FrwZz4<%Md{g0*-qY zKp>m@Jm8F+7(kHXYfQ+{)@p7A%(jV`y=(>Gry=@K=xCS@8|n@IFK| zoxKPpFom|+#fi;+4Mq0)H&&MK94pfQGBXcrB(_P`=}STzmQ-fyeKf8SkL937Q9T0h85$QPi;gvOV}bcv=EABuG`cT5g3l zgk{!i1PX%|=c5$B=mfW&51uhSd&8U8e~sQux7?qGCMM41x=({xC%`!Rx@Tr;N*a>4 z2{b|*S4q!V$jbM?v=(sPpfzD(U|z63C!wgvBqV@;Hv{w|rG(qIfW<^5pF&7XjDqCa z5rht8UO>_dw3dL(PQYOT`fg6sQHq2d@F7sSz`Kk-AXb6{3=sDci%kmniaUvAKCfII~ov$DGU!m?`xRm=PTD1d2!Z?3dHfO zTwLipz>t;!6Vvm}1SO?7C8fTg;DWEO2Qu=?Tn#T<{J`|I!spqvt;iO)3kil;4s#;} z9qDYc`0qHu9og9!z-JdX4s0_ez$(~syhmYWu_Kuu^=RI;YwK)ubWvQ4qN6)7FaSjo zZDd$zdIAJBfH%lT8BU?`6J30FVn6Bh)+JiBzR7pl!LScs_79*?eEhuSm)3dYbQ>7w zyLKJFtFwtW;=#0_Y|Z=^pP-~k@-cQ0IKY4+q@W?las%K+Xk%Ee$4`DDh(5xA5Tq4Pfy8qUS||Yl zfoS$QA3I>$52hp*K0XieA8|{rnEwFS<(7ivRO1X1i4M$$lI}c7SuXB>Fq3 zs-!-X1;GyH2E~fk?Ofpa=0{A-Qk{K4JiFxV>}?p)95b7pZ;SV9n;q>2{i4D~Sy_lr zKX?Tt$tB%}6LAsT5>M7}7n>h`y?JeE8K&K^2?ryVoz=KvBjxBg@#l{Ya3}1tMcOxR z^_7C5RhfKM&y%*@9v}`FIro=yAEMSh8eMhvd{&VGk?hAVh}=L#J39WWaJ&D3<%L+Y z=UWA3R;-(DE|`?Kcwapf0D&|}znSmv^Zf>Bf-ee~Q`N03ERgO!0D%kS3&BAxRa?%1VdEzSWN`mX@@0w}@VJi3n0qo$KfC%}&ds9cnTD)c6PnVAm zxmdWmfC+pWgw8sa3pbQl<6OWJ^Q7z4Ixy5=4zav$qFg1RJDw;ohmDdgI~r&+Qr3^HP4%R7$GBqV&XEA&pKg4K6m4}V2?Gz5Pq|h~Pj985yw95dObuIW z>aPx4_3Hx3UNNzqogD)M14vhjfo~%H@$|Y%qA`ZFFXnyQt&>Ui-TSE0zBAdAE}9QU zX0vmHMFvlgLzq+y_IBlAp`kC_1RS=%b#!Ec*+DvYfl6 zq>`|na{^~+Dv?V(0#frOA2=_BV+3N+UC8~w$_!t=b2G-(gq~GyvtfAWf zpghJn(u;obEQ3*h6=3*e<;+Ql%@pr)JE^C;bTC)_uwQ*IE{k;pWYo8JcRZA(0AY5xSF>qjI*KJ%JLe|&r6lL0U?`0(f1EJVZo&8gBXW@7cnMZ zt?Lx{R}+9zzX{0Mv(=XUD>nJf+^^RqKO%Ox@6Bk&zY=dJv~b|~rZTT0t~B^+EQfKX zV(ZJhwl6lM(DuNo26!$XwfU0{4S{R#3>bfFDJTR*Mph+j@A)HkOG*^GoRA%C4q8&n z>!2bf2^tU88V!CwTyRIp@;qI?j*hM|9x7Cl5W2cbb$s&bt9Kl;v`C;;gx3d>W@Jdc zq}cUv67S{Dxn)NXQDZ#6Ai8-Z)qK1w=tI|hA@R4{+P?fWbbw$N&m`|*ip>oJ@3vfl zOm&`Cfz*av#kV+?)mL5O#;J;6_gMFy3JSbJk9XsECtgX+39jUtVI3h^0~q|;n50K= zhmCBl6F*SP23dK@bMasu26#d~hPbTZb=|VJ_t+v44x_fjUXAtfuh?CNLT|QfbjlxV z_)9YhIxH6YI7K)ON>h%??_t>R7DVXsx~+|LEx)|@cFz~3P&f&v=J$4&hf}-pDc-Jx zPMKm6PgV(p*@Jpso{;dt=J>6w?8<6qa3Z&)sTqOALOnpT_|2!7IN0rR5N-$v%rrE1 z&o*DhCvZuiv9X@~VQ!)DxkMq}=W=s+0!-2D+?LRA;)24C#xI!+qTz&1}i38s#AT^4MM=O&>l1`9UJz>1W-70k^ zMT2K#{`vRm>Q*6o`3IJ$B(hC{;NK17rwh;vDw5~1u6kZt2Cw~Jezd5a%kYJ>;jP@YXs zfxHg?umny)+ zv!1P#{<8izlKcS+5+uA>vgtifd`^!c-eI@@Q-h~3+8|X-0lEbN3o_2RC()OiklNtN z9M?sXWkm6Cax5-Ej7K8w;%q)wn;w1hWPZ>hmh2~6d&-op00qlX1l(&ZE<(OMb&EhH zViAiv>(o}9jV9xf#Nj_Ad0oEz3Qi`$`!Mt)MEvZ__q4kn846+xht{(>UR5F7!3NH{ zO^6FN>Pj(Jm!2b1Q8lG6 zXwtvY>zL}iYY)&Jw)t2h9(svMg@n(le;!Ack?zRE*>&}&!DfZq{q7^PZZ0E_*Iq}F zq`#p)Hj}Yl++=^?#r=%s9o?m*uI@0-%ehMyWm!9}-vI=fS4jy?XH$5>&NPeQl{)o& zdt!7`?P5Sf7uj1ct8G4^F3f7xHceYFEG*?(YjnCECRvlz%-PrU;Ol8@)#-17pu+d9 zgz#YU^N`K(3H_zD;6-;6_RrNQrlh*mQ$5iyt0hyusw8y0m1&ND6&SKKUZrP{g@QzLqsQS-KI(ro2`=_vyqh<+$_?hJ~Ofd=#{+%APF3 zoDL1BC1PW+p5|8B5=`^eh8sLXN#^LlC+~@X3UTHI%=4=9@>83dz1azk+osfGD3!+y z@nE7^%~?mYhXww;wI775EA4S}&PguU_Zx-nYD#S7iI80wo=SVGg>z%>15JnbAL>qY zI(pF6phe0X-gA!g*l$_eU){rYLO`i%hR`fBEa6y3{UOOVtWGZV`1mz9yk4)j^=wV> zpv!FpN466P~`jvl-u^N`HF~l zAZ-JV@#B^L8Rc|_130g*+3+cgAQc&pBx!gbJ|dClAe`CNXnCIY=Rp=UIS(d^( zJ;&dIS{VjqHO|Y>l;sJ>cd5mllw?Ta;%qb5Re_gUwVFD2)ixxl#j1`{jgm~RGt)W~ zS^i14xZZ3i;Y1<6nd6y7H;2ch!qqAYjif(FUSS>CY1KDmHF`%m7~A{Q{&q8$fRDO( z|3M;7UTo`>hvHOChXeoJ`;Mqo&u{tgOcCynJrhAX_WcNF+&W$u7%iA9X9|Qw`ylOl{rP?dHQDH4g>h?T&!)OUZm}BeMy6 zm63&rPfb_aa$|atitl1Q1Z_LswbQjT8^#AZ%ptz)bLTIemkXAA;Vzor)uR`)5{t8e z(av~jgy%PO(M}9}^0?3}tmmT9g7o|dk{41Mn_A8ecYTwrL%0(cLXaD2)nHzVu%5Rg zZoCtZcH|{fqxf@MIv8k&*YGM+D?_V@<`tKmXEasJ=%y-_MGi7=+lIZ|L6gJXyeDB2 zcBiZ9;^KH`w+>w~<^#(0SqWha{zk*OrxRB@Zc6jn`r_VzpxR-}fXmX%=(WPPmgUEi{-?-{pDGq#7V(kwyWKns5#=d-Fk1mZ9%>X#qXIw1})Fo9^;p{y86 zA2&ijh}#j6*7At1WQDAi_`-^~ut_$KxGncqlj{l!3FJoTZ_|78$$O@!#ESE=)__ZvCi@Oj}I#{E-iR zyrnDTWnXhSwYAs1#DfxBf& z9>?dC_Qa;p*q^7+IFmSM_eEZ7G&PR+ja>(On){yI;1-hhrm$g}=MhRv$9|t{BEOZ_ zpV1#x0XernE1R;nuv!>y4R#mP>CUWLX=0;*GO6UqByC<{IpO zbHod%L*C)?Iyo-aN6+{n&y?D~VOzsATVp@d`fW+S5;AVE%@^5q3U+Wjo=5&uzJo@{ zj@C@XPdjrL!chNxJ|9DB?!OS?y&%ITL^CWt@V4+@p+I+n?aT+46 zVrHrRPSj-FUSE>?$UCMxQGa_ev^v@E`Hk2DZ-kuxDx-qZ42yer_WIt)9F?|^mx26! zWZ0Dc6@#$H#x+TLS!Qgw?tx&JfkSjj#pE9e9N5;-Kq4*oRgck$q8bbxL60U1m%+-5 zRzvX&*Gvr6q@)IG!_zuZHD6p*+}{NW)okG?L1U)s!`6K-49)^VsGB~0^tA7G{=>-FhpBcO-{O1-|T>H8@7*(n-ZX(rV zL1K*0kALKj@;W@DldSjlr#I5}XgP4M?i>$>yb>lPjet}=hzXM{5zW=8G(T65teh8} z4N2Ca+#wk}!En>x_|A7YEH=}67S9mu5LEAl;?`}A&UIb3PBT;5>roR~6$*S*m#BQ2 zBpdF;&)qM#`rmZZve(GK+&*w^EW}Em*h_S3%QRQt&bL`2UKutIeX=p(3rN$CkCAL8LW%4pGO zqCYdu^i!l{ygbw2wnE}iwBB_4H||!P1gf^v(dim1NC$wuP(VMu!f6wU3ZtvROLAZIm8^hB6V$_Kz=y6HT$=ib7JJ51)t@|*EKOs+fpcsPea%j24} ztL1&>HlaZN?;B3W30&g8#9clKYX1z~dcSRO*u}Ve;6?*3H*l zc?_ouGd3c+f=!1V7FPHh0o~#R>VJPf!%6`)q%Jd>&Yr3pir{o%n=KOG0!qf{GW)?6 zx3>zN+OJDfKC{=B&U}i7cqdH0`uO;T-0Pu=L;)d^yszxL{zl*c6CqT~{dyGwT#cE< zo5+uSlr>6=6e)F-v5I=3Y_!<(6+dN_{P-|*Wq*@$Y~#+%R6PIq|2&V*#5chMmHV~u zdljRL1!f5QP5c~emOS3$eG|D`@s%c(6ebr9Q#15(LkHGEAsHLDVU_la8*l@7(?x1< zA0#>hpX~Mqv}G-;aFP?@WQa;Vdmq0?7vd+r6G4T!%^Y)Qq7t*ZL~y_-I-p0FhR5)} z;9(r+q%$*D*Oa9Hpb=>0Os>#KkNn@q@l>MBmd_j5aTeOITubrSAd<`wF+y6AV{ey( za?92R_7{72%zMK>O?4D6*FW(-HKvKlpwUE7&~QmH^u6jdv{*FdI2!G?dZbbtJSs= z3txSnImxZ&{%&}!ie`ds3p{*TJNQxwxiL2Q?4sh0&zno`(V+$M+}NDhC=u;{Il|Y{ zylMOBb12vxi5v;LO+t`$-|5w6xc|)LQB_pv+lZn_bnrWfL952iv_%Of10FowU;Ecm zggrRh+?lYBtfBusB6V(&65pm&ocamtB}>x7L}cN`?FMGYo8!P>>-1kPge&!g#}TvmY(w{d>FZ@YiZ zYG4oj;UsCqxqZdfBq6<^`e!{-h(ShF`PC6Z+W-AqyeD*zkDs&i*k7b=zGNhD4%R|0 zhqLaDHq$zR#{`8Gq7I)DY!F_4eb!>PbvstiCI8|!&)XJNe*nwi>< zE<|M?-bi)8OpT>mNaKU4bG$U` zR*=!gBmJ{%76Z~CpAjYWf9GOCQ6M2nrYhN;%(uO#Il7N3uc`9~OQ(a}VyK~et{NJq zo%{BQpRKQR&!Q*#1Fi`7E!ZtiiltF?wRPC6R=e7n7dH+<@Cqrh=G{&v(>>27ea?~# zWeg-Q6+@i;eAwX}Oo;?Lu-m8c4?IxnPS!7yR!4=17(&HIKcBc0Y9=z_ANG#>-z_{h zr?V(v2BITh{Nn?U!92v=`Tku~ikaz0eDM(hAvhEE_BTkB7>jen zJ<|cYCA!)K43g_j`dr+T6A4^BW#;P|mZTBGYrccAVaK~xhPS8Mi@##v6wA+)T?vm+ zV(lfboD$Hi&5depeoK}~ktwgpEqHjW_*Cq5U0m3g=HT%Luc3~zQ9nf8U(0fZXB2(E zPAID1~!*7_7cjdKY?OHpQXp7&o~JyS!uWeyROBYPw~2`$kmlBa-P%+00+shpB;S zRaLFnRHdGkAK?!T9sY%)@Z*+2qQ|Q%hMi=3DaNQ;pEwQe?@L&z1|gAz9)=w`?03v1 zd<}2KV}}1$3A9NeV_UOcd?p~QjdI~pomfzT`ftHgrb@;OHed_r?PJ`uv9aXzQI@Dc zH-snr_$?&c)YZr-yO-ugp34{?)&^UV*3A#0j)B#YO3ziwXs`T0P82PboQr0lI{&@Z7h8NSrAIqjn& z>Aa}>a6TN@Pi>_hOQ`Zo6Kh3~dF86EH&HHS(cN^m{)CPIN^~q>L^N&(e-|lL=&iJF zXpV_bLf_~zXP*3MTxw8cP!>g;hs(kHUt9Z*aj|&uJ4)pmDFfvJAXT%3wnNRN=Ek6$w(|WwLu3m~llq`-2 zKRF}b$uG{Htl0@#%$yIDIZQl#~LNx&QT$^1Ov_rZ*{ z9qxW2Y&~I_6agmMT+jR)mcGC!#UR3uD!2pq#Bqs{E%#p>-izS(k}486qS*ba-<
mr!v5*oxE&__`| zl1y$IVzxJ)A0L({nkwg^qH{TSbq0-@vLsD}H#;Vr(ehS1TX!r{hg|EP>{hVuf^6FL zHtFD}_niIX{v-XR=l=IW2p{TU&RFTi<(LmM5V)Wut_J)x};Xzf2Xf5P||L6xua*I z9ew$Cm8%H-8=wQEnd3sV;>q$@A;0GyWaq}8B3G0D84RV)^>FhMLaC_9VA>(_P!RUtEWk#XpWeA~Md!SMMzvZT`p( zjmqJ@*xk2Uo|hLZHzzgB?A^|(hkRLyXKBS4kq%zVr{Pvy>9*u0sf~_T zr{t-@i-ah-Ip1*}89`eN|DU$_8Hy~{!Y3qTP+nnkqzIbv#>VJI5XTS}ZUG+seR00v zsElK!xx^`@yQ~3-T)Wh0c5+;DCqKdW@PTiSV246xwvxhBf*jG0Q>i9P(7l1ebn=f; z=K5<5UEP90F%FB_7Z7TCn8=dUrnTKN2-xtL2TuC6yC1#hiHb=eie(`%JjynRi+CzC z+Eq0sx%(6Em!;2T#Vdy)ZzZTJY%VDx_fmG`vgzQS{LZ#^{ZXY`?x5kd*?Ph0MpdQz zw^i-;|7{~MQh%_}azx)U5~sH~=ifMsqFJC!4qHX}k)vH07>cJfwY1+4Vx+ybF-ahumsqU&2& zz63W=DZgcqWFWQNL!=$q`aBc&#@uK2|9iHT?rdN11NcZFh5F z-1%Yx2D)COip}ax3GG&%*=x>kun*+??x61gRp|LV0+@?(L$F6!&D5mef(?I@XnPYf zQd9YR-#QzsX?)5`iQcc~!M?6Mq5wiP1efNb2J&0~PbaNRmCp;JDWn?Hvc3;f^71f6 zm8J}=MX`0t+J!`(y-s?`6t-rlNQ*aS^Zp$}HVf9PIoy~aTVHLct~fpT_pAD>iRt39 z+JC2oss3hbyBrmVP`8WlMt88LcFvLH7+;xL$49Cu>(cz%86#5G7L`+u`TB7XbS223 zEBWfu(&Lrn;0$)t#|_RIZA!t&y5?&phFhP>W2C$9C zchdDy`NP;)bxQ5yepPsa&Fx!B6>OdL&CGrY7f$ov?4Eb`{};@-_1Rh z?Y^OrRsTtud+zmYjmbQvT~$%uXZ>1vH#w2Szv@VxWOojTI};ZZ4O*a`cak93r9CBY{AW@VoAKc`O~vO5h4(?4J?&8!LX2VU%G zXwkt=GDeQPJZTuULy#LLD#|W!-I!F$#ouFQBsMn9Qo}8e{ugouqq(i4lXS1E$NJEu z$ES}ISE{ZGFr-pYjcGKy6^C+lA)z)McJRn*oksW7BIMavWq%z z8ipHWHQfz+YghWOVnebm6A3EWOj+9Mnm&_j?|no}*5KC0*jCY^!pSd4xR#8#+9q6_ zBEHXmT^j(~al|z_sYNg!L5H5bwO*52NyciCW?;_#o~V}opO$q2w5(f8uhc|^zP-Pr zr%53DLw!eJ$Zl{Cyex9F`)Z80tkZikA;Wn1V;Q3CbxAjsAmg>sE2-MEH~s4l^!axO zGxNvN;os@46BZ}}EisHxxy=pfyGMc$HPNvDyqSQ3Ae8!X^gtoxt7LS@Z%ydI!h(;D zwn&UL*v=`9`I`MyCz)>8AP=*b(QGzNXNuOm+jY8WkY3$u{nRgJ#ZGL|wQsj}MG%g6 z4u)yb@noa`F|=ED)@8l3;fkLMbp$7WUar^1WU28Y=tizc_J0N%OCrH(>~{5|P5Ypx zxWmB(d$+6gR@jRPHIjz<&Fcxx6U?s{BaV7VFwfol^P|YM+v#BO?Fc=SejlYUffbD% zN)xhrQilY&f&NT`-@69ox_QjAst7MT=7F6e)3>>?X~UaMX~;o2^%v)Zm}1hO^)wj9 z+$=1X(|937`2Q&VyVvn*VkgZb(j6q>~~vo`ObER-_}&v-rkm;EUQH<8%FomA#c^i03$Ls zvB_{bx=uJ_q5{&{(NkdpfSX=tJU{lntkoRl0-o(1ENllUVV7x_%+Bp&55BKid zh0~RzI;W$wh}_Dz>BtdL1LOoQ7h7h|JIiAEUo{3Taxm`+we0uzeduWdst(5otnE^( zKJxd4^qty=NTKYE*C<&`^I?e!dJuIp5~`|_>C=nOye98|H`U}8eaMvsk!5}8a)mN@ znt{vX15KLt!+&Zycy^lMJ%*KpqUaLgKk$4r)F-Y!g)25@gR65q?#w0g^@rtO-!s@w zKe!NTNouPMEOP&_m%=DCG=aT#Q}v2mJHMdsa?vE%HGx-AzR`L!%Huoo`mDKBnJg1j zxfLF&*xt8i@KDBQUgUaO-_~*BPNbrSNu#k1Xxxt)(!`?G6eks=i5QGqkl&hhndXz_ z6TI{DVu=9DEc1V1j-q_hpw!xxp&PjVZ~*>1op#OeV?N37)ZbjK54OM!BC6xp^)i>Y zJgq+t0diHf1y&uE$041flUcoR@5bWD%TiP5J?`-gT7q*MLX`N*$0^w}C+2zKhra=s8hiw6ILMeWLEe`l z`6&N$dxLiVN%!R{5i!AsRP~Ua^?@mrpw)mEzI3Sx23QgOqk_reo;W_^<#X66NPWyYOd^vJov6LaLMdK+ zULa_|gR8okaWTZWE)t!HdiJVpkA2o)_Up}Vf*Gqj)!8e$kIm9dRJtVC2XpR~iS@A> zOF86j5q4L}^V}OEyF9r&N>gFN_Q!T|ZdUUp6~?y=vfi;}_h0_sNhcNdk^$$vK|m)# zJU)!XO-~lqb_9>s>7_s1$fLAqY`DYK9+&5G25gu^H`8(8Dl#B~{I@+Ux7+9gF8tUi z@4b&kUWCiA+u_SX4{rW|5y)SH^ZA^Ok(ZjP3p_hFr2n}f5T22`w~egIRFPqagtVD7 zkZ#q%LRq5QxxC+}OUCPiM8r?`hXo>-jbOhEKYsm?5PnXuvWh!$bW(&%Iy7c%o$6tB zbyERX;&3q0I^Fiiu(T=M=kou5dPyzD_?^c<)N(mJsgzJpz!T8y9Vg7w=)BAToUarb zh5w}${IfWsS1m+>hbKNVm1G@oCPBjzZ~ftK^teD|rtHez*!X%H-W`*Wap~)4(be%7 zdx~XSj3_3Q^NWRMOU?iNmmSSvxgp@ zI2ya=v?C1wKVaoX>cT?1!cdu(^iHs{wb?~8A`=CWKr z;Qk|g#+=iJipmC$VvfK~0vZcHq02<##q=#zyc2 z8H#zTjN618*R-V-SwI|dcd}Y6Od_BlEXuf*8q5CS?LR>Za1~*~2+sIt)&re|`+7h( zVsJ$zK)wQxcu|0b;Jl`FbLOb8jb?49%P6o(l1W`)>L)eaYYB@KEA$gic~<$E4QtAz zG`OV$MK4fj1{To_cYu=z*|bJk5=!}rDLH}!Y7$KXjRdzi(af$6rHH-wZMUK|Q z_eGxXYctQBhi1@#Qi1iUr}lJV>8aTB{0V|G93hRH$)Pb^MTrtP^d*1a4D-JS*v`Su z<97c&q2x-$txsN?)GQ%Ic!d^WV(M@a|RnEeETiPHa*8{0uwi zyR20kcWrHEz59^-gzjcvyvjPB*1hGzs#$;Yd6|quxRgzYC%4zn8N&-}`s($Ufpf(z z`wff)O;)o?O4lqWb($z7drkwh!h^M{zMmp10iM>Y;`D%;Cv4@QbnB+kP{2;(fU-ne zGnE$moG`}gBkq6|hGHE^l^8~)r_k^hBZmsG8vM(fS+7(ZxB2sUOX?yPcMHc2bsnxF&e} z7(Gt=9~hr;aArxJ5HAquXmMq9GPZ=M0P7M6>zLH?w6N47%csukHr38t!Q?Q}C%5>^ z(e64S_w&m^16D)r58k`pP=7;~vXWf0ZFZB!nzK=XaZ>%zYGYI*e-Ami*Sp>`a<#;W z0}8UVk5hj=8DJ@AtvikAaCxM(Wb$I_^IPhkYTMzdw8tl`mFocF6&Ha$dc;Q;Rs&OA zm8f{wdvP`HgYN8)J#(+^N#~DOZXyD*K|v@MV@HBFWY1c*!3pa`OaN1fJi{ z7j2NR@XaiUmbL7FxbH7N18@?6N)!m-bnN)We)mcmN;j%_33Wx|mU(Nna)Q5KI;pNy zK5FK+as|S_;_@x?=h5I zoOvPAzcQg1Y~pp3^>_|ESKn*frjTcY%D^)C4~X;Em2Da1$C+E)mC!4;{Gg2bv0#o` z#`%SO(%GOmHJ_(9V{_wk29uROUtPJGYTSadB|f)Gf2*56H4PpU8}7txBn&@%Ld97s zYvz6eUVfvIornGIr}_i)GKpW@(Go)gkBddyYslSKg72&7>FAVHA<*yB+pdg^=`P)1 zg-&GY7{S9HqK$#d3ZbCa&0LTo=mwiLOM|t0FNLLwhRktp*ZrfF(|6}G6rnHnOQWBi z02qBw`R_d~yUF1T0+MfN`5btHc!c6{-?&(>WeoiB(69c zMc-tEy+I{N7|2O}-2B@xiY@P3x*@7E=~f97%a6A-8A4N}y}Nh%zm1QH!P`^>Q2mB_VrQ3#I@ij^%s z+*--H61;RPB1s^kf8knpXdn@TqeholHMUyin;IxjRwxKj@o(0=tFqwqwo+sUOQ-z- zrV8Xmr;|?-Ud>0Qh3$X_*Fw1h-P-{mf5;-PjQg}?zH{mmW=fSp9j}zez!xTWR?U|3 zq*m@xi+UiOSu5p>-zvKN+U*$;nn)WJ_G;71+YDFCRT zG}50%LrJa{Dr#2O`~d^Uv%|we?MI#+uV}Z_%(jsx4jan@<2gp_cly6_nc)z2`Sa6; zCdvmzPVH(p2#2rZ4;8%T))wXYMJe~UG7<{Oflf0R4@#^jC3(CP0OU@~m2~yU6VVIf z5T3Cv9ivdaUvHdpH@ESp)qoU;RtIk^Q@t?y?s6ls!h|m_S1Rfj=F-bU6P41 zgc=S;X8Gpw=m=G1y^Vqxog&5he{yjE*Vm(fy12Ftb|AOmwVsu+Y)Npyuh-u4a(&kQ z1HTY=?ec{{<4MFD-U=_jg4IU7Lz$+Il{syGLqX{`0#{#T$A{hHFc{WYfVD42;y=Tpd!td!sNZ^G#oy2YqerEKuJN`ZY~XbDqaLUd+LP*G!wZ#bu5?wO7-~E7afT^@`WG=n672d!VKtSc^6htx?t$sa0!$y4Y0E;zDqk-0xxE@tdu_^S?t4I=`*=i~~=#=KOb$y0XwR)94%K43<(cU7=b~{PHPNC_+WFh2izkyMMa|uObVP%j4A6& ziEZqeXlcg=3lB`T@qBLe7yEaAWuzxUxWApEtD^i(8{}I9vHiX$Vih0z?G5O&(XQ5* zjhCk*sCKx;Fp__rpPoN`dYe5u5n6U>xM#VoJDH->?^|Sspc})ibzKV-VIN=R&2W8| zAI_&9_(+;|5$Ha0Bd^bB_ImG{?x_*SAKF5m@~#4efy5>VAn7q_F(t(%G;lgDw~0z% zf;`-f%dMV|TxM&boGnDym`BsfXCKb6VJ@Wq5nsPv2ZW*sCM%8j_7HBke6yutrmiq> z?H(U$$158{GFwX_dhY?e_&frfZEu3M--HU&TyLyb8=YPsjew#%@3%BZ$67W^j@gEa zxD4GMpwtYoD7`(N+ndaG7)wb`<9iEMrnn4iwkr&7Oc>Xg6<-dOK+B*i72s{2!u%l?3OlK8WO#})J286 zT<2Bl!3W=;A|v$SggtC@F(_-3VHxoE)z(Q!wHT8(WIExmmQ4V50IXf7gi>;6EgUv- zh8~Iao*)esN8%SqbV6hgGG4KKU9?n=OCbO>*WsSTMlt$wxr{Q#ofMsXLcsoG?Iof$ zX$>g^_qRk*(vPahQCQGiPt5xkWAMheR-_STJ(CI^Rym@C;sq3U90Zp;hs)GeHKJIi zI^MD`|Gl#p8yhb}Q9K%IvfA0-nZt7Qp`4C&!_u3w6PhLQS~cE;Hthzz)bQakIcz7+ z*M8G~?t8IN7HoBskRTt3*5fef4++Qkkd_*emW?`Fav%~w3`U^0+ThEQ!YJ7rAqOh9 z#H6IG7Ax_Au5V-#+x?MNSXfKxTyO+jwGd(Y-#TnI)){R#XKPGs;IX!;P>a8S-=5A{ zonw+!(l?ntA4#7YHksQA~G}ybb{ofypp8?iSHE()*wm{ z+AnAK;0qcSbTpFhdkr@b^l~8tbBoiB5+kBaP;E4>GUF?)oGs)4YBO@kDU_I8%Yu!? zD%>r*EcgVvo)ORkNCC@vTM%C*hbVkfCp(k*?$_}gm~WI>m3<8F$3O7@o5KPsBH5fR z<$p*NF|{2%fDGG92udmdO<{3F>D2Zd3glAROeeGcvRQAeRLC~*`DWZLKkDlfsMW}T zGIv(aC!mE}%ghY(*_jv?_VF}}!{H<{Z`-rg@*^z%DW9ADA`m;PYr;3$5i?RAttK=3 zXg5^@y)Js<$mxAvUG>z}S7!<;CNkxO5s~~%n_3Wx^z@h+83pJjK3p}nY}Q*H?4YRV z=psWPA>r?zI9%H3KJ~qT{kIk zifVMc6!>1PafXt7nSUqOPDpBm41u6MA`IHm9+a4B zb--lxe61s$uMO}?(;yqMUZDFvn#7kY2C=`NZ)U;mw+A?mxLt0FJe=d%thr+%Zn8R` zg8O`IJw1IM8d^alArJ_#wtna0y8G!$EZ#M6%n=sc33S~{Kbbb#&GfNus5Z|wRoD3x z_5cu4f0@W!imJhakb%}GOFz{I?oA*a_F)P27A# zqsjvCtguzC&bTx?D;{$m9ucbH^Y#qp_4ZsKxAC|WO?!82rmls6 z#n)2s={(_I@#NB4er zTy(LH7yZ@J&z-?sPL|Kc$Oxz-emtCV=(D=9Vq+@@wO4(if?JkxtE57uvrRhwb?W5+p!DDPj+W+J}lV zq5&x%PtwVCM2&R_-jnSTE-) zu=1U1w$pwJO)%Pp0lP7ru3}B^%6{jK))J9zeKSt+`}+S!EPaDTx(1(Lcp1usysi zSZu7P3u7{J4Y=tNA%wYzcg~(}8epKq@Nkzk)z$gDU%!#WYHBqo&ju#M$CHuknJ>?F z&d)0+Tp!j_t@RKeX$NTM0l>}q1y3- z7tf0}rTnd^P1kqD<+(cnVEbjcAmjJ6C|G z;9HpalZOs`Cvg0HBBQsxI*{F3r=pu$y5zQZ?m5&&6#DAxg$5!|*!6oEL^fHD4dU$u z6(o}nQ=cC#o<>!qs{&MCmHem@D6;v_|3rPuFX*tIg$!dyz34dnZ)76%EkxD3Fqh!{ zqvL;q6+ooZ5TZXV^#?piF=AC*+}`fhqyh3;4deP2kwS34d0YhqKyoJ?z$Mr4e{tHMejqEbpwY_!oF0MRcL!gkcK3hUV(~}_U_F4iIg^JoB1e}pqpN(JT1NvTZ*WaV zmpB&fhFA0Mu+*jE^6T{CkofD;3>7e>7}R=nO1KwH3%JguS~C|32{Z9Q3UCEc%YMKpw!3FXF0C;lXRuU**x&DqSHzww z2B2x+|C|0o_oeP4(i7y-LTi>;WTmHPl&yWEW^vw4Dv!}|cM8$QL)~WbFX`5?Hw{Xh zs{GQ=g?%??o!(?}V8(2u6CzEM`melfFP1bg59UEmkHNj~=B3E}%W~)9|35ABBT>x! zLyO(AJfE%786j|rLh7{JtPBhTP+gYuxIi1_F@e7{yZKx}m&niVeYA@^B^1~5L*qj% zwtfiSP;lz!$rd7goO81QAmhL8cFrp@;hbrYhm4}aORajA&#AnwmpN`X{h0k@6BD!) z6l{!)(sDip6?z{_H4FMxr7}3-*SFsv&Uq8X|IlhCT|ZGZH8p+L>+1u2#EIlvWm3D2 zkGt`>^H;#h3qyoc=q*s;615VRT;@ zhbUQZb>$}ix?bJ(bi++wU=4Y`xYj>2xlk_F5`t#%Hz}~VjCI1m*x+&?cN;nSlbBkb z)(Al0>%~eA(c~HJUgurY#5Lk{Lt9Ayn_*>RiH;8{^s(GF^%&em-ht+R5xN2@K3vPguruOMp+4Tp*(6lt>KY%{XUq(1&{wrtR<^lEc4ukJ384PkF4#FiOQ_dHA7?4dqKq1_QHU>~Tc-?A3&WK%_mt9{S4h1@-WK+}FMQwr6}%UOgxX{kBJP zKlf=OE8v!Z90L~&K1ly?AsSysDKnF--b=fFR84XSi}CR>!N>Ww(@anT_AN!w4>Z;X z{6v98E-G8^C3kmINBCpsEQ~N>TjzE!tkW)T@W_Xy!Y@~hxHdxc!|k>)%%rGY8;K~< ztIDk~4dUHrZ+93?4B<)a26bU`A!R^5%tgWN<`2uXC}3R-K)9@bpAMjr4`R2Y-iC&` z`DSv;y4Yazz-o~%MM9*v zfyK%Bc{s~rAgb(e&Z2u~pePdi=-M?cD+}cPy(TP7GB-z5CX-LmP73$8WFEM{_oU~Q z2KtCd58amfI;)k0%s&JrfE?ic)4xbc%%HFl5`)tM7AvdiAk@Yw0u^{xX81Y3K1k`I{Pl;?76F(W)YzhFiMrVp-*2t2#6-!j zmc~OKNP!TR9%H<{Z^QVXn$+b}q#VLNUIA1ozQ5+i==67`+b|kv%(6pp!fTl#~ z>zQU`Q!Ul`Qbl>s^X$6UA1bs52zh1`DMx9V-DocCNPKwX4jYb!-o`S8^|#OR*gikw zl$89!ypd4pG!vj*a1aig#0JLV6O#RNrp$M9CU<~Obs#ITT1pQ?>ccH%ksevTHWz+F z(4YF{j~hqWzEt7Hdc62j1=*wzYAg(7sfSQD_JPe9n7hwG+vCK8KI}(#3nXN574|?~ zc5%r2NL!c3&sb)YjRd1mqmNhfPp#pdU3Cj+EQ))cR1H~f8b-H)rTSH;Fk@hT#eJl> z*wEjIlRMJ2Pk*!@8rLU1D&BOq%u4%Pl=OUzwsyVjem7T$xg9BhvRp08o|F%KylsO> zNz@?Yj#CgN9>rMSjnGaE_gCZreXDib#BNPia%R^Xed@-Lwzoy5Wo?>YZoM{uSSJjk z;y@5m0B%Yypscz=A;z_4qtwIw{coEwQ_UkV*XgzD2hsdZ@c1Gk_;}KgpI{n(9JYRq zw$xNqVbJHkC=cs6c3(p%sKTn1@7E`jZ>6-hp0Z8vC|if$_|{F_n^yD8;#MmXUp8PZ zkAODl!SGg4Fe?e^URzf?iL$aa^8C+ z9hud}GL}DnytcR``+t#fzdt<$x_bj2)RB7!_CL(b>b82?jEqR-(n5T*g6NBT-h0Vq zQXa02oE4`*aHA&8g51Ea-eD0h(A(+PTQeW7Dm#{N?IemP1QEbccvY-OzocXN++PjR zCjK$((~y?k?IW$0WU_8X(IH=X35=9!zw@kLn@)dN(BZSC-qPMa9gB1FFh9PV?zgv2 zX{u?hG@lD~Q%3_tv-??Dh zhz&g}O%Dd0?KGe{>F=jQnK#Mal7n@a<6CU;TvRVfXQ8i=J~2dV?bnVhyRISKy+M#? zkV`dXKDPGizT&`{bm*B|0KbJ@EM*um<==jspAjS9_$I+9{G^IO1QuF*!L*=WrDAV} zvU2FCYR37t(jwMx5^-WqA!Sv!Eo@wO5Glt~_B4O4VJ_9*a_H%G+iJ7dmzN+|YvbhZ zU^fC$Sk7S4=8**D67C^pv-@r1bVoR=m6DgWSE4Mop_K}uDgEP2hGX)p*U@9la#?9? zm#DV2ck^3&epy&O>`c;{R<_FZ3AFpgjE4M7gEK;)S?nK^^X4@}in4hcZ_U>GA5#q- zB9`a!Ldo3YTxGE*NrRo?ob)$1?QI^~kMTA0ro73jLr{TtLgk*qIEx#LVLfxo`Fyrh zXG9X#b!4v^M@*-uraOmENrMFVIUruZVnHwuQ!PKn_FM+WEdOmFTNX-hj%!Z`qwjMf zHNK5)8p4lVL@Q_Uwl(e=-@#jjO5_Fn9%zzGX?a%h7FutT^+&+D zs@==W(rPR{-Kx9iAp)D#@M`;LF|5kKA9&)uwuO5atIBa@%aW5fYR%5$$UZA8dn(IU zw)yxxlEk9wXxao<>%!&UvAbzq2wnfs_C`4gZr5iijQ`6$?57o55}0jOM&oA zY*cQ215BPA4%m{C?|i=+ixFfygM~bp{B&)&>=7F)Vi`_OSCj42#@b>JpprGJ;t+jn z8LQVquQw5`@LuyjON4W*OT%CulUUOd(sEM%^;Tr1)=S!(fCoP=JrbEzyQbe*rOF;T zJdJJQv$8lx9%e7Kkf|3D%I=rL-2+dyA)8v?zo73*5JGT^I&SZ?=IkC>}F3ZiiwnEiLY0&v3gvz7R&#A!1}#q z!JXRj!PIqiyB(T}MZZLa(n#h7D3r81%?QOc(nQc;YQH4P^J6|ZJjZLVh}mPX?ik#d z@UT6ixWMSv#wt(+T;`C9RlIEO+ztkt<78$SXOZGvOXuVomFIq|IHXQ>V1(dA)~Aqk ztG&v@5new}JatOyy~L5_*~v&0?k05rq-9Y0f^J1EX8Cv_pcK15ie077^7h2rp3Tn` zP^{ekN-RLuL~|kTI9GSZ3w>~UG^a+bQ$fM0p!feM?(7>P4RnMDd3TO}DAl!Z+tYmW z1T&gvVCyK}djxXi<;4j2E8eFo>u`cl_VW7r@!44%fkr;QsP%Q(=jVRJ*~>?} z=?E;~g|v9NW#n~kHpM`~rk#T<15)oYGhCtwg zQ`sk9$q^^-X{SVM8GGOYv82^$9Uiq~AtRNLa(((`h^t7qr0r@%BP`?Mk-j*Y!`I zK3#rX>QlT*%R5XVU9^0Lt2e3s3JsFukP*espp@fuix2Wn4NN2qVZUpfTKTH;?P1;C z2-uLJq>fHoUENTYfD$pW2^{A4*^-6EzHl%MnoDNJtJygfy&B`^*C(y;FsckbUe_1r zu!e`&NqKq$gPw?p)ukF{V5p9N(lKjZBQIv?vN;{9|5`}FA>HmufGaux(Pmv(pW7Li+xavoAYd^^1jLYu zk?venxYrR|=KlGP+hlxoslleGsRZUy;G@Lj^un#l*~1Z9E#%%i^&=dgW)w{>*nTr?fNw$xs$52~-M>u|fXYk~FR0K1GUfK*Ouo+MFF zdN;eF>yuGc<}!K+`?p5FA^Vyuj2D=l`BdN@gC2Bp~Q)Fe*pXsw+i{92vqjmYb8At1+a(AZ19C0A#B*<}yVL}1B^Bp-kgnW~bF0s$hVNsl%(?8&B1uEa|Fn9(q-EiX2HQI1#fQ4C}o4j3sOs9bC2O`it~qRqDh zun-q5z|MLigLh?SCaP!abvYFNh8SP3?RjugUP@0duIGryYqkRg7k*(;d;p&KUju`y z@zIO-ew`by_baLUnna>(ThpJwPk;HbGto`^_~CQy*@pIEbko66m{XV{VN!TV(eR2J zyfcP3D$|~P|1QFozX7I0Ti0U7kgA?D#rNoP=~tp7qrt{^ZE@AVX!j;D=sU7_HKn9- zSe}Ep&$y_rb(eVjGzLV7pBq2Y^~^t=K|OAMSOX`6i}hRU3K18V@%^brbF(=v{PTzj z*Su$eym*p*%~D#Fw@?oByt$ZywXQDIKVyGoDCX?tf3X$j3GVBE@AFLR`~G7+k`_k) zPc}usz)b{}SZ6MzpVz>5o`~z`DncHlVEaZ2zgRO8ws3!=O(JdSCM5WyK$M`iyoylO}7^k&~Yr>wbaO7Q^VA?~r(&f=Wfnx^-9Q-+&ar-D~i( z&vwT`?EHtHPP~raQq$4oI*y-njF~XGo>u15`J$>Y)74#E#1nB3ySBQy(FR5d$Ebmf zuIi85vpU!IWS86Q=NeDFR(4(KB$r2QD7n+TJi}QLqh^02g7o_&mcI$b`$RQinP*q7 zAY`=1e))nLl5`hw_u~L5uJlkuCZqnZ3}}e0agG<)hqFYjHi+@+qrkGY#7FDnq5o#?7F1yfFqeh`j)wc~e8g!`O2hQhBP0{rcH*2nIz0u9FJ$#8FZyjws zV+#^u(f2i*5R2A#9(V}7O!EQ~koM#bc;KiNd;yLcB!;%kr>IPUUk4*qOSD@vjT_ig za5o>5A97ut1%FzLOG~R72tWlQ_EmNd4~1B;Nf~TcF+$V95w*{lf$Qio+f}KLvYKW@ode8o;mL1LW{4w!~9Di%lDHfNcqH0cqN<&>-y(Fc$M_@Cy)`KDQ zl78&$VN(?rrG}PG*>)uoa&nUUh0)@n(+e_t@e$PxgvZC2r&#L*c}H^fi8fez)BK9} zw!9J)RbH;kJ;6`c50g-L4gBXuNV(p;h-u~+h)5Nr{*T2s|A!6S44%~i205`QLU~WoXdDzBAFw5ZBetW4k4DGW4Pq`~puuUVOh@b6u}$ z*=OA7QisM7Fh8;^b(WwC75w_zYjFSH4Igow?+HOZaA%u1n5+aAXwsL?oYS555 zOJE3d9N2PzgC=)Gbw`APVpBTay#PkSIpi+8Iuw`V1SalI08d_QQQ&Na$ za@HhoERBc9XT;x*4pVh5EF4V|(o~%ntcM(e^}hVG{JyR|h{=L!-?;{ct0p4gP}E#S zjPv%+rPUF3HY=x)o{~T33q|BtJg7}6+Rs2aCUbapZ!_8eCGt4_R1?GKk1Dk%t9cgr^jc4zxS{@WSv$(A-`=Uk)W`Mf zkQEihgZcq=cS~Al33>NzG+6+vA}otmH7Fs-Me_&p-N);S+u{yM_2gUQc9? zO&*?G!d&O(?OePbd-tfQ+-Hl#<%>pz<-tZ4yQBzAxADM&#C6Z^tL2iyj5OuJP5l1i z>o)eZx_Kot!^#YADjfYR{1Y>sxk2pKH%;ESz-mrcqO0TGE$5&Q9=)5KEc@}kKeMg& zWPbNEitFb;=5#djefy$h#iQLW>;L4)$|$%8`b%Wx5hh?!Gd~JW*R(Q00(WakwS%+| zt;SH}Ebc|_RSE7h9iDZpmPW(jaLtGfALC+(5PM|VxoWe0ES^`ZNYI?;)jmVapD7(* z;EBQmo+!aafL5!lv_c@rM}lk&MbQzB!eud^Z3Qf`O-u&LfU{3Qy7j;)Th zR>Eb4Rc?fAHa&fw^{CjtNV7iR{3V#%cp`_5#j)y<;UW86pm8sSVVUauFq#Ww4K$WA zz~i~UD+<aBd4(H2dNHjDNZ)dJl9z03y5|HdpSzc<1#KR^07x9d($i@)(g{o_tWt!7p#lk z19fU5Dia4$nl2A}QDRuBVj3wHLmqe%zlh?_Cu~bi^))QHYzk={mBn{@;d62B{f6wbdVAlRVwyB8 z&pl$v$@HHet#*4p7kTgO)l7AcBG5MY7rk~mW%2LFZmp!0k6c{Wp0J2Eo&5hKR=ufA zF4nzwCvv)NT;&PVoK+qVL{3HgGI}S$$(-;(p(NcewP|j2TXAY` z4=zk13g?oU6rY`%j35N7XZSsbDGm=zWxs9<|Eap1sr{RK-Q90hu}0p`IJAx4v2p>~ z1D|K^OZyBXw+{9wC{hx(W%n%PD&KLZqD|2CA zM4ts8uHBoRAEDHU&vJGB{bc#q;~vk~Va0HCe(OYU$nvK)_RX2^cj9Xiv_kPcT}}oN zwj1kYVkqn1ZU4ai9fFgU%XA<(ea~$sLCEE(Bu87C#O0O{1MVDsc4OSeQgE_K^g0E(H}knp(6ATBBd)%nhVXBC zAKMB?^Q{&78}Lr8Y>RD2mXws7tgSdq|51R!c)q(J$jbWa@^sB+8JN2h4}mb9K%2_q zBL9oik>ZPAaY{*Cb;Cff7T{uI`gI*NhmIe1%>@b|qb_WWzs80(+3=M^ zg#ej4a`pDUGntNuSw%{vn~9*X8p@gozJ&!f&ONEM;|z@9j7G{Di@mPbB@E6EvZ$ygKY>*%RyrkAyo6A$xYiT#!L8AT`*k~>NX;YQQ(v*VIssH?ARgxu5%un)F z-WOA`MFAdsIh#K~=#PYrLQ+6ePhMlg z)x12>XhHUN%o677x*%xpO9kN(U_qXugs-}`T@&7)a`@f)wDQ21I6TmQ({CA&qsB~J zQ}^SwFNkOg$^`WbG%52$i4s}k8U6mH!{YrANl_s{=KP`47 zE_eyc{C30s$iPX6Xv4`Q-J|+psDMZkBkWx|r&u43<^)1))pGwqmemKSlBY%u*u2$GYexvo*T$Mo~3_M2OfZ zk$pEoB@+_UxV&hR|G7Fl=eF%ADk;}q5|i6m z!QTG4I}k;+G-+&WHZ}R_^XDeHj8i@NUq#tlx0hS(Yb_JHjfni7uKdh3_$cID)D)P{ z&tCwqYyf7)^fUh|-aS4pr=gKRzdkoMCvn}U1s&odMTKYs0_@^p|AC!j3vB3WDM*sE zlqhWQ>^c=VXmfjEqU`eg?CgAn4RMdsu6Tj-x6vSMi6JTdlJAgYh)|JtG9m<$$&UHW zMO@b zvaqpC;b2xvR$p$sJQDu%;?a_Ph_*IqqXyY-nr%dF;yBoJP!zI+H&aNE#{PI5cO(}g zl_=nCK;e&{XI2jnN-ecxYKu|WtbHIeUck|z3A0G;F3%sFq5y)H(T-Qgfj9O<|I^^( zk}v5JG-Wo(yUdGC1;!I0H9CfU7t$_< z#H*b<2(}FD`T>DA?ON)QprI5ny zSSp}Yti!`INlkGcQ1+ZHbPElIC@qbMh#cVL)(Q@$*J^R@na*1o89wjtTT`zw_7UvX z6%sltkeeAB^LTuup`@>9wj(arX!Sx0@Bpls8LJYEga4G3Mf>^9UhS;>bcOk}P5UY2 z!KHfZl0&+qDjdh}GADe;mLq^zeRGJ3Y_5Fyy*%#J3nR+9&`J(;ofX*Vr2<38-awC@LlT>eU&$6338ifx!BPKNj1aVqNO@-E{|5X+O}UM`=EmlSW@f}VUL)}f9fL2G93fh# z=_QmJR8;m4CS1-czo;rq7SG!=G*ovCN>kNVT$$fy>`iL=dZcM`=#1lM4e9aHRTS7g z2O<^Pnv;r$vJlHls%xd>#vnNRgmNmL9@r!BOy1gC=S|-@g+z{OpK^;+lg_Ieb~g53 zG%3Vqz1#|%=V}}$v=fe$^+@l*YY^%w3mA0fB)vgV$#)9X`fr}PU+{wh&!UW zf6eX(bByPpXg=FYr9OH(7u6I1YlgXQr7c>&KeTce$iqxNG%m&azVv&17d%}yJ`0f@ zFknr3x0{+Xq1tAl0{O@;eit&E^X|xQi#O^i&mXNmKcAL5(J>plWsZh8Dgm+@zq*-o zLv$OD%OFT0#@rWbsrQ>b0r^{X^Yg2NB{=D*G!D10JYO#txKmqTSl@9{jwLx0W`w46#VFIt!CV>UjbGFkQ0sj8|>d_|EEdn;cX1 z@bNAwMf<+anuiO>+S(b|zsbdbR9V<)Gs1LTArfRra4Nl6s8`NT*X+TnvPH}0z`%)G zGlRER+)!aA{*LeNgvfrASg;96N#H)zzG7FZKh)4L$ynt`?vwY0t#9_zp}($Y0;sT` zf1AN{crY`&w0UvChdrS>bEr5`7G$FTSH#BT59CFVQRxxRh8r%#iwm$`EpdEQwu}$u z;HH^U-RtA+-psXE>FZRLU^?N&LUV~&bx5@U>5vf%h|PB|&NHacPwB&jTyX=OybCMx zXCJ*dB2N?0ZyVgZ@45M}Ms#7_jcgnC3RZDr3E}$6(_>#DNTD>9yH6(luaI=XFqW~| z6MgH#H|fJZIuiBp);{$5ZrEvjPL?~g1{>DJwJ3k`|L@E&TxD>xgA1&)hu^XzX`9=< zjy$u2Rtg}_%sRf-n&E?hpgb|VI&?IGKpJO!>k!}aRv=q_y&e&UJdW`GMgvGO{un~^ z;s9a!`1rV5x%Sav+oxBse?Ix;VC!8FgMn;4W}qU?_Be-DY|FlpXd#$JvlE=5q7CxO%s)k%InIhh_5IJ z?f~o)!t)n%CL=SRFV(w)7Y zIg=FrX~)9k8?fUQMiVb(V#m?3(#&Yfpe<%HPQYn99|Cgbu*$`$rKj@Uht={nN-2>q zOrO7w_TMKD+@Li)+{T_G&6#h;j_3RM$3tq2cOl;<-pBE8LX7$;goJ7_-sOrr%ztz#Z0Q?r zufP--K*>@Op4xydiip~BVw2P^)D#{&0m_a^4ZWvNO0Mfu&uJ*fK2sjJ)W<>O-LupO z1n-9zC$CTR#5oFMC+d(NCMY34{rf`C7`v!|wcm17e10}a+`+B8^>zJlW4PkG&9}dA z69fh6@43u0qwu=&I3&R2{3*d3?ysyiUv}%D&DMV}hfmvSqRL0D6Qc;t$Qhauq$l%aa?eD7GuIAMPdd$(*>NPEKupeoXx5Dc@;1xw*lVLIA&@ z%ZCCb0UEmZKgGw8|C}&>bQ?^fxX_0QQnEF70TQINS4%XOUHu+8S8z_O6sI;c482&r zbM#$T>j73)ygic#Js|QkFy?Y$R0ima34+!p=STScb$_vkno8DoZx;NU^PE-Wc>ff5 zR!xlBbV*`^&rz8t{icS9;sF>4(6h@14^swG{cALI8+<{vMWe07CELJ<8 z4|%HJtp)?;@2C3eoR6`wOa?nw#Gez0S#S z(2}A5Mb3{^1`^!WM^i)k%gri}4c^`cuJ)OQ`O#@^k%rPURz8|Cp7>Dhan7ODvirRy zydSR56(M1Pu!i5ZRk2MWMPOa=w(c&;enK|`?)U5sF`ai^V>>wiqj@c<%Ao}6Xu+H) z5orS8!t&Y-eQ;=Zjehs&d-49!vst4dOyw(cz;pNixO&I%I@hRMxNU6Pjcu#3Z8wc= ztBsw;wv#4}ZQI6*?WD2J)4kt)-t&Eb*PrKFH|9O(m}AVr#-#FaREqM^kM=Kc$#qrK zHz?R*ztxeDkn;O?d~mR|@h*y>pacR!W?vs=YO2T1;3{A_STBLJj>n0bDyy#eQzDr` z6A#_@{v0rcT&zk^%7@g_xd)VlG$8>=393xkvJ!(I0XmyvLaXgMJR>8e?|cFnlspvp zFL-GC z-0x%UIj(#0J1>8?2INN(0X!Jky;bh6!U=BBsd;dd5B zk5whdkzoM4(_~G7dfuqj`^9A@{7M$Gb+6}3Zft~LB7<`b6Tupz-EHOmT!m^`^^G&O zrkJuPi?2`5FV~&}%^y}XV^}8IMs6FKB;ORX^p$Vz??Q{SN0Rltd|gKDQL{M5jCTqH zJ#^&7Wbc7~wSY71m9OHQxf6**oN!6LQ&!A0r&5mVe+4xhNnbiDmYV>bwT|jshE8ML z%nS>lC_F!EE@^3b`*U;)Yy=_q4A19p!5tl~ZEdQOp@78R41l9&XQKU{RIQCzExJDm zf(rqB@AsESB>+rxyAS{jW$OaL5u98k2DNkX>h!%jn>h15HSF6EF*}_}j3Sxis zwjVSf6da5<81UGw&b68mQtJ7cT3X2xZvuQ59E8(m5_2yPLzjoEW+w33W%DM#grr>Y zE=PrgDCm_BWU$o_M$53-Q9S&uV_{k00bBAj>j)pOFmVB}#gVI)=UR*#28xKFY+5eq zx;EWFQW|N=S6akhwEO){)4&fm77p&mpb)u>tb`bUTC;rQLbHd25l;>WG-eFkw^VsDkg}>UJq0Kqfb8Es0!_mg$uuP@)A0N-<$%YGHx&W#MTtrr|RB^-xgW$ZO888wy2WdI6_PVyYg{gLHisDwW zMWgfyyN-^Tp_0IGn4NXmb;w?+KGA||} z6>SNuATtu0-B?NQE9WScq+G;hfI}P9SeWs4dVw1+7D3$4_>6ky@4_V0WENa&3S%*! zX8;NSDzJ5^sptuZo`C{=v<3S|Tt+w{fCBL}(m+R8& zma^F+e%>fBwd(H}yB=EXZzY_*W-Xu^=N%Q@h#<^CozP@^B^J|&MLArAcxXxM_a7w_ zfxf1#ODrzR#suY%y?wZlq9j_FN-d^b@-6x0w(55udf$~*);75F!@UszJU?av$LSwP zA~`7uK%u70(fRr1q@-5j$kAHeF2X9XxgC!SWsm@M7?))NZMt6c@Z(%5Fn-}dLv@#y z3clUVZZVpmcxNX&HT}LX!*hSE`tFXfqa(ho%o6bKgco8*CbPA>C@!J$da!4EFZ>`y zr=+j{zfe_Y>c^X{wLc2wU=2)1bW&O{H(L}>?e)xNAP>JxA)?G6TPw>l+;7>|jsVcf zQQhLKGAT8BtHWn&oI}taR&Cj{?>1SJz>eYU=?bHL(rC6k?LU@>fR+4(iP3TLk$>@? zE&`8znFNT15yMFd7)Ym_oHt^Xk$G(J4&|d(ZWO;RzU6?Vc-nus!vQ1qr_**9(}Mku!O}~exyOh23Fi(#SO>%H!sHn?K{6N+88xGcOLUoC*1M9 zxyVl>r5DaDqBZv$3+EjsYvKixB)HPktgfH7Qmu`jn){G2T)YOfPex?+)fc>on*I6) z#WmznKW(p{1D&n~0|H%yP|h6szl~XBeh?jAIyE}117Pb(LvxLf_?d)+!w$57L=^7q zth%#ZSzLgxrk2Rcad&-7@B_o;?FELj(PMcT3z4ub@->qSWxCOh6ElLIn)>2^vokO6 z=~(TDyUfz^qm*j6O#wH0IBC+6yOkH>Bt7Z34sckz-ykeXW^V zQDJVDI3ZzVQAeeY`aWeJ9r<(_n%xkSzNy$q2PT?ne}>Bc znxc9u@zUc11*0!F;iS%NhCk)zCgVQbz?zO|8f523me>UQ$6?)MwSTJ3zkNuAhcuw^ z+fQMV^#@ORwhy=Fb>$s`wmY{+Vt@r;_ZGOeU19r*m-?$Y zS=m(6O(nSiB2x;#04VoLDC46_WM+-c6Q&}Dm&AMmYMXQHi9u&$H3>8JAU@DgzJuO# zdFbn00}yCF1Ep>QGD8_Rx8VEiY_XjMTTP;x9s1t>LGs;s1ypar@$m?tz7gpcmXP2qJ48?CX=!=v{gTpaTmujPrwR$2Hlh2gfOkq! zW#v>wy6tF&@de0-81<^9MK4XQ?dHS3`~&PQw(aBka&5?C{JY>ycNDkOoS44=4DisK%W-K-u^~|uEn*Xp^fy1x0H*VYjN?ZBg$6s`(!jqgs^6g zwscEvLp8K?);yhLEe?KID?E)2*-^URK_rzTZsTE?PqUK(9xA%|BqBx*Q=XW-Bl9)M z;lEDabntY?IUyhqk_8=?FnBW?oA^^79`QBM2#v^29mE@)BsOo_ zdAuImV?@^c1*xL4aWuopBwK&o%*mNQg1NX(~p%W8wOq zpZ|TKzWHA={q!XLnALA)oJv^g>h&?4bDN%wnwV$N4mJ$E9d(&(M*s4Ps=%nD-hw49H9Tzy_q*pPRtAAb?|Mz&~p^7K4j>Xv`A!h1kCNGpE#n->1_l4DL zF!P45BR_=K{N%7|RniwSw6k_C{l zaEnM>t^Td7bR<6;B#rxN!lT$vuKjtU@gJc1@`3Yr%MaEiIRsNY;J&npK?A(nz`S6g1E)T!Kye;Q7wbg;w*3h34{ak-Qc|>POIXEw||Rai@9YuQ;}S5E%EQy?QV6ybSp(k@V2;`;5Mxdv>}6xKtYD-tmNgIM~9dH!Y4r z$}p)P`gu>V0|@Q9vY|p7p-EV_Uz-XH5BFjmjlCfP;!lg9Dows){srpx$9057L}20U z?72GEc3vYk-I0SW zst+Z>{PwE^0z9KLPMO^Lq?4uYKu!`BFp;J+sWNvE3PiX?W%Uq^w;S%4FIlL0@ed00J1||T<7j4Uj*J3hK@^uKKzp`^-qejW35qdnJNsh7u%;BY(G-#zOl4=(Aqjl z%cwdD5&nf+aHDE`ytu&U{Tz@6FiQZVZNG=h$MsfT3=G~__kfappiXGK^tlRlU~R4=F?_NP7fi=)u_;?$`tfb%8{t#DF6C+rlFRc0hh5V8 zOf$34l^GG151m$()xZMxvpv^ZIB70#4`LFQS|xWiGHvScZs4EU}{* zab&nQ6Sn>+!~hx$$q#|(fC^quJ70*u@9rq9rzJ#YSl;={Ot|Wc!s`t9|2!ygTYc${ z?{E1LSVWRiVIE$*mgZD;YwmrEZh^tU3-Of0T3Q}E0|{OihSQeMlx10^sLAcf zzh3AH@SGKN+@*;h#^s(FSZUb+Z0C`|sTnG}u9sHtogCAm*q1r2Q?`zd9>nGgREjId zv)W16<{EW4=_!jp68HqJApG0L=mAC~89GtirAJgv3!SSc^**Jo+=(wq-krq6&bq26 z&S`Sr0+eAXVdXVF!6}!x_Q{^<-)O8w2#Blp4#VT5_jcKyo4c%i=3m_K8hyb(`S#Wk z?1{~nM^6Tkr1Xlb%Z1LIm9jfDQ>J$htbEYabD2j_u0d60hK-e#Y`OAYPq1kvvd#$y z1Jd4CB%of6YSiMs(~rM#VEO6F34MgljL3_Mjl|CW1xLZP0y=Y>zajsAUjxVxCjyI; zDy8Xn09qXL6Q=b43pxWr1!>*2AhC#VpGgBQ);xebdw)EAm7gC=O=B^WACKGqYfgOp z?b9vm?XAS{=;&A$zwQ0GQ(|I0qXFUjpVFZL)9bZnfh7naYk`~09(;VDYxlaRF4{vy zt-IPeBOnN9)^!GYf(;Fr<7z5d{4(sUH>({!LfYCC`;W1)xtXT&0N?&^lc)!Yh`l&W zCGTK>V#AjZpB+Y_56g$@%+MD3w95O`+6*l{E0yw&ZUFkU@!qw#p@ z8LOH1qC%?UhQ-v^ddkv(T`w_Xk%<0O?wdvy)i7-g7aB!qTb1F{#EgPTEeE(cZV51K z1vT|yQ+a0Z)y-O5%Tej@PxPp``g+gZgk_1%>0!Ve;lH`W*+A>5?!kAQ=JpJ-rINzl3aiN`Zcn`qbM6m!^jAj-t*w#40rgnd;3U$Lrsnh zdsG&4(r`a+F4x}2#c!;2`Z;fOG^v%}nVM#$bF;Eg7H@*uNne^f3Qy146BCS)2*URr z%iUFQgaVI`EulhEDJUR$-74|eEo*WOlS4z7zqA3O8{p(1+sTHd%8fOZ-25hN&kdi= zcnOvs4jUBH(`<5TDQlP+K9otdAW9;D##uF@NtI4YO6&rkG0P3nXSDf*;^%vR%Ueu{ z0{!rDVxN=}p%dS$xGks0FO1Soo0%^m>%sP7#7O#4o&H8~nBzQBQ3$c+=cPsbdHTX^1AS*feN@@<`HW5Cf4A^Ewy_jnjKbXNNcWIcg~ za+XIU2PW2-X}h^d|J6Lef_ht6G^eH_MTt)@F80ZK9nFOCd;aNBkHp7uK2q)pf=dT@ zp2Mce>FEk7Jo7dq=Zg_k!5i|Kh$}G!Nn2Xxkm4r@lNy;uQ#Xd_=M2JAHWEu<_ZYs7 znK}?-=Y=a(jZr~$quA3yrpuBTim|iLKqP-e4GQhuYDONh{su$Svxc`C7?Fq1Azc)k zc6?-Boa&rob5-f2NuZ;p0QHv|(Kux<81uM${h8{e=Pui;&1(rDo$h?iMu{C_pchrJ z0TKt~xXJBt(RgtnfH({MZ}#m10`e8DwzJch zf`UHz%oTVc|0Mp$&2H9YM!`)-L-4J&j?A{NKZuYoRATg_*WKsZ4K7%3e0V&OR$&0y z&DZTrwWs~l^4EEn`i;q9KU46h{c&w^WlLj?_t-X=fi)(bH%CaOZgpg=r~SIGZS@8f zPx4}Z%;+?a-G#eMG4=mbBMw&>wf`zW`?{npCYu)d9)t?~TOz(IifS3Szb6U`fGJe| zDrp0;cjSO@^`rRUGXa5%iHVlcPUXS^Agc|~@8AQqx+##7x_NjEx4OCAAF+0HbWD%i zKvS#=2*{^%(9_Y4;qd{owC@)nJqg_m&&#h(#RvW`FEbJ`6JL-`QNGC}iB@W%wfeq} zJ>Gz&ou&C7aSDR-SRE=FkdnTZlsqmKFHS?1`aCqan_*~Rk1dtZ(0W*Te06i_j|L6j zeE`t+yF0`%Y9tDVn>h~djfY5ZpDC3N42=JW3@i2+6q0^PDfQi9?z7804h~UaL7%D`2G0|8Js#lGh1ru@ zeK7OGi+X)NF)5OeuTDcl3ODTgdm&eCIIMC@yi7V@&_oO=AE3S?rY7MXx7nG=-xTRpA!;M zPP^u8l=~Qd<)8XAB?xaa*{u81Myo<+X6ZL!A$et@RprId(~DDBmNx%oHQC5Qd}oBO3k4e7{j z&ZY}(#^>LSlT_aC4VMJ5Ujhw5V(9_)355xCw`2AOhbL}d9W)yaHZi68SFh2b+L(FOtA)6`WL8}R{LN`UZ|3tu=H^vX`1WyR z0^RiSRRdS@Yh7eCJ-h~>s1q(GumlGxOTH_4IttXyU8OH=?>GPKc&2dM1t!~4)`UOP zMv+roJN&rcwa=%wZ@`^^pa9+M2ibG|_9Fvrd~6l9r=x!@mUTda#vY8(M0|jN&KB6S zu;BKi2VobZ8bF@4bcs)k^VG&vuAJ@tnE6(*YDqg$q36MqAz6@z93F6G?HZpSHv9O~ zi0qS8$}#N;EyDgS!9wr;L!^izUbXKXC`!-SRL#bSAW<~+gv+T^XDOQtD8F^8@JzGy zvK&axz98hEZzNOj>4wU5?I@n6_la%o+kr7U;~~n3dWB8x(r}|qY)Ywmdb9WE^z^|Nz{RPk#PDc`w;EXguR8vkz&OX(FbAR zmh;I|1T#dg53oOv%Jebr&ZTZ0)NH9n<_MCsHRb?DZL=)HV_0Aih|$xTiRty9_`9{U zwI-`#EwarYQ3K=%+6A_SKYEs|O*-#B@vdLf`Q9ItgDKWU+VR%#9U5CoUAsmMmm+Gr z2h}rloaoQgd$z3xw8aIzB?RIAR=Z)A6JWvMlkCtvkvmLg`;3FMiZU^cWTx-U+riX@#&KdZ9%exFYIfTa~i z^kVBtyk)-Gfpg#=6Mf42$?LK7v)wl6C@fWq3!702!+^~Z6uG4$&&R!k!`+e{Yol)> zB-CI0V;w3h!$ULf^8)R!jVeNpRtj+ikl@KszL7-0U2}SCLxT6se5S)H1vp|B@=^2C zw`2B68{gKK#wSGl_f;g|>SG3`$-__bnn>S_NTSEZ6?yyE7#YdsNBXmkh%L(^#&;+D zO!~^K#pYlsXF?D%9IFy*$gH^(CQgPj8bPycEvlyxBxUw3IGVs1MBz-4Dy+?Bn^{y@ zwh`x0a~+qGY7`{cmVkcVX>tm9nN(G4ZgoC*N16kzH_3moIHgaaoSRsiLq3KV3H!F){8i(YOTFO{7m3 zeEe2@;o&i`u)W%0_qC_qwU4WwR;7H}tZM$!kA~FG7erZYe)~sS5^H0+RmpG)46B|t z5^IsKUZSZ{4*ztx{ReV|}pkaRMzQ z3eKT_qT`iTp-p2n@aYluncFwndhO$QRcv} zW{}@LCI24rzqI&dIB?}8`~olOWZHehQE;}3#Kz7HR6oPndXXApQc}R=v6FL<&&U=r zt-5R$=VT{$l8)Kts?zxVntWQ#*{LY+sL$+L5Oraq`)VLBFhg~Kv4eq;8teYS*nDj} z1txYzwRCH%H``8`l!1@NkC8d7N#!z^`Xp!FNfOj0mDfMO*2^=`c!0>^%nr)du2}~aKBd5!c8tv5&MQJJjvxft1&(p&E_6Ja9- zF>LLhu+c%VD@;J)j`2@z|NmcWt}6<4H8no3J9XJeOIuLD7h*n->lV8WTVs}rf`X>J zydpD&*T_g+Ik~;X-%i-@`&~Z$^N zR8$P4q_nM19YsWJ0J1_R?+z-OL8TV!8K~VXp7OWv)#pkHB$2mh7f8S&M=6;wct9T( zhns0+Ln}pDJFKxcSc!=C8Z~K6F+nx7qu%K(Xo&yEDwSx>x@E|!jNQr|eun<#0!NRB zy<7U3FWz$n{(?j~=N%6J^X1EKq=-*lR52m6K&CUMi~~DC^%VCUk9Ug5#THcYIS9+o ze5IS)VfHSn0sHpfy#a>zFd>R_OMdNa-tUzSjwD1zYcbf6F5@yO+$ege4a=R5 z5Ai^Z`-yoomA%We?CV{!I@st@8_by5N>2sM9?Xj=G;B4qhFPB!n^tqUaSbG(V=U8k z`bMJ85wr$)Mt2bp>Ovf0kXr4#NCGFS=~jpZ)@SR z!&rBe_$$ZlAiJCxp^~un*fZ3{N3h;Awp}(`dOn4|cLUKs-Q}73d!71w)y|XcOP=Dd zJ0@u`rJax&Ap+_7ls-_MbG&r_^eY2bKM*&bmt0t)e5+*nhpOWtX z#d7?of5)>HEeDEa8944Ny#k{@xSrsT);piSHu}1M7I0jNzRo*V8$9S)f&rj}|1XyF zx9)}mh@>RknJTUP?QwyApRr20hKK%eycYaLP~mOB8$J( zN@I7WzWQP#TS3mp%1+Sf&tWc5Of@-~HC9*u2pew76`|*_h3|B`7?{dcG3XTVcy@IJ z#`UbNrB7xTM`1JbvfnO^B(`P?_@t*a5#883II#1!UO76>3`X~VQCv@+H{3ouBS_0O zNc&mU>T!KJ(Pf8rQCeNAFu}7sfn_HfvuLp5y?Yf#fw}%g%dT)RE|=eWLa_C;dfsZ} z=l8(KzU$!dD|I7|D1+H9>(kabPeslXpC79eg}Q0Dc&DbvH9kG!d|6Y<4-rU=BHNf0 z2LdN$1$cYygtW&B`2Rc~mr71)TrM()pafnE5}jgX=!d*DfQj?ul#%2mOEeA0`lnFJ z{XcsTOycg9?60@_#SE9X-B)rwF|eBrzSdJW8Vrm@WOjfQjIMj5DsGL296V?0<&Kac@MW2pL6<%-@axd$d(qDxCasbOsnjSaW`p660F-jG-7CMnP1oPw`^(eru+YKeMu($GYk4_Zx}KHk zJ^?ZlXwptxO)abKW$5`H38=yW<%IR!X|X^wq<;dC^59&h@Gl;tbK7) z(&RzFJjOj^6OjY?!XbT-Kib_m*KT2`ApVLB9Tx%g42g>o(H#rzk8FF(M#FV4gG`=Q zgof6=Tcv5;-DV>Ca!=Y`54fYqd26)&K%KRSWL^EH&+jP{hwOCSkQf)wS9^P16L2%< zoG@`7KJJ-X`*aZ$5B~}y>R<4SD>m?6m!mr;n*lPU*?IG+{ZK*^e!;TtXh|ztBWAq_ zU2Q^>U5qIo8`2~5fy8oLO2!Lq(gY2Hc~Z%La3#jBqa=sogvEy}LtYjf{etoP_PXK! ztojOK`lyKmgyFapS9e`zNyAGAWIx`_9$>OW-$4m7r%B{7|Ct;a`H&~3-_TIx=-2`+ z91+&s;rq&LPtQOh4qzkOS53fdw{%{&xZl4G#0StsdZnnTi@&=M4p98k!~1H+sVPy+ zNVPUf3v_<(R@h_zG8R0_%4q>XUHW9PcLVQCjY&EZep5o4r?kRvLH{}D@{)Z)u{NFF z=3*@V9?G@#45qvb#%7b(fG=0JpF;0LW;mzq5a-nwd}xvE?NC$u(8rJ|9xM+WJVcp4 zMM)pPVs&YMBD^R%VP!V2^U8r6>M~|pmgN|dMy0%V_a;mZ82G9Q&tWgahAU?EmAJ>w zn-2wP=8=!E%2r8ilbAdlO$at%Lbzo}t$Wh{%V)l5zXr$aqDpemesXP=(&1)%%_4H= z(XZ8yfYstTQ;xeEKHx`ZL&?*6r>jm7HNI+6Xh|Fp0<`^Guz+FaVt^AMT7xZro15LA zHjj8RvOX=kiOVf88|xP&d`(-6Wng~3)W?tKR*R6hcna!DO7IA`2r$c&dyemq2UGbH zjBD%3o*vvt_)Dyujn4hB*`+EPiAl=m8-lcL{c>5%MpC+K`SFy!1o(787`d-6J^6Xx zbajL8PKuHAfhj8TDk>GrmluNR(P$UqkQH^6vU=jTi|h4oC^_z&TVF38Xn0*t_7p8` zcKi;|-<;>K-Rc7)AG{Bk@HpWpFHqodYaf*uUfSXMB_8MrpT1TQH(PnXziVHLu0=rE z-O+Kc)c1L|p+MR~QoOjo?7J6Efm?bM!Ij% z7p#bxeI}z5KH$q-u&PpCeS&!m_$43?-S8qq#ujzm* zWfEqUk}U=Fgk8ELn%(? zn49KlvP9Rm=hfw+?baw)q@qI{-LijZt8}or+}XTr(okSQ49Nr*bOX5N|M!fV=~+Ie z3F-sxiFQCJ*7I z`yu55Mh+pPDP6>SmuskYfwXPqZ|l2DVh z&msBWG1PM2oMp{`!`7_x9wOowc+ByGYY#pFT+&cQ2F)&5%nzH`2OpdKNeQUfw6U$# zeh%!xHK4wPQ(#{spdwZX(C3q}J7m!&_ge5Ue@DF(eYv_l_?`V?HWz&!uCqzBSSXQ_4f*B?CD-+gB~>XfvxNABd~-YB7V$*kUdsUnM6^c z@_#msdp?opL5w5Uf3yy@ss3X+f0>5SgXSWE%&MH?FARQr-8~#LxqIC$qlgPRf0xY- zJfQ!(bjV?iVNyg4$+PERdiLDWgAPI{Q!T4-io}=^H~VrN|14IWc$)+GX$Y3qZ&bm} zFiki>cj>U*emI!KU|O~UdlUmwS2?c8kj(-~gT=j&(5LqsBM{gyeDh1giMk7e3QV566fA0q&-rz@}-E$qPc?>~?jhZ95soijc3Ty2&3P0mH3BHj!@ zFS4XZy`Cd(;^I1zk^TDTT5Y#(BLp%%-bD2{%**wt~JuCGofL)W{b^rgx`c8bz) zipUD5!J!$QM8u#gFuH#D+{HC5IMa?n&8}zKJQH#Y4qw zBm*}WCEGyoKxtS?XWX#t8>PU)0|h!t`g(z_Cj(Jk!04XgRA}*!U*Xtyj)1k4HU;wO zy81}G=MbBV$9hC~qLfMdUf8g0i#6dl+;}}Cm9x^E0C zXy(S2OyXWop)ev~oN@{su`NsVpRf1R3RW!?D)~Dv_cs%b8XG1Bo9;m^v`}^(3gau@ zCK!+rLPm;)S*z>_s%AlQdsWSwxE}(978f}x(@{!u7+Q_py0kYW$>oB&5CB8hTz5{W za5vgVscnA7)hYcv`k1uEY3?o4k#bpdBc~H@)No79wOiAX(I|}WGB()tJyU%tQZsRd zBT^3B$J<|94$Y4UvN@%jvY&riBoGtR;09ejP|-owPpc{gMXsl)-}(!K$fq$71y zJ@OYqJ4G7fSidzSCobyMc}byEe5+Nag5R7ByfKc5!oBu8)fYe->2`A>V2fCQRD>(!+UI-2L7^<~LFP9p<*jBx6e$$&jJ zkQcS|(6od|9`@mx{t(Vtu=@8Mtdno+Zba|M&5ip)ylEc?-SYuC%s3arY1x_sQ34gj zgmrLLf{%z5lPrWhUDr#7S)SCt9XHIwsHd*;JSly}K>z#G)e7F+tBa?&@j+VHHcFW$O`%U9^ zdpVwKMEBBDEDYEptOF+V6}!cujkS{*FCEd_HmG)xX3f<-o=%B)5VvR-=rnOvNV+b& z(AcPuuVrbrMA;gX`~0r+tm?!Sk_@Ytd2KOt7HL1~!m*QjC6sB+i&?E8@_#?RkVtQH zSni_Y9xR2>Fc7ipYI$d56<|ZE4eo1Eg{4x3p82V`2sGY>XL2O=B1kcCsuRaK95(SC zpY=eJJ6Ko6X4DoXbg&7uhZDx@G`^aUGo9TABr5p)x@s{fp$fjaAXORN&V11XlNW${ zrj|rwiW$Xwl0-@DuQFk?-PMsMjSv*qaZKx|i%$X%8SGS7>^veb{LOY08s%wiTd~@2 zjev0#8TP~%Xor^koEO-dM~$h4>~@bZMsWW zt4(edSTbji9CyD6^-#{>{=CWT&e!pDI>MNPtt~E_wEWr|1E+o2KbP2mkm$Up=UOh^ zpN~%+7x%csJyuCs83%nA1(^v)$Q_T95MXgJ#|*M_UNqXQ)-2bG(9)E-^9jHPeR4h9 z!%#QaKt|oh!`x?VBag1Eti(VgMGmu+Ne(V5^l9?(`d#&LefI_M>i`_!(jV-uAO}$( zLsI#zsk1^eTe~IX6o3I`qPa>~_XC2RH_CDf( zMMlG`+c6uhK6l+^M%~ArG|G;s=5!Q}LgY6WgO4#{EP_>1bVbX3UU_ZM^^Yr>0Jl^%2Oh~rQg^<3cXkVn6O0NZWUD$!>w zacfN_;*hlHC+;tCn@7YohQVKX__pwYTRAd7-IZXLh69i7w4}b}Gt5nf@FMbHY3vmq z@MD3}vBg?DN7;|yYlTq@4h<(CFDX0hUvooS#fs)4YD3EIb?*V&0d_Q4h{}MNP}s=n z2tNz{Bb59B9iiK0p)MUHx1h3~M%<9Z!@RTnH*TNh*%}qnx#wZDACDW11w6-+dTw)- zc}{;OeY=d5O!V8kmn)UPb@eKz4Nu3nevrSv6*V=^Y*CQ0seZ+;UoQ43&o7T*Q&S?1w)|nypwPh` zD1>89=Mu_PI^Wx4SA4|hW(Hj`vHlVzC;iR|&j%)8O7`_l=FK_aO`rf2)OqYiZA7|K zEA@wxog~4^GsZRe%S$8nsn1UcSQ^-A_ZgflqUFg0MJdP{2#gL=4m$TFi}GuD#_ND^ z0jIEekG9x4N$Dpb_k>{cTKRTQx>X!eGtxLvAqvwAMg${BPolc){Hczh3&lusZbQKeEZqI(b;4Cevd?bkw z#_Ea-kLdSj{N>>Bmf6N4aKP z-bl!=AE|y!&SI}0A>(zyvL!C?yF`D;hFX+za@&4^-ddMyYhtp>r0Wz}Hes$uXDH8+ zz7lz6tzp$Qi%}31c~tN3{UxiPc04Ec?rmpBiJY*dJI>?>w`xu5iWiRT5?cOM>(c%C zA2jPo3KJK^!gz#%oxVtUUasq;K6-#PFTVfm}AP$bzzg6(eT2UzZv2= z_M2cx@mE_EXFT7hQMh>i^p>j()wJzT9rE8jHp$QqH}zC7}V zh5ZJU*gx_{M@L=K4S|&7pSMn3jD65Zpl!_Ob(i2rAS(`tDAZx$gbz)Jw5GsaUfkEg z^%2^wTq^j7&$!p|-GZWGGLgBFDDif8=gYmAMGUCsFw!?E*z>jCQ6Fmx&%9mM%OA*C zaK$QYyj^LLPd4<-Gv5nt{tmbc@+r9qb2^HgsD@tV=3NR_E8Y+XLU zdsVFQA7nzBN3Kn4t(3n&VWq5Cb;=Hd{TFsg)xC?_R6q~stY>-HR*&+Cn|A5`0_!+~ zS$U{ocGNuS_EfW~I%#^0M7}fGrT?U|_qh`+zlnDYh@}t3fx9s3w6EIJ9G&G!{B0I? zB^3qtyk(~29X)HJto6LWq@05Jz;L>wP8uF1p1C^Y2RtyFg}1cHq5DgaO%hGa2yRHf zaRAzcC2Ju&#mHw`aYLo){JJ%lLU4s+0qXT{6%8ew!pd5`mrZ<_@vYs`7+jB6_w2J5 zn1@5i`4SdsR6p13_6$7rj#wqHN}|NU((xWjc8%z7hV0ozv}VH13V~4CEoIgJ<*~kq z#!6*WgECR*>io3fONU4@%0&oy)IzmZ9Jn=qKs=9-UyktVK#j=TSKoT~#aZld0V{IJ zPgfmnb54;K*%RjuR*zZkMrGyY<6~{y_ze|jUnz@8&d>9^3VrqU@9HdQ0b|40*Y!qg zt${5?Ik|Q)u))bR_WYbpf~vv67$hPtdv1#Tj)+l3D=Rc2BD2p*y`!{MWw4?5|KPGP zUS5nC7>}LrklT7u&>bu8QBm+LT9`t8C&$+Xg*K>YF`C+GZ@j0EMDYI;x)N{PcP=%% z@z~Np||+gW21+MY+pC& zG9X1@jMetO@$sj8O|qjCXX@o_4KnDca;M84JaA1v0{h`E*qnvBGT-NQn0R`S=uAjd zt$`KeGhTjOvKtV%U!BY_U;`!xbCFRA*N3(;C(9P&y2zW3?VF*0UhcxKEXStB zdr6w9&RV7N$zWUoIat*!vgn7IXL0uK}$l>_5H+~^J`ETd%SGaH(-a?zSg%}ZyRP3~bBr@ylFx8aH@>mzzKY9b zvPKV>zVIsCSa_}|nz{viCnf6>)Z+8Bv9Krw7-uS;9`wi;MhTwUe|o4f`RjRFQw;zQ zr$&?Uo9imSaAf>Aw!-9Q<<(D&VdmNHv)pSR9SfXD*`3K?HcjSQ%N`X*%w^AgGzjq6 zVSZkhg&U$Js|@ZUDwB~n-{wp$H!v?>oTLi08)LyMHYKPu@-&Yr$y~S8$}8iHzj@jk z;PjNDq!AsbP-C#c-~%olALo)sHWBB4KA6?nn)1(XY1{fDcx@3sR$!J*(wgts9$?K^ zqYWHsm;ZEYD+&}~OztuI&1;D>9NTt*l{r*WH=PhkSV%3?rL8;cslJfHA>{YHZRrrN zdGWdwoQ}e(rUMtpjwl-#Dc7x~@fjpl{Cy-XR?(R;#`9YO(AZAJLVp5YZXS;NgbH;# zoE{LTniCQF?N3rZmwD2!Q<9==G?|VpjF&Mg)h(v(rky5cSBbfrZ7zvFuI} zuF>Q^13x(>RlOp(p02Ef2N`Z>t^WNM_9|4IGl% zWU1c z4e|&Y1qJlZ3CyvaR_;-wFn@Wg#-960s};8A%{c*tpmn1EG3Hg%D;XudT_( zlb^Qb%G+5?w$&8XLIe@=q}r8wpai@ZA; zJl6-Q2UOj@h##UHtutd|UfurGSy?-QsX{y=TxcmNRhmDuiQ(fZC}gc4bm>q~WZ_0E z)!7Nj9J%H->9jOa+H2huB!K(-x7L!Bl;rZ-r%N9f%3VQ5zO`b~WXq-uy_i`hIPS4; z#_#P2_Itg>B8;(`aT(*fKTiDaMq(noi;rVrUE!oe^kfs1Zf zia`~quyfg{QwASLiNRLoO@zKPMP<+5yL+uBb=WKpOL0wR|5W@o_ZM&kltkS|FM(J}a8a9ZX7}sEtC+^KViI+q?-zFte z8g@a5Att&!_}tby^56#PI!rmvB*1&>IuhrH2lm!I8p8rxSPk@1oIQLq-8;wVs}UkT z0ZkIA4})+Ky_qNG&9-GeX7_(}i~4m`DWl^r3Rjpvp2Q)i9EwUyCBAg^;jnslgo}Wp zi`B%O+}xsOCnmupa(Ag|qW_w2;?0n|KUIHS=!m!or@@RQY*=0`txcc(q}cZUb>;t% z^_5Xoc5S!PNJ)cqcSv_hgHjR#64Kq>At5a(U4kIeDV>5K-L-*DH=AzG@_oNE#`(r^ z=%44$1M6PviaF;sFGfhnH5OBsmRLOJb#zw{QHaq+6BQgz@8@Sjf@_!`6Oy9yngC%y zpOvJ-1pmH{s+;D8-5+@_?+^M~pY=x;%y~!Z0@0L6yn8f()1qdzRbBt`{h6*i)C*${ z6`uiPfaw~A&d5nyo|+Boh2LnQ6sejYLqx}K8S71|zD-qSf%d$F&sLuzaO>75`GWGY z^DsMe!>HLLS(s$zP{g}&c!`q-h#q+V5Y`dm7`UH&@`qcl*Pvc@ni%&CB|71`=?F$klX*~>F{ENTC-1_>+a$iinP6Sl_7Iyr6Zm8Fe5M=xu8m9rwBV2Odk z|8vTbVLP%%RA)C}-h1bm-dssrQ_$`E9MRKRoEQ5A6aJ=d)Bw&0Uf{T)atH zAUq_E@;QevaQh;`MLW_w(NEnKYqKuGTZ}-!E z$E;=K$gOOy&DT>+P0g|q{xlGv`>^o)k#C#$`jx^8MO<{fk&u~CZ56V7y0=FMa`m7( zmH6rreKi`wNyb${8!VuMkJpp07V>@4`v&u%zlhg~`jhQNthribBC&&dpto>DBYigb zb)62K)>Dn0Z-f(@?6(GA43T&i>x827zPVEh|A$4{Y7F*huwqj|k}&?}dp!x3mX>O3 zYrkROp`)Xtp(*IOeQ#huj)ylnH)mySZEa#Q@$tgLJ>bI4r2UQ~$2XZnt`klyAqoG< za+nSCX2svF5p}$9i&u{(dU`|za)=Q2`W5loxvZVanM}fKs#Rw}Ho-$6V&~6X)EbeH zK&6Nu9^v5T&cNscN9amMP7ZdB->i6VY-~D0n&3Kcgv0%mFa=o%iUu0we70^g242Lr zKh=YL7P{*Oll#<|EaWF1G4%8lYlRDh!K#)pAe$EK7w2voBgwL! z-WpT0v9a;;*7fWi?%mtK0(3MT8q7E;BJz23ZAA}=TCnyHC>nt^0Xh-#nU`Z&t;d6B z@A_6@z4|br%;%EPz%Qv=tk9>sBIEq__F|yPlYjAGkw1h~n?_GZ$3Wkfn2Zk$7eCe1 zAcwe((91@IvT}^*=y-w;)7K)zgg-dEFa|%F5oNIayg`pXo}#pGc*mkhG|%sFakH>FH@fG8Nz$AxRaC?d)SK1rG=%|G;HVRTML9d6;crb14 z2VlZ{cB(X*9Zk&Lt3gcr5RW3q_XHFvTS4gK2(0iHs=1M4dT6KdU7$mxk5FlLyE$#D zHWg{9PFO)d`})(8!0n{mvQnyj7o+b^pdhuY=+zt(zFueywef;rd0oPaeCw0nKPFT~ z0OuHa|0~&Z+;FAsZz{aR*Q|Y{?*$TUWCUeAj+TBIcMS~e3}*_&#l-~&2iIDTaxgQ) zea^|rQCC-|xq|$1WmGel5tROWx;d1TnAllURI~}q13QhXKB%kb|NMz@*-%+|yw(#T zg%;8CQUYwADboS7FJO8NSd@R|%n?Fdn&g$`k?EM22D=#N=lLoKN6okdwNook<#oQb^Mu=^BXKlM1Ws%clU);2J5^txlI2C zBM{wP&Bn+2cbWnD()Y`Q(ibY^*4C9+gGvDdy{V{&Eykxhu zL>Qv#<<;~A6c3;H;{qS=HvtA%VYtliw!=hqa&*Mw>Z<(i9VabndF|CZl#^Vjn9q)i z5ffxtvz3-;7$bu}4wC3)%Jua}vrme;A>hS~cn%F8%WN2~XGja!8UnxipiH3xI{YBO z;Ma!rOSotir}39#713Hu5ONeytu5{{K$oq^GwEeJXSf7Q@y=0L>dJHNLtf{f?@oTN zsW~0^Ld#?E_Zjod46r~Er~9Jth5?W9TfWpQBiQZ5ey~Mom~;dd95o8Ml9G~}@bcK$ z$>va+vDCYFcnb9m4Ib|9?qG^FH~C)cIS)x8D;^7^ps}EgeSc_@>oU_YaO5f&rkvyV^BL%1iyi=-1fmw-(OFy)Du8v{@&KT|<$3l4DxlXv0kL6u>XkTV zM_+;Z1ft!4kKfK;w_30ozpFr_fB!{t1oXidEX$toXb$tz856pSDv`m+=-L#rN-&RX zGSZoC?9F_`fFOp2ids@u#v>pQ`1B~I5FZcBR^LL!0`9$5M?vciPG2(f*M|y)ktb=; zJd-&rHNk1PxVXf`#8~kzgkzF|#}JN6>GL!nYVs&4EZ5pXvx4Wy8`|}X3+Cod-tJ)V zHC}afe^!^7nXz(Z$7W_5 zkamB!{+0z2ywSVQT+7{j zdV0dlSxe|8ra6-DJ6hR!4(z#mUubM;NeM6CMfW0U4<^={Y4X=m56b zXSb0yV|2W3&24}1I$w~u(tsqZ*$}f4c3aSrVTJsr5+oyxPr_%!JY2D@DKA=l^^OK^ zWnN@8(@(WkeO21Zl!r95D@(W@6mCD-+tsC{d`7ZjHCftpILikHTt{;$XJ@o|_;a9Z zYgJ}{3TiD?Rnl=2b16OVuj{2FIE>m`-8+MFz^;<0C>980h=tV_VA|Avi66Ql5d9BX zAldM%zjUyNg=o+6{DP@0AZ#pn!~nfiu;b^ejX%J^S&Vg%$LDorbK+{^LiuH^7bN7w|B1u(O*QABUs& z_3_CObcQ!$;^4^Pwbn2(F-h_QE)c*9Ma}cN#5?o)Gt_^3zN~{LzU}TSC-qwj0l%us z+P6kx?AA5!&%Rv7VpynqZ{FOgzUYN8vlk5vDwdy+5b;8AB5X&0QgJ_0eHlFSSru*i{;|L=Q6#RgADa(ty%uEf_09o|XP zW0J?Sl@TE!F0!)B(S7PVIylz+5k1~Fr-iMpbqx)AwLiE(S~6x85&{$M7YcMP0>sM? z){N3HLZB27`u8eP|DRWB8Vp^Am!N^r;ey#@e=72Uvrv>5l4t{42M1uLJT_L!LX_f# zHqPf>P$r(AyCp#AAfP2=VPU~Q?wOh4r=ycGGBT2uF0A#GBU-imd=<;c*&k;2aY_S) z#vO>RZySLQ#IcgGtsA=~lRpkf-&u9W{cH!y{ydP~W9?$hBW;53P z87>vt3Xiai0B=F95b8wGVO{PN%z7U|II_OXt4BI*x99jx*W{f6+1i2wpr!s{WB;^$ zMDT#w1HVWLAlP@`1Cf!WtZZ0Cg@dW-dN2ZtjIfy3$*SdOPAs{|C8)j%L<{9&n_ai# zVq*S+&z_u|oYA}3*N61gt2;`;y zM@I4A`&@Hl-Z#;!ff(u|uCgM71GsU_{~}i{NPByGV3H~=H@CxP`o|Yq`HubwFKYE8@4-=C3|9c(4;dSz?#d21a+!2pzh!ZGQW(`D}l zBdy%BLW{rnEz(f@(?dA}BjSDzPS7;%66bk6k!AF@R9E9Ld0Gm5_l2!P#aw%WX}KDG z_ia}e>-A(S#LhkqFFL!8{vno@4z-LBh z`gOi$ZNpc& zyd#@3c0cG!WSlKg11I(0Skld4mr2fo@Q!-@%<8wmz?sh z>zDQUt;I7R*TU0|T^kX-*)t9%;&2T&@K+Q^$sETUBWmtP7kCQfhe=5&YUjzux^lG+ zJf-rn>#)_1J0{h=Lpxsz%HKhHs zb?Jw-FCkqg89>Yyr_8)&7_}r2D2(qoP+2?_d9_UQWBKOIr%$pwS>GppFtu;nj#!SD ze(?wj60>RQ=nM^j*3h+?XobLWJ*)*nl*UOWX1ArlV_(F%k}d12;vnMkG3Fs*E>=0? zyQQUNuBdlnpNWv5AOU3`(l{o$~W!_Y&$TVp_8dvr@b@lJm_meH)O-oJNrym)d$ zF0B6XL;LO9+IhQ=R0!fcXAhw$oC9WD`uaSeWC@HgybJ9jAR;O$6Z!A+&6|a&N~nD4Lx1N@pWQEiFx|IKCSszb40@a2i}AX&45Mt7rnFHk3Mh|O zkhfBW*vJT_11pVObe*m)mW{jmTa+jOvOO1l`@rA&*)Do z&By(nEF!$~Sb~6%)O#5a8bF`l6%rf_N3xtlDM)=-{L8B)<5Hab049r^IUSK>Ne~k4 zi^N>hR^0{VlJ@=K@AE4Dj|r(a``};Z)<({ynQzA(QJYQ&hlj^29h|JJnTd%s8JHt~ z|FUs%j{fVP8ZdTEt1AmZos#ka}8Dj*1u}3~eUpkb%=J`A2)rc`>rUO1%OZ1F5DEor2(e=~EkPy}i|IrCg41-N; z!Q(sTKOM~^9fTO)Ca7ZXcPW+|f6J~ZJG)#~{gaD|E+1j&*6FnyAt_j!+`w}|Y7Wsi z;Np77zF6X8LE}DFIi!ll>0tWR)a2C}1P4m{B!;)bo<~bH=0p7U3ub0!=LhpVG+(aW zl|^YUbDF()>~cIyDn-1~6AiL*^kaGgBs58}LPyvu=RNnCL;IxiCF8v?F;Zg$xdyxk z+%l#1Tr>_BdCKt|tFNyYi8vuCiHRp?u9K7RfDK4q$CIGJvfCqMG^dB z6_;Ougnx|1KvOSzFN>53gpgf%;<{Kt5Y>4@?>`*+STz&V`C!<)(sHrlM6Zp{V@aZi9G)P zqpixa&&Ds8%rmF6reS3@rt_L40siccP|xD`k{UVt%>ZzUy@`!nQ7;$&cU7ziSKuP#S3>N!MOz`nr?P>jC&E?ln5qx_#{ z{DVT|XuCgAP?V+F2gH|haapw=d&saNTI?79BqvYuSXQ*TZtp_-a-dL?qa)gb!)+?m z+nsT(t*wVJ=vz>1?^hUD0@T}>dmagio`Yk?c6P+U!}ehmj3F@o-@Y3hqa+m#825R# zuQ4=>+M9Xh79tKZdt)eOuP-h3u^kNs3SHdQv^+_uoIebY;d+i|J`0&aKm1b}$6up; zKA%mDE60=$x7`VVJ4L4g)n9{gSBT@L-^uCni7qSh<^Bbm4f#n_)tTpv(XD)qJCyy} z`q_;Rj})zx(H`kbL^6jg@0GUN-wKiS6(ysfBN_)~U2pH?wEJAM`&xTDxM3?2fdS!4 zMS6N8&wT8xN_@B8eELc}KIgUYH{HQ?3z3kpeR|4GNlDn}Xem1;=3ue$DK18|#(lqZ zY3Y67Q!ElvQKMt-YruJaadSJ@)Jzr&@FU7KqCZIa_T5s=?x1IX*7L=Jk&W@7w1sqo>gW{2B5 z2x}Mlapi1?vOX`#^af-VVRY+s?n6cEr#Yx|1I!c8VOcc>4KqW?Vxw^ow3GY-0x0M* zG+fNg*5J9W4Bk{%efxeVN??A|s_c96ereCOan|5E;>|7a-G9A_e7F&s8e$P*nut#F zXDx(4^4G%wm>{rjH2rsETwqeDxY!URQ^AIsn)^h0!{+8&)NVJA9Nz~Y=}n~^mJrPjmJ>1yo-(f4vsyS#}YVSa~K-d{wT9uUst74zzk6k z6a?Es#GOruLZ;C66VFdGjCAAy3@e9?rdBQ$TKpz^ugSRUr$uPzlQEEw{)>bEIz~(Y zL!@e=vFA78M9Yc#C@@Doa7;#f2v1|y?ew~Fi3SW-!L4r0%n#ob{tJZQMgK7a@sfaG z+^UXRI$S|X$wiRN$@|mIr8WFP4+tAJR#pQ;Pj#R3+GPO=AS){ikco^HQY6gx4R-0t z$x~4TtnF5FWjd7rJJ;0G>R{TQueAgnXLr?vL{fKm_rii6Eo~tvvXBrFvjm-AzdO6r znYg*RdHM2XO-)UBc(@t&FY=3a){49-K9e!_`*0R^xzl;PY+b#u9+S=svFyHatyE{S zG1r`=u+)NsDZVe3CPx&DRmRPxLzb)@X(FJypPJHSXJ;Q%(br!DKS)$`BbgF)S<{=q z4|z3-evg~}UO>fSHaW-U#u_kBzgkr~Pd`aBsRYiz z%Td|zR}bquZDY~EWnK*FMLoHS%zX}`F^Ds&766C`8*l)G0u1}C01nX8-CeAb{iCw- zsr+7(WxBJ|cqo;9viR-Y^xem^tzkfcJb^hvil?VZLNC10Y!OCU+V$1dF;Sj~9yrWQ zA=fRU*Kw#@rCkThEwd{tPuu*C0OWuE{5hv#E9jQR-S?7nbB$~nbP!@!s!jWcz;C(I z;b?AtG3l9c@gsD!7Hys^hb^7g97G0nLa}M_VMM~Z^|_;W)tMpwsc&5Eofoyrv}I*v z&JP!_PS$!nJUmw03>zJmXYbz0fI)y%an{?5aYeD_(sGHN_7P6^I6-a06;z0dWs>1z zA#yy)*cpllB#H~t+aI5+l2P|YPEJEn(POSkijYw5^3rvsRmjn5uD`n*A*>s~VQ#CP z%+u5E9v)k1oGZYKhZ3i{so+&xqe1M4mWCY>YtmV5)*TU_NupdMsGRplYGykMEGB!{?UFuRYf=X zeMnzHZtVU?wilG6_q`FgQPWZWQ48wqkNVL+@q#Y>-`HXj?BvAC$|^EC8es^~ zaz{r;;5bnxit_V=`%GS9AUB0uaHj)awx%XzKF5kTKjKF{aUokc4(AWH}%QF(+w7DewH_u=2{Z?$jXu0uyDB5Q~azP&$!!IDRNgIhNYwsR2uptK9IbU z%pKZPXX0yaw&^4wB`FCD+eAhtv9K^Klz&0VZx>Wh@H0Q3qsrN7&&tlu>p{3IKX|62 zBlthK%=73za0Ym9hcU&USzA*Q;D4*HpU!xDK0Nf74OGvQlao91@ASXU@c|a}e{_fc zV2pni>|%{rcFl<*Ggh<`1$y7z(=!-Fjfn*Iaq`s_~KP*(Bg4_4g0Q| z3&8rOz$bA9A;0SrNSm+s>mVq+C4&CN-n6oml$5F}r?_~IQc;dXAY^zLGx^!lHDIPX z3xZWBMZK`67EKln* z8X6~F_xaKhojHD|8;aJ}yA=j4okzGSrdC#{x2LSVzQ?hQg>QXr2OFJ^ALKMLhM;e6 zNd`;h1)`or(DXDk5{ruH-+$#cN1MxyJRypb3PmIb)9N6010|lSZY7gLqUG}C*0rG@J zCWsx{6=Y6z25|W{znjx&B2I7t5V=ELSWIlI6F*!BW>b#D8vZt8D0OyGW6|w(zr2dLGXhM0QTJ7IdKO zNWCIf1zN{(ClMg(UWc>^3W;&=F& zUri&S6JFokd^V|tbAJIUg%>YSih}QPaBw1DeYlKy$pZ*Fbi`3GKQc8n)!&bzDMg3d zL6C%z9C*H?p0`tw_HcKFVvnE&tTgTCnw)8w-~eJJ7sZW1!aFcI>H2tov$eG)c_(`T zFC)+>rP`(Ke|L$L5~AAOA5Y0?(5z`-ke!orw6&$J&l)8k`>YLx5L4-~Sqxln-8LV0 zZSB;Zf5E0F;zh6lFOlPzdEP?-+({lB_(2PD2~JN>CDWv?WVs;1vaN9Kq*!FJ?c%02 zlV6G?c%fpKurnC7h+7H)#gS1V8|!jco7ER^Ur>;c-2ct^Z==H+^xB`aq?qAC*9x15 zrLRKZ$o+UvE35HEDJa9X9APf$4m>5bi?jK8uijIptvSpPx@n^)?R^~Exs}UsysiNG zK0XYJc>~{-^(Oaeg*u5x3g$o$J|1G!RV6}{=l3q6Ae*7N3GlJB+(wphS#UxvoV+h> z5+{0%1PP>EVHVZz0ho^*XJcv#0%JjOF>i?5pwJ5+yK>RWAlUQ z({+RWVi0m@haEquA2WO!KNlC*t5?qg0ncXCL9wBzs0h2bAP{RZeE%NQ+g;#|1E>q+ z6T)webNp)LRg)m)z_lQAqF|AsE+BJkg3UgGkgAUe@1OB=;M%FnI2gA|wgP(n373XN zUP|gC*tpbLvAMCa1p=aR=QD<+&;9G0o158LSx3t)0uD=pV^U)-C(q&LJIz->cIj{! z1)*HBHdr*;uz?BES&_!BzqPrE#QiBOjGlo3(&UT~-3M=g;>%4-i@=M#D+&uZ-Rp_G z!J!l7cNpSkE%f-!Q|9)<#u2o^TWTiploK!0PG^7nlB=-BCQ*t`)D6 zmxJ{pLwCvUQ{Y#!=ssS`D=cki!KD$1G&$mqudeQNmQZFpw&T#|9RxoWuZp;iR{`cDp+u?>Te*rfy zuTT<+mX4 zJ@Cm;a| z3$f)_MP#ptkdQ9^G^n|}fX;yiMz{>aLPLYcY;oJgrHLsfgnzpI`=@Jo?uXA zy!FlgOmwl?`-92eHNoPX+wbuzZ*i55aotK#R^xnh;v1)>E3j)C?V6KWn-uKEXdQJP zpUX8MVgLIXmB&d!-U@Gdu-9?XBcaa(uyCogQJ_to$NECVxz^6{YR z4Ux&Klq@H}#}^e9m0S;g=IiSV_=W_p%LCm6aeycT2a}K=AiKq*QWACe-Ef!aoxf>H zO(k?POtoCsnaNC+-V zP0oPtENG^orIo6VctQDl9oBimsFDQ$2u$_l8WnuU&J>im-X3E6{%@+9sG5%OW#v1dI6F8%#`_$CioSYwvkJ_Sx*F;ZzW?~~`1pX5r^?57;U?TO zH8s|4BFwZmmY)JFw+N9G00U83Qj(HDCFYa7bx7{OXLyk((tfTz#0@ z0115e4G(HQ;TEiTDR%O_Jo@0{gROfGfONS73gtB*jf4Lce{YdE7JdysW7$7Ge({q4 z)t*%V2LtwGE~O2g%+<~~?Rd6_%r#y0&z}dN0@a(xpgLiqUSWBf9bR8n!s>%utoTK$ z7RFaoHoI4(wKtyzZd<;ZCspnLDdY#2&%@X>mxGCpD<=PNH--jdX!Smu8MYtq3>K@c ziE?+-{PaWAy%)8%P=0o9?k`|fC`AYt%4WUX-Ra%k!+;VemiQwH5s_liOgu3T>Id>z zIUTL(qT-yI`t0nC+mD^k{JC}gAgw@Mn169Gb#fA9KIA7csu7N}Of4Xg3t(E$jR7CT zU2(1n&?5rNC@}0905rypsLIQ=FPityOh-05}z%vq;r1g1$6fsa7a1&RdF@MX{S^LXMC9g3^#DXD9u zp1IWJteM#6Rm41cHobNlm{KF3sRsN$y~+e?=ko}E$~Mq=J<2tYDnkI zjH<5g8pv5+zU+G(3a5Q>UK`HL`a+8UZV4lO+b>^F)zKW`-@hqFMx<$Ix^u+a06jq~ z_V72BR1wTHUmY!9FsddpsC)qY{yfl#JjJ)2J+q~;MmVwXoJ4K=5rkSYhgD5wavWvS zgJt5#R1g$GuGt$$M$ z?U{u|J16zpMNh<2q|9sKw(-@j%jV`iYU*HA4LVQ3rP1Tupr9(9N)-UM2z)(q02SIO zfUf&UHSU6;2B!DpD)_Qr60;o-f1yWuLOpcI${mkYst*JMSM23?7}7rH6#6hXvth#H zC`Ktn5mUcReB^hkpXo2z3QPt7DtMY`i12EYTS;iIq&Y`Ua!T)g7qBK(YBfP#;Y z6iBRydzzn5fD)#JH1RMUpPXmxC;oJ`wW9=KsQLn;6tBaQ*QuF2P&vA~y6T#Fg`Gg9 zfni|2j!B{n8J4tcUrma&WAqHNkKP{1UMsGLoZ-jUr~QktQ5W=3(^;v1KKb-Y{8 zcbe9LBbB|RLn9-HQ`RnN9QG~lJTK5&0PWDUJIP3cYU6l}OXzViQIyTkFa4>zmBXM| zNGKLayyOfa^7e;tjSh#4wcAs7SEX23(mg$u7so4ov$K|aQ$pU}vP2wYsTmEY3Doc0 z+<3o14c-Sdjpp*`HBz|lj14L#1<9VUTo~>Nt`Hq_Py|bJt zy@oN>7!6HSx!=4DwsJZaVocw&{xQ0(0llZnPE3V4(=J@T;OH>VIa6cR2f zRi8wTTafeeOuk4iew{6$KO555J7O$=1Xl3Dg;71XcdvgS)%M)CgB=dpuGh4#*J6vq zoLbY}3kP+d;X6L(mSAT7#ftm6{f?F_{3FKizz%N(Umv8WMg}W+mtTGyI(bD!O4wR< zGNJoO{mvwqD15+v1s5$0bMI~(WRqFSv9`NTJjOWIW;Rs8mo@0h6%szT_v_oP&vR?8 zKZ)bLyiIeaWsjB8J{-m%oh+&MPt-u#^5VHXmT34WJ+|>j8tCG?6d~VL``i7u!^3a& z^q;0HmanT#QLgVecjufFK}<-bj}sS1sjuf7$wv3OzrHC^KjP<~AtWpZ;^djiTnX>f z!SnN_#l{{pG053=CF=8Er=t=O)87~ga#Hf>=+T&%5k6jT4B8fuqyn;AngU?{FpoJL zLJ|T>=+>)Wo+z?uRpCu{du?qi&J~i8e}-4FalRo(JO7<-w^f#wC90AGai^5~QPowV z<|Z#}OfH+Nv0NLI3@9{WVKhSKRfX^3ImqpNJc&?0MRBO`1$|5=;}6`=_WXhIx6l*) z2c46nzvJ>u@-tTTYGudvPrn1nR=Nfnv}kim;O?WIBD$Op`WBOWzK?_<+mX$kiG2Ca z-)mX!pUA$E#I>&Xes}sE%mQ7e5)FwrnTD#*o{v~*YA?oQ^O$mVulYS&0CO6PA_bd- z#5Pde0c^T6>eq*y=H}BpsTpE9&gd-a)JLutlvwfcfxle0k<}nI_CwOP$(7mx%o>9q z6xbR+3dIsQv+CBq`$c9)vkuPghy?yEKk^QdCA8L ziMKg~UWr$+US$X{L(xAAKT8YH|MQ+S(^bAF=9($1WIl!u3WR4)b>ifM2gbS5PAcrW zw$OL>J?>@2PK_pMDH475vggi^GSM?vEjQgg(iV<>wj~#{mdv)r6N)GfWwABbgkh&M-|7od!_wT9 z>g~}az+j|u<`$RE|DAL-(41PIn;ZJz0F?28B0{l~vyQ1>V9v7@CsR{)`im5J40B*s zYPBb#uJ|o!Sh(T%ga#m_!i$TGHD%Q(;%A#Lh(%i37aJ>)Vk`Xbu-Nnoug|T$CW~(| z86I8B@s5-Kwzfm==08x!<-_CT7Y;_v3mPe>6z#ZZ0jJ=jqYAnks1Ec2Pp$g>*pKP2 zebJ0ULctXC#Rp;H{S}+F*V!$OS;}cUMN0) zojvOC0*qQ=OWicfa7my7Z3q4<<(ryHYez&kZZWuXZHnmbJ_MI)qrG>o4j=uLY#eYb zW#{3@eShFS~G+;QR!>ygoXcL4qPLNGLTX_)EIpIhJm;wMi(Z@u#qC zch|GPo6~RM$ZFM8{#_zqwE!ANMlRT4q5MZy?Q412J_$cB^GSx4)zvda#gDSG z^gy{qCfa1X(w_bO?O9ypoX1l@K}7!u68dk@V=c^kM>k$A(JOB$_=g9q|8ASF^{+5m znQZ;_JR||2yn1>Mv@{JbNM}?RROhfu?4j(Jf-c@yn^yF~1iW9m`ZRji>@)1v`3t@m zplef`L5J66(Zgt}n`|Zn=wPZ)s38#e?e5#+$RHg2tUxZ&+-vbHJzRJj*-Y*(k?X>7 zX-q7da7W|<4jTDLssOH>S@-gqn8p?Nn$L353t;LLu)wZ_> zu5?g-tEj+=qQ*YCxUj!}SSBYs!2YCP>Zxg2Y5;vWf47z39(eW%G46r&IXuqCZ-s>Q z0XfKJ!lRuMGRDSv(b3Tb%5XRF@-L2E`7+`zF^}eFN|HW5Za~Om-T4R%N=T5R8b#R% zqkC>Hm=Yr+Bh?G#n47B-7Jx2EFWN{W?ylzzi^^y6!v#`&x5qz8*e7f8 z#6ar=>F&2I)oir}Kwi$Pk9|eTaDflgSY(_60`F8EPo~RSkTG_i`?eBvhlJupIP{h> za}A~i?T0bqd&lh6`z$?^O*@(eFb$hwt5;uF7!8hYkB+cPirEY5!2(+iY&1Dp=D7~$ zZ**tf&l9n6+`rx47`(z@9bjJSV4=HQ1E#SbkLlrf84Nm!8eUBqa2L#GNolY4{-{PR ziTZbd_!ypK6O#>%1o2PVoqfdNsmU%`epjITwctGfvQMBa(L)DjK#C)O$y*M0?l^B= zhwMx*2>N}N&faM&VFfqQBY*?44=OikZJ!^F&+DNIwbn98q&~0=Xkyub0ip+F*25X| z6^3Gq)9!MbQ=!lCQu})mQBZO_0>t=*g@5|{wzT_;fI%HBOtJOVs=dJ!13)mE)q0bV zu(WvQ-rpS$Twi0Og>m~e*_LVlUTp9lgqJ$=TJkt#@YtQOczPtzWC2Ff`ALk9zkeeu z%QpxlP`B4Bw61GR$X!!#Ku@d$!%d0DXq>8_Nk4;@e;;YyUN0wX#q;Fb7Os%gfn;loo<&2;7iS$7Huglay7gN-93d@dXr?r=l zQh4Z1w}`rWo6l81QqBVn-+ph8Szfw-ye+!{DW35yNwfRD zk3i#Q8iyEQw)sUxt)RUGA3u`*jwKZgXx)CeQ%p^@FV(c$9-+4!g}}~4fJ*))O9~i) z46n5;#Kn!<-P==`&hWYOIKPGs5n~8jCA<8(M?lezCE;u8a`U9ES88pPhX|41g#@n` ziAuR>bI3_`)teN2Fa7&%lQ8X87^C`dNpIZz9*G;)o=U{95gp)GWGX0EwCNEn z&PNe}qNyuukWQtLNQLN!lO&4*8PPDWKAV3XtX5VnXU&B|j1VfzImkg6g~~35No^El z%yeQcZp5^K&UZ1jb$&WErlIdTM}VW;Q^-@Ijx+w*Gv{48FYr6%J0I~%!+Dn$pOpN= zVM?+CNu`fR++t)0`WWclx_uzzdZOp{oiN;4pL8(h8588Rvt%oQ1z7mb0Qk72b##!l z?lPhH$Ep3XE}NW|$%s?{5KP)^Z$ls2BXc#niLe!}XN8`0{MRdD|HoQ;?Q#il8Z&T9xXxda~^IL38uU*n5rEVD~iq)B}S6 zLB_&o1FkxmBhut|6&X$L3I509uE@vwFEx^lxLso(!qXp-*)z<(0n%5f|9AC@%-$MG}-MfJ7o0Z83P;1TGLb<9bE+S(Oc6o zVkKTMnzcgjfByW+?A0!vuE1XI%7#2xR;M=MzX|EqI5Nofw#T@@H~5?J48$uyZ)zcf zqM}IrbEDZgDbkluS`Te?ModUuOask$0&mZG8Ly70XrAs>k4z9Nv#_T>{tNCBM8`F5 zGst;1`Fd^@3=3l%eX z)c}9BpLRmy#&TcN3e9b7EUf3{*P!g9Mc1>Y4HQT-rQn%zdq_xXRYFb|}$wavbmIdSaI=Vt;mzD`0vDT`DDx}-& zzhe?NI(hB2xZQc6^XKRVT+ESsQ2TC?+;#YOj)Voz3jEC6(@EB8T_4o*kYaw9{+wTZ zQrnQU2A}M;haj5L8lmg5e8nMm-uj2!KdlqtJi*Wi=4y7QxAS@nU!;)Z z|2>NtoMVd36UjdDImw&TS4@!l$V0_>WeT$b!P~hC@BU^^h%EeuH_q!j`%t8}*cd5q zE*bg)KSU@Zs&^+eke~t+5)Cl2_rwbMf${UJvgk{rLn!;7W?rYf6wTI-7n}Q0(3V_M zED|N+!R;q)i5%WY9lk&xcO>LckDF6zl!!m_@vmVO$N|g{$&}@AZ}hNF<{g*I5$;D8 zri!>Sr?v5q(J#IQ4(tillr6@AJra)9v1x}>kLyI3sw&s73d5z~PTY?B!03?9CI?MIllI+pv{^I!w5K-@-tx@aZ#kp!L-0ZqPI`h=E|#7&m3)$X7V* zcPFCgc%dRYyE_Y!%+Yo-Cg0Qjdj8gF;b7lHha&+wEA!3yrm|r)VRn&gU)i%cc0>Mw zRtG=M>DW{NV=eGcwYEnI2#%L%E;+)2)8Z4;5J!4St!$LEGe?GXVwSCVic~W>%7a9G zkFr;6lj~idQp$xh1Rc|>Cuw)aFM6fN7fmXAQG6&oR=EBTTW1+oWfW~|X{15Al$1uK zq#NljNd@T=sZC2aNViCLcXyYR^rk_&Ti`A{=bpIF^@l%I zmu-7&ixajd4?g>A?L~W`sI!BGGUP^F8@b!8%SVgd+1T%@nS^hr6he6~I1L)z1QkJ- zboE(J6L1`!f)1Ekl0&WVtAgnsIb6%`%-{d&$?uUpopn9@xFRJEI}9AnD8O}ap922u zUs|Q+D301=h>SN*z*q5XBUwiQ8g0(QznUG>$J=8Dd_2}F8X7n#&q?2hg`IO-YyZs1 zQ08jU&s{5yW2}QVya4WMdU$#U9Tr~|u7sw)1@&_4n|JqSeGwAS#tX=mjr7Me#2si2 zO?@)|g+_Bu$8^2b;i~-W7e{SM%If5trsW~Ig9S?ji+{KO5L%?qI|}YVL-7LfVc-a! zKgV;#SLh;Je4pZM%ea9`b*UG^OX3^UBM{)6)M_F6eJnV0UzDKjLn;o77Bj`}jzZ*0 z2Zl#4ALi@lIHZ3jQ>w-E`}PyUY%Hf`uHCXkcR`B9koOuh=708K{cl*0Riv1ky*-{t291GiY>Y*l15&f3o02hkmxb*AN(x3%N`zhx-}e=A)VH7P`Kc=8 zs=+Y$uzT?b@x4El%?YtQGlc1ijJ~-s(z8R_Id3vi+I(Gos3WiLh6~6Do(w|&^}U2E zNA{4Y{k@(cGtHL5;cRi$g9oo0H@LgyRooQhP-07Ko6#32?V0md2!RHFus6f|i{Ye6 z7M2BQTV(e5R7bM&VRwDQLHnB!;aBJ9sa)n0;)-LA;1_$kbpia&bzf^NlmIR)EcyOy ztN;#)(u_k+UHxWbK>p#OkJZ5QqYd;eE$z&bbFkO7q zMMX?3W{j1SgOtm$@ZfieK9H>wbC0f{ZHMwX`AJF$G&mp4{>rEVK`56;%g122$5>?k z{COV}6UZ=Mw4PzXy)M8JG)WQJC{&9;+n?AChwrDHW-I*J-YQry9A9r#+?(&Eyk=IvEmk9 zVdX;$Un+2dAZHAm=YL@YE+aA`G|7qXca{RMWZofToeXU8t zjmT}0ZERu!VBzHOA~R*ncvxTfSy(=wZFPa*Dj`<~YU+(OZ#ZnlA5z_4&&}$7m&m(& z2n=?o06%P08sEP-j#7Nkn%1Gz#HL(xE_Fc3r(+JOBcGikEkdqCBunx{&b2G z1Dum(m@5<1xlG@goR4to(d2+xcp>u@-)r^*c8Ixc_vR1ZfG<4LRAK1i%_1-MwyI(` zhMW1uKv1O=>GD#hEuU(PWb*yvTLf!niG)&{Gy4-LR(j2)bnSP=-Z0Bg2t7?GY9h@s zG0U~KwqAFao{UB0R!(4zWVb!)->-{^ zoJPG;czC|;?JXiA+iUA$-;%-zS1mq`TP!uwAX5%G{sR>`Ql_kvsk>@ZV?_zY>CUxC z>t1RJGW@)D+xu(y91GDqNM}%WQiWzP40ci8j+paIOvA|)BV2*~?W8FkW0a_}SpK(s z4*mm=03$bof6FI>jSx}0a3q-nAA5P=vvh`%luvW|IS%lnae)o0H~(l&pufNRX32Sg z1KuU|+@;jIY`9u+L98<{jA)GBe@&0)p#hIuV8yTUt*=PW4J>vF>nykPIVkNzSTxg+ zLax$td_W-8SCetdHs>Qs4$hkPWW6SZnQ`TX+L4RvlcJ7}Z6Wt}ZB}!mp!x$&f%cA$ zhnut5{Cv29f%LnJlse;~OuO~l<+k+M2K&7#)1&ht#EcvcSq_gY z^;2Yg4|5e%=OZcXZ|aku8u5;L&0aXux`+W;(HIsVheTnoc@(;kvFT}h=vWSF@T1;E zgknegWa93y7FXgf(n)qGp;^)B_$af5Fd+#be4-nNTg1p91d~b)JN;lQy4Yxx_2Jf` z4#C&4(@Fdh!ehd2Jt7H`l8uy~?jvguvyv0Q?UZ~#>1-O+bU5`mD`V^LUr|X-zIRik zDkR_5^h73P5nhJ+qrc^uo;QBkq^A<|t`< zWlkD?^LJ}{-K!C^kavfA#GxXrn&rV*Gh~mEsX#s)b&1A;BsA`t++r#gv)(V zL^EZ3k~?u63%uU~kb-VQhkeV^;@(Ct!CIr2_9MN?N9i8@cPbyDE%FmB+wyY6Eg`bl zC;obY4l0u>5=zwE0zwa$su{u`ZI*iL^(ON$4T7aBv&+@`@oEnzzTRjVq=t+vtjxB$ z=>!j}R-DJOk~;}H*$1H6?e{TnmZWc$m1fnROubc*z8~8Gz^bR4Q2UC-_orPRrIMN6 zbjMeYRLpQc*?FQsH%nLY9{km`quDt;qWMA&jcO8`mBnKT>g5+1m&pW0QXW$*3d%UO#5>KGEnch2 zvu6USI2dBVcFar+{oD1AIgW+e4AjbFNY271EJ*)Zzi!PgbfT4FRpW-Kn;XebS*!#p zH8b{KX7AHrzGWlz_&n2`<)nT~zSjVWB$LZ{m21sJhihkJ;_9g%A;Lb%$>-a2zC(|r zrdU7{3$cgzMCn8eft$>n6I6SbUC zeSxi>MI2=B}LD3>U}Zm z1nRMx?01%%Y;Ab=Nghrd=ObkG-CLzpHJ6P0Ega>iYx)*LztV_}s84-hd~MCb%!J!5AjeW(b*2>vg zjTiyoQ9jo+oApA&*=w=0<0E{+C-GVC`3Ou{B>Wv-2Xe+|a5o_?%$7 zlBOqimVS1VeAyp;@hcuVS;)rqarEEtEMcvMg+k|9x0}^H-cia94P0joHe$e+V68q4 zJ%^$*g0wU~XTaq8lkU|lL8(l3!@v95!lIanz4?6@q5b|Wn?%B<5IFsf2$NIP>mkP$ z>%Rkf55jPc+6ddeKprault?d}X7TZ`58S?fz|<-FTP!;B3EGu=4p)}z)t@}|d*G_O z618Jya@o|8Y=YwO_vDYRCYHrF9y|3HH2(Vs1kJK2vrOvodrGX1)^RnUqT7~*MXH>N zt>ry%NjnzYkQV-=&z2_b<}NTkcF`Ws&X_{TP1{4QT($(<{eT4WA1wfZv0VPq&1l2V ziT(?R22=jA)7 zc3aP^3ZPe87#_=tt!%HgY}x(3eg6SDz24ULcT4NmNo$Jo{GC+@103sC@Q*OdagjV; zDx`g&kz4Kb5Ah8B?q%oT!2fk4C{y<2<%N2)a%|`3v4CW8)?4#QO@%i$mf3})u$ExX z$+P9^6Vj++@wK5VMDjOP!99%z9qhE%ly>|`!XT1B+-a$0%(3z20 z<>MU4)d6|~kxK&+f~3jq=X-e;#Q(1Jh`?PKgh~o!>NcY%!<=}EDQr*0p!oQx11qhFC7gfac*k}VAw_H_ zlr|Tkb!}eXn(hpJq=c!>RohY&@e&lKSptQz9v(}iGk~T+7bCDND~keOv*Zp8N3ry7 zwSr#xG@c%)5#WZHRGBO_Ivs$~cYfy3>C-oey+e9bl;QlKPw>0KnGeq~zOmMoZ!dnR z&qkY|wpu-J>&(@)_F$sZV}?J$yXXxX$=e2lZf{0f>l7BDKYt%?@;EOSl55N@=6`?h z!yEpc7{-=(U1_H86PXCidUE-c#2OA>=CZd(c&!<>)#hoFW^WC6#3N z&0K@T4bMsr*?X$Ha8im5O|Y*eLwpg*>r?t4G{KDfxe$RZxyV=lXpI`;a32+FSgjKQ zaPd^a*FCtw+=3Vt6qtiKfs(p0%>Bw&1XtJO$y{{Pw{aU9h?udkuoykv)c*bl65rQs zRbHuXO!yAy-*Tb8eb|}Y?g|`(Q6}P_1;$ksV&@VYcadp7r^(zj&)Z8|;GCXRg?pxX5HINPh!;6@FuTW^Aq>Bp5$*0YHD^Y}?s0Z#-7}G4RN2%S z(=@i+-J(9OR6{xoZO>*C`To)%$HaZOcHcBYwSrfLM4gxpx(T+nCw$xB(*zJm`mlwY z0sgNZ5?%6F6m?JED0h_HMkoo3a(Wuk%xvvm4ZEBFUTHxp>XN z*Wak6?{d85G6^LrH7f{UqlbuEN}bK?G}M7>OW=DVO*i zN5oh2+x7mfHWApS>V{AUYi$4wkDTm~pV z`4NzAQATAmXXoS?n_V|COL_|*18Eyq$`WuFfdPnSv(5q(O3uLpZg6YBuyAmoH~CVW z3>@c%JtR?sRg#}K3J_4afISjhVp=#Qsf+&P5}Fb}HI5cm6|sd;;qbqFVhD8pY@ z*VmsU);mTvil;6+dVol<`m*n2D)zi$WE*s(KqJCT=-@Q5>f^4K^3E0D>71g3R^Rb} z-*N}zKJB63-ByuzM92}n;pXE7GU3ype#6D>D2OJ{JC%HHr6z{$&t89FzPTO-NQ=vxvn&1PcQuARrA^N3hs$mC?l;LP z+(ubhG_6Z*ZIkFr{S%FMj+)JI&WMdnU&q;h(FQRfvcxP_JzbE~2QnFp7-!b6 zO})Sx$BtQRW<_|?(hiAb{Dj)m4wDM_b={9DY7++F#l~tWIKhRNj6XT9o`ykR{}^}N z`Y|EO@CeN2r_pAixaIe)F)wUu{)OSzCdR3G9;Xstj=`Hu!)++Su3_)J-d>rK96(J=AjtMD04YR@lh!AG>bO zwkLPalOMkAB7;{X1!CFe_1J2?Z~~%>51}o#c6JtbmoY%W4M3!~SIeeIn6(O05Ra2J z1?vM^Z*LJsjeQn+7x%54_U!!@3rW9DFIht`k35ScZ89~Guu zS7m>5N^=Emd4*t`K^`sfx%fN=#Dj@L*sCm?|0q4>{~86cGb@iJ%#Esu;G0MWN_954=} z!jlxAdAWf_hxW#rkB`sU`O0RgaR5KS-+yjlK}TCVIXQW2V`CrMlG0-chz7@yX5h2_ z{v8WoaOX>;TZZm(#E3a4P@-(4Ooc)Ky)HWlW8|EBd?$rvZno6yr7`$P=6f#HsC| z&t=-w8^M_LG9tq-NSJN^F@o_qdyrAH3OP&Kyx#l5^ksjUI5f`mUxUw|_?9kobb?;P zKkeY$zTPZ9-wMsp`eeQXKQ>a{M{){w(LFXWzzy}6`pPC+vZiI;BTR`5C#GFU$(gRV z*Aq&x!lIa?LIR#na&n{15-$=GlBK03J3G5lj4vJ_Dx;yH0Z^fKjRnH6%K?a7uc&Bm z1P?>VX@Az%xgIb~!b!OC2?)UZ0cN0j2Mcu|fPJ99A2=Y00zLs3I}oXJ5TjSKe1ihs zHnhnZbCDHnB?wIGkKGG%b3|IabmV4Q~AmtIm=&2c+9$uyjSD zN>TwBfYCgS!O8gc4i6u9r|E&E10!#?-z+h7sC&974OmdvjYFJORQ~)B$jhc09{Zq; z@wmqt^YdqTo}vLM-|rXKE^VIiL_~Ai+7?KdlBB%2Hygp?&DPcgHCE=;MtvyoFeNNZ ze0+VNBnl^qEY*E(1zmimD>XPceZ#c}*zrX%D2SRd8@HX*s-?(L)V0CgNi=o})c4iT zqF*WF#{FiB&J)fX;u8CO!Kt;ntZ96gDPZc&35JG#7@wu)Mo6?0=%RO2box++3&yOF0>O}jPzp}MSsBED z5FqIEmEF2Ysu8GFb9Sv9j+4hq5!~y6!%04c3*ms$TlfpecEt~!1~3QUu->1mLMr|R z5SZoSnws1I9#DN||Me^IGEq#L{QP~|6djp`kAqVQ5Huzu$r6&15kTUmr*{u*0-Kfr zfq}r20=7M%AA>l`Jr*Spr=Ld2^}yu`g;0-nOXjLe5nynz%o8E8iz`# z?MqJx(dg*Pu)g!nY1eB-8@)D9r(Zt-k6M2+H(VR^P{*?f(9zE2Xo#&Oo&!7E`Fe17%X)o1X={jyiPEi;#IL==Z@>;FtsqBI~(I z~N<| zuuyVbTzK*)o*Er4201?=zvuPwYN2`&a8|>sk?w=wY{X_)-~48navP_3rYP2gcs!mKKpul$4b10hCcv>gssfSCQ8ZkTUo49&Rqw)zCwE zo29HMm7Ggm)9yPB;-<$PXIwMH2RL)re@pl$nDe39`9S1tPIy$<1G%g@G((pC4PgMuk)^bJmeX5eJ zsp;U&7j6gXcA*s&Ftj_i;u zRBC`)xu;%}VzQC4J+%VoVHMr;1p<>&=%5NDV*fiEk<+yIfQ zsQ9xfh1<5(aSvZQ8ko{|c6JB@fRUe!=PNuNn;BYtef@9>_t^aWZCTqxBk&PE9-#EM z3qAgYvr6h({gwi7nTG(vEE!1-9Ju_}^VNVM37qw{2!fBGX81Vc;^G2MP5^zcsjhzV zF#-bb3mlwpyRv;_3~-xIjwx`b04Aca^>k>kG7`DLQz zi6`onqMqOb(4bgGvh2M5*`j)6EU!a!O={p|vP|POF@w=+TWvjZGw0I?%x zWyKUT056EfQJmpkRF|?~80~(=2*%Rp)nVZyCB?TugMbg(6?6gk zn|Q3m)rCVp6<4-7n}GT)Qrp{ssFFSeV-x2G9(?6XRtzn|+NoymaKPToe^lmGY; z%~P=SH#{%ifCpbaQOD|ti+7mSXCTv7;AnHRyyjhG%lhejG%S~D(hk?Kj=ZeP!FW#3 z4b6%c2kpiDIA2pd!q|E8l%rDL$eDB9fgzXm4diJ;g8lRpg(L8D;`zmXhROJL$*GW` zW*yCjYl|!na$g>*`YTz;cq%ohDOZp(h1JyKOh)fhpe}z(?9%w3Nj?Lj{naEFg4-e$_a&*Q@ z8eGqRr&Y0mTj~X=&9u>H0^ac2c~4noLAO4MvNd-u+3-z^^_*!fk9)Srzx$$Jm>O|B zLMIRR`i|mXE{z+Z9XtI$pJ78od!QhaMh&HMO1DoqhoOrd%+6i`0p?Y@F}U!c60oQ_ z(7$?I`|}5~(T^>Wu=dnMgfm0ih2p(8EPFkwv;B?zxH+toOHaF$aGMY6B5qlP&jP18 zc40@lH9O|A!jPpQZ&GHzSVu{{*%P?L?Y?d!bYbBB<+kwo$Vdv!YY$TXf~#7IJCw>$ zXFVMQ-FA}#hLRukH(h&bkxOSOb==r7)Bza{9}s#PBye$G1P7(r1TwkSrZkwqif{#4 zfrs%wr}_wUsfktlo`^mac1Ai569qEDlzc10j#NuL;5+W9pn?6<6{@9u}Qf8Y;XL_Ex%{5Gt z@yY>FUQcHPYVBZ;YQ#p-`gQ;E;2A2EyX18&op2bq%&tj9o13d1FXpi5of zD%7aBZ@7x$dBwg~v=9nH(A79|UUDMWuH}rF9M|!R1#AJN4KNvc?iMt`L$kmGz51Xh zd9g2ASco9`QnRm;HFTJPq+FG zm~FvfmM2Fv`k8^^(S#dU_VGU&RQ#x@3*oPNjE9Vsz~77$Vpa^LYr_Ei{FLwCWts3N zM@tQgG?)hsYdJ})Q@9SMN_4#5^)+dK4|Y}?pwCb`Uowcr8i*vr&3!9IN3Ddykd!DH zEnMD^!#@B=nfK#UlMsQ9=nE zi8;WxlFCu_hh423#r*6$kof|he&so_)P-2i+Rn?jWXu3Y1Uhd|9ViSSHT99|zWYRr zm)HID>33;Co9^W+=*b+VAU1m|4en9laZ&!(bqY7V=3LzdhBesxYy3a05LJI zaG12pw@i)#uF0h(kIYQ+zCHud@37KV==2}$%afJpeyYS}8%D}FCM!gf;!12%rmhbR z99diI>D0W%3GVmvn~5YRV|!!v*AeT&6*J62KMnpGiOET?c`u%o9yfR(U4N(Y*_-7l zL|k0F=MoD9WaYIx93OQt1vQVSd4*;RMa#IT-;fTC4jXXyVE!wAz$e?q0?!s1>@eA- zV92abkiL2UPG^`vAobToBX*IB8)%zk!0vQ^N!a&;|@3<(JhZ|Ppmc6B{GKTb3>oBcbPw=t4}nU}YX_7ZCJyEvGD zWl>5&!E#=0yFW@602^9cQGs_m+bJAyE`Zw>ppc@>&FumLx}eQGla)Eax|J0HAaHk= z`puhPJv|cRIp6+*hzu}?dv*fdWHarEt;jqI;f~nE$KN@(2W0Gja)ei4Cq*Y z)=m9pG|bCsm>~MyT@26OF}c<1-RW4*oekohf(gB%-@6)MJ?I?@c%!*51vE<`TmM^c z0hbSKTM8>(I(4Hv`*Q$FDhPS_yAgudbO`lue|*>khzf%i;uj=$iY6usj}K>giZ92@ z47Q`FmLgwXd7N&9r=+9@_DzusFaeWdmDL=FqT*}#RVhr0fgKM3*XOjn#3cM>JoZB! zo%JaPZ&&>HddtzMiO(y~@pZixC9GH<6*^=QcTPJvGcaE!jPSMyd{!y(9Xp#ZpN=ip_FHFKKR=?aVw}u#*c`8erzMlcE*f5~cxJmwAcaxMuspOlSuhhpm zlkZIhPQX{3lu!g@6$Y}mH+SEha^L#6KjOl2TdFXvA)_jz=!?BHaC{}%{S!OWuSioe z$g@E;T9#55+b09rY@Y5DxVuWy#a*KyM-Q{CdXuTs_oW(Q$-=IvIK=L#p%tyrvYwWv z%jlQN%@oMqii*^Cc4GGyjW`Aj;)9vnrVvGR_RqKmti3RZy~M^g4orlKcy@gfN?e5I zIa)q)9D(h@6CTx?@1Zw*30%9FndAHqVYBPo0#;HaJy0>>$oDvj<8%@2|2BGhttYC zV327WfBXfH!KwMw9y7LYwoORIK@k^1HKI+!K6%_0$R}!bv!^R#%PxCp%2*t%q=9Yy zrrJ3@G5Ni|&f&!utkek8=CRx3(-+|)wkMkG#`5urR#-;kcEmp(1np%4CH^(ivWkm! zwY6`Dra^J=pAr!Zg801U9%Fmqd!Bza%{uWrN-$DZp<1(!d^J5QmmKj*M9B5PUfQsK z=!U6LgKQTG2b+?NhN@4)ZG153qzo%C52aHD!Q!0?o#+H$;N3#Je&N5C2F; z)wG+BH`K^&CySPBX#IsURoF;Pp-RGi&|qlYD0uJh&Xe5EwpuR__>OOG@bNEUU}V7~ zuvv1=RxdK4jQZ2z`-)iJ}K%Dg`L@$tDyDT)quN%dyqYCqqEJnUNnO3Cy9YgY15yIol9RCzR~luztp zK3)4{PySSnbOm8gc04uWv_JJ%Zz;MPHsmgvEA|SlQ`MIgluBqh{l8yh(-_}wA-Ltg zJH&sJ2ZsH%sTtyJE^E-kxcBF1j#5!ls)`!3B3uGXS5DttY&|JUek0P8l@8o`C8ng*?2+ILdN)|Bus7Nx_3w0} zaZd-OlXCR}ZKkQ|e;E=gnzsb1q@@k5y-4@}WbV#MoILh=J6TP;ImSMO*M@Hh8;c`M z<+Y&PF;sXV3Z>g^wR|)H5AP^Jzn%3M&8B0s!_dSP$}oOU zh7jaz_UFSZXqkT6#IIj#UqZse0+_Ww12TT|P(oI&+=+r58hB%-Mp7!q$D`uoYr$ol z-@&YFAcmkX@~YXx%^+Ry|J%~3vzh}bGU_WN@3FKY*AfhPU`PPSs4nfhbvAk&*2!tr zQs_wF(MU^lr{s!zJscY^j!@I!<&byr7VQ)o2oO&%)!G|TwGdR;wz`k*@a7wxZ@%-z zmybSd!$JvqSZ_4nvxFH@kR5IIT8;YtdY`{NQ=Xt|meCHLf@P2q=ZpV=Tdkf^84I#?0rnvd{VK zo3Tk5ndm0|us_3iM3O7f#e$~jZe4JA((ZOSU!Ue&#g{c^<6BwpS&0b$BqtIA z?;dEMr@+8`YMK= z;M+FhFz?EpL*oJu^xg}Fy2Ql9S8bKwHSTx{hiD)%f34A*87Ze+5+y9Ajq+yymjOYAHBI>IZ2IV@ou3C%p%6uF=Ngx_$Hp)g5?n?W7HNeCsVrr1)yh>V za5R&Y=LDBMUZ71?RT}ttLPPaC`2z^;YO-~LlDSdY&0JjYBq0p-TK$@W%c~&#Q`cBq zIJCm)A^BlUd+)o$QG;swCLc>_cmr|5^~!W92}!BH1j3!)QB&2;_p6HP*kQ7S->2`~ z%Lgb^I4zP3LG4aI$cQ)-TFGm9UrpvbD$#V1I0Ga+^kafnKk8=g@-1~_*)x7V^c@8L zw<-i6HMJx6f9T+y_yW!Kl)?$ElpRbUW_PGcLhr5ge8esqZ-lx-EkEv2`jeX>3DOn~ zM$tw}6r1LCLwl-oICgWQYK`B&KSWA9OUMuy8V}{zo6~rGp1WN>RKn+0IG>E3(yR^- zz4iy{c82$>m62{N9u*S@? zC?y%=XwF_qON&zy{YLMp4(-4{MIfj0$ahGNDFd@7Nx}$abI6}dcH0+%W?U+YjaQvV z(RGE5r=1-|NDBL$Mw{J?8FekMZI``lhc3$b>J)@@_l26|U#pfkR5n%~$vt)ga{{V< z4}w)N1oI4*q?Bi^*XRsuAkpbI?)!o%raly>LR%v3;$eClZ=H)}d3&)`Cxw8GRVOiJ zjib4ANt(31E{T>7RQj2VVDLkohNa~dbl7ZSlH&`NN#J6ASmp)oCl{Aoh;H1D ztYFth)J*n2cwV3X2(_Q#2}?B+!^vjytvgC>uX}#9@FA%s$YZiHYv2?PHC+(0!!|gx zT_3pl43(k{+KcV?zt*)YSli3+A$73k$UvOLGQ`IsJN!E9b?ts35-|4lyEfY{Ls@Y= zmKBw7b5u@F^Qh?`pvlpwsni{V{kC3(xL@8yQ`7K#Cz;>?L@0FMLv*mPwPR{?8i7in=1qN5WEy#kdax%cA7+b=f zGT}ARjd6SD4xxs>00;6(t`viC5<%#Ig%jx5p=>yJ>m2f59~Cj`(6zxwp~QY3h*V4) z^tb_<*>r|;$WiOxV~frO&KHOdZ^s{R`W_iia*_Q-1q~qkvzws$6M$f4=jCN&cEYmi zHsRl2T3f95&UXb~Wsg_$3vi1|2t+TvDkLZK=H&Ful=gf!F}e6NKEBv| z0BF-~?gP^^GcBG1lB%jBziW|vJ7Q5kMgVZ7<01d_VBtZ&atlbFoBbJ(EOt_=5BnDm zzljA(Qlm^=d}|HVMJV>6iA=O<(nZVn6J8i-eP1W+Tgd7_t_6@x)UzEJuz8K1pY#L8 zM8_K^kqnu!6Hr_)zdfG_yard|*}I)0(wAB<2rxkTaypjcGZH+<5sSn1)0_ld@(Fv; zqZ78`5Q`P<&O(6`MTd~V#>Rk7B|iPT9W;8aHzoOpu6)(kR5Ua#u!hZfJAoeB|7C*$#PwG`JTO5hSMH={M`gZc<$m6L-EQm!uNusj<+V+8&&-$n=h18gW#a_VphyJ* z@hd7?-kZr&SLbYxMkl$nT5xihEho3LD=RKpKtQ@<)!DM&R?u}gPeUcZh$L@02TqG{ z5=)+U#R}5WRpn+xL@wK_T`L}U3f6O9KmYl4sK$U150B&q=8ELi?K$r|x(z!=ZOoOG zPXI3mTmgYUFxG& zUppHTRfzi5+fJO6Z^2=L0`Ut{%x+;}pvU^dVZ1}c82a4LhZ2D(b!tXp9&zyFK+|g% zfY^FC3Z?DQmjA$u$Yke#_I4%}(k3-Bp;Q{oaOeCpGuGMvZ##ht6DfjoGR=Wwa-ovH z(4l+QD_lgp1$!Ek`+*cD-jdj{nUC}rHkSK>nVPRBkIp?cLE7kfdASrf_hSO9BFJbk zHuhlSA}7bKHdzzzyE;mDIN$}N*Ck-k=5c1RSeHz|V}xyK*`Ax9Pko2Ix>6q9)nK5e zHeY*i{YwdOVD~qNoZr7kA|WEeB;#FJY~IY3AHwoumR&nouUZ~s<)B(WtuRWszwi1X zz{i}h*+lVVp1P^)yE`<=EEF3lLJN@zh>GggbR<`Zo7$rM#JQxshcQ8Z#9ubB@UKxA zSdiVl{x!*A-g#0><@4dl_Iq*hFk()baesdfho6uHRkcBvn5da~rT6|PX&BaR*`!y2 zNouJ$e2p$h)}Rx+P}iR;FQreOnHLCgp|};=cVx)h0@{am{lx(^o%F;6+F*Zi(^X0s zX=Lezo07nHHI%`-=K0{}Lktpl0d=!viWX4xfy5)mKm`Fv=jqF`>_?AWFWA^3W4TBk z`=Bn*3@!{(8-7Jb3p{gvyHz}2GtsK6D}-Yb1O^^ET{B=h#ShveLau@a9bD*0Bf8u949 zCeSFgOCL0yHgInCV6 zdGLyo{5^#`*VG@<^6eA|&#GS5_kK!T%80gFp3YS}BD9CdE)%i183nSO>Bq;}g3T4? zt2K$rq$E{`b7j0ZZ!4#g#>VivI@zzM3+N;m&sODK1($+M#SIaj6ZW# zXM~WtCSDc@w^s!B?T*d_B>#J7ZU7U(Uiru3r(m$b@34YsE|RyJiYOLO~P zp)JAGhJsvuIU3$rYN$o`<< zdWEU<{G8M2@pPo4^STcv_6a3=Z@tFRWM@ke%s|j6*c7Y@CsdAUvFvCXYFf|8)!j3to$+=ibeh*x zm-3j}DZ4+VU#=*l9Jg=eY7ov}B(3tqD)-So$39H=AgE+^$+zRyVGaLSMlfYT8yWt& zjlp!BC&V|q`?KFWtB80P#WeE}QWN_o0CUE0y895@(eA9%0sFdeW5fN-gP0zy<78aJ>@h&Dhvrv#* z05Cx37X;#xx02%B`fU$yC=JU>4^ufUO;~km|EzQv0XZSBeeY2F$NWq~Zg}j=D*!3m zn9k0wg7DjP!fQHSJS@~UWUP2Rdh&m9lb0t;85td71OW%0=X6+bM^2jWk&a9toZHUg zXOMT~ott0l;bF-s?RogGB|EeFyn>NoW!6rF$YEzT-K}V`D1shcJGiRtAh7x?Td=EZ zSlG&g=ef%GSoreqH0qr}LY%B-0m&C49BTrMs_}9-QBN(jVFL_jVTZa4O}p zByKr$8=VgFn~)i9AkGz%a6iTfp0}~BwV6-6WRc}VO}2?)ex3e#3+Fkc%e#eucu(#( zasSJ0nqM<3BkOk8wR981P-Qdx38;yDVw?NHS_2(K$o<~){zghxPyKFOEJs2D{)GVv z+z}D`cM@(aV`FLxm=CpRCud^Gony=JP->J1>F&Ns%NPB)f15q#iF@_t-*5zbSKth zy-1kovr7wS_2_&?BXUQNXl#SGv*lF>wBaupe;BvXy)>Sg`K`Ln4oP<-sO0?M5!RXc zj7-W`s#7yz;@Of=u4V&>&WPBNDf)EM{==C2X5%q0-rHG)?XMBryU3E;JtuPt2tlmo zKuFzwDh9Zr)IZL5)oj}YqeNNmgpI1;(C*&rzTQROAKzT)AAvt&Hx(sdo_{!5m;W%Q zfHl&!F0Ay688x&b@@3opgv8-Qju@k+O`{{DAoD>|TSEi8zvx@G z@tgC4>pP-thOJO-e+hDMek`a^I_2_SVQKQdTM`>tXum>Z&S={^!03-#i7qvEeW* zKckhUAGb{%DNJ$-ZqZQ?)(>jB5Wj}n?9}UnGd8LbPCQ(`Jb|IV=NVdk4rcD3Rjxcy z2JFh_$hYFvI`h7F7_Qnu4jz{DUhk3=TvEjZ8F*h6T_L219EL_OM=;}@-=kn*c)q}xdouWqRlvakVD_pHxigh)V6U3v9ATM81izPVXZ@kh3NayBc3?uJ$8^XJ7P zO

tjpC9%WjaHkzW@l*+}tdpu;li3%j9IZKy(sqU0p2flUJ|v+JAjX3=Z~ma#C=; zMPg^yBUJ#Q0gL(h5FMQtY;5DKH*3E(`=HGOe~(vVDk=`!#Fvl}q1HuU8-Tpy;(utr zo#Yygnf%ehe518yXOc6s?;(!Uai2Xj_&f#CoRN8v-!cQrU`Jn3F?o^YW4Q8Nabv4e zakGflmMgh5a^)nlOz5Ji#&~AUmvvn~@<%QsH7&nQ<3oJI-5gWI`sIg46K#GB&2lCp z)9_6${~updMvJ@KN$?P%`EQPU+es$V%N(~Uo%LOWsn_Nn-#B{V!?4oR%HglPP<(@5 z9MO@@u4SZl7f>%JWYBKsnP`9sm5Ad6LE3ipx+`Z}+47W80H4;juwZ&hZLj*mh&S;& z9S#|QsJvAwJDU)rs<&_=ZxGdpvMK^NDgsbUD8?4Uq40KHY&Ze0nW(Bb>$Ar)+zD|=%fLc z9qg3|D zJs3bpz1m+}DJ3y^#6hdxTh_iRYa?uq6c-4W9-24f(a3qf-D|{!*^J#WxbSQAEs5~k z9aIMPw!fF&1Xk+^3#1)Foe4Be*dn)oSX9eHOOHm(%Tu*lc4Kby!0kt;(C)Z*^~SeX zQ~q4#_n)SnsX{#k7{#MzhjJJ!<1qlw1knRFB1(h&Dk-*fgRbasCBa~VbVHp zP0|#zXJT}uSzqQLEG~I+zq{=Iz0$?T7h#JsI>fN(yc8;8Dm5VUE(IbrC<95;`Jk@h z%jFfF)~z?f-;*ugGFsaCzt<4^Rg5TkUVC173+lwE&rE!eiO|?+nNj&Mg!C6x+(7IL z0#4FJw~F9VWMheRO=HG26i<_gtEQ`$VFzT~OnPsYlONY!x3Tg&3XMa*&^7X6x)0k4 z5n7BIQjl;{HaSti!1Pb#1m)0xWl-tFTxs-vo#?^lSc90Q>2JMf3hT07nPhv)hzZb< zA|XARe%{^Lv1?d#-9`MMt^I_fPKj8~?YLX-i?rVw4vvT@#q3x=(LFRpH2^`!V`H|z zi$8c-MLL7|=qFX73HKbZ>Odk|(6A zZU;t0HB5q#qua*5yN(8Df(6{>Re{n{DzbM)i3Y-w$Y0vNe5O>^(n`d@XRcDl%3?o| za~Nf*v+>mTVup0x>GpjNdCu9gOL+Pc(iWh3>V~#R+j(!~F;fy#oS7L=dZ>lgydWO# z%k3S+%AD`OwlJHM*PN+Rw>a!>IYaNdrTuJ!oZYEUl7Gp`uH+$mbyFg)bnC}fRj-19 zAe&HLzti{i%kMQaa7SCs5APhNe%7(thgqR*H_esS&g<%ZsC(?CePyjIt^Co7{)1ha zeNoYn<*)tj9C5u7Zt4D@Pb+#aBfPm8(3;tfd;JQG%&Ysx{hVLkbm^X42Y0E(N6e5F zOrQ#E+*d*zbty#KY2^2W&bJd@yl`)B%LqF1b(TOdvV@x_Oiypmcz!zN!=2itd7FiFh@Ex(>kLn56&d>|^n0$EON zZigVDptGwBQ0m)wCW?8WAcpJ(t(;lVNxl?ybXgfNz!~Z2t(RdX(tWvM=H1LoEqdV+Ht#T5bl@sOg(2?9@S>xAe}CyX=MhF8F3U#sJgl}m(x$Ly*LA{= zcr*9!*&-a_56sMr_4G8uZPhNNL>A;k$HnbV_Wd8O-a0JG?cE=yySt z5Ky{H5Rj0Tkxpq)k&;en5Tu9h4k^hYBn5_U_$|NZocEmT{d4c_=E9lvtmnQzX=<^` zEGk;uI((A-pb!7}@<~jmslW$n)ySXrwPp|%T!k$Oq*3fsUeQ;)yLhO4*kS6+BCjS;D_we_SocB|j03inlGbh&Mlmm13>NarG zRoq!uzO?ck$#}4hL61~TG{Sl3g^Am_VM#sroVYDLW5dkM>;MLH-JL3|>H-A_@lPaJ zIXJFc6Tj`xKyz|Brl$p2SPHFRW0?}mOG^uzdjXlCPzS`Be*EbGBR(-a>5@Uc1-)Sx?%&JfaXYEmPVR7iD3mD@`X}-k$FK&p3g^5Ny^)H+=EFR)2l07Ubsx=aZi*%50!7 z*k||-ZJS@EEk0M!Os~x_1f^#`ez*aBuYY85vAwBj2BlRh$l<^c1Uzv{vl0i*2XYTB# ztvb^C|2%8FJEv%r43iJKo&Hw;S6SW{nxBvV!!YAWAA>f5?ejzW7eqJ1@+#Hmt}(|U zpZ6-t^;Cit8E0g;OnFj9ZM{F;{u2I@Oa|o0X;Fg(IXR~?*~TWjyl5^2N|ARw2MT+M zI1CImeZ`&dkUww8ai{iPk};C|bC0S%iYf+9V4Z3p*WGem9I-Ce{UT@$CedP$L^L?d z6=acat^8IhEMM6EJX?gfhQc#>Xd=LgjM`Y+7*}d-1v}h4GwI`*QY|aNG39uz3bVXS z-O5gg&0~B3EwqVdEO=*Q?h9jYxMCE9F|oU}a883x`b}qQ(8aKiY&ooyLIVe>F5T6a^JoU^S?UM~Vg?BNP-YT68k|ICym2 zc7z8OEsM>Y#Uj>6!#c*CYIsRS?i*C-qxCKs@bol#%Z zf8RmMUc3J|#QHoy^2Z5fVjtm=BH0nMsh)gUnI-2Lv4x*yn5j1*St2?I0_0XglGD#% z!_%o?j~#V^7yamwR+p!PqYYJk!K&ldO0J@_E<=+{DW;d*|1S5mg1g7qo^ z{fMUJg0Uv22EP2ACeFNcy^AQ>I58c6W%q+itoO^B*}9YZ<(p*;b>aQlvSPD2zpQQM z7|f_Gbek_xqsBS~O8@rK2Q~OsfSVq-AysYDmI^5F_49)^HeOBSDzGp!FWfRNXoG&Q zvBAO5pviZ3_T84a)ad5`h`u}r?t)k1VcuNd<-4P@p6O(8zS!89Zf@>eeEcN&UFqqd zYxfd9ByuV7nlCocA7tCXh>Pf3Ym~9{b*OxkpDc-qS?+HMW1*hHFL0=Y z4R7OBB!`G}WrxZ_Qgn$voM~K~YNI775VDl+3{j67?B%nL+Md7_?|#AY6XaL7i}b6F zOrLSM(|M&G$LRT#s(GYZ)-gg5A#@%A-x5s^{Y+mfS!BumsrOX9A_JHQ=7ADNQs49P zTeYDW0^>6Y(+I?))Q|-&E2BL59&sv(LF2c@rqrR{`R1dlpzQW3_iPZUma*|lg^Asp zKUw_=zMhBNkG+LrjPN1W^}m0;9v^BAs`3a5kmLp|{J!LmjT6e*i7HpWAaKfWh(2Z^ z575TK+K~3=RJiPq`xO-}&d&a_zfswlVWjMX$QJ_pN1MJ?Ch@|5{kD>8&X9G^g+<^vrbClW2D1J_hw@4zWS$ zg{_HuJNgO)DHmE(`fG>s2{sfITLBjd7B=?39=E#Ry`V6uuDUuX4wzT>_v())+TN3- z4A<4xf{P?MDM>?969n%&niR$cUs{yU277x7u5W!TE@h0sJDxj!etygpD0K9dNRQH{ZeUrp9`uBXqS7< zN3lJ=dIA?EBKMZp)O@b}rTydrl!AkB_naJh5vvf|w57pO=fzNT%$M0KmOr8^EfSsx zIi&vlNt0y!gp}06(6AJcq@|@_GVdRLeV=-_ya^m#oKK%xy{lwnW{$;srF^y0``p)u zCqvj7SWdnsCx87y^q5jiE!XM0iIESJ!Gtm;@)sS5%@KAku8s#bot{*-il4UBnB3BL zddDcTx{1*UQ|BM*W9@zFUnAR2@kd>ue@stEp^K@e7CI~Us7=nFck?>>_({XS;G_Bk z1%8V`;yWJcCTN473#C>`576Q_CriI?;gM{!*_U{cpIxM8P#Bg$u1>b`dHV@#x^s== z)Ci&Qi&&bz!wh;3X3BNHqB7l-o3CYWgkWND1 zb*$p9V1)PynAAS)KNMNnctSzZ2psL;y)j>3Uk{pe=?O4E->|%Y1!zgUzPVXgTnseX z1lSM4Lr)Yj5Su_GX^GNg0;Rj?hQ4h8TgUB%v$eG~{b`oeE)5|Af!j`j z8D3)lhH$ajgBDG&%pY?6Poq|+kc_I9v?!YgdM@G6ZW$1#IRL7SD^!)zzeJ1Sj3w@> zbxXwjWM={cu0x;rimR^@k2#Kq*K*2dLQVz-RJq0aCl?pe5&?IS;H2&B;YoWrJTBCk zSidIP*s!j&kdgA*+gnbTZOjSMWdJEXLBwd{ODKXt{u& zy?yC>yNe;OkWEkc`LnSxa&K=qCd$-oMU1W?-qd5H&`59t zuZ%|80Xd`M9t1W@=g>BEPPxrWZCtXtUugj&4t|W$>y}0OEZg}R6C*eRZio>xXm!7( zU6&Hc{P4qV?G)f3?^eg()`kSCx_W9F#^}_v z%EKunzPLTp9ozrw-VS|arN#a)6n!`0dCNYXtE8}&^<85oaU6OAM=7WO>8?icaJckI z^lH(#jX=fy0bT0{V}hXIOWu@_kS+r}NLv{&(*gf#8|f}NutB1t=Ke)z2Jyy!_Vye) z7C>oRAn7Rp5CCGDl#~PrmD0Yd9 zlY+qMZA6?G1cB=n8ynkF^UNOfuc@o!+$DYbL?P^CsL=612K_ZP1?vKa>3<{lCPM)f zDhb|+yu2>~;5GCMe%8~YDEb4mZ+dKxfr~TXY=D&&OBC8|&+@*%u9> z^5qjN%F04p&`7WJvh{p?Zh%f1wDti;0sO+iQZ`ZOLyhPX`M5bq+}HtXdMT)=>VFE* zARo0GR=lsAHLgR3JZcN=njV0nI?xOcDyu`uh`v3+2=)#HsPBZ|6KTKsF%ag;CN$Ty z@UL+B%BaongZ}^!U4SBBgsX~zg6S2n)8QF4`17l)UWLb_miBHz$!Ndt>!_l`j_16h z;1D~jDw&VR7^xo;4?h4F@%bo%3jvF$D0sH+t1K+OBp>Q(pQviNtq&&B3JcfP)LQK9 zi~v)xCFCl!%zzGL2>Bf>1Zkv`X{K7fzjjZJ8%5(h*V52f?g*aU9(NfV=QV0L?AYjM z14Xq!>BDCec`}yiNLWDTAIbgH;Bs%qYv`*M1A{kxT~Chy*iAza_Z}l{bkP!SnI*~} zBPY4hT3fNZ$!QbH@@v&f_h1hQVcwGPcSB87oHVrkF4JuzBjpLP?>4SNJ5g{M4j=Uw zHnYP8F*^Mp5@b8Rvp_4JR#942_U8CMAfmqPF(5No7WF^!_u(;$l$RpL8eOHCTIm3AeN6@lZB`LlrHvSrK<)y2Z?!^}+C9IIx= zMD$s(`{ed$Z|@&q{eJT%d;3|i5ugTDzjVwHWomkRcp&TA1{#%;L=wspVvyB$`k}}J z`3K1Tz)?gf^90SVpyc&dD<0thArV&r92)^gveoSDhK9@JrzSynlded?2nhJ9xs@OH zI{)P)e6p64L;QLNRN1n(Ai(qEk)^9Ee5&xpq$Z~>sAT@GnMp);b_fCjK#ZH2u`yNb z?v2u$HKhCdd-7ZX(?ITGzv6#&b~kBh^uT^vO#Cw%p&rl{0YLG0gW>M(ZmHn=;X`f< zczD8JI6|7geKS^3!63Nu?xeQ#1)Zg!*S^H%zNe>$KE9&D`P796ycDYU{HyAwFsad2 zWJMSqH2fbP z7-(s+gY)$CZfT{V0`xG5rKRF)#=y+I!9+83{KE$4MJhxHQ^*g(ypY}XYOS)o*MI)h z+25Q+SyI(`pFau@;%E!_a}_d7DU(_~5=yN^2DKs*iJH;|d0#EG<3iV7$dWPYC(V3e zS0r?^g4qZsY7pkS{#{+N<~@h$|G@jnmPJ9=0(gehqv%(5Y`g3k#V4&6_ zOT^vnK#w2Yk~PDUvjEsXF{CqxzY-wN40CjJST657$T;sl9Q_D0dr_4MbJjCAICj6amc)~J z-1*WGA@||IB@62BK+Lzj^kqZ8`@7Qj-q>@R1F7+GW4nd^j6Hi4 zoy#wA+rKW6;3YVg+DcmYcs52*Kpr(aecd3L8+lFo80WrmlEJTZv^cCR{Xv=vXb`Vs zyayv`0*x5a@<;S>09f(x;AJjh42T0K{QUgSx5QYV9B0d)iHM*}gS%6-E1Q=JRJYm( zIV&hAsG!RSMbFI4APxjhJn1ojKtv@s!7zb_l_~C33LN+0;izyn6r^SalpqI;aZs8{ zA7O^D06NhbaXY9_^nkERSQu>r^ok&m7(RcFHW0Wok^71Q`0wrPw$IKy5teIe_}NcE z=P}GP=m3=e=DrCUMo9(Tvp#R$usF$@BW%V^Yu}F(1FyHMm+S+u87rXt5<+a>AEYEK zKgR9_2|4cC+C+c*`uq2Hc9@x&?HwJvc`Q%Au)O%!xxI@}228^QO8YLI)WrZ%r(OVE ziX!soz+5cv{Aq9#9iLV(FhBx40_9U&J5jx!NLh9Z3Z~B^=c7sZ;18-o=uneNvY^X$ zZU9Yy@ySn!7#bSFs9&krX;RUlSB}ym7+laN)tP69flUefQRYuSgjt&i_vB1dGUr?de0k~cevw$b$d6}X z(_Qh)KAs1Z>7z3fbb>DVajT_?LV9&LoW#-|8|AncVswbF8qB39-^HP>QhwySen@Im zIJCQa=dxQe9VbhFRVjIVb{L~@*}(KD``vp3_9XM<-R_Ubr71NI!svqH$W^H7^S%3k zuhU!AnOU9G-Ai3XI2+!h1+)Lt?@jjPwH^}El z&}IW&y%U=kg7Fd5b)%li4*C^qoW~U5 z3guIB;s&)%N5;RVa3Uf#b@ebCEqHQD3OWwj`oAm+kW{9tOY-TdB3d!pPkk*d#C4^X zMuuaMN2H()f(rEcz9Sy zNC+1P2jK)AD?<*XVo`TlAwvQ>f)JIIls=)--+-|nG%SPKdg$^iB1Kpk2RFA()#er$v@72{Ky>Gy(g75DavI=cdXZ~_< zAW~ESjfoNqsmua-@Kz3c%*=|1Q%Yo0g2J=`H)q1Zt99{i2@CT4_esj1Z}%3S8b7~Z z;fy?Bhv*D2cV#>?mU>gT(84{cmk79520BlPB#J~I*kOITU0!#B;?3%BDmO;X^{az7 zEnA0P;!sE#u(u*Bht{1)+03Dcnu}3=7(80o-46I&a^(F~8s_nh2ufOD(Y+nPLF^%tEGb3#cGDf!k!UH28 zdyuYR=Igz7ckm9p!H$-uuxg(MyT~LN?v^%dP)ijn+ud!i-LF1op!Y_g^r1M{&*%Kg zK06}$@T?EhlHL_jaQ?$;x0VRm+VFhNzjf$0Ur z?+9zd0CfTe?YzW;-ku(W^Y0iadMFGaFCbm;Em4$W$VqB?dj9u`T~BtjU|HFFS_RZL z$T8#7r?_82LcrpIz++@)CiUzYVe}0tL;-XPFqo2jf`VL>#KACi&_;zrnpaeW=8aW& z$r>JfNmtf7$WpeB1y}T2{~174VSG!5e3VUswsOG!Ki54z^q5YDtf&9GdfIChR-K8N z_IE3>8k%FeuRehsm7-V7+S=NuC(+>js_=a-L-K>3iSMVPt%>#ZyXtE6-@iZPD?6rr z+p~M)ukv!7O2#}vmLP=JQ&;ycmU5%FM-la!h(2a#Cv0Q{pNa~(NS#t?6aRd(e#cn6 zQT+UG?%QG)KVkOb))yxt#eg9vhS!mLX>JHpEK(VB3wWp3=&9+y_Nb<&S>W^c-Ahzo zB-N?NWaGEzo;%JW-l8&+czKg*WGOJLK{S(R6? zaCDKpP%6%##b6-tQq|~qbX(sb`Q*E2^oyvj(Tr#JFZ(cX%dW9Ovb7X z1ynxEH2t*xsZgQI1oPiBUzdC&5YR}XJ{&ds+@gxw8$Kcqg+&?s)X%m$@ZYt-Mn3v@ z;|zO3PL6IGhnCmcU#+F1lbD`fR#=EiaCOiGfKVjGztbheSSzDPvG-)qRnU#)cLfK? z!-vzYOi?$axC@XbX4QQ)zTM`3Yieeu1xMpSHIIvqmT!yoQbv)ULGYhk*>LgN9(%&X z#6$uG5=PuXHPz!lYANqS8u13GD*QQ0)(l971c+-O^RHH4Z+d49jZq-1Mqtok{|fGI zfA1^|(`AswG`WswhFl3U3BM*Km6VpsTeYDcfn(eKM=*oHoEK1i;H8lF$3^=;_hyP| z&50XBRHUysDe}<;&`AB?gY!=cq1d^%WKZK6{BHVEN6PsiP*?L)jxg)mo=3iY@~T1A~M=noJbn9wOSLp zIY&4a4j}N+81Aw{;w_(f?XnUc+pMh%>xv9k+wcCVWq+EvM($^S-ySy*MZ{|#$P^?* zRahMbONkgBm2*~|H?0|&nR9V+G8{pADGk%-aKS`O1M6+Q_KM_wrqia*muN?YJc$Li(T$fA#D8 zSEu?x|1?_ww2H;ZC`+Ig!K-nc-!QYN2cj8nFvX!B@t8H6w)%Jh!ME&iP@8P6l}m4KyO1I0!rr#kgw$Rb1_0gS-Tl4YNGjs4EKR9w&?V4dJF{|H!tVRAfNT^DjR(bn zh)^kFX=hDtd5R@b>OX{L;MLlM05zy^x2iow=~;Znxyh^dz0E(V?kCPO_&Z`DycM`y z7qKht`;}?Vy)BrKB&~=DQ+tGVgU0(6HH90TX{{CEaqsb&IAo!uWJ>1=VXAfL_}Zu2 zU5^&|P;ER(jD&U=ahrf=XrB9JY>)R)_l*y)l&+Ef2KBCmMebN%Xf5mmcxx9{_jcD&XWzZ*3fbOxRjiaC@7pseVk zIc_~*dnXFK9Vxt0Jj9)){hI9i87i^u8dX#{ZAo0>W~IQSea2+2wZ%>Xz5 z;=%%&Kc$@~NYeseS#fFU&CLx!&-ZtC>oES|L7twT2vDuO=O{9Kh%?9&NJU65@*R_M z?>jyPfYqVHqt!zwN5?WCM;}4UGfV8`jeH|Ou>fz2=*EB|oROLN=FJ;GqliJ^uxx<& zd;rs4j(yG>!JdHu0DSwpuDwbHazJ2p0CE~Y=Yb@wJ5{SU#cb~f+`vFJG&Gc!mIivp zIy>*i$tGIb+N39hOyxlUQIw)_z0+@yBnvAA31BRooLH)I=3wjsv!e3-e6XaXWKM>{ zN1Wq}3o^pD$&@0Z%mNjWSQTaFFOMU1Zt8bKqjvv^oLay=_ig2V;>Ti24({t7O=36QF6^`Tx=#=jtg+LobUS0-#hi4 zdQ0$Bct1NYn4MJ}9)9Im{NAyMN`0zub}HT%m2Jgem5kAmszl=8+y3_Z(!A=fjkD-x z2Tr3%Pnomhwm-^GQ@)Lx3dIetdA~^v8yNgi(hm(ORVzgheXcoCV4+c)n-tLcTjf$T^!D|nW<@cVPR5wI`$6UBz?^OT$O>Y zE&|c_q9Sefui1!40O05p>iXE!BsL*5jc9mr?wc6mWC(}xrKJw~ z`m?~XMG$P_1&|usm8B6Zv^ZokY(~w_%cPv2*WFPiGk~>R22yW{J#5IZcX_-zRoPVe zZ&(~$nztaPe*qP+{W4Ciszhl%3+L$UY_J>qDdBzS`_Lb~A zs-!X78YLFRd2YO5j03!^(M?}i1X+|~E?l%j=dQPNl9!kyNRgFjuwbf-UsOp&C34UP zDUopMco#38tl` z4O&8Il|72Ie8%^){mG{lfOZZdAvEb- zLnkSsCL0@Degr*%W(4N}fXZ!cSp_`{3kv}#-to8-Wv^Z>{{CnsGbJS=i-n5|n@!&N z=W=&=$@0IeV4pnrvd~nY4{b$*qvGr^=`ZfO;%_{(m5jZp)@8jn8k5Vc9k*;iC32hM z@EEX~#APO$fgh5)wR+LChVl|$nNN!Pk_G8T>Ybr)!TS2h-5n7w?me$gqY39y3grg3 z98UqS(VfJlO8rYZQs)D$`B$gHS{?v3cvO~Vn1KNwDbU z2C-~v<)x*0uh_x325>#|3A}aWV~}gAkkjmgZHk6~0NAH2wL%9+NAKM!R9qAiWwvDG z_>0|N=u_nh_(nbb{V`EdVB^Rg_5*vb>~)%YQAh~V-eX4Z>%xKp5>nF6_YLos5t>vB z#^Wy;coB@jpAC91NI*|1-)PDcHZU+CXk^KbKrd($jn{s(+Wu%ji1390DYP@lGzV`| zJ}-pJ2ZZsGkXDj7aImqV4`JTn8*QMDe=sux*-=oasku3=*%>v^NU;^4f|Agm!T?1- zCr%(Ll2P%4$8*ttf@8K5K%Wx^&pM8r068Q3nbAxg9h?HsAle^8v-BH=ggNEKZ^6)k6-f`xu zhtPTqdDZsVGDw+3q631VYBgculc0LkI zezzew|0h81@*_6K+Ky*)d*DD4*X7>c4wG`}(b6t63(GNYlVNv8SzX~3taVB#%Rg@P zwTB66GXk1r6YYDMGO2fwm4ke&8>>dzZBLaLiDlY`8)#vX!6N^@W=g1?M*fUHI#g9r z{_UH^pv&$Ed_Wy96pRF=>_GRcn!j`;!(2smiqz!{=5O**{6NO!sS9?WZEY|_tF%9? zq~hnc(%`p7B)(%J?j%8zKc9*zVN%#~V|9e*$KUpj13n1*M^Th0oI_4iYviMAb@A5S z!s=g;cBFq`NSKL=<+47Z@%hZ^>HK?hB~VV1Cm|wq9Px#$_!Cvhu!#OOw-6a1IBz%B z2o`C}%j19cOp0@Furo9T30bP;ebU^tZtg4QowL2WJ4P|Fx0sk>2)i?74gEdLlD_Ye zEyK01dmDaxCDC``@LE9-0txNuijj|I8X>0i^-3dP05 zz(@iR%6}L?^?TGuhUnN>Nsq1AYWN&Rj}8Btzo)CK{SngZn=+DQkj&^M78SHuRW@Nz zD)j3p9dYF?#ur{g^zCg_i}&^r^ngnW3Osd94Y_4%#!9AZLKuT7QPz}_Zqw6Nn) zI0a#2R{T4VFR$>Qy=EiA=PtNt(^qd-1pAD!khRz|NL>%F1)kUYi-1cWQ$s^6r`~iC zIuJCjrV?pjFefhdo7+@$eAF2`=rWN_tIE7rJ-wb){Kj=WD;g2eZall4g_RXW?+I<& z+}x$S;rzW-H;7L?2cIIF;I!}A&gYQa-JZL*%TtR(C^RNdGNOA|)Vk+Eehn_llVb4A;^TkLC<`}Z*XxPiuyE1U^QT>pbZtfP zRAgizuJ)^b0mx&00XW60Gtjl1A7g9X6}D>-GrIPyDaa<0kM`~7xk4WyWazWH1kO^2 zdei2H4nvbdySWP<>>>-W@ds9XYls*5zsy{6?k~rd_fOhwqW;WQWPvmtVz%bZP1DoM z_A8j&dUu3fU%VHEWMe{A)pyH+)w|f(;h`Y~5|WR%*A;oc(k?Eta&ue0Yd-m2A!U7L ze_+-!%b@bKfR6Io^2BDY&A-LOnCcnDh;@J0;j)CFXYoRN&kEoN3$GML4Vt7kmTWu% zs`mM5#56*> zY}oX)x0@Rn-y%f=Bqcebj!3QdbmxMRZ{ID}KSx-RXJDAquexXrSXS24BbFNw7RD~t zy_qTNCoE`gb=+ydL_Vt5JMCt&!vg2gR`Z^Oa2jf>&kNy%IgY|0Bfp&(GCgWDHkFi5|hiQb!`_rdd;^E5gb8eNtMp_sy|%UCVPS0&G?|^2BN<}E(!bSNJpghQ zA0JO(sP&>2E`19>zpd0>w{kp)xY*UtCW+`~NF0Pnnp^SDN8CZ@$ z$nZjoG~azT75Y&O*a>X)^lC-X&zd;4)d86{s{ zQO-nw$w<`IG04qoVi?wCb((@?T`OwCK$J2XPQVFf3Cw!|lL;*RSQHM)=^Nb7*N^o(R^09_l z+x(7{1Lz(Be)+lb=o3YSy-0a4Cmv~ezEPrcZ9NHEP|t3>E}Fd%Q+SW2=IG*a zh5Dz(0Sp8u=$d13^;Oxc#i>4tb5`C>wRiG>r$akDrv;a>Z@W%c$rr1Bz)k$~Qq;!3 zHqHqpd6B&s=`*61H74wLk--T>%Bis8gS53&9F-D#>3IoKJu#=ZPS^6y#7EA4>HSS~ zcr;{qjWF^G*fAhu==+?30|T{`J3IcP3VL=3bi&(BZ4+z@efT`m+&Nm&`RIiT(Uoc9`3I83kt$Nfx|=cmj}T+vp? zxk?b5$p1T|P`O)k`Q;ZfQd_r16q>`?BBigTAwT+90sY7)D2Vcj7I5sAk{O2P=3s3< zFE8iD#f?)ix)oeXd72fym#V zH$(J?mOPTQdn2GHEjK{HAtF6zmXusrU5!&D0{G_GXQo`Xmp(v0$>nOM35Mq7=JTssyAm7z zx4-!vxwwesApcQD0+Wn30IPrn({*{bXU72Xvi^Bi9G;B(@Z;Dl+*)hy<%)T;Z`)-} zXiUR=Lb?^qzFNPs?f+z&j{M3+YEfk(REC~wP?X+Y3b$V1@I6AYgN@jpyT1-C;|Ay zQvld?Q1DSqTmn!vk?qvC)}lS2qO`ObKz|5F%DYDumzm%B4Iwn~LH%xxlDBTnR2f~r zJ+{h?>xPz>&4648Vgme?KY#ucSh%^l0lpcesso%IgwkCCSP#7Btwh+^`bS4cXJ$wV z%k6>Y2RMsg!O_YMYiuwPpkZMse2*ZNI#>6=%<8fD8|n(qA#qtdIW(Je&PeHxnhwSv zoH32smC!d~3w+1Cu*^wzNT>=1()Fyr(P~T(eOHJx#&@hSQlj$1S6mnj*nUYKLGMsB zCyhcBa5^n&=n1pvr5QzaWng4eVJ0fXJ;4}fflir=7L01wd@);ppXU}zk10zR1WMj7dAEdGD5=(2OeZf4{r^DwfGl=IR06Wx0;yro+W9v7+U1@P z8#}tsedi`zi2i5w7J|5dSXID}n70D>Dkdg|oSb}kU?Awv+>|H3fIur0`cKwG=CJY1 zh?{IPFDEDH@rS^6SXh|Ocg+Lv!7rY-2AEqegpdL-!IUq>YHo4y7FcJ&^9njM*(f@l zv&lIygvXe;3R=~<~s)mV?d8m zQextg(}a_409H1r2e(t=>CvzKrF9WhmKJ$y!CS9Y{%%8UT}5+@qefRbLx^ZjU3hpe z$Z^m4yaZBlz6m)7t$=f|3cR__RiM5AK=eRSker_068IPZs4K9lGOPw&g7qJ$%|Ic* z&Drk8{{A8;WT6g~R&b0^X<$oq!-2dRS4scq2n@A}14)hkVsi$`Q zaR(YXK)3-lknuN7;2F%#wE|kbxk`=loYVDQY;^R9f`WqQ>)-&40M7Xe>Wr%EI z=LR0Nr@@2XKJWk)f=W<++YyTY3iTH(sHp+n#>n3qJ33_N!^!@FGX?hiyu3WzPU<*6 zk?Vt=C*S+=_1bEN@YpDPUleN7N&A7Ol;7YGgK`tHvG5CAstoY8&x*KFQc@D6rlha| z%mw#rWhGBgYh>hr0qg((Iv`u1Jup!4H+VmxqN3ivX{UHk0-R+%WYd%%?NqUd%k2Md z>)#74kU;TiM5~>C*IcwIs%O6e6}#ZXT=4X+=JbKUg8)gyZL4jKid<*0_HV-*Tec); zulAY;$4uU;WXmlKnLP2kG+h>brSmpz63uyoVQd92VtG7k*^7hy4=l90pWKh>y}6r# z?qTxrU1rU5i=)B_&S3{Xrq)s}hhAnSvr#Egnyh{f0|qVruI{y{tE+(m|6%;Hef+H2 zvFPr#A@4*|qFm3f7u#$AJ^Xh)7hB-2fHhr2Sa_x84K|-Z2itr&u(@}C#6>11Ae7}m z;B-<45pV!?kF+#j2PVY0*jW01@-tA-3Q)0Ncw?ielvFD?27q3X)&9q|3OSM)7FxWy z1VT&jlP#J(-vUPifb4b*HG-@#$jQh)>NI<8NFV~myIHlR+`D%?1qe++nc~3RfEBL& z>Xj=H9)U0KUo{3H&a)S4Y9XPafFS|@j2nrMiX3{bOzsP6pMYoqJn)x5ua1a_=(=Vv zO3`A8tMCjUFf%J6(5L@lXw7(I=K9F!6`4D0+KZs}T-qHSw{w`r6qdUnWkT`!bF~*Q z6uP@C`2bxFoF25`91`YC=KtE29b+ci1>^vrh)|vfv0_WzU&M$cdS_C~C|<9Zrr9tu zaBzSiVZ?i-so7aFs%!vlfsckx;ypJ2_>E!B&EiyX?~m7&a&V>*DEMtKI)tmm1$|t} zHfzB2LdvC2j|2zRBj?}%A~r3yWIcFU0Z6Bl`rueo6VJ9*Oy)#RDfdnU_?`anr&%Q8 zq`snQgOFhTP+HzPXC#0#6a8383ZW<0M{kmGW&+pWlP9;&tVcpa#~Zwj9?*ou`l*Ae ze^%eY7IchCnuz#cZ`Jk>c_m66dadj)?N$1V)2+A)pDN8>yx2EYm)dvOw`m!e*4MJD z{AMevgl>30UvwHZSrw)@@GXBP zsrOG$0^f@?IUbF1v!<&vG#^^bTeOPYt{Q6LARqaFo%~fh;y-H??f~u<6a6v3I?10J zqkvpGD?|dkwdCYvOHZW3kV1?iJ0QRXj+?r=I>g#FYqaN^;4uadWqgQ;rI;dG2AJkX zfPJ!U6whg*qoV_GK#fgJhzts|)FSUt)={f@d3ewih~q;BsmaK;U@(L&UOveJwChTf zhC={$gHKvSNC>qabVTrHv;bKfkO%{XF}n@ezZk-S`(b|GkbTXYx4yg_XT*q*@H1d+ zLhhq4AfaFj5DB=u0Pc@{9@f*k=d6uebDfcX=gUAY((92{H>#ZUzxXX7~Zd$1^)HF20i^}Wkfz*eP`}+R&k{AQU{xiLDhp&o`jt+4(jU~{af!q_&&vTA~jpmLWz&t~_+5 z*TaEKJNbA=cv*@tx5gbotTq#&2YuhUNf*M;>f8ey8Tq@>{{=WMc2E_F!&Lj@N3epWC=w*JDGlrl$#fmUdz0wZ|8|L;Z{kePV*&6RAxO4a zQoh2URl){>exc*huyb<@^6?=Apuwg6uJ}5Hdq7q;U=BVte*exU7zHj)Ct>Ougt2UK zF9&d&1o`OelZAFx*VZa1vsvBTgFUr@mDCw5Gghuyh8*Deq!RiwlW}$TVJ|~09id6a z?+OUkTv!Ut0|Lr6H#gfA`!HS2=6m0q-rtUYNpxXS3GaxaQYJv;bxrFdb*e@U{LCBZ zhU?C}+V;fyZBue*PB*rC;S|sce@XOEa4oi8%C9Hbnt?v8TkW6o3V&g9EGC?I$^`#t zN022?lN|y%ciW?LltdiQ;Wv1>{Q9-P(CZS)isXmidY#*~SBkiH$JfolI-%prZjIwv zSZzJZ4<;0$Y_QXd%A8de#;8x(IzKIY{^QMmSG)=|4C;w(Bo zQI>RTr%iyk|IN+jVE*PG($Z2jdHKVwZSsUCWffIbi5VHjuU;)xr28o<0_6mA274+8 zx?wzWXy|k6SL_9U|Av5qFK!dt7OQS~xz7*(jh|lj_i@~70l)P>2pTSZ*yi-$=Lp!k zI0B5H(Q1TdF%~E*VEzGCfB|y&$B0B(`dDfa9%g0~3hFMzUVs{b(Jux(A0T-GbOHW* zhNPw2K;@Op&@*A-_aI;bv@V^)GE-9{Z?qC3BMpDrYk_0Rybi8xN)`-epe}|<*-y}e zy9`)ErtI%Z*`Gc=n5&}WlX)fhRR_Ft2}Py$mRkVf0CHO%tifyE0?2D034!XNN^|^T!|+#7^oLiiN_Sf!|Qx-pb-F;xRW$bL{{7GZ3c;kCA_m3{mA)) z_h%2Hqg4IwL|OSJ=6ZW?yCbNXiQJ}`4Jau~wDJX^(D>CRVgg#)jolADLqmXoc3Z3` zMnrT>=E-C-jKh&@p&(TB8~S<;2x&Dn`L$Dv<2!Bw5N^XU{UNyQNFFcd*Ph6|N1$lC z1hk1GCv%aW=d%4f`S|6Zk5xx!Tkeg}*=zZopmkncV|~)brzprS04|AE%*8oe_J6f% z&Mkbqm=B@J>a>km_9dYr69%3P09TVjzL?i>%Nyf4R!Bf{x0nEoGfT#Z7uSn)U7A#tbPDm(|e);_4&H3Ye3wBU( zO&cz3d*$s%9xY5QzE^+oNS?j=TPHDFN7~ZTGCS{OVCFd5tNE9#=ea+r`lmyyn=#+9 z5e!irhP;iA$IHWQ@9nWTf!-xM%_lWAHDE$@_1OREebhCU+0y2FSsrj7%=WTU>cdH2 z;GX4=hO3TQ7`PLC`dXCdXmjr)lGjBtUmioDl5Q0!ZMR&?S~3t zxD8cGjyz;bB@&5VyTIJoy!JD8wc#7<>Qm#NvU{{&z({_2d@T0u2yKD38nbtTb(7I_Css)0k?Ck7p zTU-DO?(BM_X>k(u>m0_DxbEqse1tr^Q3}=rlw1@AnSB{rl%uY%+r?W{++?G#YvIdkV!vZYt+=K`%ht|GG}hjD z^<%jcD}%cW=jAUFYzrgUbOF79`v%jiZ}H0ovEc zzS^1}fQ+i5YaXrmoBBR~OA{E_Mi}dmg&5+kPx4Zss5_AsFF!E+fesBl>=kYi2Mbe78S{^Pn`AN$Fi43_dXgw4?X!pM*vwNG7=w>^szH++O zjqY~`kPZ_=LsGzWtuCuSqKJh$7g?pHXMg{8*`M77 z_leJ)C};rDBIkW44uMaBeq3bahS}oD+lEzfjZE>ws`mG|PntmhW1+cuowM^(LvB#% z(gJki0gn&wGE5fgsE>Aciz<)FN?vv@x8B-6K6tFO%lz5YHI^bWaYOEG;rqh|sckC1 zD&90m=(<7*2>=?3Wr5$Wz828NLCuCsaG_nhzh#Shr*Tx;D|+}BbRcX{m25FJ!0 zT1~%sjdTpm({t5^FB|SMevHgJI|>dB@&F20pxq{6I!Rv}0TFTIN1M$&(1N`VSuXX#Q}1w#DBJ^gMgmN4DBD zIkj_ypagmt4Y7k@0z@Qtev7}N(V3_=$Tx^qWzh^5#=AyWbcAO68A&h*?=FOV#|Fk#;sa)JXWx#F$BmKBD(M;`?^?FDGMr z-v6bq0fT4bceqJ$a|tlG z6KKi=Cc_pjZ+W8QR2F^mTgv#gm&)?E4f`&7PATF;!gA+jOL~v*)o${YXw|y_7m_cBjo}7FeO)wijEdJ`1CW9uEZi{o_ z-@na3o@s|Su6kD|P@ZxEQF5Sru3F{%J?!4-h&$KEyRX7seSocG6bM{x-90)|>-0%C z<|@)bM23XWT|?z1g!3ma?1zKR11pPD9P}Iap?!(V_7$MV&BFr-17UqBzR&LI)YoY} zK|P84k&4BNp#n6DDv}P)%Ka&MTwLMitBcLC->Vp0LT&fXPXU~SW1m-=w*Nhogu>v* zSWxb#8clq6(sa|u*4TMJ^P=TMj2eXq`gYCv+pKCd91v9~90GHKv9Q@R?Z*tR6k(CI z*MT0uwQ2hvLmvTla&yzPP(>T)g*<@8nWX|ATx@K$dfGVio%zdTv0l;XS376C zyzmam?vyTUc^{IuoT(iou-a$q5GcT+n}$hlqEOKjUXZXd(y<;Uv#cP06XUQz|H1gz z_~9A;T483GKkr`S#?ZL8RS~;}PxhQ{;t6LPgK!Quv&$@Zq!EP+ny$sp();#zvPaz+ z=9AK_gD{^RwKAASWqg0Spr($y!<1V+sr) zy4yCILP~2S^uPGJ)bZyC0@-mMJ9-0ARkBw?i>5{3|IcqpGPV|ZP#{Rl+dy0fHtIJTD!h-ZPaOd!;I&6^(Dmlvlk@E|BYyLQa&PVK&3rAL z=2*T$mO1cP#Iw6~lM;^B%ws0>7TipA$3v4dS7gG)6C%UxADXr}V#GNzI?n#t7ABlt zXHDDxR6Lf}KQi><7_!)It$R0i(BiryC-uCmZ^Kc#vArAd95=Gx#K%i!cUam&_tr}E z9T}vzG3oM8MrM%!o{vdF8I>e$%x_TfSpms?@A1_mMUu||fxZzltNJwb#s z1wCFpLZ#~rI$}3P*`sqJB9y_41^jBp#WZj8vlGixMJo~Dh2H|Xvnp0LRyAg)BcTmt z7Hi*jgVGAgYD!hMvZ}WB7TaJN>fijcJx6ODB`)QnjKzfZ2Gb99n122ZH{ zGF|9vv-sNQa8qeeP7tNiyY6J4^;W3)No#S)%#zFZ6xwiP!skvpovqB;yM9fJ48e~Q z29>K!n~AaTQGi5}c#5{6Etw8QWO>-Q z=C&C*B-kkJK=%x!_c2mIF%`R+vS9wR5PnO6DqXUileb&2X!mi~^KzX<4#3{xwK9r} z0DXg9(;({hI&UU@0oUojNo#J{Ru+EOdQwfS#3PmR$A+Fhk2f7_+t>KV+~a?f`i<+d zhEn%fTRSTAKl>#j8v40f_xILs)@yXMFJstyI`@an6x5IPEXzz6@?=VOMlaPszeJm7 z+ud_o%tY7l2wWux@XyCUfK}*HclP4&I2tcvuslm+60Vy+=oHDJ{`1LEV$&qn5;v3T zpF2Vf1Rwu#n0*GD?zeQm#qhtlIxOBET_&getR%v*+AEF{k~X?cZd!VD zpkHC|X}Q)+aBnW8Xgh===NxoC_+UU!^e5T*Ocz68Ai=dXf^2q1K!}Ep$*p~DCFZNn zGh1FIHWGEZXz6d#SAVx$oF4l4H=d>&i7+2ZA9hi0-NpgA2!g|IH{7Y3ToY!t}cP{(8EJ`apng)5Xj=B(9%usm#8sh?60ON$MdK ziuPp^E3pd1oS^)p%yf`_-Deck4=qMI2fcyszF*$LX)gu*s<_M$K$tc;T=ES=|Lo%U zt5T~5HI*SMBEsjIsZ8wcJ|MeX=6xGV>eZ}91cIOeJOz5Z>&VaV`10j$B`mBfHYTQ8 zIiEa-IZ0+ZF;VsKu=1;@0g$qp!mx938&$8xSf$f|G`8Fm>elACe)Y>}_-JjdP{{cd zf0GC(wse}@A8+11?HFIvZ_0Ihkt^>#zw5Z3f=rGDw7KpBjxpNTbg9wiE zumDNrHBP#uSaJfBTP>%9YTd@)EBPV51)tem$CGTe@4K3V9hD6@bc1LGb;p7krUKj(IA4E=`+;7wR2@NU0Btj2s%}(Wp9VjfUHDXdN z{?CKE)7*f^g^ivb2(P_TExJ+6XgnE8{>kt0`goOptp*khv_bRoryA5-?$?-TI!(ny z%ky~HPcJn*A1%6+i3C|YtVyhN++W1Il#VLTFZvM3vgcbA=py2Uv~pK(*M|>fVPWV*6bi&wvik zyFwXXlSri`sVcA^GeRKu023vbpt=ZRas5VZ*^+4MeRzcz9{cS!4OkxGal-)10GtQ> z)t6p2P&tZ+dAkq1L)7(gS3k9!7A}F;z|asiC+B2$?q!}h#c7O)h^CKED3B!Qe}9FQ z%8Zo7*D$v1Ee6;jXcS3s@IWJF-cR?v9qtR?{;Vu7fA*vg0{xGbPGdMe;}h zmUp{7+eNn8=Ht?Cv^~Gr?V2x9e(DZ-zTDetb6DdG7V9m|;0Ew;z276E|FhGRP1uIk z&%2sqav4F3Rh*m}<*ot7N2=Fxbp=i$}ytU55DLpkc(y7c2tL?WF zhW^B8Zxn895wXe>5AUF>kM;AGq%;|b94^#7FX z`jCM^F$MIjFrmi{<9J@2GO6)w!lnn!P}ppjch096`8esKmDxUWZTA`-9aSvoUG@JO z5X#O665wZ{S(h~fNhpN5((|F&Jv#HlzW+EQn29zSkJZNH626iiW4BnJBvY20A}(_B z5`4s_>GL<#BQO?Vt13P);<#EhSY)sYc}s{$Me_Mf`b*@dwMISEnI*?-Yx2{s_g7Dx zHX2}Ie>PD%BZD1W?RjXYd#02XDv>?n1<#49f#aT<&@1SEUxlIlfQ9>EU13GP5| z1pgQE8sU<3?lrqw?7w`34QPrRTteZBDXcq7$+t`6&6DuQHt~5yl0A&@2c|XT1sUFG zUti$xo4zRR4Dt`0Fo}t|&4zv_(o}~8wGuM#la4D2W6Q zwK)(jQ1RTKqwv18p3V)L?+IgGZaC-jd)OE|ARMv=RH3$bNlAmiJe&3Eh!Fzl4z z&CD@`_^Xdi&;HFMCH#;>l(DC?c)L0r>t0y7CKwB>ZOQw|7A{WYKYPgsBkzrP%G=Z# zOw1hcKahk7^XHk?$dNPbuOF=0XNSLaSXs(dI4V4mjG%!UEcn~?4vc>G;>hms^|4lg zjM_wE0wvxBqQk@X-zNH$#wJc&djF$rWX6a~A*S8SVjcw{rj`>E>C_<^?euaQ5*w(& zl!nC{yjdb_Jow|BkMPIj2WN@D8f6>rH2|`-qEIH84h50H=+E|4y?bZ&!*{6sCNrom zD3HVs*g*3pLRs!cyL*S8&_Q>1uD!Q?JODzr zUAb`pfeM0x4i{=xXAeh=_*j*`cp^HW$DVUtJl1~5Aj-5eA;-)bP9@^%S_5^pH29Xq z=<$IRj&bxzyhqB&Oms(TU21!j9~LW9-E`D4TFunFz^3az8#dDl(Dho_zRgt_46gSw)nypP4zP4(&V4% z!+{w;0rcgG-dBA{f!wFOpf0so(rj7ZcSvNNUiIvwT#T3Uqk&pkakqPB@m?|ZRbJ7e z9ltf0r}tWI8t-^^_kzdo+f_=1&$>DspF9YrG_G!!Rd_RQ8(vSfa{MHPP=^x#OgX*}d2=T5evPkl&}IWo@L zplf#Ut(A@9C~!+yI>@I?SA`Wj#~DPV$t5I?Xo1}D4tZ%v4h7jJiu9TVg%R|rE`E(J zx4~0y1sQaJ%FFN_z;@a64+r;>vHF08CWcyS)r}Oo-=Lxga^Fb2Z+|Nv?IHFTKSZ?j z`mIroB}r_ClWM7{DO<}}3Xi^2AzO={l{q!7qc5ira_Vq*u?e08eKO&2xvG|}EhD|! zzs;@LyEXr7{Qp3pZ@cH~qo)Q*#e?Pv`)>@Lbee@?>U)}LlXkl$IBx{4+hQ$+>gJl1;*8-NNlZ{?N zn&Jc?xk&B$QG?(}`}x#XPtTge68gsXKSG7j5dTCDWZ`bu4*C}wUix#{a%$}1swIxZ zgdd>bAYBiecj5e9qsROZhwN~=SmPSK71?XNc}P|L=h3{t*Q8l1_-SCqv%}SjN2mr* zM=^ELBLEyJ^=0b*DLUjo>zp`LzF^~9mOZDk7M~-3UBy%R8^1R`K=UEHk}xp!eLY2m zfXU`mdR1r$BdgHLqAqPVO>IJc#&X{PG%Q+-ks1eDODm}5_CY@K*5BK+Srn+*-*jgT z*r>T&!TNLaB`T#9KCVlx{QO?r)^=WNW@2l1I77%V`=)Fiq5J0a9U$8Qa+7v&!i=;o zJ0fA_lVT}}hWx3gNs5zQ*^c*0GY$~_Xa{Lm0J^10RjyvWVwll=nmOOcrpKFY zXR!*rZn${f%%NEuQ%LL>lzPLVGKBwt^u5PScG`^Rvx_HDW(pQerLR71(JakjKK^ja zxkZjzb13;{HR9NKaueeRit>~zo#OA_us@}`0Bq)L@o5z88!)Ynip5?F4r@w@C=B&8 z_T%76CI0Ynv3L&;8Dc@&dC+0rl?Lx%((%o#PU|*DJTz)fAr#$+h;j6eqjIvzYSdYK z`vO3lD5!x=-My5B3J|v>(8AMea> zNMFe_w2|Z>cDLQ|-IHCe++)YT9xXA_?8^-5+%`c39&Nv?@eMByi2xpp-vc<686yL% zv(CbpDE)M|-&L&G<0MML-`LjD0-Gu20U-f`qz-wwR@c*)=O;fZS-!r%V)ko(jO65f z^G)A#42PGSjsdcH>z}3n&55|K?l7x~7XVYs^Et1V8u){dn2}LeREnZ9q_8zRys8Uj zB|Q$8(8%q~vdL!WjyX;< z%q$G#LVOOE;-Io;&X_KiHJr#^nJD=sJ{th<&>Ra@^7?%(Q=2ekKi(P-5?3|K*~szx zG05(1W8KoL(vlc0-XxJtp~LN3+c2?+h;)3qHQTLQ#SEIJ)y}i_rUvGtfvUT8&DgwN zP;PqN?$(tVnttX6Ew%WJ(cl>L*N>$w8TrG_L(TF(7*fW@>7rF}85x`IXd^5mOIX}< znav8S9Hoh2%a7V^-_wXJ#z!IF8bA;Qt^1dGLQ0X$XGst1JwcZI9fy3b50dmr!R=v54Db=nEZ1Ok+AagcP4HpZGWna`95V54y zP`3Ox+(xsK;P_ZrOt39&aH=o^*pkZm{>sWNq{S@=v2zqWtJCV*+8Mepj`M;Z0Y4j% zV?of0{Rs=G?x5r69+J;)D=KNQG-Zx~%jjTxsU;--T&9reXIYik91fTwJ`~Fb**`PGZuW$!O>O z`iGm2f^XPvtC{{b+D)mNm&ZE84|}RB!G|=h+@*7soisVaWYyk{q^$T7mzv)uLZ`V; z*WSY(Le6)3^DNUgouPu8;qvB-+;1*gIgs9{vuyP4HsqVfi^{kBiaIEl z4>ys*CMbm}GAp&u*kyXE}1G?6$aWDtr{I9yO_jIW*4DVY=caGnTQy z4LToknDpBLq4ECWl$h&a9*=yJDE4QXBjQrT~VovVY}uxva$j= zm7gANm*3&W;IS3pb_#J{uwVQKtH44K+(h&tv$K^z?!wMcC9^CKgF+JSlXKGxFsCWQ zMXwX>C4qWMH0N`UD3Dz-cuH55oQ1poK=E;_|Gl+jvqmcNbvO z@2}VT|KkYM+T3A+0z*|=tn#ZfMEQV;c^p<_1@3gEv+NG+sxo{oH17NijW5i66~N2i`bse&?@``{5&}|F{?fzpR8d zBYlUF+Ib$O_sN_b=C-lLY+)`<=eB!I)7VZfZMLH`LQz-PlV|%PHb71a(d5Qndyr|; zQY5OJuJPdcp#0bCf|7R8?KzetZ1iCk`QIF&+3bFu)F^h9OS^vC?2x*mWpcInd+M zb#<6wyV7j@a05iXuA-uKGyA!rJ>HE1QB;*7AuP1CuEQz#zIU!-C0b~&0P=`si@2Tc zFZa43uh;(eski7mtTiD>p`Pr4fnUL4!1h*IN{jE^)BUrr?P}ZNp#N)q2HrJZe?u~5 zR{}_*tr<;(xnS4z*+sUjOlh8S*qy=Qe&&MTE1L7>fAAq{lfRp;{*diE5*+jI;&zD$ zfn}T{@&L6XocMcN3dsR)RleKXVTr?ioJ7BHfMQ9Af@ZlT{4$(L4CY@Wr*Zf=jRpx)+H|D;{{JR#3b-G&o6BqGJhtHuX@msw3F-Fu31Un8m z7D{Bu8ohrvAr*B|XqbPAK?ix{1Hgk^p`-Z%Zhs_i?{crpvI!1ua&k5@vf1gnKoz<+ zXR2mfZAr~}Hge^&4)0u2VF`c=&6P<;0ouXh<5{z_k9b0X z{dEovRpDIOmU2~Q9bYtVIqnN92>24t;a38L$A}hPogYHiJ;p)=T2E~k|yu$#haxN>_FQoZw?S(Y7 zta-^nmfm$VV2-Dqgm=Iq;dV)2sovWnvVG?W$Z&9Y)g=WaSc3_+!y{03ch*(5C>Xy(!o5 z^Yd)i*j~%VYGmyG7}NVm%0#}A)B@RI&y@6LBc9a$%j>M!R)=%H#I|;?i$?#^8iv@> z7Wap%{`njY;;ar2x8LE1JI|eM=F>^UlOIk2PXKmtc!)W*$?`}|ZCzTEx2TFOiny{Z zI>VKTfWq0OF3bfjeZncXE~DpXy5Nf#%frRzP1R^o+4xotWf0?p)#h4n+$% zO&G);#4OEfQL+$(cYup%G-fTk^=NQEY<-K>SFWZwqm7&W{!J~DY7yX}g8+3Ax2tqS z;7Sm8q&STj&F%dtP=Yhz$Sfz--kOQ&?w}N!8Q%i5M-a0hxOu(FJxCXbaE!Xnm~YJS zXktASZhxAoUiw!UW}XSbxq&XR$MPM$W*2D_5t3i#&YZIYq}HV+)$;_HYRJbwZ~e|0 z4@W*8q(?cBtE`Ud@;aa2`d53+xZolq#M%q&vo?NRlkWNgcm?fTpHSMb&rA+V8ZU&F(yaH8q#TF^Jylu zFlHi0%jr|~{HCMkrtOVfLcm@CZnH6798w5(Dn(al91gRSk&)8(c4mrMGynL<$$0}n!QaFR@+6^+zV3&z`z4u>EsEtib6tnoKKFZSK5l#I(X^S7;0(pC0)P^#P7n?UMaf}E4Hb};nnE{7tVv&U5VnImqKns296p6BXd zd6+|L&QFF?TGelfy`zsa-)Hgo|6Xbu7hdAgY6T1vEp`WiA~q(%eGV$yvc zBnP%>pwr7jCr#cq#bwKtl})bjL%PyJK)24P5?~)?(^Uliu!jI zrd=c^&9U9Kp<-{7HOOdzae6LkqVk{adlQi9TA4{L(y<>QdM4WK+7;_+D?Ft}wpcd!n0V!4&5d6y(2Gs#@X#_+Z8v;eE4T3&Sw-Uq? zoNy)gh{DPic)(|<`b2QQ72#io%^U#8__{Ce6oW?10ZUq!_3TJM7MRJ3&1Qdjq2SyF zao){?%k@=vY#%pkgOQlRM&}`es^34 zTmazuc|}`$Qq0epESHbA_J|l}`0jx|Ej`^#zh!-M^zCydqlLqx?V}hMAz~Dx`;Dk- zuAZsQpQ+_Ecz;XvA1JcQ09Uj4pVA*Bl}w+A6BQvsCAXkQ;IHi8dr1fkb-0^uwOmr; zr;by{)j>iP{PX@FV?p3~F4y>gSC(KfMwM;6D(SJ<+v!a_4o)O5V5WnE3tG5fw1>fN zun}CAn&d94Q44^}goE~n%g7P`I)zSOPqGeLr4&6p$=g(s{43s-$A}=r(?bK~2;h7y zKM=Yn&yOU_3q~rBwg`EAR^cN4=kQy+oXxgxk`Y=eok^nw=<>cc8Z55i0o^YWG2)9y z1J(HS@!u^Ot1|#!GQ2x5|2xcrN|VLGc;l9+u_FXOXPC>?DS!;PVc>}So#MboIXG^cAF<*Y@X{mn^fDZ2?`$OWU&G-c01G)BTtGaq zW}#Gw{NCrEG!9nl95c57WGuO-Tf>bAq#7TthUMkK9kew?F4_pj3(aW5i%^7P<1G)Qar0T)nHv0?sB)=%< zqe0lV73T8@+mlf#R7xy^@o?C-M4Rw|gZcpa_y5l3z!P^l?*CMJrV)FF7GeGJ4&fpI zRS6DT5V%g-r4)f5+C-QcOzM!`4UOuOF&Ns*S*d_=Q1<(x+7uWF01xx0o(CUbMByl2 zcM}HW9h!gzL@85Q<>U#!$9<*PW7tLuhQ1tz2PgZ<){fzl{&NzmSLY8alyK}aGcqgS zJS^ldC)+#GY~-^oSBr|s>ngftfB7Wfe!~S974822l>q81^y2ooir0loPqrXd`W?s|?jns@Bx9@OIS3QqgKZygR5?u9~1?`%P7=ote-<|zMYPz>nlyM2av^E1$Qh*ANG0HY7a%Wq@ zzOOg;22|@XSL@WWsbp^MTl@)>58?UdoX^JnbTx1?*AN3A}~>PdLfobo%{6Y!Z^D%T;l0ct5k}Kf=2ajwKcaV z?9x`dO9;&Q?)G-Ks;PlWL=gAp*9}yAwIj0CRU|&%w8c!;^{M@csk_^9nadRl_jU&l zedngu`P5jgXr!r22@dWX_RE}J?gRDDo!pfQA!td1{M$7Dw-KOMVn8MRb%T=HmhXN? zue)R@&Dmz@yupOiOxvG^UQi?N+vFm>x7rVpvOaWC-R}xzn!ORp*Tz#gQP~9K5+RH& zu5XZDG23)RdR0%r?+%O6MHW7bnTAmBYHNd%$mA}mm~$f}HC)A0 z=X6-x(T&8hHM0eu3X@K)J{yY+3tfGD>~BKxJ)5so;%a2PBdy^08=?3&IjZy7iWcRR zZMV|c6m!>1dj^9j_Sl+4)!p^=NljOOOsD+daZ5V^j2Ch(^75z6Y@scKDfu`5~uKGX3B}yN#V+O z!ze+KP1#Zt!0!QRoe5Obbq%VE)a*{~MjS9R`C()@S>;R>{o4nYSUS+*Q&^W4&}4vKu3Gdn zgS)|IWo06ouz-Z;t1aW+Y6g)&TZ4^s_P7Ey1wg`&cwZ}J@tMk3>ipZCCL%>p=wdfF zg*>c1D>YcR2=0M72T$3px2LBjLeVVO|BUEsRD4OK@vj-~dd^$#QI?4~$`uftJcY$| zyn+Mh4xna{j|oNS4L`j!1ZtLWuK#M5(K5-JL}p(o*x5~J=f>H9+5%Cn7P5O-QYQ<{ zB2Oc2HaSx~jH|RE)yp`jSpPKV`lXwajiLQXN8d>?sZdkJcQa+H)YR+Nv(6b*|A;mD z{yk_@jug$~waqG!{1cn09CKN(yH=7kCRno@@sXy7v_G=NR$~rJnKGl)QKxfqvQk+( zcg!H{PXdzRV;qxeO&24HCrOfAV#1#&G6jZ6?wY+^ZJL(FH0=NhD$2xlGBJbE#5;{zW`eMvemr~dN zQR}K-zaz!yklA#+XqRgS*$^fXXpTzI@P*RI>gIUe^Qrgc)@4bRI$xECyHVMy1g7yb&&NTJX}T94Z{r|D1KF{%cPASwLj! zLJZU{HaMj!P#8&f?HuM@wcY9o>X~kK;wB%Yn7TfA)Bu9EG_B@v(Z0uP4PJEJte@t|scOp@0$><>dJ*v=p4KTnP=j)aa+oy)3>MNxwc!==ujn4^X* z2LDDtvvE*a3=q)cRErZ39!h)O5=m~MZh8qx~dot-}@|32;3=59HHSNpK}cBw9GrZ6nbUpcf~b1(*fEQba|zGpICEeckL1aCPH z{nBYR9P4NH3hwv&mq|1TYA2QwEH;k%@I!7<$}rgj10&<{bC#P140`CYA@3+gZ@jzz zuUIW>U#&5@u4QZwKyiFo-pq;g134I3F0Aw(y4{!PX0pg`HeSUm;d-9jh&7iXND`bC z;8l(4DZcXj-U!53)oXvH2MiWTlT_R)%e@m-S|?A}G{ZZX{nQMFEvjYBtdqIgW=k2b zhx-}SY5Uh=md;o{?v~f}(ty>+jf@15vSxFpHZNm4{79K+KWFnzrGp)xY!0WUl(|8q zD6&U>chT8c>p0Hp@{-fbixFYWvHZPP)uM24G`-SdB5Ikw9vg0KQ~bHuK^h!%ye+L+ z_9qJQ96|(7blOGUOg^%?wVXTI)&NWJc6IVqW0w@3^AYhBZtiJvOKkbp=^PXF;+a1z z--vsym@FkbD^Bg!W0@_8VGr?mLQFb~!e|$>#lGjQ+o|nHiwA$|XN&G>Po;8_Ub)VX zxm|{3qO;@E=HBd*S!FO+{At+6eIhf~ht#=qCS$dGUw!Z9eZ=GWLCkA^gFSz-sG+gU z^F|MGtQh+fgIgB&p;=_XaT2EZw9{&bu9Sz?3-Qy9zh_q#Ypb3)5j=MhVcaPL)t6N> z1)KBHux}jQ_c{e7yOS*MUh!*JAn$G)^c;mD9UJc)q*wdGY_QpACZTJeX-g_N zq!&BQGqy>jiTH)!^UorCmzXmEeLvl`7c+0tR%_rIlKN;LGgJb z0N>a|dW$Qjp|P=`yM=m(lqbOJit-)L1F3L~PW`80o(ILW051EPqaQ&Yy%9@h{{HoT zg4&LbpSp-Q;fqi2E|rzyu8Pv}tUqY08wp>aN9{KexCnaqYqS;=P2%VCbMe%H1{CdeDa>l4UR8(RspC;X#--f_^ zUtr0G;BuQttKIkOlJmN^*G5^!GJhG9#P|JRQ2wAHzhJLPTxGy)MM2yhTLvrrwp06s zdTr8JE-1MYlw8M&nL?Y!5TA$LlD$DUD?hrQ`hMYEZ9_2bpuI%6^rcy2$0oh$_`IsX zfqT-w4ym!hMvT&gd?h!V?6yLC37kQkdGcOzg#4~j-VAFs(Ql>8mhAnHBe|ET z9ED%XRH!o%uU>03u0&3&s!%Df9{Rqrnl z?hWF3D{FXqLO3-Vs5?C;X0gfeumOq2=~7H~u@W55#>@>xr;ZVIZ?uTzy=CImRrPrN zEQtNf%20CJ_K{lcSvEm93ajp-UQrTLV#})cynb?ehN!Dq8Iwx#c)|d=kPc56m@Yd% zgE=9H=ny)(5^Z+7aqGffon5Z&?aVk+;sW?wCmQh_=Fziw;yp*Yo~e7%Y8EL)VR9L>nY;yq z+>PH}fdzHjFqy}*WIxIzeQIga;XL&6k@G=`uIMIAm6>ktFz6T|xIX~+V=}{OG!o&w zUo7PIM36&J)Bx7ov)7-{jdQ15dY9;i`t3eRipQ+yv<6rCWol7;PCFU`IQe;& ze6eqE6iywDI(gzFjA%RD+FNj`#y8BCKM6D23qja#y-CiPvLS(+r%WoPCAQ4=5C>;O z&oAAsh|*(&+hEGkG#x92V^m=;p)h2|W!$qa)7 z7U{9;1cRGE3`VY@Kh-;vzBr@4HaCjss^`0RXv8ua;Pm+Rh}e9mYo=1gxn__uzn0FC zBSE!ZI?*C9@r&{M*aNA#c~>ms3n!vQbDUP^yIM%8sNVJV2iO0?C}q`AI%gxB*X z?n0-3os_goh3X6|V7oZ1(|2+#E1jamqMpEV0&cbS@~fc;0PhF?AAF8N2QIyvmB)nk zY_%=E$zvO8g!+|04dzpR`<==J+o|B_s=^8yJ-Aiiz}|`5BXTiDY*+O$oQ)yaX|HdQ zkUu=X&F! ze6o!@orwWAnLfU3`)tc?9`-3BsLA|jAD?{y*%}|rL8cmM_8WipPoP@yu<0iC*eqzV zhr%UjdsKRs5BG;WE^0is}nw*?ex{1kTo5vF88^k#GT{t);fs>G;*XaWW11e<~=;;x7Vj;z}Z(hIV z18}w>z_7x>NgxG!JSBZDx1%`R6nkApeM;`e@4`hGjVD4NyKkGWzdKbhEv(TF2RwalBld)q%R)o}MJ~=D55?jF3%i*-`MTgrq%F zY81Fn1A`|$CA7+LUWi;%Q{`(6>1&my=ziYYCC*w~J9$6s0lW&hE{l^Z9vTbO=M;4b z$s+ChF+5cpbG(=1UhO$uz9y+Sy7$#tP}& z<~|YQml8wdC@4VHI^ujLHh+8L!G|=817~^SOIL>=vvLj73pW>++DP+SrsxY-%mHS- zPs8$HORR>z$$V2+Zuxgt@f6bKpwX+w#wCSSlRnGh`ymV;l~Grlp3vM%UG{kk_+ypA z0jy+JS6-pBW+&5JS1Xko!=jN`FMXgcj030~Z~iyXGwDgs3c|Wv_EqjQ5QvAo+Z7!nicgmyJ16f+Lb6nP*>M1 zT3j+Q>hXqE+n`mA|4v7O-ND~~Rbzvd)wG~#L8~@1zKR{%TyAx?T1y2Wzy}q_EtFmI zs`v&8K@U>lyuI0mpjMbF6w9pHE4kiVlYTJm&|xh>#})0NddXY^~uMPwuJgczq_QUtP~)XBb|8o`iRPRl3l<%R6wwyquTp&N})Tt=c+{6 zw0k9Uy>`UP^d1`23VnaeJl={O)Aw|Bxnj@mY6Z93D^MUNXkA z{NnzaM*UH0^^`D&_Ws|NSEWIvT|_L*%atSD@^iWSq15JFl4n19LjChCl6-m|+MaAm zTo^zX_P+U#iu-xP0){?PC3~%;lakX%L@^Ylzh%5OYBDia>Ip3{zy-Gbsnos3u^3DX zj2xyqoNaTv-!N#&yr_MGVI0b5W^xjbadJ$Dn~k*SmtuxbYcq^+Ix6*l7w`x|vdQOH zc^*0es*>Lb?@&kP^M6Z|2#bSJ&do>bM`3lXz8Mb@@!3@!bni_SPz_o+c(W?)L<}f^ z@hbWs*BY?VBNcRw;ktwVww^$@Dd7g37`0fZjP5qpqSYv;yJ%x1A_^J>do>PPCS<*$ zIk`O_j8UOGyQJ!<)n-&NTdm2@US-gf$j#;b2$$pW0OIguOW%6ueHtjIz=RpURY5%L zf{PB0~1~od+;6Q=N(iTq$#EzQ25QrTz<6*a$B=2!|HD{Zwt}Bhk=0p_(v3a|BQPMoDFHJNCXw77 z2sVu1Gx{CodUe~xzS5-_`f=X&wx6^_G^CnmxG%Z@jjQvAlN%M;%8K4N*G=orKV{b= zBK+T|hX2$mBU)X_o{bf=;}9XBYwOtoACpBo>;f_`xhVT47rmugvq1x%rsZXes{RkY zV?@qW!D}Qu3l9A#&xQ5JXJlNNJF=vAu#t<7r|`*_ut}WGd{@1`+-GT{e5np!R2bk9!U4rD z8~*N%%fI*9jg8cYEqK)D2JIs|iE~=B<%tr|$Gi9xnY3gatMQO(q7(mPh}jjP^(sM@ zV7b2iT+|3@_)Na-T>9+f(4F@^Yyc>Zjn)BY?v%AX}KT=mP z9t@nMZw{Qx zEWMi+^oEX1o^ZS2Ma@9@PrLz`bD;T8?>j!!=%to;@z%gPHBb6PT4it!2Q2tK5_svH z&mMU3vsLT1)_eY?8nPXOc@szAr6Y#Pqk9NBK81m~qHR|i7?$ekTx2aQ4%?IOuk%R+ zr7Wv8~V0xz=7Z;wwLobvM{OKIf+0RakY6rfnV+^mZlH9FVz|5Mp@ zM@6wTdz2uG0VHQhk|gJ}AUPvxNdh7{XIMZMB?^)e5SMUa$-98Wm9Qiwset6XfCL4U zoa38?``z!p|K6K(_MDkBv)x_Q-PKiHzv{Z4F)z={F1Ce27KVwP>sT+Uy85rhkOO9} zU#LL78`7faE|8)4(Sk7ar}nZr6=-sIB|qS(E8*8^fR|=&qUPPfMzMOz*|(~y5uzY^ zP}0L^El{n}W)XB-Zi0WgWM%;AefG9=4@`m9!J#;9gE-KfZ9Bls56FY~J*kn1(oPcK zUQ73>1=c?SY!52rLgYh$BS`?S2BvUXIF;47gWq#CKu0)a`f zX=*&X9>NXgcvG`$^g~S}vh|uBI9FSJp7U$-c zXWtD82jWdh4sd(1L34az(b>~GfV&b0Cjh$)NidfkC5^d&%$UGKT1zNpsxJG`g%`QdqPr12TUfqZ9Xvz)d`$?Q#g_=xSy#i)tWcNceGks7^ z!L)a%mT;?w%4{F76G=0y zP`EvpXO?XC;um5`i5e+UIPWz#rYbcvGHYyhrp&l6jex5Rw7#0m=md8A_W%3}szLC1 zVG#x|2QKLYgaT8Te-6lD7F0F27>;S_ch$&DIrz;)WLTOA9jhn2tNb-BiD^g}H`V_2 z8$LzVY{(#=cjiXZtqR?S5(GTs8U;&QH&R+)4uTN-t|?RG!FvpBqV745CD3b> zjy`U_^p*iO2jE=A;l~NX-x#Y<%Z!g^`Z`nuXVdVS7JL+Y8_Gbglx9>WPs+aRH-4~@ z6m?@zGjX(>gzQ2Y$orguCq|$gYdr_V3 zhlfv;6Yf9u!ry|L2s2i}!pHZ=OHR%$xYSeLZ4jHdPta-fQ38wqpO|>M0p1-*e(zDE zKdY+q;R(FfdEo4Xbzh?(BWV6znGj?u{HDX+YWs_BrMv%8cdkS1bS=tQ`HjW%&TmfRjtD z{V+TJgctrEm&Isj6LCY}dA@vamKI-K6eA}f*$Als(8K+tWqRU=bJZPKz{PegcBJ9( z+mTs{cbi};Gcy$3nu92R>3EK_dKp|{^t9 zek3mm!09I*!j|J5k4M{0I1E>hm+;>LOA>(eMjuu_1K;9oe!A=iz=3719Ca9fw04>&mC052h#MwlXNLg>M>|837?*0HGMH2O?h88M-I}>j%Fke~jiD zuA`)CPz3%a2VngO@JAONjGts0lQ#Vy#NO>28`KP?Z449ufElKI6MJ@ZBm1UMElgad5!TzS6&O{%Uf;(y;H7d z9`_bIFfMug&$N$W&jhQ9d}k>FeCx^Kj)8RV-(!Pz|0TpO4YhwZSpc$LUi#>ymepc2_+EW4RC4t zic1|)Kv3}o%KvAZ?o)q;)*x+1;fC+g`ALc4tgNk6x5c0FU%w_O+V%XwAuiDTn?c!3;-!Xmx|)%9AOrA z?c;x)2xy~(OZec;QKw9cg2TV)u>?N@fOY^8gx6g^9%UFfYTJeD6uOK!hP$bqrF*es zXGQCS%W{bEl9H0pXms#;9gwmi;u5yPbQfF11Hzj5Wm^hQH}G4d!jP`1C8N>IHd>&U zUbf*}y}v%Y)u*3E`MqVKY|NqI#TU{;AQvT0>tBBtd}!zme0<|lOvwSRCxw;?>AYe* zZ5V%?D3d(n?|EQg&&qQ^Qaqg;F7Dyr5C;-B<<4eHjCypeGyUwXA=|kTk@oB!FgcDX z1lt-rm(E7g>FvVmR18F8HEQ0)wJ`wVL+%2=jJ9Y^dY^>$!U@~yG!}E~P1+a$8MVt} z6Y6iG-~TOn>b*zSYEqwUj;ufr*>4g+yzerGitSD<1Ud7L%fxU=TM2B0+y`iSfENyA zdU5^3Gi^p{Jt!qJSm3@Kub~KV@+C0<20R0PIK*C;5+D@d!7&iTiSuCba-85@14;y7 zcsO$1m-B+~I-dJ4AaWAtW#Xl@CdOlhw1DITUXeG|(i9nSf*e#fBQJ>lRluuG;`65s z#|D1kI(aT5PWb6M=0T(NjrlfPc}i^zkESiL^~wX=*8tJP`2__gWuqW}AWO0bfQ9~W zgXjQ2-2Jz*xVX5xQ3{BWD!&I}4rq%U7|fZUDpbb!V+*#m;aDgT&^J#6bMue$dRdCw zk-OdSo$4?dU>5x{>&DyAAI)AV5FvSpo?8U8d#>R zH*BR#0M|YkyHm>SP;+2#yl%dB5!7EHg}3M4?1|Z-G7cFX6Hh*X(h>YtjU1<1-?>Mn zM{dl%Irj16dhqRobNiW~rB}`*AzcFeT*WbY1E43as~~?_y2X~-&{8g%W101hiNf`> zb0Qp^TrQhXv98!N5ntY()@J)78($~&&wUxx!(}s|l>(trA)a;qtVn&Uz|y0$F8y8t zdf%u5mDLNE9S^S8G4E-ur3Lw5wZ1d;X@gR{DcMF=iZ0{v%o?H5=iE{inT{Bvsos(Y z2zkwS`-6E;J8wTeL$!=U^;h`UGNSbK@HE~+?R5hK^OG7rg@FA|*YM5WzS=Q+9BQe{ z%iE(xIHvDE?lsfAr5A)3bbC?%bJR+mrUrcu)#Di}_p+TF9W(3LCN<3{BTzMe<1qMhfI17Tr5cPZbgId-Gd}`H7V; z^fxT_*5VT_j&oe4#kSzhud6K4g@f#ox~M?bBPmPc1hIq2>aIDXVSz2H#qC(ZLB4Wwb{cKrMhWt6 z{A|0?kV+5n1-V3}OR;mcoJ>-aAFS;H)$!0;!4>VtYPE0lUp;*qn6enUr*mx#jVg{Q~4#!TV>M`uIoRpyJT!MK+)?;Vh%bQOn$1tdZzD!@M;PeD74QR z%hyt(6)`I;)S_`JT=)1mu-Cq|JkF1yq5~amP9wKsHw=Tp8-eINrMLJzjan|auB z5t2x4sO{z9<#tAT#t^u{0zvG{_9=SCH_x_$MS5qNbRI7=ML3d7+1mhG2l0CXh0VT` zMX)PswiQOoEi}ZO&r74TK%^SJYK-y9-p>do8}$iE*0x63R*{L)+xtQP-R|q8Kl< zu#xg2CftKQ8ks^ZG?{>qN~d=e_QWmf;t4)_!LUX|G=-q!gIbLcep@Un=2GL6dh-ZL z?V3?*Z|4{ny+wT?Ns!=k$XN9T_%ln~Oh1FW$D7hL0cuvtawM0HT}#7D5F97g9>hq|kal_SM7E(wk)Rs_YD= zL!`Cuq3_I`o7`}64(g6g6Czj|5%7c+cZ=39Y43uCYEG-t=Y+`6BJ+j*fGtYdez_r*xrCO~eGGz97l~jun?CZtF z5u;8p-t3hh?a3MfY4%-Eg7tJawW`6?qrWr4(o??@Vp{4kTiYde+A!vTfrZpn@Z!BU zc}9>8#J<74?);8MH7kfv?#2>xn_c%Bm8P2VU zFO0%;x>;X9`n_n$eOduMD{w*MC$$>d!!@_AA~!3^7v0uPpsVGzU|b#zPin>#Akzl zco;<4A{d(dr&nLhd!Tt^#b9q`22+<<80_#i%EY9=Ijd5T3s3Ozsb=6NXmnbraj`C5 z3Xz5$%#HZ;)8>2F!OuJ8Y$~*YY=%Wv`0lB<7f?fk7If4)GxN$t{=@q)$#taZeosq| zt4XR(g~!Z%MrB3a4EGK%Z_DXu{vxB+U;PgB3<6NI0%gQNRT%2g7d{!X7F(KNWwVWK zo!OA0aPR9Go6-Wg$b!eooKhq) zI64?->&zlc5!#||=z)R-(9G9pfy}3gS)GI!?k_YbBfo8~Vy=nty+680TRz!hhkz44 zQSFqZ4T6mnqN?rZD;lPlMmtOIJd>{AA?~2hlZGrpz--w`|5K6WII!B|s3lv{UG!SS zLF1>*H`M~mxdQ*pH}{732biS?C+xl1*@tLn%BRe~exYGYrEL;vS5aC)DhZIgqElqc z*5OK!f|(sbO8X)1r*8f_vzD}k7^Em09?dylQDKv4Btz3$V{ir|=6eR@!kETUaE+by z>?EITb+nY%eV|!xa~V}~hcp>uXRxWR{%w>N8ZqI5ORi`e&_?XVUt$Ie>c|TgL1mtb z79i=U8+`*MxgqzaJgak67Ezq1FC}o~RB3Ow1t};YdOw-{rQ}u&fBtDS0nTTzqq<#D zAyMwTUwO+%Q~r>fWx-^lCM2-OZ)jz~Ad!rz19@?CAzCVf^Spe}vn~!bE%V{q6!HQy z{uzh~_x@TN?=5P9Z$i$z_r6!ZXEE%Y6I8M;Io@i9)b31l7%XFib(?ZPi!rqeqV8;5 z5;LqyY}U`vZ%>ezJ;%7A;)U}0Vdt^f_^(#qO8aZ~U7DBYkc#BjbD~#UTXNnj%X@aX6;<; z(AFqu(W~B%aqh3%OrJi8`QoqKp<^rE?yycWTR#c)2lrAb%*m8B_&|i$t|=xIzd$u* zgp>BsXjdTn>J*{e?u5RG8E&+9!OU{f-gLwd??=nFuWs$i+PMZ)YZyonE%$h4zoR-| zRoHcz+wR#gM;UpJB_-cVN!H|xj~xq@^?;4~J?YRmOG zv=BSFAbfX?Q9JgTG%E0*^vo%$tJ_s@%W6_9VPI@1*?av{X&>pR#rgBST#{E50^ua* zA(kuR6YU5&UfwM5;U3v?-dVC#pG+<9xxdr%I8(vM+}NdcTdf&J1K$h@SFX)`g$Zf4 zh&^<+wbqOjLdy5|4fRF+cRue$;0rs{v-b54-tIqm-%+mDmE=gj7nY8+m=Enedt9*K zwrR(x7PqN`?@5~SYvY@9CPu7p>QH07hpov?vmK(0e;+1h?z)kRN8hNWR$3+P-~EW_ z>{|E?OE7Ahrp@I0{f{aLqUyYKn<1Dc!fe{7U}pYjFiWHXM3OsD0Dir`B}G(;mk|o% z(iFI0u@2BJ(qXp#&;ZEyA$=Bb+-qx@esfDJ*8`5Gj94Uy%-!^WAKFTX^T6N^)*`yR zJZ1s(!PPC5+@%$XeZVEeT8h}~84eTHBEVj^A6!}!z}?GhS7?9WmjC6|#tLvt@I3^V Ypy%Gr0K9yLTmS$7 literal 0 HcmV?d00001 diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp new file mode 100644 index 000000000..f1b374adb --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp @@ -0,0 +1,264 @@ +#include "ck_tile/host.hpp" +#include "moe_smoothquant.hpp" +#include +#include + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-5; + double atol = 1e-5; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-5; + double atol = 1e-5; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + // due to rounding, int8 quantization might have 1 abs error + double rtol = 1; + double atol = 1; + return ck_tile::make_tuple(rtol, atol); +} + +template +void topid_unique_gen( + std::vector& host_tensor, int tokens, int topk, int num_expert, int seed) +{ + size_t total_size = topk * tokens; + std::srand(seed); + std::set unique_set; + IndexType current_v; + for(size_t i = 0; i < total_size; i++) + { + if(i % topk == 0) + { + unique_set.clear(); + } + current_v = std::rand() % num_expert; + while(unique_set.find(current_v) != unique_set.end()) + { + current_v = std::rand() % num_expert; + } + unique_set.insert(current_v); + host_tensor[i] = current_v; + } +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("t", "3328", "tokens dimension") + .insert("h", "4096", "hidden_size dimension") + .insert("e", "32", "experts") + .insert("k", "5", "topk") + .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size") + .insert("v", "1", "cpu validation or not") + .insert("kname", "1", "print kernel name or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t tokens = arg_parser.get_int("t"); + ck_tile::index_t hidden_size = arg_parser.get_int("h"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = hidden_size; + ck_tile::index_t experts = arg_parser.get_int("e"); + ck_tile::index_t topk = arg_parser.get_int("k"); + std::string data_type = arg_parser.get_str("prec"); + int kname = arg_parser.get_int("kname"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(stride >= hidden_size); + + using TypeConfig = MoeSmoothquantTypeConfig; + + using XDataType = typename TypeConfig::XDataType; + using XScaleDataType = typename TypeConfig::XScaleDataType; + using YScaleDataType = typename TypeConfig::YScaleDataType; + using QYDataType = typename TypeConfig::QYDataType; + using ComputeDataType = typename TypeConfig::ComputeDataType; + + // host verify + ck_tile::HostTensor x_host({tokens, hidden_size}, {stride, 1}); + ck_tile::HostTensor xscale_host({experts * hidden_size}); + ck_tile::HostTensor topk_ids_host({tokens, topk}); + + ck_tile::HostTensor yscale_host_ref({topk * tokens}, {1}); + ck_tile::HostTensor yscale_host_dev({topk * tokens}, {1}); + + ck_tile::HostTensor qy_host_ref({topk * tokens, hidden_size}, {stride, 1}); + ck_tile::HostTensor qy_host_dev({topk * tokens, hidden_size}, {stride, 1}); + + topid_unique_gen(topk_ids_host.mData, tokens, topk, experts, 11937); + ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillUniformDistribution{1e-3, .5f}(xscale_host); + + ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem xscale_buf(xscale_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem topk_ids_buf(topk_ids_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes()); + + x_buf.ToDevice(x_host.data()); + xscale_buf.ToDevice(xscale_host.data()); + topk_ids_buf.ToDevice(topk_ids_host.data()); + + std::cout << "[" << data_type << "]" + << " tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride + << ", experts:" << experts << ", topk:" << topk << std::flush; + + moe_smoothquant_traits traits{data_type}; + + moe_smoothquant_args args{x_buf.GetDeviceBuffer(), + xscale_buf.GetDeviceBuffer(), + topk_ids_buf.GetDeviceBuffer(), + yscale_buf.GetDeviceBuffer(), + qy_buf.GetDeviceBuffer(), + tokens, + hidden_size, + experts, + topk, + stride, + stride}; + + float ave_time = moe_smoothquant( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + std::size_t num_byte = + sizeof(XDataType) * tokens * hidden_size + sizeof(XScaleDataType) * topk * hidden_size + + sizeof(YScaleDataType) * topk * tokens + sizeof(QYDataType) * topk * tokens * hidden_size; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush; + + bool pass = true; + + if(do_validation) + { + using YDataType = ComputeDataType; + ck_tile::HostTensor y_host({topk * tokens, hidden_size}, {stride, 1}); + // smooth outlier + { + auto f = [&](auto i_token) { + for(int i_topk = 0; i_topk < topk; i_topk++) + { + auto i_expert = topk_ids_host(i_token, i_topk); + + for(int i_h = 0; i_h < hidden_size; ++i_h) + { + auto v_xscale = ck_tile::type_convert( + xscale_host(i_expert * hidden_size + i_h)); + auto v_x = ck_tile::type_convert(x_host(i_token, i_h)); + // y_host(i_token * topk + i_topk, i_h) = v_x * v_xscale; + y_host(i_topk * tokens + i_token, i_h) = v_x * v_xscale; + } + } + }; + + ck_tile::make_ParallelTensorFunctor(f, tokens)(std::thread::hardware_concurrency()); + } + + // yscale + { + ck_tile::HostTensor y_rowwise_amax_host({topk * tokens}); + + using ReduceAmax = ck_tile::ReduceOp::AbsMax; + ck_tile::reference_reduce( + y_host, y_rowwise_amax_host, ReduceAmax{}); + + auto op = [](const auto& v0) { + return v0 / + ck_tile::type_convert(ck_tile::numeric::max()); + }; + ck_tile::reference_unary_elementwise( + y_rowwise_amax_host, yscale_host_ref, op); + + yscale_buf.FromDevice(yscale_host_dev.mData.data()); + + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err(yscale_host_dev, + yscale_host_ref, + std::string("yscale Error: Incorrect results!"), + rtol, + atol); + } + + // rowwise quantization + { + ck_tile::reference_rowwise_quantization2d( + y_host, yscale_host_ref, qy_host_ref); + + qy_buf.FromDevice(qy_host_dev.data()); + auto [rtol, atol] = get_elimit(); + + if(stride == hidden_size) + { + pass = ck_tile::check_err(qy_host_dev, + qy_host_ref, + std::string("qy Error: Incorrect results!"), + rtol, + atol); + } + else + { + for(int i_r = 0; i_r < topk * tokens; i_r++) + { + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, + qy_host_dev.begin() + i_r * stride + + hidden_size); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, + qy_host_ref.begin() + i_r * stride + + hidden_size); + pass &= ck_tile::check_err(qy_host_dev_row, + qy_host_ref_row, + std::string("qy[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + } + + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + if(data_type == "fp16") + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "bf16") + { + return run(arg_parser) ? 0 : -2; + } + + return -3; +} diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp new file mode 100644 index 000000000..9f9adda90 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/smoothquant.hpp" +#include + +template +struct MoeSmoothquantTypeConfig; + +template <> +struct MoeSmoothquantTypeConfig +{ + using XDataType = ck_tile::half_t; + using XScaleDataType = float; + using YScaleDataType = float; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; +}; + +template <> +struct MoeSmoothquantTypeConfig +{ + using XDataType = ck_tile::bf16_t; + using XScaleDataType = float; + using YScaleDataType = float; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; +}; + +// runtime args +struct moe_smoothquant_args : public ck_tile::MoeSmoothquantHostArgs +{ +}; + +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template +struct moe_smoothquant_traits_ +{ + using DataType = ck_tile::remove_cvref_t; + + static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize; + static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0); + static constexpr ck_tile::index_t total_warps = + (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize; + + // num of warps along m + static constexpr ck_tile::index_t BlockWarps_M = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return total_warps * (warpSize / ThreadPerBlock_N_); + } + else + { + // static_assert(warpSize % ThreadPerBlock_M_ == 0); + return total_warps / (ThreadPerBlock_N_ / warpSize); + } + }(); + + // num of warps along n + static constexpr ck_tile::index_t BlockWarps_N = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return 1; + } + else + { + static_assert(ThreadPerBlock_N_ % warpSize == 0); + return ThreadPerBlock_N_ / warpSize; + } + }(); + + static constexpr ck_tile::index_t Repeat_M = Repeat_M_; + static constexpr ck_tile::index_t Repeat_N = Repeat_N_; + + static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_; + static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_; + + static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M; + static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_; + + using BlockTile = ck_tile::sequence; + using BlockWarps = ck_tile::sequence; + using WarpTile = ck_tile::sequence; + using Vector = ck_tile::sequence<1, Vector_N_>; + + using Shape = ck_tile::Generic2dBlockShape; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kTwoPass = kTwoPass_; +}; + +template +float moe_smoothquant_(const ck_tile::stream_config& s, moe_smoothquant_args a); + +// This is the public API, will be generated by script +struct moe_smoothquant_traits +{ + std::string data_type; +}; + +float moe_smoothquant(moe_smoothquant_traits, moe_smoothquant_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/14_moe_smoothquant/script/perf_test.sh b/example/ck_tile/14_moe_smoothquant/script/perf_test.sh new file mode 100755 index 000000000..d1e848b93 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/script/perf_test.sh @@ -0,0 +1,37 @@ + +EXE=build/bin/tile_example_moe_smoothquant + +$EXE -t=1 -h=1 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=80 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=128 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=144 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=168 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=184 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=256 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=288 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=344 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=376 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=448 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=512 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=924 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=1024 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=1078 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=1996 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=4080 -v=1 -prec=bf16 -repeat=1000 + +$EXE -t=700 -h=80 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=128 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=144 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=168 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=184 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=256 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=288 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=344 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=376 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=448 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=512 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=924 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=1024 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=1078 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=1996 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=4080 -v=1 -prec=fp16 -repeat=1000 \ No newline at end of file diff --git a/example/ck_tile/14_moe_smoothquant/script/smoke_test.sh b/example/ck_tile/14_moe_smoothquant/script/smoke_test.sh new file mode 100755 index 000000000..3bb62d37b --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/script/smoke_test.sh @@ -0,0 +1,30 @@ +#!/bin/sh +EXE=build/bin/tile_example_moe_smoothquant + +for pr_i in "fp16" "bf16" ; do +$EXE -prec=$pr_i -t=99 -h=13 +$EXE -prec=$pr_i -t=17 -h=16 +$EXE -prec=$pr_i -t=1 -h=100 +$EXE -prec=$pr_i -t=4 -h=128 +$EXE -prec=$pr_i -t=80 -h=127 +$EXE -prec=$pr_i -t=22 -h=255 -stride=256 +$EXE -prec=$pr_i -t=7 -h=599 +$EXE -prec=$pr_i -t=19 -h=512 +$EXE -prec=$pr_i -t=33 -h=313 -stride=1000 +$EXE -prec=$pr_i -t=11 -h=510 +$EXE -prec=$pr_i -t=171 -h=676 -stride=818 +$EXE -prec=$pr_i -t=91 -h=636 +$EXE -prec=$pr_i -t=12 -h=768 -stride=800 +$EXE -prec=$pr_i -t=100 -h=766 -stride=812 +$EXE -prec=$pr_i -t=31 -h=1024 +$EXE -prec=$pr_i -t=64 -h=1000 -stride=1004 +$EXE -prec=$pr_i -t=8 -h=1501 +$EXE -prec=$pr_i -t=3 -h=1826 +$EXE -prec=$pr_i -t=5 -h=2040 +$EXE -prec=$pr_i -t=7 -h=2734 +$EXE -prec=$pr_i -t=1 -h=3182 +$EXE -prec=$pr_i -t=9 -h=4096 +$EXE -prec=$pr_i -t=3 -h=8192 +$EXE -prec=$pr_i -t=1 -h=10547 +$EXE -prec=$pr_i -t=3 -h=17134 +done diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 15db0f46c..b6a44f76b 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -13,3 +13,4 @@ add_subdirectory(10_rmsnorm2d) add_subdirectory(11_add_rmsnorm2d_rdquant) add_subdirectory(12_smoothquant) add_subdirectory(13_moe_sorting) +add_subdirectory(14_moe_smoothquant) diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp index 604c9551f..a15d2c040 100644 --- a/include/ck_tile/core/config.hpp +++ b/include/ck_tile/core/config.hpp @@ -64,6 +64,7 @@ #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE_WITH_NAN 1 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE 2 #define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD_ASM 3 +#define CK_TILE_FLOAT_TO_BFLOAT16_RTA_ASM 4 #ifndef CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT #define CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE @@ -225,3 +226,7 @@ #ifndef CK_TILE_WORKAROUND_SWDEV_383542 #define CK_TILE_WORKAROUND_SWDEV_383542 1 #endif + +#ifndef CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID +#define CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID 1 +#endif diff --git a/include/ck_tile/core/numeric/bfloat16.hpp b/include/ck_tile/core/numeric/bfloat16.hpp index 5f4b64466..499ba80a8 100644 --- a/include/ck_tile/core/numeric/bfloat16.hpp +++ b/include/ck_tile/core/numeric/bfloat16.hpp @@ -18,6 +18,7 @@ enum class bf16_rounding_mode truncate_with_nan, truncate, standard_asm, + rta_asm, // round to nearest away }; template (((token_id_)&0x00ffffff) | (((topk_id_)&0xff) << 24)) + template CK_TILE_HOST void reference_moe_sorting(const HostTensor& topk_ids, const HostTensor& weights, @@ -20,8 +23,14 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor& topk_ids, { const index_t num_token = topk_ids.mDesc.get_lengths()[0]; const index_t topk = topk_ids.mDesc.get_lengths()[1]; - std::vector> expert_tokens(experts, - std::vector(unit_size, num_token)); + // allocate a temp buffer, and fill the value with [number_token|topk] + std::vector> expert_tokens( + experts, +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + std::vector(unit_size, MOE_SORTING_MOCK_ID(num_token, topk))); +#else + std::vector(unit_size, num_token)); +#endif std::vector> expert_token_weights( experts, std::vector(unit_size, 0)); std::vector expert_slices(experts, 1); @@ -42,12 +51,19 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor& topk_ids, expert_token_weights[e].resize(new_size); for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++) { - expert_tokens[e][i] = num_token; +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + expert_tokens[e][i] = MOE_SORTING_MOCK_ID(num_token, topk); +#else + expert_tokens[e][i] = num_token; +#endif expert_token_weights[e][i] = 0; } } - - expert_tokens[e][idx] = t; +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + expert_tokens[e][idx] = MOE_SORTING_MOCK_ID(t, k); +#else + expert_tokens[e][idx] = t; +#endif expert_token_weights[e][idx] = w; expert_slice_idxs[e]++; } @@ -75,4 +91,7 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor& topk_ids, unit_cnt *= unit_size; return; } + +#undef MOE_SORTING_MOCK_ID + } // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp index 1c6acec70..d9e28ceb5 100644 --- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp +++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp @@ -12,20 +12,77 @@ namespace ck_tile { +#define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \ + static_cast(((token_id_)&0x00ffffff) | (((topk_id_)&0xff) << 24)) + +// clang-format off +// [indexing implementation-1] +// using M_a as constexpr block_size to partition all tokens into different slices +// each slice map to one expert, and one expert can have multiple slices +// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5 +// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]] +// tok-0 tok-1 tok-2 tok-3 tok-4 +// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number) +// +// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]] +// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5 +// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]] +// +// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1) +// * this could be larger than actual, since actual tokens are on GPU +// +// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5] +// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4 -|- exp-5 -| +// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, c, f, i, o] +// +// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr +// +// * Note on token_id_per_expert/sorted_token_ids_ptr data: +// currently we do not have topk information from the data of token_id_per_expert/sorted_token_ids_ptr. +// In some cases(like smooth-quant), we need topk information to indexing into tokens quant from +// different expert smooth quant. So we modify the number stored inside token_id_per_expert/sorted_token_ids_ptr +// +// 32bit 0........23 24.....31 bit +// (data) -> (token_id | topk_id) +// low 24 bit is for token id, top 8 bit is for topk id +// +// the input after smooth-quant is [topk, token, hidden_dim], originally it is [token, hidden_dim] +// the input scale for token is [topk, token, 1], the smooth-quant scale for first gemm is [expert, interm_dim] +// +// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5] +// * length is (max_num_tokens_padded + block_size - 1) / block_size +// +// num_tokens_post_padded_ptr : [28] +// num_sorted_tiles_ptr : [7] +// +// * different from vLLM +// 1) token_id stored in sorted_token_ids_ptr is actual token_id, not token_id*top_K expanded id +// 2)need sorted_weight_ptr +// 3) use num_sorted_tiles_ptr, already divided by M_a +// +// * below used for indexing +// 1) sorted_token_ids_ptr [max_num_tokens_padded] +// 2) sorted_weight_ptr +// 3) sorted_expert_ids_ptr +// 4)num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one) +// +// max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1) struct MoeSortingHostArgs { - const void* p_topk_ids; - const void* p_weights; + const void* p_topk_ids; // [token, topk] + const void* p_weights; // [token, topk] void* p_sorted_token_ids; void* p_sorted_weights; void* p_sorted_expert_ids; void* p_total_tokens_post_pad; + // we fused the setzero of output of fused-moe buffer + // set this pointer to nullptr will skip this operation void* p_moe_buf; index_t tokens; - index_t unit_size; + index_t unit_size; // this is the M_a of fused-moe kernel index_t num_experts; index_t topk; - index_t moe_buf_bytes; + index_t moe_buf_bytes; // byte size of p_moe_buf }; template @@ -183,8 +240,14 @@ struct MoeSortingKernel index_t expert_id = topk_id[i]; index_t rank_post_pad = tokens_cnts[calc_index(num_experts, tid, expert_id)] + cumsum[expert_id]; +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + uint32_t curr_token_id, curr_topk_id; + topk_mdiv.divmod(i, curr_token_id, curr_topk_id); + p_sorted_token_ids[rank_post_pad] = MOE_SORTING_MOCK_ID(curr_token_id, curr_topk_id); +#else p_sorted_token_ids[rank_post_pad] = topk_mdiv.div(i); - p_sorted_weights[rank_post_pad] = weights[i]; +#endif + p_sorted_weights[rank_post_pad] = weights[i]; ++tokens_cnts[calc_index(num_experts, tid, expert_id)]; } @@ -195,8 +258,13 @@ struct MoeSortingKernel cumsum[tid] + tokens_cnts[calc_index(num_experts, blockDim.x, tid)]; while(expert_offset < cumsum[tid + 1]) { +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + p_sorted_token_ids[expert_offset] = + MOE_SORTING_MOCK_ID(prefill_token, topk_mdiv.divisor); +#else p_sorted_token_ids[expert_offset] = prefill_token; - p_sorted_weights[expert_offset] = static_cast(0.0); +#endif + p_sorted_weights[expert_offset] = static_cast(0.0); expert_offset++; } } @@ -229,4 +297,7 @@ struct MoeSortingKernel smem); } }; + +#undef MOE_SORTING_MOCK_ID + } // namespace ck_tile diff --git a/include/ck_tile/ops/smoothquant.hpp b/include/ck_tile/ops/smoothquant.hpp index c9e459765..24a59b45b 100644 --- a/include/ck_tile/ops/smoothquant.hpp +++ b/include/ck_tile/ops/smoothquant.hpp @@ -3,6 +3,7 @@ #pragma once +#include "ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp" #include "ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp" #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp" #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp" diff --git a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp new file mode 100644 index 000000000..1bece521f --- /dev/null +++ b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +namespace ck_tile { + +// host side args +struct MoeSmoothquantHostArgs +{ + const void* p_x; // [tokens ,hidden_size], input, fp16/bf16 + const void* p_xscale; // [experts, hidden_size], input, columnwise scale, fp32 + const void* p_topk_ids; // [tokens, topk] + + void* p_yscale; // [topk * tokens, 1], output, rowwise quant scale + void* p_qy; // [topk * tokens, hidden_size], output + + index_t tokens; + index_t hidden_size; + index_t experts; + index_t topk; + index_t x_stride; // input x row stride + index_t y_stride; // output y stride(stride for topk) +}; + +// TODO: Extract some type to wrapper class +template +struct MoeSmoothquant +{ + using Pipeline = remove_cvref_t; + using Problem = typename Pipeline::Problem; + + using XDataType = remove_cvref_t; + using XScaleDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YScaleDataType = remove_cvref_t; + using QYDataType = remove_cvref_t; + + static constexpr index_t Block_M = Problem::BlockShape::Block_M; + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + static constexpr bool kPadM = false; // always no need to pad along M + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kTwoPass = Problem::kTwoPass; + + static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N; + static constexpr index_t Vector_N = Problem::BlockShape::Vector_N; + static constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N; + + static constexpr auto I0 = number<0>{}; + static constexpr auto I1 = number<1>{}; + + static_assert(Problem::BlockShape::Repeat_M == 1); + + struct Kargs + { + const void* p_x; // [tokens ,hidden_size], input, fp16/bf16 + const void* p_xscale; // [experts, hidden_size], input, columnwise scale, fp32 + const void* p_topk_ids; // [tokens, topk] + + void* p_yscale; // [topk, tokens, 1], output, rowwise quant scale + void* p_qy; // [topk, tokens, hidden_size], output + + index_t tokens; + index_t hidden_size; + index_t experts; + index_t topk; + index_t x_stride; // input x row stride + index_t y_stride; // output y stride(stride for topk) + }; + using Hargs = MoeSmoothquantHostArgs; + + CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) + { + return Kargs{hargs.p_x, + hargs.p_xscale, + hargs.p_topk_ids, + hargs.p_yscale, + hargs.p_qy, + hargs.tokens, + hargs.hidden_size, + hargs.experts, + hargs.topk, + hargs.x_stride, + hargs.y_stride}; + } + + CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) + { + return dim3(hargs.topk, integer_divide_ceil(hargs.tokens, Block_M), 1); + } + + CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; } + + // clang-format off + template struct t2s; + template <> struct t2s { static constexpr const char * name = "fp32"; }; + template <> struct t2s { static constexpr const char * name = "fp16"; }; + template <> struct t2s { static constexpr const char * name = "bf16"; }; + template <> struct t2s { static constexpr const char * name = "fp8"; }; + template <> struct t2s { static constexpr const char * name = "bf8"; }; + // clang-format on + + // in byte + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); } + + CK_TILE_HOST static std::string GetName() + { + // clang-format off + using S_ = typename Problem::BlockShape; + auto surfix = [&] () { + std::string n; + if (kPadN) n += "_pn"; + if (kTwoPass) n += "_2p"; + return n; }(); + + #define _SS_ std::string + #define _TS_ std::to_string + return _SS_("moe_smoothquant_") + _SS_(t2s::name) + "_" + + _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" + + _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" + + _SS_(Pipeline::name) + surfix; + #undef _SS_ + #undef _TS_ + // clang-format on + } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + const index_t i_topk = blockIdx.x; + const index_t i_token = blockIdx.y * Block_M; + const index_t i_token_in_thrd = + __builtin_amdgcn_readfirstlane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N); + + const index_t i_expert = reinterpret_cast( + kargs.p_topk_ids)[(i_token + i_token_in_thrd) * kargs.topk + i_topk]; + + // [tokens ,hidden_size] + const auto x_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_x), + make_tuple(kargs.tokens, kargs.hidden_size), + make_tuple(kargs.x_stride, 1), + number{}, + number<1>{}); + + const auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {i_token, 0}); + }(); + + // [experts, hidden_size], + const auto xscale_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_xscale) + i_expert * kargs.hidden_size, + make_tuple(kargs.hidden_size), + make_tuple(1), + number{}, + number<1>{}); + + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + + return make_tile_window(tmp2_, make_tuple(number{}), {0}); + }(); + + // [topk, tokens] + auto yscale_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_yscale) + i_topk * kargs.tokens, + make_tuple(kargs.tokens), + make_tuple(1), + number<1>{}); + + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + + return make_tile_window(tmp2_, make_tuple(number{}), {i_token}); + }(); + + // [topk, tokens, hidden_size] + auto qy_window = [&]() { + auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_qy) + i_topk * kargs.tokens * kargs.y_stride, + make_tuple(kargs.tokens, kargs.hidden_size), + make_tuple(kargs.y_stride, 1), + number{}, + number<1>{}); + + auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {i_token, 0}); + }(); + + __shared__ char smem[GetSmemSize()]; + + Pipeline{}(x_window, xscale_window, yscale_window, qy_window, kargs.hidden_size, smem); + } +}; + +} // namespace ck_tile -- GitLab From c2bcbb1379c31a068234216a585027a91be57fee Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 24 Nov 2024 21:41:52 -0800 Subject: [PATCH 076/153] Bump rocm-docs-core from 1.8.5 to 1.9.0 in /docs/sphinx (#1691) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.8.5 to 1.9.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/v1.9.0/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.8.5...v1.9.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 3a2e266ef..5bec504a0 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.8.5 +rocm-docs-core==1.9.0 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index b65d2391f..8881c0e74 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.8.5 +rocm-docs-core==1.9.0 # via -r requirements.in six==1.16.0 # via pybtex -- GitLab From 645fe812f65db86a9eaca7ae00e0004c1634bc0a Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Mon, 25 Nov 2024 15:30:35 +0800 Subject: [PATCH 077/153] [CK_TILE] Fix fMHA fwd MakeKargs() compilation errors (#1689) * Fix mis-matched tuple<> elem types * Rename MakeKargs() as MakeKargsImpl() --------- Co-authored-by: Qianfeng --- example/ck_tile/01_fmha/fmha_bwd.hpp | 208 +++++----- example/ck_tile/01_fmha/fmha_fwd.hpp | 156 ++++---- .../ops/fmha/kernel/fmha_bwd_kernel.hpp | 232 +++++------ .../ops/fmha/kernel/fmha_fwd_kernel.hpp | 370 +++++++++--------- 4 files changed, 484 insertions(+), 482 deletions(-) diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp index 3b21a3257..722ef15a2 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.hpp +++ b/example/ck_tile/01_fmha/fmha_bwd.hpp @@ -150,113 +150,113 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args) // create group mode kernel arguments if constexpr(FmhaBwdDQDKDVKernel::kIsGroupMode) { - return FmhaBwdDQDKDVKernel::MakeKargs(args.q_ptr, - args.k_ptr, - args.v_ptr, - args.bias_ptr, - args.lse_ptr, - args.do_ptr, - args.d_ptr, - args.rand_val_ptr, - args.dk_ptr, - args.dv_ptr, - args.dbias_ptr, - args.dq_acc_ptr, - args.seqstart_q_ptr, - args.seqstart_k_ptr, - args.seqlen_k_ptr, - args.hdim_q, - args.hdim_v, - args.nhead_q, - args.nhead_q / args.nhead_k, - args.scale, - args.stride_q, - args.stride_k, - args.stride_v, - args.stride_bias, - args.stride_randval, - args.stride_do, - args.stride_dq_acc, - args.stride_dk, - args.stride_dv, - args.stride_dbias, - args.nhead_stride_q, - args.nhead_stride_k, - args.nhead_stride_v, - args.nhead_stride_bias, - args.nhead_stride_randval, - args.nhead_stride_do, - args.nhead_stride_lsed, - args.nhead_stride_dq_acc, - args.nhead_stride_dk, - args.nhead_stride_dv, - args.nhead_stride_dbias, - args.split_stride_dq_acc, - args.window_size_left, - args.window_size_right, - args.mask_type, - args.p_drop, - args.drop_seed_offset); + return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr, + args.k_ptr, + args.v_ptr, + args.bias_ptr, + args.lse_ptr, + args.do_ptr, + args.d_ptr, + args.rand_val_ptr, + args.dk_ptr, + args.dv_ptr, + args.dbias_ptr, + args.dq_acc_ptr, + args.seqstart_q_ptr, + args.seqstart_k_ptr, + args.seqlen_k_ptr, + args.hdim_q, + args.hdim_v, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.scale, + args.stride_q, + args.stride_k, + args.stride_v, + args.stride_bias, + args.stride_randval, + args.stride_do, + args.stride_dq_acc, + args.stride_dk, + args.stride_dv, + args.stride_dbias, + args.nhead_stride_q, + args.nhead_stride_k, + args.nhead_stride_v, + args.nhead_stride_bias, + args.nhead_stride_randval, + args.nhead_stride_do, + args.nhead_stride_lsed, + args.nhead_stride_dq_acc, + args.nhead_stride_dk, + args.nhead_stride_dv, + args.nhead_stride_dbias, + args.split_stride_dq_acc, + args.window_size_left, + args.window_size_right, + args.mask_type, + args.p_drop, + args.drop_seed_offset); } else { // create batch mode kernel arguments - return FmhaBwdDQDKDVKernel::MakeKargs(args.q_ptr, - args.k_ptr, - args.v_ptr, - args.bias_ptr, - args.lse_ptr, - args.do_ptr, - args.d_ptr, - args.rand_val_ptr, - args.dk_ptr, - args.dv_ptr, - args.dbias_ptr, - args.dq_acc_ptr, - args.seqlen_q, - args.seqlen_k, - args.hdim_q, - args.hdim_v, - args.nhead_q, - args.nhead_q / args.nhead_k, - args.scale, - args.stride_q, - args.stride_k, - args.stride_v, - args.stride_bias, - args.stride_randval, - args.stride_do, - args.stride_dq_acc, - args.stride_dk, - args.stride_dv, - args.stride_dbias, - args.nhead_stride_q, - args.nhead_stride_k, - args.nhead_stride_v, - args.nhead_stride_bias, - args.nhead_stride_randval, - args.nhead_stride_do, - args.nhead_stride_lsed, - args.nhead_stride_dq_acc, - args.nhead_stride_dk, - args.nhead_stride_dv, - args.nhead_stride_dbias, - args.batch_stride_q, - args.batch_stride_k, - args.batch_stride_v, - args.batch_stride_bias, - args.batch_stride_randval, - args.batch_stride_do, - args.batch_stride_lsed, - args.batch_stride_dq_acc, - args.batch_stride_dk, - args.batch_stride_dv, - args.batch_stride_dbias, - args.split_stride_dq_acc, - args.window_size_left, - args.window_size_right, - args.mask_type, - args.p_drop, - args.drop_seed_offset); + return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr, + args.k_ptr, + args.v_ptr, + args.bias_ptr, + args.lse_ptr, + args.do_ptr, + args.d_ptr, + args.rand_val_ptr, + args.dk_ptr, + args.dv_ptr, + args.dbias_ptr, + args.dq_acc_ptr, + args.seqlen_q, + args.seqlen_k, + args.hdim_q, + args.hdim_v, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.scale, + args.stride_q, + args.stride_k, + args.stride_v, + args.stride_bias, + args.stride_randval, + args.stride_do, + args.stride_dq_acc, + args.stride_dk, + args.stride_dv, + args.stride_dbias, + args.nhead_stride_q, + args.nhead_stride_k, + args.nhead_stride_v, + args.nhead_stride_bias, + args.nhead_stride_randval, + args.nhead_stride_do, + args.nhead_stride_lsed, + args.nhead_stride_dq_acc, + args.nhead_stride_dk, + args.nhead_stride_dv, + args.nhead_stride_dbias, + args.batch_stride_q, + args.batch_stride_k, + args.batch_stride_v, + args.batch_stride_bias, + args.batch_stride_randval, + args.batch_stride_do, + args.batch_stride_lsed, + args.batch_stride_dq_acc, + args.batch_stride_dk, + args.batch_stride_dv, + args.batch_stride_dbias, + args.split_stride_dq_acc, + args.window_size_left, + args.window_size_right, + args.mask_type, + args.p_drop, + args.drop_seed_offset); } }(); diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 41edac67b..704453baa 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -281,87 +281,87 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args) // create group mode kernel arguments if constexpr(FmhaKernel::kIsGroupMode) { - return FmhaKernel::MakeKargs(args.q_ptr, - args.k_ptr, - args.v_ptr, - args.bias_ptr, - args.rand_val_ptr, - args.lse_ptr, - args.o_ptr, - args.seqstart_q_ptr, - args.seqstart_k_ptr, - args.seqlen_k_ptr, - args.hdim_q, - args.hdim_v, - args.nhead_q, - args.nhead_q / args.nhead_k, - args.scale_s, - args.scale_p, - args.scale_o, - args.stride_q, - args.stride_k, - args.stride_v, - args.stride_bias, - args.stride_randval, - args.stride_o, - args.nhead_stride_q, - args.nhead_stride_k, - args.nhead_stride_v, - args.nhead_stride_bias, - args.nhead_stride_randval, - args.nhead_stride_lse, - args.nhead_stride_o, - args.window_size_left, - args.window_size_right, - args.mask_type, - args.p_drop, - args.s_randval, - args.drop_seed_offset); + return FmhaKernel::MakeKargsImpl(args.q_ptr, + args.k_ptr, + args.v_ptr, + args.bias_ptr, + args.rand_val_ptr, + args.lse_ptr, + args.o_ptr, + args.seqstart_q_ptr, + args.seqstart_k_ptr, + args.seqlen_k_ptr, + args.hdim_q, + args.hdim_v, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.scale_s, + args.scale_p, + args.scale_o, + args.stride_q, + args.stride_k, + args.stride_v, + args.stride_bias, + args.stride_randval, + args.stride_o, + args.nhead_stride_q, + args.nhead_stride_k, + args.nhead_stride_v, + args.nhead_stride_bias, + args.nhead_stride_randval, + args.nhead_stride_lse, + args.nhead_stride_o, + args.window_size_left, + args.window_size_right, + args.mask_type, + args.p_drop, + args.s_randval, + args.drop_seed_offset); } else { // create batch mode kernel arguments - return FmhaKernel::MakeKargs(args.q_ptr, - args.k_ptr, - args.v_ptr, - args.bias_ptr, - args.rand_val_ptr, - args.lse_ptr, - args.o_ptr, - args.seqlen_q, - args.seqlen_k, - args.hdim_q, - args.hdim_v, - args.nhead_q, - args.nhead_q / args.nhead_k, - args.scale_s, - args.scale_p, - args.scale_o, - args.stride_q, - args.stride_k, - args.stride_v, - args.stride_bias, - args.stride_randval, - args.stride_o, - args.nhead_stride_q, - args.nhead_stride_k, - args.nhead_stride_v, - args.nhead_stride_bias, - args.nhead_stride_randval, - args.nhead_stride_lse, - args.nhead_stride_o, - args.batch_stride_q, - args.batch_stride_k, - args.batch_stride_v, - args.batch_stride_bias, - args.batch_stride_randval, - args.batch_stride_lse, - args.batch_stride_o, - args.window_size_left, - args.window_size_right, - args.mask_type, - args.p_drop, - args.s_randval, - args.drop_seed_offset); + return FmhaKernel::MakeKargsImpl(args.q_ptr, + args.k_ptr, + args.v_ptr, + args.bias_ptr, + args.rand_val_ptr, + args.lse_ptr, + args.o_ptr, + args.seqlen_q, + args.seqlen_k, + args.hdim_q, + args.hdim_v, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.scale_s, + args.scale_p, + args.scale_o, + args.stride_q, + args.stride_k, + args.stride_v, + args.stride_bias, + args.stride_randval, + args.stride_o, + args.nhead_stride_q, + args.nhead_stride_k, + args.nhead_stride_v, + args.nhead_stride_bias, + args.nhead_stride_randval, + args.nhead_stride_lse, + args.nhead_stride_o, + args.batch_stride_q, + args.batch_stride_k, + args.batch_stride_v, + args.batch_stride_bias, + args.batch_stride_randval, + args.batch_stride_lse, + args.batch_stride_o, + args.window_size_left, + args.window_size_right, + args.mask_type, + args.p_drop, + args.s_randval, + args.drop_seed_offset); } }(); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp index ccf15ee60..23174528e 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp @@ -304,64 +304,64 @@ struct FmhaBwdDQDKDVKernel template CK_TILE_HOST static constexpr std::enable_if_t - MakeKargs(const void* q_ptr, - const void* k_ptr, - const void* v_ptr, - const void* bias_ptr, - const void* lse_ptr, - const void* do_ptr, - const void* d_ptr, - void* rand_val_ptr, - void* dk_ptr, - void* dv_ptr, - void* dbias_ptr, - void* dq_acc_ptr, - ck_tile::index_t seqlen_q, - ck_tile::index_t seqlen_k, - ck_tile::index_t hdim_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_head_q, - ck_tile::index_t nhead_ratio_qk, - float scale, - ck_tile::index_t stride_q, - ck_tile::index_t stride_k, - ck_tile::index_t stride_v, - ck_tile::index_t stride_bias, - ck_tile::index_t stride_randval, - ck_tile::index_t stride_do, - ck_tile::index_t stride_dq_acc, - ck_tile::index_t stride_dk, - ck_tile::index_t stride_dv, - ck_tile::index_t stride_dbias, - ck_tile::index_t nhead_stride_q, - ck_tile::index_t nhead_stride_k, - ck_tile::index_t nhead_stride_v, - ck_tile::index_t nhead_stride_bias, - ck_tile::index_t nhead_stride_randval, - ck_tile::index_t nhead_stride_do, - ck_tile::index_t nhead_stride_lsed, - ck_tile::index_t nhead_stride_dq_acc, - ck_tile::index_t nhead_stride_dk, - ck_tile::index_t nhead_stride_dv, - ck_tile::index_t nhead_stride_dbias, - ck_tile::index_t batch_stride_q, - ck_tile::index_t batch_stride_k, - ck_tile::index_t batch_stride_v, - ck_tile::index_t batch_stride_bias, - ck_tile::index_t batch_stride_randval, - ck_tile::index_t batch_stride_do, - ck_tile::index_t batch_stride_lsed, - ck_tile::index_t batch_stride_dq_acc, - ck_tile::index_t batch_stride_dk, - ck_tile::index_t batch_stride_dv, - ck_tile::index_t batch_stride_dbias, - ck_tile::index_t split_stride_dq_acc, - ck_tile::index_t window_size_left, - ck_tile::index_t window_size_right, - ck_tile::index_t mask_type, - float p_drop, - std::variant, std::pair> - drop_seed_offset) + MakeKargsImpl(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_do, + ck_tile::index_t batch_stride_lsed, + ck_tile::index_t batch_stride_dq_acc, + ck_tile::index_t batch_stride_dk, + ck_tile::index_t batch_stride_dv, + ck_tile::index_t batch_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + std::variant, std::pair> + drop_seed_offset) { Kargs kargs{{q_ptr, k_ptr, @@ -470,7 +470,7 @@ struct FmhaBwdDQDKDVKernel return kargs; } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -531,7 +531,7 @@ struct FmhaBwdDQDKDVKernel float p_drop, const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, @@ -591,7 +591,7 @@ struct FmhaBwdDQDKDVKernel std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -650,9 +650,9 @@ struct FmhaBwdDQDKDVKernel ck_tile::index_t window_size_right, ck_tile::index_t mask_type, float p_drop, - const std::tuple& drop_seed_offset) + const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, @@ -714,54 +714,54 @@ struct FmhaBwdDQDKDVKernel template CK_TILE_HOST static constexpr std::enable_if_t - MakeKargs(const void* q_ptr, - const void* k_ptr, - const void* v_ptr, - const void* bias_ptr, - const void* lse_ptr, - const void* do_ptr, - const void* d_ptr, - void* rand_val_ptr, - void* dk_ptr, - void* dv_ptr, - void* dbias_ptr, - void* dq_acc_ptr, - const void* seqstart_q_ptr, - const void* seqstart_k_ptr, - const void* seqlen_k_ptr, - ck_tile::index_t hdim_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_head_q, - ck_tile::index_t nhead_ratio_qk, - float scale, - ck_tile::index_t stride_q, - ck_tile::index_t stride_k, - ck_tile::index_t stride_v, - ck_tile::index_t stride_bias, - ck_tile::index_t stride_randval, - ck_tile::index_t stride_do, - ck_tile::index_t stride_dq_acc, - ck_tile::index_t stride_dk, - ck_tile::index_t stride_dv, - ck_tile::index_t stride_dbias, - ck_tile::index_t nhead_stride_q, - ck_tile::index_t nhead_stride_k, - ck_tile::index_t nhead_stride_v, - ck_tile::index_t nhead_stride_bias, - ck_tile::index_t nhead_stride_randval, - ck_tile::index_t nhead_stride_do, - ck_tile::index_t nhead_stride_lsed, - ck_tile::index_t nhead_stride_dq_acc, - ck_tile::index_t nhead_stride_dk, - ck_tile::index_t nhead_stride_dv, - ck_tile::index_t nhead_stride_dbias, - ck_tile::index_t split_stride_dq_acc, - ck_tile::index_t window_size_left, - ck_tile::index_t window_size_right, - ck_tile::index_t mask_type, - float p_drop, - std::variant, std::pair> - drop_seed_offset) + MakeKargsImpl(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + std::variant, std::pair> + drop_seed_offset) { Kargs kargs{{q_ptr, k_ptr, @@ -858,7 +858,7 @@ struct FmhaBwdDQDKDVKernel return kargs; } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -909,7 +909,7 @@ struct FmhaBwdDQDKDVKernel float p_drop, const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, @@ -959,7 +959,7 @@ struct FmhaBwdDQDKDVKernel std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -1008,9 +1008,9 @@ struct FmhaBwdDQDKDVKernel ck_tile::index_t window_size_right, ck_tile::index_t mask_type, float p_drop, - const std::tuple& drop_seed_offset) + const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 4443a4503..3de433d6a 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -64,7 +64,7 @@ struct FmhaFwdKernel template <> struct t2s { static constexpr const char * name = "bf8"; }; // clang-format on - __host__ static std::string GetName() + CK_TILE_HOST static std::string GetName() { // sync with generate.py // clang-format off @@ -267,50 +267,50 @@ struct FmhaFwdKernel using Kargs = std::conditional_t; template - __host__ static constexpr std::enable_if_t - MakeKargs(const void* q_ptr, - const void* k_ptr, - const void* v_ptr, - const void* bias_ptr, - void* rand_val_ptr, - void* lse_ptr, - void* o_ptr, - ck_tile::index_t seqlen_q, - ck_tile::index_t seqlen_k, - ck_tile::index_t hdim_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_head_q, - ck_tile::index_t nhead_ratio_qk, - float scale_s, - float scale_p, - float scale_o, - ck_tile::index_t stride_q, - ck_tile::index_t stride_k, - ck_tile::index_t stride_v, - ck_tile::index_t stride_bias, - ck_tile::index_t stride_randval, - ck_tile::index_t stride_o, - ck_tile::index_t nhead_stride_q, - ck_tile::index_t nhead_stride_k, - ck_tile::index_t nhead_stride_v, - ck_tile::index_t nhead_stride_bias, - ck_tile::index_t nhead_stride_randval, - ck_tile::index_t nhead_stride_lse, - ck_tile::index_t nhead_stride_o, - ck_tile::index_t batch_stride_q, - ck_tile::index_t batch_stride_k, - ck_tile::index_t batch_stride_v, - ck_tile::index_t batch_stride_bias, - ck_tile::index_t batch_stride_randval, - ck_tile::index_t batch_stride_lse, - ck_tile::index_t batch_stride_o, - ck_tile::index_t window_size_left, - ck_tile::index_t window_size_right, - ck_tile::index_t mask_type, - float p_drop, - bool s_randval, - std::variant, std::pair> - drop_seed_offset) + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargsImpl(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_lse, + ck_tile::index_t batch_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + std::variant, std::pair> + drop_seed_offset) { Kargs kargs{{q_ptr, k_ptr, @@ -399,9 +399,9 @@ struct FmhaFwdKernel return kargs; } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template - __host__ static constexpr std::enable_if_t + CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, const void* k_ptr, const void* v_ptr, @@ -445,53 +445,54 @@ struct FmhaFwdKernel bool s_randval, const std::tuple& drop_seed_offset) { - MakeKargs(q_ptr, - k_ptr, - v_ptr, - bias_ptr, - rand_val_ptr, - lse_ptr, - o_ptr, - seqlen_q, - seqlen_k, - hdim_q, - hdim_v, - num_head_q, - nhead_ratio_qk, - scale_s, - scale_p, - scale_o, - stride_q, - stride_k, - stride_v, - stride_bias, - stride_randval, - stride_o, - nhead_stride_q, - nhead_stride_k, - nhead_stride_v, - nhead_stride_bias, - nhead_stride_randval, - nhead_stride_lse, - nhead_stride_o, - batch_stride_q, - batch_stride_k, - batch_stride_v, - batch_stride_bias, - batch_stride_randval, - batch_stride_lse, - batch_stride_o, - window_size_left, - window_size_right, - mask_type, - p_drop, - s_randval, - std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + return MakeKargsImpl( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_lse, + batch_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template - __host__ static constexpr std::enable_if_t + CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, const void* k_ptr, const void* v_ptr, @@ -533,91 +534,92 @@ struct FmhaFwdKernel ck_tile::index_t mask_type, float p_drop, bool s_randval, - const std::tuple& drop_seed_offset) + const std::tuple& drop_seed_offset) { - MakeKargs(q_ptr, - k_ptr, - v_ptr, - bias_ptr, - rand_val_ptr, - lse_ptr, - o_ptr, - seqlen_q, - seqlen_k, - hdim_q, - hdim_v, - num_head_q, - nhead_ratio_qk, - scale_s, - scale_p, - scale_o, - stride_q, - stride_k, - stride_v, - stride_bias, - stride_randval, - stride_o, - nhead_stride_q, - nhead_stride_k, - nhead_stride_v, - nhead_stride_bias, - nhead_stride_randval, - nhead_stride_lse, - nhead_stride_o, - batch_stride_q, - batch_stride_k, - batch_stride_v, - batch_stride_bias, - batch_stride_randval, - batch_stride_lse, - batch_stride_o, - window_size_left, - window_size_right, - mask_type, - p_drop, - s_randval, - std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + return MakeKargsImpl( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_lse, + batch_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } template - __host__ static constexpr std::enable_if_t - MakeKargs(const void* q_ptr, - const void* k_ptr, - const void* v_ptr, - const void* bias_ptr, - void* rand_val_ptr, - void* lse_ptr, - void* o_ptr, - const void* seqstart_q_ptr, - const void* seqstart_k_ptr, - const void* seqlen_k_ptr, - ck_tile::index_t hdim_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_head_q, - ck_tile::index_t nhead_ratio_qk, - float scale_s, - float scale_p, - float scale_o, - ck_tile::index_t stride_q, - ck_tile::index_t stride_k, - ck_tile::index_t stride_v, - ck_tile::index_t stride_bias, - ck_tile::index_t stride_randval, - ck_tile::index_t stride_o, - ck_tile::index_t nhead_stride_q, - ck_tile::index_t nhead_stride_k, - ck_tile::index_t nhead_stride_v, - ck_tile::index_t nhead_stride_bias, - ck_tile::index_t nhead_stride_randval, - ck_tile::index_t nhead_stride_lse, - ck_tile::index_t nhead_stride_o, - ck_tile::index_t window_size_left, - ck_tile::index_t window_size_right, - ck_tile::index_t mask_type, - float p_drop, - bool s_randval, - std::variant, std::pair> - drop_seed_offset) + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargsImpl(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + std::variant, std::pair> + drop_seed_offset) { Kargs kargs{{q_ptr, k_ptr, @@ -702,9 +704,9 @@ struct FmhaFwdKernel return kargs; } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template - __host__ static constexpr std::enable_if_t + CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, const void* k_ptr, const void* v_ptr, @@ -742,7 +744,7 @@ struct FmhaFwdKernel bool s_randval, const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, @@ -781,9 +783,9 @@ struct FmhaFwdKernel std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template - __host__ static constexpr std::enable_if_t + CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, const void* k_ptr, const void* v_ptr, @@ -819,9 +821,9 @@ struct FmhaFwdKernel ck_tile::index_t mask_type, float p_drop, bool s_randval, - const std::tuple& drop_seed_offset) + const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, @@ -860,15 +862,15 @@ struct FmhaFwdKernel std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } - __host__ static constexpr auto GridSize(ck_tile::index_t batch_size_, - ck_tile::index_t nhead_, - ck_tile::index_t seqlen_q_, - ck_tile::index_t hdim_v_) + CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_, + ck_tile::index_t nhead_, + ck_tile::index_t seqlen_q_, + ck_tile::index_t hdim_v_) { return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_); } - __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); } + CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); } CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() { -- GitLab From 440e28b08fa0f503c229f5787be4f775ad20484c Mon Sep 17 00:00:00 2001 From: carlushuang Date: Tue, 26 Nov 2024 11:14:56 +0800 Subject: [PATCH 078/153] [CK_TILE] fused-moe first version (#1634) * moe pipeline * update code * compile OK * update * update cpu reference * update pipeline_gemm0 * compiler ok * update pipeline * rename to ex pipeline * block-asm * update * update * update first gemm ok * compute correct * update file structure * update README * update * update * update code * update API * return unsupport case * add comment * update readme * update * uncomment * update * fix build err --------- Co-authored-by: valarLip <340077269@qq.com> --- .../alternative_impl/matrix_core_swizzle.cpp | 4 +- .../matrix_core_swizzle_kernel.hpp | 12 +- example/ck_tile/06_permute/permute.cpp | 2 +- .../13_moe_sorting/moe_sorting_api.hpp | 2 +- example/ck_tile/15_fused_moe/CMakeLists.txt | 19 + example/ck_tile/15_fused_moe/README.md | 69 ++ example/ck_tile/15_fused_moe/fused_moe.hpp | 52 ++ .../ck_tile/15_fused_moe/fused_moegemm.hpp | 84 ++ .../ck_tile/15_fused_moe/fused_moesorting.hpp | 20 + .../15_fused_moe/instances/fused_moe_api.cpp | 80 ++ .../instances/fused_moegemm_api.cpp | 33 + .../instances/fused_moegemm_api_internal.hpp | 60 ++ .../instances/fused_moegemm_api_traits.hpp | 53 ++ .../instances/fused_moegemm_bf16_m32.cpp | 14 + .../instances/fused_moegemm_fp16_m32.cpp | 14 + .../instances/fused_moesorting_api.cpp | 73 ++ example/ck_tile/15_fused_moe/main.cpp | 603 +++++++++++++ example/ck_tile/15_fused_moe/misc/moe-0.png | Bin 0 -> 76830 bytes example/ck_tile/15_fused_moe/misc/moe-1.png | Bin 0 -> 92535 bytes example/ck_tile/15_fused_moe/misc/moe-2.png | Bin 0 -> 126766 bytes example/ck_tile/15_fused_moe/misc/moe-3.png | Bin 0 -> 18655 bytes example/ck_tile/CMakeLists.txt | 2 + include/ck_tile/core.hpp | 2 + .../core/arch/amd_buffer_addressing.hpp | 103 +++ include/ck_tile/core/arch/arch.hpp | 18 + include/ck_tile/core/arch/utility.hpp | 24 + include/ck_tile/core/tensor/buffer_view.hpp | 86 +- include/ck_tile/core/tensor/load_tile.hpp | 54 +- .../core/tensor/static_distributed_tensor.hpp | 26 + include/ck_tile/core/tensor/tensor_view.hpp | 42 + include/ck_tile/core/tensor/tile_window.hpp | 74 +- .../core/tensor/tile_window_linear.hpp | 159 +++- .../ck_tile/core/tensor/tile_window_utils.hpp | 54 ++ include/ck_tile/core/tensor/update_tile.hpp | 56 +- .../ck_tile/core/utility/static_counter.hpp | 116 +++ include/ck_tile/host.hpp | 2 + include/ck_tile/host/device_memory.hpp | 35 + include/ck_tile/host/fill.hpp | 113 ++- include/ck_tile/host/host_tensor.hpp | 121 ++- include/ck_tile/host/joinable_thread.hpp | 27 + .../host/reference/reference_fused_moe.hpp | 196 +++++ .../host/reference/reference_permute.hpp | 23 +- .../unary_element_wise_operation.hpp | 99 +++ include/ck_tile/ops/flatmm.hpp | 10 + .../flatmm_32x512x128_1x4x1_16x16x32.hpp | 615 +++++++++++++ .../flatmm_sn_32x128x512_1x4x1_16x16x32.hpp | 562 ++++++++++++ .../ops/flatmm/block/flatmm_uk_config.hpp | 10 + include/ck_tile/ops/flatmm/block/uk/README.md | 1 + ...m_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc | 613 +++++++++++++ ...atmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc | 516 +++++++++++ .../block_fmha_pipeline_qr_ks_vs_async.hpp | 19 +- include/ck_tile/ops/fused_moe.hpp | 8 + .../fused_moe/kernel/fused_moegemm_kernel.hpp | 421 +++++++++ .../fused_moe/kernel/fused_moegemm_shape.hpp | 125 +++ .../kernel/fused_moegemm_tile_partitioner.hpp | 33 + .../fused_moegemm_pipeline_flatmm_ex.hpp | 651 ++++++++++++++ .../fused_moegemm_pipeline_flatmm_policy.hpp | 831 ++++++++++++++++++ .../fused_moegemm_pipeline_flatmm_uk.hpp | 354 ++++++++ .../fused_moegemm_pipeline_problem.hpp | 46 + .../pipeline/fused_moegemm_traits.hpp | 48 + include/ck_tile/ops/gemm/warp/warp_gemm.hpp | 130 +-- .../gemm/warp/warp_gemm_attribute_mfma.hpp | 170 +++- .../warp/warp_gemm_attribute_mfma_impl.hpp | 457 +++++++--- .../ops/gemm/warp/warp_gemm_dispatcher.hpp | 58 +- .../ck_tile/ops/gemm/warp/warp_gemm_impl.hpp | 61 +- include/ck_tile/ops/moe_sorting.hpp | 11 - 66 files changed, 8067 insertions(+), 309 deletions(-) create mode 100644 example/ck_tile/15_fused_moe/CMakeLists.txt create mode 100644 example/ck_tile/15_fused_moe/README.md create mode 100644 example/ck_tile/15_fused_moe/fused_moe.hpp create mode 100644 example/ck_tile/15_fused_moe/fused_moegemm.hpp create mode 100644 example/ck_tile/15_fused_moe/fused_moesorting.hpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp create mode 100644 example/ck_tile/15_fused_moe/main.cpp create mode 100644 example/ck_tile/15_fused_moe/misc/moe-0.png create mode 100644 example/ck_tile/15_fused_moe/misc/moe-1.png create mode 100644 example/ck_tile/15_fused_moe/misc/moe-2.png create mode 100644 example/ck_tile/15_fused_moe/misc/moe-3.png create mode 100644 include/ck_tile/core/tensor/tile_window_utils.hpp create mode 100644 include/ck_tile/core/utility/static_counter.hpp create mode 100644 include/ck_tile/host/joinable_thread.hpp create mode 100644 include/ck_tile/host/reference/reference_fused_moe.hpp create mode 100644 include/ck_tile/ops/flatmm.hpp create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp create mode 100644 include/ck_tile/ops/flatmm/block/uk/README.md create mode 100644 include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc create mode 100644 include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc create mode 100644 include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp create mode 100644 include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp create mode 100644 include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp delete mode 100644 include/ck_tile/ops/moe_sorting.hpp diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp index 93c662a28..e5ded0ef3 100644 --- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp +++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp @@ -40,7 +40,7 @@ float matrix_core_swizzle(matrix_core_swizzle_traits t, else if(t.permute.compare("0,1,3,4,2,5") == 0) { constexpr matrix_core_permute_style pstyle = - matrix_core_permute_style::permute_b_nr_kr_kw_nw_kv; + matrix_core_permute_style::b_nr_kr_kw_nw_kv; using Kernel = matrix_core_swizzle_kernel; @@ -83,7 +83,7 @@ float matrix_core_swizzle(matrix_core_swizzle_traits t, else if(t.permute.compare("0,1,3,4,2,5") == 0) { constexpr matrix_core_permute_style pstyle = - matrix_core_permute_style::permute_b_nr_kr_kw_nw_kv; + matrix_core_permute_style::b_nr_kr_kw_nw_kv; using Kernel = matrix_core_swizzle_kernel; diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp index 60ac103ec..28f4c452b 100644 --- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp +++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp @@ -42,8 +42,8 @@ enum class matrix_core_permute_style { permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6 permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6 - permute_b_nr_kr_kw_nw_kv = 2, // 0,1,3,4,2,5 - permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv, + b_nr_kr_kw_nw_kv = 2, // 0,1,3,4,2,5 + b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv, }; // assume this is B matrix, originally we have batch*n*k @@ -203,7 +203,7 @@ struct matrix_core_swizzle_kernel else { // clang-format off - // permute_b_nr_kr_kw_nw_kv or permute_b_nr_kr_waveflatten + // b_nr_kr_kw_nw_kv or b_nr_kr_waveflatten constexpr index_t Kv = Alignment; constexpr index_t Nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane; constexpr index_t Kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane; @@ -332,7 +332,7 @@ struct matrix_core_swizzle_kernel make_tuple(sequence<0>{}, sequence<1>{})); return tmp_1; #else - // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv, + // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv, constexpr index_t kv = Alignment; constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane; constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane; @@ -376,13 +376,13 @@ struct matrix_core_swizzle_kernel else { #if MERGE_2D_013425 - // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv + // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv return make_tile_window(dst_view, make_tuple(number{}, number{}), {i_n * NPerBlock, i_k * KPerBlock}, get_dst_dist()); #else - // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv + // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv constexpr index_t kv = Alignment; constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane; constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane; diff --git a/example/ck_tile/06_permute/permute.cpp b/example/ck_tile/06_permute/permute.cpp index af95b64e6..477ae370b 100644 --- a/example/ck_tile/06_permute/permute.cpp +++ b/example/ck_tile/06_permute/permute.cpp @@ -264,7 +264,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { if(arg_parser.get_str("perm") == std::string("0,1,3,4,2,5")) { - // permute_b_nr_kr_kw_nw_kv = 2, // 0,1,3,4,2,5 + // b_nr_kr_kw_nw_kv = 2, // 0,1,3,4,2,5 matrix_core_swizzle_traits t; t.data_type = data_type; t.permute = arg_parser.get_str("perm"); diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp index 91b54932c..0cb393f7d 100644 --- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp +++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp @@ -5,7 +5,7 @@ #include #include "ck_tile/core.hpp" #include "ck_tile/host.hpp" -#include "ck_tile/ops/moe_sorting.hpp" +#include "ck_tile/ops/fused_moe.hpp" struct moe_sorting_trait { diff --git a/example/ck_tile/15_fused_moe/CMakeLists.txt b/example/ck_tile/15_fused_moe/CMakeLists.txt new file mode 100644 index 000000000..a716eef19 --- /dev/null +++ b/example/ck_tile/15_fused_moe/CMakeLists.txt @@ -0,0 +1,19 @@ +set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe") +# not using add_example_executable() to add this target, since we don't want this to have +# to be included in "make all/install/check" +message("adding ${TILE_EXAPMLE_FUSED_MOE}") +file(GLOB INSTANCE_SRCS instances/*.cpp) +add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp) +target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_sources(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS}) + +set(TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS) + +# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations +list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) +list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_BUFFER_LOAD_AGPR=1) # TODO: enable load to a +list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=4) # rta +# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -mllvm -greedy-reverse-local-assignment=1) +# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) + +target_compile_options(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS}) diff --git a/example/ck_tile/15_fused_moe/README.md b/example/ck_tile/15_fused_moe/README.md new file mode 100644 index 000000000..dd566c166 --- /dev/null +++ b/example/ck_tile/15_fused_moe/README.md @@ -0,0 +1,69 @@ +# fused-moe +Implementing the fused-moe block operator using ck-tile. This is a scatter/gather-group-gemm based solution, similiar to that of [vllm moe](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py), but we introduce more kernel fusion to boost performance +![](misc/moe-0.png) + +The benifit of this fused-moe: +* 1.5~2x perf boost compared with current vllm solution +* zero workspace to reduce memory footprint +* much less kernel instance, easy to maintain + +# Implementation and feature support +## moe-sorting +this is a common pre-process step before the actual moe-gemm. The purpose is to transform the moe loop over from token-by-token to expert-by-expert, make sure very workgroup is working for a single expert (B matrix). Besides, we extend this op to do the zeroing of the output buffer(to be used for reduce buffer with atomic) + +## moe-gemm +`moe-gemm` is a group-gemm based back-to-back gemm, where the row-id of input token comes from another buffer. Naive understanding of fused-moe is from token-by-token view as below picture: +![](misc/moe-1.png) +After `moe-sorting`, we can view this algorithm as expert-by-expert, as below: +![](misc/moe-2.png) + +## optimization +summary of the key design of this fused-moe operator: +* fuse 2 group-gemm + activation + `topk-weight` multiply into single kernel, using atomic for 2nd gemm accumualation +* fuse buffer-zeroing in `moe-sorgin`, user no longer need call extra torch.zero() for the out buffer +* fused scatter-gather for row index(same as vllm) +* pre-shuffle B matric(weight) to maximize memory throughput. input(activation) keep original layout `[batch, hidden]`. +* extrem optimized pipeline using block-inline-asm(we call it `micro-kernel` or `uk`), while not breaking the *composable* design of ck + +## +``` +// [indexing implementation-1] +// using M_a as constexpr block_size to partition all tokens into different slices +// each slice map to one expert, and one expert can have multiple slices +// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5 +// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]] +// tok-0 tok-1 tok-2 tok-3 tok-4 +// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number) +// +// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]] +// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5 +// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]] +// +// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1) +// * this could be larger than actual, since actual tokens are on GPU +// +// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5] +// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4 -|- exp-5 -| +// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, c, f, i, o] +// +// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr +// +// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5] +// * length is (max_num_tokens_padded + block_size - 1) / block_size +// +// num_tokens_post_padded_ptr : [28] +// num_sorted_tiles_ptr : [7] +// +// * different from vLLM +// 1) token_id stored in sorted_token_ids_ptr is actual token_id, not token_id*top_K expanded id +// 2)need sorted_weight_ptr +// 3) use num_sorted_tiles_ptr, already divided by M_a +// +// * below used for indexing +// 1) sorted_token_ids_ptr [max_num_tokens_padded] +// 2) sorted_weight_ptr +// 3) sorted_expert_ids_ptr +// 4)num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one) +// +// max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1) +``` \ No newline at end of file diff --git a/example/ck_tile/15_fused_moe/fused_moe.hpp b/example/ck_tile/15_fused_moe/fused_moe.hpp new file mode 100644 index 000000000..6bd7688d8 --- /dev/null +++ b/example/ck_tile/15_fused_moe/fused_moe.hpp @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "fused_moesorting.hpp" +#include "fused_moegemm.hpp" + +struct fused_moe_args +{ + const void* a_ptr; // [m, k], input token + const void* a_scale_ptr; // [m, 1], token scale + const void* g_ptr; // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) + const void* d_ptr; // [e, n, k], pre-shuffle([e, nr, kr, w]) + const void* g_scale_ptr; // [e, 1, n], gate(up) scale + const void* d_scale_ptr; // [e, 1, k], down scale + const void* y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input + void* o_ptr; // [m, k], output token (no need to do zeroing) + + const void* topk_ids_ptr; // [tokens, topk] + const void* topk_weight_ptr; // [tokens, topk] + void* sorted_token_ids_ptr; // [max_num_tokens_padded] + void* sorted_weight_ptr; // [max_num_tokens_padded] + void* sorted_expert_ids_ptr; // [(max_num_tokens_padded + block_size - 1) / block_size] + void* num_sorted_tiles_ptr; // [1] + + ck_tile::index_t block_m; // block_m, used to devide the input + ck_tile::index_t hidden_size; // k + ck_tile::index_t intermediate_size; // n / TP, for Gate. if Gate+Up, Down need divide by 2 + ck_tile::index_t num_tokens; // input number of tokens for current iteration + ck_tile::index_t num_experts; // number of groups + ck_tile::index_t topk; // need this? + + ck_tile::index_t stride_token; // for input/output, stride for each row, should >= hidden_size +}; + +// This is the public API, will be generated by script +struct fused_moe_traits +{ + std::string prec_i; // input precision + std::string prec_w; // weight precision + std::string prec_o; // output precision + std::string prec_st; // token scale data type + std::string prec_sw; // weight scale data type + std::string prec_sq; // smooth quant scale + std::string prec_kw; // topk-weight data type + int block_m; + int gate_only; + int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant +}; + +float fused_moe(fused_moe_traits, fused_moe_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/15_fused_moe/fused_moegemm.hpp b/example/ck_tile/15_fused_moe/fused_moegemm.hpp new file mode 100644 index 000000000..b8e51475a --- /dev/null +++ b/example/ck_tile/15_fused_moe/fused_moegemm.hpp @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/fused_moe.hpp" +#include + +// this is only a convenient structure for creating an example +// this is not part of the host API +template +struct FusedMoeGemmTypeConfig; + +template +struct FusedMoeGemmTypeConfig +{ + using ADataType = ck_tile::bf16_t; + using GDataType = ck_tile::bf16_t; + using DDataType = ck_tile::bf16_t; + using AccDataType = float; + using ODataType = ck_tile::bf16_t; + using AScaleDataType = ck_tile::remove_cvref_t; + using GScaleDataType = ck_tile::remove_cvref_t; + using DScaleDataType = ck_tile::remove_cvref_t; + using YSmoothScaleDataType = ck_tile::remove_cvref_t; + using TopkWeightDataType = ck_tile::remove_cvref_t; + using IndexDataType = ck_tile::index_t; +}; + +template +struct FusedMoeGemmTypeConfig +{ + using ADataType = ck_tile::fp16_t; + using GDataType = ck_tile::fp16_t; + using DDataType = ck_tile::fp16_t; + using AccDataType = float; + using ODataType = ck_tile::fp16_t; + using AScaleDataType = ck_tile::remove_cvref_t; + using GScaleDataType = ck_tile::remove_cvref_t; + using DScaleDataType = ck_tile::remove_cvref_t; + using YSmoothScaleDataType = ck_tile::remove_cvref_t; + using TopkWeightDataType = ck_tile::remove_cvref_t; + using IndexDataType = ck_tile::index_t; +}; + +template +struct FusedMoeGemmTypeConfig +{ + using ADataType = ck_tile::int8_t; + using GDataType = ck_tile::int8_t; + using DDataType = ck_tile::int8_t; + using AccDataType = int32_t; + using ODataType = ck_tile::bf16_t; + using AScaleDataType = ck_tile::remove_cvref_t; + using GScaleDataType = ck_tile::remove_cvref_t; + using DScaleDataType = ck_tile::remove_cvref_t; + using YSmoothScaleDataType = ck_tile::remove_cvref_t; + using TopkWeightDataType = ck_tile::remove_cvref_t; + using IndexDataType = ck_tile::index_t; +}; + +// runtime args +struct fused_moegemm_args : public ck_tile::FusedMoeGemmHostArgs +{ +}; + +// This is the public API, will be generated by script +struct fused_moegemm_traits +{ + std::string prec_i; // input precision + std::string prec_w; // weight precision + std::string prec_o; // output precision + std::string prec_st; // token scale data type + std::string prec_sw; // weight scale data type + std::string prec_sq; // smooth quant scale + std::string prec_kw; // topk-weight data type + int block_m; + int gate_only; + int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant +}; + +float fused_moegemm(fused_moegemm_traits, fused_moegemm_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/15_fused_moe/fused_moesorting.hpp b/example/ck_tile/15_fused_moe/fused_moesorting.hpp new file mode 100644 index 000000000..57dace9b4 --- /dev/null +++ b/example/ck_tile/15_fused_moe/fused_moesorting.hpp @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once +#include +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/fused_moe.hpp" + +struct fused_moesorting_trait +{ + std::string index_type; + std::string weight_type; // currently always float +}; + +struct fused_moesorting_args : public ck_tile::MoeSortingHostArgs +{ +}; + +float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s); diff --git a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp new file mode 100644 index 000000000..bfc0ce409 --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "fused_moe.hpp" + +float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_config& s) +{ + auto s_sub = ck_tile::stream_config{s.stream_id_, false, s.log_level_, 0, 1}; + + auto o_data_bytes = [&]() { + if(t.prec_o == "fp32") + return 4; + else if(t.prec_o == "fp16" || t.prec_o == "bf16") + return 2; + else if(t.prec_o == "int8" || t.prec_o == "fp8") + return 1; + return 1; + }(); + + auto t0 = fused_moesorting_trait{"int32", "fp32"}; + auto a0 = fused_moesorting_args{ + a.topk_ids_ptr, // const void* p_topk_ids; + a.topk_weight_ptr, // const void* p_weights; + a.sorted_token_ids_ptr, // void* p_sorted_token_ids; + a.sorted_weight_ptr, // void* p_sorted_weights; + a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids; + a.num_sorted_tiles_ptr, // void* p_total_tokens_post_pad; + a.o_ptr, // void* p_moe_buf; + a.num_tokens, // index_t tokens; + a.block_m, // index_t unit_size; + a.num_experts, // index_t num_experts; + a.topk, // index_t topk; + a.num_tokens * a.stride_token * o_data_bytes // index_t moe_buf_bytes; + }; + + auto t1 = fused_moegemm_traits{t.prec_i, + t.prec_w, + t.prec_o, + t.prec_st, + t.prec_sw, + t.prec_sq, + t.prec_kw, + t.block_m, + t.gate_only, + t.fused_quant}; + auto a1 = fused_moegemm_args{ + a.a_ptr, // const void* a_ptr; + a.a_scale_ptr, // const void* a_scale_ptr; + a.g_ptr, // const void* g_ptr; + a.d_ptr, // const void* d_ptr; + a.g_scale_ptr, // const void* g_scale_ptr; + a.d_scale_ptr, // const void* d_scale_ptr; + a.y_smooth_scale_ptr, // const void* y_smooth_scale_ptr; + a.o_ptr, // void* o_ptr; + a.sorted_token_ids_ptr, // const void* sorted_token_ids_ptr; + a.sorted_weight_ptr, // const void* sorted_weight_ptr; + a.sorted_expert_ids_ptr, // const void* sorted_expert_ids_ptr; + a.num_sorted_tiles_ptr, // const void* num_sorted_tiles_ptr; + a.hidden_size, // index_t hidden_size; + a.intermediate_size, // index_t intermediate_size; + a.num_tokens, // index_t num_tokens; + a.num_experts, // index_t num_experts; + a.topk, // index_t topk; + a.stride_token // index_t stride_token; + }; + + float r0 = -1; + float r1 = -1; + + float r = ck_tile::launch_kernel( + s, + [=, &r0](const ck_tile::stream_config&) { r0 = fused_moesorting(t0, a0, s_sub); }, + [=, &r1](const ck_tile::stream_config&) { r1 = fused_moegemm(t1, a1, s_sub); }); + + // keep unsupported case return negative + if(r0 < 0 || r1 < 0) + return -1; + + return r; +} diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp new file mode 100644 index 000000000..c1a4c495c --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "fused_moegemm.hpp" +#include "fused_moegemm_api_traits.hpp" + +// Note: this internal API only declare, not define here, otherwise will block `make -j` +template +float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a); + +template +using S = ck_tile::sequence; + +float fused_moegemm(fused_moegemm_traits t, fused_moegemm_args a, const ck_tile::stream_config& s) +{ + // clang-format off + float r = -1; + if(t.prec_i == "bf16" && t.prec_w == "bf16" && t.prec_o == "bf16" && t.prec_st == "fp32" && + t.prec_sw == "fp32" && t.prec_sq == "fp32" && t.prec_kw == "fp32" && t.block_m == 32 && t.gate_only == 1) + { + using t_ = fmoe_, S<1, 4, 1>, S<16, 16, 32>, 1, 0>; + r = fused_moegemm_(s, a); + } + else if(t.prec_i == "fp16" && t.prec_w == "fp16" && t.prec_o == "fp16" && t.prec_st == "fp32" && + t.prec_sw == "fp32" && t.prec_sq == "fp32" && t.prec_kw == "fp32" && t.block_m == 32 && t.gate_only == 1) + { + using t_ = fmoe_, S<1, 4, 1>, S<16, 16, 32>, 1, 0>; + r = fused_moegemm_(s, a); + } + // clang-format on + return r; +} diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp new file mode 100644 index 000000000..5872179ef --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "fused_moegemm_api_traits.hpp" +#include "ck_tile/ops/fused_moe.hpp" +#include + +template +using S = ck_tile::sequence; + +// do not the define of this tepmlate function inside the _api.cpp, otherwise will block make -j +template +float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a) +{ + using f_traits = ck_tile::FusedMoeGemmTraits; + using f_shape = ck_tile::FusedMoeGemmShape; + using f_problem = + ck_tile::FusedMoeGemmPipelineProblem; + + // using f_pipeline = ck_tile::FusedMoeGemmPipeline_FlatmmEx; + using f_pipeline = ck_tile::FusedMoeGemmPipeline_FlatmmUk; + using f_partitioner = ck_tile::FusedMoeGemmTilePartitioner_Linear; + using f_kernel = ck_tile::FusedMoeGemmKernel; + + const dim3 grids = f_kernel::GridSize(a); + constexpr dim3 blocks = f_kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + static int printed = 0; + + auto kargs = f_kernel::MakeKargs(a); + if(s.log_level_ > 0 && printed == 0) + { + std::cout << ", " << f_kernel::GetName() << std::flush; + printed = 1; + } + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(f_kernel{}, grids, blocks, 0, kargs)); +} diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp new file mode 100644 index 000000000..cc476685d --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template + typename WarpPerBlock_, + typename WarpTile_, // seq<*,*,*>, used to select mfma + ck_tile::index_t GateOnly_ = 0, + ck_tile::index_t FusedQuant_ = 0> +struct fmoe_ // traits, ugly name, only used for internal +{ + using TypeConfig = FusedMoeGemmTypeConfig; + + using ADataType = ck_tile::remove_cvref_t; + using GDataType = ck_tile::remove_cvref_t; + using DDataType = ck_tile::remove_cvref_t; + using AccDataType = ck_tile::remove_cvref_t; + using ODataType = ck_tile::remove_cvref_t; + using AScaleDataType = ck_tile::remove_cvref_t; + using GScaleDataType = ck_tile::remove_cvref_t; + using DScaleDataType = ck_tile::remove_cvref_t; + using YSmoothScaleDataType = ck_tile::remove_cvref_t; + using TopkWeightDataType = ck_tile::remove_cvref_t; + using IndexDataType = ck_tile::remove_cvref_t; + + static constexpr ck_tile::index_t BT_ = BlockTIle_::at(ck_tile::number<0>{}); // block token + static constexpr ck_tile::index_t BI_ = + BlockTIle_::at(ck_tile::number<1>{}); // block intermediate + static constexpr ck_tile::index_t BH_ = BlockTIle_::at(ck_tile::number<2>{}); // block hidden + static constexpr ck_tile::index_t BD_ = BlockTIle_::at(ck_tile::number<3>{}); // block down + + using BlockTile_0 = ck_tile::sequence; + using WarpPerBlock_0 = ck_tile::remove_cvref_t; + using WarpTile_0 = ck_tile::remove_cvref_t; + + using BlockTile_1 = ck_tile::sequence; + using WarpPerBlock_1 = ck_tile::remove_cvref_t; + using WarpTile_1 = ck_tile::remove_cvref_t; + + static constexpr ck_tile::index_t GateOnly = GateOnly_; + static constexpr ck_tile::index_t FusedQuant = FusedQuant_; +}; diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp new file mode 100644 index 000000000..93f9c7786 --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "fused_moegemm.hpp" +#include "fused_moegemm_api_traits.hpp" +#include "fused_moegemm_api_internal.hpp" + +// clang-format off +template float fused_moegemm_< + fmoe_, S<1, 4, 1>, S<16, 16, 32>, 1, 0> +>(const ck_tile::stream_config& s, fused_moegemm_args a); + +// clang-format on diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp new file mode 100644 index 000000000..b8a823e8e --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "fused_moegemm.hpp" +#include "fused_moegemm_api_traits.hpp" +#include "fused_moegemm_api_internal.hpp" + +// clang-format off +template float fused_moegemm_< + fmoe_, S<1, 4, 1>, S<16, 16, 32>, 1, 0> +>(const ck_tile::stream_config& s, fused_moegemm_args a); + +// clang-format on diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp new file mode 100644 index 000000000..75aaf86b7 --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "fused_moesorting.hpp" + +#define MOE_SORTING_DISPATCH(unroll_num_) \ + constexpr ck_tile::index_t unroll_num = unroll_num_; \ + using ms_problem = ck_tile::MoeSortingProblem; \ + using kernel = ck_tile::MoeSortingKernel; \ + auto kargs = kernel::MakeKargs(a); \ + const dim3 grids = kernel::GridSize(a); \ + const dim3 blocks = kernel::BlockSize(a); \ + const auto lds_bytes = kernel::GetSmemSize(a); \ + float ave_time = ck_tile::launch_kernel( \ + s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \ + return ave_time; + +float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s) +{ + if(t.weight_type == "fp32" && t.index_type == "int32") + { + if(a.num_experts > 127) + { + printf("lds size exceed, only support experts <127 \n"); + return -1; + } + if(a.moe_buf_bytes % 16) + { + printf("buf set size %d unaligned, must be multiple of 16\n", a.moe_buf_bytes); + return -1; + } + using index_t = ck_tile::index_t; + using ms_weight_type = float; + index_t smem_io_unroll_num = ck_tile::integer_divide_ceil(a.tokens * a.topk, 64); + switch(smem_io_unroll_num) + { + case(1): { + MOE_SORTING_DISPATCH(1); + } + case(2): { + MOE_SORTING_DISPATCH(2); + } + case(3): { + MOE_SORTING_DISPATCH(3); + } + case(5): { + MOE_SORTING_DISPATCH(5); + } + case(6): { + MOE_SORTING_DISPATCH(6); + } + case(7): { + MOE_SORTING_DISPATCH(7); + } + case(8): { + MOE_SORTING_DISPATCH(8); + } + case(9): { + MOE_SORTING_DISPATCH(9); + } + case(10): { + MOE_SORTING_DISPATCH(10); + } + case(11): { + MOE_SORTING_DISPATCH(11); + } + default: { + MOE_SORTING_DISPATCH(4); + } + } + } + return -1; +} diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp new file mode 100644 index 000000000..2f44f903e --- /dev/null +++ b/example/ck_tile/15_fused_moe/main.cpp @@ -0,0 +1,603 @@ +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "fused_moe.hpp" + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +// mfma_type, 0:32x32, 1:16x16 +// TODO: padding? +template +auto shuffle_moe_weight(const ck_tile::HostTensor& t, std::string mfma_dtype, int mfma_type = 0) +{ + assert(t.get_lengths().size() == 3); + int b_ = t.get_lengths()[0]; + int n_ = t.get_lengths()[1]; + int k_ = t.get_lengths()[2]; + if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 0) + { + ck_tile::HostTensor t_view({b_, n_ / 32, 32, k_ / 16, 2, 8}); + std::copy(t.begin(), t.end(), t_view.begin()); + return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5}); + } + else if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 1) + { + ck_tile::HostTensor t_view({b_, n_ / 16, 16, k_ / 32, 4, 8}); + std::copy(t.begin(), t.end(), t_view.begin()); + return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5}); + } + else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 0) + { + ck_tile::HostTensor t_view({b_, n_ / 32, 32, k_ / 32, 2, 16}); + std::copy(t.begin(), t.end(), t_view.begin()); + return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5}); + } + else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 1) + { + ck_tile::HostTensor t_view({b_, n_ / 16, 16, k_ / 64, 4, 16}); + std::copy(t.begin(), t.end(), t_view.begin()); + return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5}); + } + return t; +} + +template +void topid_unique_gen( + std::vector& host_tensor, int tokens, int topk, int num_expert, int seed) +{ + size_t total_size = topk * tokens; + std::srand(seed); + std::set unique_set; + IndexType current_v; + for(size_t i = 0; i < total_size; i++) + { + if(i % topk == 0) + { + unique_set.clear(); + } + current_v = std::rand() % num_expert; + while(unique_set.find(current_v) != unique_set.end()) + { + current_v = std::rand() % num_expert; + } + unique_set.insert(current_v); + host_tensor[i] = current_v; + } +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("t", "128", "num input tokens") + .insert("e", "32", "num of experts") + .insert("k", "5", "topk") + .insert("h", "8192", "hidden_size of this model") + .insert("i", "8192", "intermediate_size between 2 gemms of FFN") + .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size") + .insert("bm", "32", "blocking factor for sorted tokens") + .insert("tp", "8", "tensor parallel size") + .insert("v", "1", "cpu validation or not") + .insert("kname", "1", "print kernel name or not") + .insert("prec_i", "bf16", "input precision") + .insert("prec_w", "bf16", "weight precision") + .insert("prec_o", "bf16", "output precision") + .insert("prec_st", "auto", "token scale data type. auto will set to fp32") + .insert("prec_sw", "auto", "weight scale data type. auto will set to fp32") + .insert("prec_sq", "auto", "(dynamic) smooth quant data type. auto will set to fp32") + .insert("prec_kw", "auto", "topk-weight data type. auto will set to fp32") + .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant") + .insert( + "gate_only", "1", "w0(gate/up) style, 0:gate+up will double interm size, 1:only gate") + .insert("api", "0", "benchmark api set: 0:fused-moe(moe-gemm+moe-sorting), 1:moe-gemm") + .insert("balance", + "0", + "if set to 1, will try balance the expert in topk-ids(convenient for testing)") + .insert("init", + "2", + "init method. 0:random stepped float(fast). 1: random uniform, 2:rand normalized" + "normalized(slow)") + .insert("seed", "11939", "seed used to do random") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +// I:input-type, W:weight-type, O:output-type, ST:toke-scale-tpye, SW:weight-scale-type, +// SQ:smooth-quant-type, KW:topk-weight-type +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t tokens = arg_parser.get_int("t"); + ck_tile::index_t experts = arg_parser.get_int("e"); + ck_tile::index_t topk = arg_parser.get_int("k"); + ck_tile::index_t hidden_size = arg_parser.get_int("h"); + ck_tile::index_t intermediate_size = arg_parser.get_int("i"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + ck_tile::index_t block_m = arg_parser.get_int("bm"); + if(stride < 0) + stride = hidden_size; + std::string prec_i = arg_parser.get_str("prec_i"); + std::string prec_w = arg_parser.get_str("prec_w"); + std::string prec_o = arg_parser.get_str("prec_o"); + std::string prec_st = arg_parser.get_str("prec_st"); + std::string prec_sw = arg_parser.get_str("prec_sw"); + std::string prec_sq = arg_parser.get_str("prec_sq"); + std::string prec_kw = arg_parser.get_str("prec_kw"); + prec_st = (prec_st == "auto") ? "fp32" : prec_st; + prec_sw = (prec_sw == "auto") ? "fp32" : prec_sw; + prec_sq = (prec_sq == "auto") ? "fp32" : prec_sq; + prec_kw = (prec_kw == "auto") ? "fp32" : prec_kw; + int kname = arg_parser.get_int("kname"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + int fused_quant = arg_parser.get_int("fquant"); + int gate_only = arg_parser.get_int("gate_only"); + int api = arg_parser.get_int("api"); + int balance = arg_parser.get_int("balance"); + int tp = arg_parser.get_int("tp"); + int init = arg_parser.get_int("init"); + uint32_t seed = arg_parser.get_uint32("seed"); + + // w0 (Gate+Up or Gate only, N size) + ck_tile::index_t shared_intermediate_size_0 = intermediate_size * (gate_only ? 1 : 2) / tp; + // w1 (Down, N size) + ck_tile::index_t shared_intermediate_size_1 = intermediate_size / tp; + + auto prec_str = [&]() { + auto base_str = prec_i; + if(prec_i != prec_w) + base_str += "x" + prec_w; + if(prec_i != prec_o) + base_str += "=" + prec_o; + if(fused_quant != 0) + { + base_str += std::string("(") + prec_st + "|" + prec_sw + "|" + prec_sq + ")"; + } + return base_str; + }(); + auto api_str = [&]() { + if(api == 0) + return std::string("fmoe"); + else if(api == 1) + return std::string("moeg"); + else if(api == 2) + return std::string("moes"); + return std::string(""); + }(); + + auto stride_str = [&]() { + if(stride == hidden_size) + return std::string(""); + else + return std::string(", st:") + std::to_string(stride); + }(); + + std::cout << "[" << api_str << "|" << prec_str << "]" + << " t:" << tokens << ", e:" << experts << ", k:" << topk << stride_str + << ", hidden:" << hidden_size << ", interm:" << intermediate_size << ", tp:" << tp + << ", shrd_interm:" << shared_intermediate_size_0 << "|" << shared_intermediate_size_1 + << ", go:" << gate_only << ", q:" << fused_quant << std::flush; + + using TypeConfig = FusedMoeGemmTypeConfig; + using ADataType = typename TypeConfig::ADataType; + using GDataType = typename TypeConfig::GDataType; + using DDataType = typename TypeConfig::DDataType; + using AccDataType = typename TypeConfig::AccDataType; + using ODataType = typename TypeConfig::ODataType; + using AScaleDataType = typename TypeConfig::AScaleDataType; + using GScaleDataType = typename TypeConfig::GScaleDataType; + using DScaleDataType = typename TypeConfig::DScaleDataType; + using YSmoothScaleDataType = typename TypeConfig::YSmoothScaleDataType; + using TopkWeightDataType = typename TypeConfig::TopkWeightDataType; + using IndexDataType = typename TypeConfig::IndexDataType; + + // host verify + ck_tile::HostTensor a_host({tokens, hidden_size}, {stride, 1}); + ck_tile::HostTensor g_host({experts, shared_intermediate_size_0, hidden_size}); + ck_tile::HostTensor d_host({experts, hidden_size, shared_intermediate_size_1}); + ck_tile::HostTensor o_host({tokens, hidden_size}, {stride, 1}); + ck_tile::HostTensor sa_host({tokens}); + ck_tile::HostTensor sg_host({shared_intermediate_size_0}); + ck_tile::HostTensor sd_host({shared_intermediate_size_1}); + ck_tile::HostTensor sy_host({shared_intermediate_size_1}); // smooth-quant + ck_tile::HostTensor topk_ids_host({tokens, topk}); // to be sort + ck_tile::HostTensor topk_weight_host({tokens, topk}); // to be sort + + int max_num_tokens_padded = topk * tokens + experts * block_m - topk; + ck_tile::HostTensor sorted_token_ids_host({max_num_tokens_padded}); + ck_tile::HostTensor sorted_weight_host({max_num_tokens_padded}); + ck_tile::HostTensor sorted_expert_ids_host( + {(max_num_tokens_padded + block_m - 1) / block_m}); + ck_tile::HostTensor num_sorted_tiles_host({1}); + + if(init == 0) + { + ck_tile::FillStepRange{-.5f, .5f, 0.01f}(a_host); + ck_tile::FillStepRange{-.5f, .5f, 0.01f}(g_host); + ck_tile::FillStepRange{.5f, -.5f, -0.01f}(d_host); + ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sa_host); + ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sg_host); + ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sd_host); + ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sy_host); + ck_tile::FillStepRange{-.5f, .5f, 0.01f}(topk_weight_host); + } + else if(init == 1) + { + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(a_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(g_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(d_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sa_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sg_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sd_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sy_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}( + topk_weight_host); + } + else if(init == 2) + { + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(a_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(g_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(d_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sa_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sg_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sd_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sy_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(topk_weight_host); + } + + // permute weight + ck_tile::HostTensor g_perm_host = shuffle_moe_weight(g_host, prec_w, 1); + ck_tile::HostTensor d_perm_host = shuffle_moe_weight(d_host, prec_w, 1); + + // do moe sorting + if(balance) + { + int e_cnt = 0; + for(int i = 0; i < static_cast(topk_ids_host.mData.size()); i++) + { + topk_ids_host.mData[i] = e_cnt; + e_cnt++; + if(e_cnt >= experts) + e_cnt = 0; + } + } + else + { + topid_unique_gen(topk_ids_host.mData, tokens, topk, experts, 11913); + } + +// leave it here for future debug purpose +#if 0 + a_host.loadtxt("../../ater/input_torch.txt"); + + topk_ids_host.loadtxt("../../ater/topk_ids_torch.txt", "int"); + // topk_ids_host.savetxt("topk_ids_2.txt"); + topk_weight_host.loadtxt("../../ater/topk_weights_torch.txt", "float"); + std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; + + g_host.loadtxt("../../ater/w1_torch.txt", "float"); + std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; + d_host.loadtxt("../../ater/w2_torch.txt", "float"); + std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; + + ck_tile::HostTensor g_perm_host = shuffle_moe_weight(g_host, prec_w, 1); + std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; + ck_tile::HostTensor d_perm_host = shuffle_moe_weight(d_host, prec_w, 1); + std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; +#endif + +#if 0 + std::cout << "sorted_token_ids_host:" << sorted_token_ids_host << std::endl; + std::cout << "num_sorted_tiles_host:" << num_sorted_tiles_host << std::endl; + std::cout << "sorted_expert_ids_host:" << sorted_expert_ids_host << std::endl; + std::cout << "topk_weight_host:" << topk_weight_host << std::endl; + std::cout << "sorted_weight_host:" << sorted_weight_host << std::endl; +#endif + auto cal_tflops = [&](auto ms) { + double flop_gemm_0 = + 2 * static_cast(tokens) * topk * shared_intermediate_size_0 * hidden_size; + double flop_gemm_1 = + 2 * static_cast(tokens) * topk * shared_intermediate_size_1 * hidden_size; + return (flop_gemm_0 + flop_gemm_1) / (static_cast(ms) * 1e-3) / 1e12; + }; + + // TODO: this method we use expert-by-expert view, just for reference + auto cal_tbps = [&](auto ms) { + double token_bytes = + static_cast(tokens) * topk / experts * hidden_size * sizeof(ADataType); + double w0_bytes = static_cast(shared_intermediate_size_0) * experts * hidden_size * + sizeof(GDataType); + double w1_bytes = static_cast(shared_intermediate_size_1) * experts * hidden_size * + sizeof(DDataType); + double o_bytes = + static_cast(tokens) * topk / experts * hidden_size * sizeof(ODataType); + double topk_weights_bytes = static_cast(tokens) * topk * sizeof(TopkWeightDataType); + // ignore index, they are too small + + return (token_bytes + w0_bytes + w1_bytes + o_bytes + topk_weights_bytes) / + (static_cast(ms) * 1e-3) / 1e12; + }; + + if(api == 0) + { + ck_tile::DeviceMem a_buf(a_host); + ck_tile::DeviceMem g_perm_buf(g_perm_host); + ck_tile::DeviceMem d_perm_buf(d_perm_host); + ck_tile::DeviceMem sa_buf(sa_host); + ck_tile::DeviceMem sg_buf(sg_host); + ck_tile::DeviceMem sd_buf(sd_host); + ck_tile::DeviceMem sy_buf(sy_host); + ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes()); + + ck_tile::DeviceMem topk_ids_buf(topk_ids_host); + ck_tile::DeviceMem topk_weight_buf(topk_weight_host); + + ck_tile::DeviceMem sorted_token_ids_buf( + sorted_token_ids_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem sorted_weight_buf(sorted_weight_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem sorted_expert_ids_buf( + sorted_expert_ids_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem num_sorted_tiles_buf( + num_sorted_tiles_host.get_element_space_size_in_bytes()); + + fused_moe_traits traits{prec_i, + prec_w, + prec_o, + prec_st, + prec_sw, + prec_sq, + prec_kw, + block_m, + gate_only, + fused_quant}; + + fused_moe_args args{a_buf.GetDeviceBuffer(), + fused_quant != 0 ? sa_buf.GetDeviceBuffer() : nullptr, + g_perm_buf.GetDeviceBuffer(), + d_perm_buf.GetDeviceBuffer(), + fused_quant != 0 ? sg_buf.GetDeviceBuffer() : nullptr, + fused_quant != 0 ? sd_buf.GetDeviceBuffer() : nullptr, + fused_quant == 1 ? sy_buf.GetDeviceBuffer() : nullptr, + o_buf.GetDeviceBuffer(), + topk_ids_buf.GetDeviceBuffer(), + topk_weight_buf.GetDeviceBuffer(), + sorted_token_ids_buf.GetDeviceBuffer(), + sorted_weight_buf.GetDeviceBuffer(), + sorted_expert_ids_buf.GetDeviceBuffer(), + num_sorted_tiles_buf.GetDeviceBuffer(), + block_m, + hidden_size, + shared_intermediate_size_0, + tokens, + experts, + topk, + stride}; + float ave_time = fused_moe( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + if(ave_time < 0) + { + std::cout << " not supported!" << std::endl << std::flush; + return false; + } + + // float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << ", " << ave_time * 1.E3 << " us, " << cal_tflops(ave_time) << " tflops, " + << cal_tbps(ave_time) << " TB/s" << std::flush; + bool pass = true; + + if(do_validation) + { + ck_tile::reference_moe_sorting( + topk_ids_host, + topk_weight_host, + sorted_token_ids_host, + sorted_weight_host, + sorted_expert_ids_host, + num_sorted_tiles_host.mData[0], + experts, + block_m); + + ck_tile::reference_fused_moe( + a_host, + g_host, + d_host, + sa_host, + sg_host, + sd_host, + sy_host, + o_host, + sorted_token_ids_host, + sorted_weight_host, + sorted_expert_ids_host, + num_sorted_tiles_host, + topk_ids_host, + block_m, + tokens, + experts, + hidden_size, + shared_intermediate_size_0, + topk, + gate_only); + + auto o_dev = o_buf.ToHost(); + // o_dev.savetxt("gpu-out.txt", "float"); + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err( + o_dev, o_host, std::string("OUT Error: Incorrect results!"), rtol, atol); + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush; + } + std::cout << std::flush << std::endl; + return pass; + } + else if(api == 1) + { + ck_tile::reference_moe_sorting( + topk_ids_host, + topk_weight_host, + sorted_token_ids_host, + sorted_weight_host, + sorted_expert_ids_host, + num_sorted_tiles_host.mData[0], + experts, + block_m); + + // done, preparing GPU buffer + ck_tile::DeviceMem a_buf(a_host); + ck_tile::DeviceMem g_perm_buf(g_perm_host); + ck_tile::DeviceMem d_perm_buf(d_perm_host); + ck_tile::DeviceMem sa_buf(sa_host); + ck_tile::DeviceMem sg_buf(sg_host); + ck_tile::DeviceMem sd_buf(sd_host); + ck_tile::DeviceMem sy_buf(sy_host); + ck_tile::DeviceMem o_buf(o_host); + + // manually clear output buffer for atomic + o_buf.SetZero(); + // + + ck_tile::DeviceMem sorted_token_ids_buf(sorted_token_ids_host); + ck_tile::DeviceMem sorted_weight_buf(sorted_weight_host); + ck_tile::DeviceMem sorted_expert_ids_buf(sorted_expert_ids_host); + ck_tile::DeviceMem num_sorted_tiles_buf(num_sorted_tiles_host); + + fused_moegemm_traits traits{prec_i, + prec_w, + prec_o, + prec_st, + prec_sw, + prec_sq, + prec_kw, + block_m, + gate_only, + fused_quant}; + + fused_moegemm_args args{a_buf.GetDeviceBuffer(), + fused_quant != 0 ? sa_buf.GetDeviceBuffer() : nullptr, + g_perm_buf.GetDeviceBuffer(), + d_perm_buf.GetDeviceBuffer(), + fused_quant != 0 ? sg_buf.GetDeviceBuffer() : nullptr, + fused_quant != 0 ? sd_buf.GetDeviceBuffer() : nullptr, + fused_quant == 1 ? sy_buf.GetDeviceBuffer() : nullptr, + o_buf.GetDeviceBuffer(), + sorted_token_ids_buf.GetDeviceBuffer(), + sorted_weight_buf.GetDeviceBuffer(), + sorted_expert_ids_buf.GetDeviceBuffer(), + num_sorted_tiles_buf.GetDeviceBuffer(), + hidden_size, + shared_intermediate_size_0, + tokens, + experts, + topk, + stride}; + + float ave_time = fused_moegemm( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + if(ave_time < 0) + { + std::cout << " not supported!" << std::endl << std::flush; + return false; + } + + // float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << ", " << ave_time * 1.E3 << " us, " << cal_tflops(ave_time) << " tflops, " + << cal_tbps(ave_time) << " TB/s" << std::flush; + bool pass = true; + + if(do_validation) + { + ck_tile::reference_fused_moe( + a_host, + g_host, + d_host, + sa_host, + sg_host, + sd_host, + sy_host, + o_host, + sorted_token_ids_host, + sorted_weight_host, + sorted_expert_ids_host, + num_sorted_tiles_host, + topk_ids_host, + block_m, + tokens, + experts, + hidden_size, + shared_intermediate_size_0, + topk, + gate_only); + + auto o_dev = o_buf.ToHost(); + // o_dev.savetxt("gpu-out.txt", "float"); + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err( + o_dev, o_host, std::string("OUT Error: Incorrect results!"), rtol, atol); + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush; + } + std::cout << std::flush << std::endl; + + return pass; + } + return false; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + std::string prec_i = arg_parser.get_str("prec_i"); + std::string prec_w = arg_parser.get_str("prec_w"); + std::string prec_o = arg_parser.get_str("prec_o"); + std::string prec_st = arg_parser.get_str("prec_st"); + std::string prec_sw = arg_parser.get_str("prec_sw"); + std::string prec_sq = arg_parser.get_str("prec_sq"); + std::string prec_kw = arg_parser.get_str("prec_kw"); + prec_st = (prec_st == "auto") ? "fp32" : prec_st; + prec_sw = (prec_sw == "auto") ? "fp32" : prec_sw; + prec_sq = (prec_sq == "auto") ? "fp32" : prec_sq; + prec_kw = (prec_kw == "auto") ? "fp32" : prec_kw; + + // no dynamic quant case + if(prec_i == "bf16" && prec_w == "bf16" && prec_o == "bf16" && prec_kw == "fp32") + { + return run( + arg_parser) + ? 0 + : -2; + } + else if(prec_i == "fp16" && prec_w == "fp16" && prec_o == "fp16" && prec_kw == "fp32") + { + return run( + arg_parser) + ? 0 + : -2; + } + + return -3; +} diff --git a/example/ck_tile/15_fused_moe/misc/moe-0.png b/example/ck_tile/15_fused_moe/misc/moe-0.png new file mode 100644 index 0000000000000000000000000000000000000000..aed1964f2802c4e7f65d7080f338309c8c2287a6 GIT binary patch literal 76830 zcmdSA^;=t8)HPZ>c+lbyytuZwJ4K7TLveS4yB065#ibN0?ouoicXxM(FXx>1yZ8PH z@BM-2A$fMP*Pd&vF~=NpCQ?~Z8XbiQ<=wk?=&~{rs_)*xmO=mA0kF__u9#x1-@SW# zla&xv^ISa6M#{pKTpVci+1!1{gbGLu-liw$0?@7Nd&Bj__e_2b~xSX|8}W3GCaJ{=5;lm&A-v1l8B6q{QdiPyT$tM&Yc&t zkyPfZ-Lc?7)*4ku(xxRrv!R6aeG_BjjX6XA*!pbm!!l;w#+bkQzCvqHXPs{cCAo0~ z?3)9`4wWyqvmDTs|FWD|?eOE5YjD{c-*uXw2tmf#h2F{iSZmsIsK!XMc7AzT4|i_8 zJ6N~g3ajFp3VrH_LT~4L15e{$7GowUF^9D_bJc79Z=@%uiSjJ6pFhXAIa{Y6A0LOJ zzVERC1q%lE@kwqIL`2=D(XnY5u_*UxS-uf?p0(dqcfLMK#S(Rs-|SCrS%HJ<+V^sN z47?9$e1-0oT`qc~Cdhf6x24JbLNI}J7a;hWxu?_Cv)8-Mx0PDs-qV%Vv8@Qn_2miv zv$jn8mJd`^RL|iK?Kk)xi~wku>$))RRwxma|>Pd9}5* z=~mj#4O2V#Cr2oaO#l|^t@2|qw%M&@(;69E-J zLfH5tJ6a+z==x}m6$tQs*hxz>@cvqvs6yItzZpHk+DStG(QYwhUto5)OcppBL&&+C z>Cl1Y-_zc9F+luN5tkRSp!8**vdMl$2SqOLM)w(c!?4b5_@elQ-2eF^MbluRTA$yu zpz!^W|Laxm+v8kk-`;Q%-J17N)lN3A%kJo}aXxw9xcLP4<9Vks?z!jNC5KI=h++b; z$R~`Peu4YLiW;#EG4NCF+w)$_QMLHbE%<3vp^&TqDsyEzw6 z+>5Si{9kO`SVHY7#Y)vHGz4F+r$ELM1_rAu9&51#7;pbPRzvQXj>SZdpr)P^!ejP? zz}4v5+pEVFPuoW(!a+l?os=c})-i!Nt&O@77AQtkwgzI2O97POXoQp*=L50CSpA#f zICP2vC-qa-bT6e6^$}3$Q;3MBV;^l+qOAGdt*pJ?ehN8`us8i%S1?$rQn(6!vTIt3 z-3P{cbOpfRp{XbG$>27SpTdpPY!bY)_vSK%bBj8(>KwBOZf>-mLFtU;u~1f4X0w7& zxVrY`ZY*|!Ummfjq#_29=;xv6_~rL>yW6zi|GM_}vUW8g%qsDcW(Y?9=5KX4qiIM= zPL645CPm;DPBab1?U$v?uJ?a?z1SLrg8F5HE-ylv^yj#fopR1?nkEMHxP)PC_D+|5 znGk;SJ<^1pzt;VNX!o;K^r!CdMOop7&T&}z=IumzAnmH#oW61%s&wPA&eS`86 z{pc4ZLGDepFYuq)7}Ve^Q8mJIw>CEdb?og=k;r{(eIJ|w44W~}&(DV72lMeP-sTPh zI_d9fIx+Xd(1(bkKM5nYmywf0=@h~a_SHuEdX591SK9H7N_taix(B+f7Kb$GVOCa5$tKqWRXxM!L+Ei0#X4R7jk7`ZN+Rg(DkcUSLyj09^zwW)@ofVy@9`B}GM9(6CxzMWiV4pVVL0 zlgFX%75-I$1>CoIoW*@qy&wgtu}NO}*2^XPd(mxm2*2J_F#!hxu zf>f21l)||-ADep&UoX?3UxF9NRH@&33&|H}184dF`B0MO*n^ysidQZbLrXb3_yFbO zHQy`rXFU7qLg{#U)^@HwT^-SPNJ#-V1@9*7l;g3GoRvwx{P-N06hhqny;l8x0?Oh@ zjZsqY@&R<^& zcWI~`HzS;eiir$|XCcmAtooeg>*v8c!WN-a6W>H*mKt)A&?|a{q7mbOF_{r=mw%c{ zXBn>Q+2YpFGd{~D;~l?Q`O0ud{=6-`R#7q;;%nnaO@p6^u$DrtA5RVx?Rq&`Z6}5` zB9p-C70U5B{hbhGv_wLL+pSEPPMhNlLm4v%X(caKJkTR6 zZWN0eNBl_pt}M9|l&XJZM3`o1 zAs1$W^9GQm^VzxRquvw9sA&z%|JSyiN`OO*4_7ECK0!C{;cbijP2}n)A56WC1OaQ? zx!C>n{QDrSk|{aSsrI{7A1s1UOBlqk(alE-ZwL#-G!Q8~3}m>mi1|U%8uUCH983IZ zpoYwQKR-$y%_Lr#;4vyuQyot>NFigb;6wf5r{B~+X@BDoHb#==hPiUR(o`@jr%rj7Y^Zz=h;l=K6gs z;?0XK)VWAXNb27v?xURL)nX&VrFeAMAdM%Yh9QF5yn!ZbU{UAcM%aJ@IaE9|@%n$lqQ z&Q@CA8!BY;&FFNnU5v1_qBX>oi)}|3Zg3uwq2qPnot9ZI#w#KoBci~VN}8;PK>+>8 zP?;;#2KR`=1G9lzjb;Ow1p7w60yp)RTmEu;)@qv7LWRzLHAt@^mD&uMxje|R0ah7< zo<$9Q87BlD3z6t!)J|DR4)%sHQAxua3S+o(%i`a;pcO4Uj(>X zB2f7|RTP9!tscA{YM`V-jsW3hB#v^Pe6AiFp`lWd&Jf_ zk@g^;jSdSYrz8v24D>9-{>ETTY;s*4Bqm<1f=bVqj2QBTNmT6f`K+!3Xk5f4Kv-gM z_!-$5#R1tov^x)C#Cy)bs8QMVV3kNuxV*@B88r}18w04Rtp(CigqCVw8|XCI#{!XP zLuLcZyRqtFub))POZlIHq{Q@}>cZ5zM1WIheC59jZ-iQAYYls>m`zj978bPOt zq0TR+Fr+702AD^LbhCB~h>c7CSf_X7E`4_y?JSZ4vZ41)#)BoL#{3JcgO!BezkjzD zeNC}Yio1;}6%-6&CQ6f(TW?los?}9PISuOTb?dcVAHL|_ANGeTNq1|KwfCJs94Ec(fy-%R5TnjYSV!y;Yja$ zS0YX~Rv5Asg5^Qd0D0rI0`9295UB!qWjV+F*|=8RYt(bVIqim@2aXy{RtD~Mkp{W3 zP<*Y>L$FQosa&@g+H4ME^TTfDWEaHyLL4`mM~iBaHLmxmFwRkwhZUecq(o`mSxfIL zk`(-$KBz+4jL;*h1&cNyU~k4yhA~ympf9Ivegc*KD3o|MfqC%jzbRB$+pn=7 zMRwidRG$!b^@MpOu0twigIq-qF3zC3Rctq!qGZ3i+uj&7I~)vfbTU*sf)yaS*M61q%OhM1QmF= zM4!hG*(pLOSpwZa=|H4OIPup%GCb5T>V6*nClwQlu;|?HFC~gQctqPsYnyB5u06#RxZ(<+5_+#+ z6#@04+#C2nf_$T~D=er&yjQI^@llF zFp6j)qW`kTfO(LH1!jnkV0}s-G~PSM1M19oH9Kvsw^n95JTt!kDdq+G>U6!sSW0-*gw-F&-fGnrEM<8}+68pMzCgcFBpfQBCiz;fbRw2Qdy#Q^5^ zRBfs5n`wGW4}qLdoPX!NIEGrtMsb7B5AGR?{yLF(gyV(PJ2J{J>tLXupszlv;Wb3& z;P8VWQ1hPY2J`yM4YI#>7 zireQKy^=K}D1R~4MaWdrsib0nND3$M_c&U#*Kqg%2VGHab?GOkorHcOHXYZ@JjdDm zd(#;(r%5nRs6^mCMF9yUPb^HNOI=exlrq$LG>do2o8%yTFl0}%qAahSqFR#A_-E<2 z1j){Q>u6p^C`oP>$h^Gc4$6+QP2F!ch zR}v?Nz(Y>OTEdh`;eq$HR+E(1VO0+(bJWAw9aJLhnT-%|GeYwhEtMf9$Zu&-?+zGR zqRsAkK-ko17Lyy=(4Zr0D`$15S#6 z{OEBWl|>vPMME?v6T%NYUgs);3H(Ngkg@1s_l4FmC8AS&z~mcQ5ejYWEi6rx){sSg z$UWL{*q#J{SO}l7=l%7M1cW0B4Rq|xcE7wG6Bcp2dUA8RawH*uxt*UkI&z>?Pf}F+ zyKbg!xnr6D`o2dI1pgkJ-f3V^x2psNtrSL49$Cnpn$ehvYqhoM>yO*QZjlic2`lP| zkB+QGU$!X=ZhOtX32avc5x#0byh4XtR_r~u9Zzc3!$D&_@b`XLSX7Uq=`p+3-X! ztHwFCFraI5D*Y% z4X8#D-J|~!2zqmMa43aUclG9d0Ch=XeV^=vfJ6G}&(F1^cD{C|+t~hEsJvpR?8MHw zeYsnc&W{W>85L${SGmR)15?(M(}sKwoiF<8S=yI?TkwRTF6A2fZhSco_HUdHJqt@1 zdx)#tMhCoCB_isEH3hi|3y~Q6G&dKSJ_WoPXs>r@a!Yc!3m{qf zE*Pu0-TTJ+Ctz8C5Dcdo@XhsP5pPcT2k$RwB`7n6@;Gfu@VHhu$#;Lq+r5{O!l7_2@|uFn5Dxqpp81V$tE$%Olx_Hk4D$ftb=#C>mQ5CP{1)v;^X&WI(vp zbb=suhn7$Iti4sI7AY?OGmALKWS)gR{ zdJWUzerxmz@I-K#_~#q{xD|$as>;g=SL9yX-%4`4QOtu@e*?lO|AEtbehrXJJ%V`~ zvKN5#{)*B@`U!>smX!8_a3KCtqGNq9UkPuieBPLiiV7|gbBzOVO15TUZm!)xZf@`u z9?g_Z@WUaIXlDAbdUqib3>V%Q`CkI3ZOpEdt_OX_V{`Q*OO=MPxOJQk(I*P0oEnzNdWTlwx);UWM+tWbiN z<7hg2go}cF=!fX+mA-U!8O3ih1UAw5%;z{fOundtDAC6&t#<{I(LG5-9UbH;=*A+O z;^X!9vDhk4#_Co}#GmYKA$c%n6TUhmDo;_S93e31dvlaJAE3W@D%*{@83Wk_M&>$lIRqe|LJ zp;qUEq}79pwQxvPP~>;+>KEacYn)1qpG++DQXJL_PsK(`7zn7Jto1J)-(K$GN%=pT z;RUBa?Ykc7EqERt7uQAV%|~$l%h#`8&krD%3H11}MAC^#Nn_vaWX_=VAYw_(w)&#G zBdaGSm(o4{FH>ttv{&uH^Zp^uQ8;|_?9WCznnRD^FHiYq01wA9$Ig&})hgYlgh&pJ z&nMw&fU~c-3Sbph#yEbfK2UXM`N!=@}Fz0$*oWg1p#ASbyFKzSY zr$1E|*7Zo6g$i_AC~NHIsf}T8af8)o!5`+Mlp#c`IMq%vOFE^l41D4?ag&{6LK~1A`oBD%OkY zuO6c|>y;Q}gCwzxBYyHeK7ryd{Zhn50}#^;{q(ayZ$;)}{9iIYSNg}7<~+#c;xt&L zEdhR>chuqW`oP4UI*Q0xbSMiGhaCoeA^bKx{!VHdd|`x99+^xs<^y1b`Wv2>)MPhg zPLJR4uG8~f*yQUwBF;L2uS8)j;1}-;0Gb6Nae$K;Rj^C&S{Ld0GX=g+bL1u!1`8-kX>SCX zjAU3@`QN*DVg>Sab5jME8sav)-Gt!5pE`Db%tSZ#z6Zw*(MAcH#pZfmBc`Fte@v^* z=*rD|;32`dx**qsQ4PwFjMaSP5UIDCM#zdR%K%YGm-3jQR2h(nVv@xPLD{&U1;0m2 z)Cpv`Df{c7J47i!S_j&JxiyH+Lp>KT`6TYXwjS65f9ZbxCG zy5v5YUEw|MRg0LMl1}3KJw9%#=Ldk}RWt@;97ip{AhbtxH*Xprn#CmMsYZM#(zY?3 zB}w2k<;*^>_3|5x2kpG{(mP3Wx$V<<=g!{!)CsK8bDL35AmOneN@;kn>UZ*f!nLOL zl#ig@jW0$|(4V*HDI{)j<;r|YZ0P&wS!;HU;;=STDc)oiSmXJU^1aAU~`-M8Jf)08`^>y5by+_ zMO(Et&}hVoM!mi@6uv|pD)aAWg|Nze3*wi`g9}Kn;ag-)vFi!z3$b0Gj#0RF89fN8 zB}x8RP#uxfEcvkYNdK0wJcG+V3x0Pc_8|{0#G4})+ny+tK5Rwo@r5nM9M8g2ZaW4r?wd0xqFC z%LhV_@aZJT`v%%{Ei#%6517NzLDgiuO=DxfyX{8C@@Yy5QYKy5KhMT2T2Ey%IVQE! zEkb$4Wk5|4`wZk*`Ac#xjmlL55Ge*eSZCn>jo0o3x7R);C=+!9`#!hr@<=>QZisM( zNyd^162lAa|1Cc6eh0ofNnNyNl_qDJ5ZUO$T|=aEuYGF9Vf}m{i?+&`gi~0-8?awiR0%oKa?-)imnO z?+sDLwVJj%(BPFu?!rD#;r>zp7f|Q75&B*%RU;GAswyMue5WW$5xZtTG)DUdxg?*V zzX0v%@fF(l%JzSKfTp-!N#)&HZ;9%EoPIs#f>K=N9xiuLE+uOGniq9YxrU2I7qoe+;((7QF2K1-O!^FtTZcsPTK8 z`U(PCh$4_j2^rZMl9Yr9-%HSR$M!Tr?k6Jy6(WXdK{bf)xv3Fu&@d{re_7l!Tcc`* zIZ~d-r`%SQqs-uvYU2)RTV^8|BFc;1bhlS)|3YL|uoXq|3B?Z+zzplkYfgB#Zx8Kt zSNYhX^cC_D*n2tCq|(ZDzaT41wENsi?O`*(&R{u(&7iJZ??dut6V!0|`v{r_6eim? zRP(!mwp?UKE$Ykf`=kGeIf;6{-}){x6#5{4sh)I+BpaG41Yq(1>ZNR;*b79R=SN3oW8QWs}qbh`Yz(8Cz zpr1{Md9%gQY?U!56qy0Lk+hM%k?k?c>y(PvtAw8$O;S|v~_+LZBx$fkV zEmzyT3eHfEg7l#S8gRdZcPOlvo1F?ht3-y3;8f$t1S%1JQXLZg2%0j=b%dB`3sce^ zVL*OZY-yg)u_1dt{C3&f^nFJ05^b7rWo5%2LS%UVC(wvU4h;^-$}OVUf15#1e*{U3 za4lXhH|-NCRsP9Oe2k<)JzigZmKOArh>}=~_Ej<;G*>D;&r50B*9R4#`{Bt=r0r(k zTf-+xKm0th)6-jq>WgnOYDu`b4o9mRyJSK<=-n(abkO9u5Xc{@yb(0(#8XxdC9sp; zSj=WpS6mO#k63+$Toz*)2v_2EI&{1(Z2}zZTOt!BbD?TbS^1q|04@3m>cSq3d%&Z} zj9j-nYP2+34R%~ah7_mB72}5wq9_~pMnR77bUB`shlN3!0U;FbXMv?%oobJV zT0DLPFj+jGaZcEt;jPHpgmEc1CqM}a#?ATXJJTk6>rrQMxrLPsT4|xtd4zeTDq%o3 zGo;*p_ZJ=vl_ow39@cHKN)>!eZcwQe;-*;2z*Oz{I6iAS=Nkny@0Uv#FLc8f$HHeALm&(;up^t~z8$_?g zW8xAS>$y*b;4y?mFF1sa0_30?_|EuI;NC+|baWo-i2aTf493y{;Y;)7D}kaVzY(Yv z1KhuPD+Q;q;jSf~)u*vA4? z$mUd*kb-)Tk575Kf;}gVzy1kGlLuDmiBMM204ceDjtw~wqi}j3qSu`K8`u6Bm7;HxO}a5L8t{8Mw~JPZ7@-&sqt1ze3HoAaWzI?vEElk=E2hh zU;bQOS*FlKu^+*FqlytlIrT(*PxQ(MCqYnvA3~FP8KNuU>m{Lpg$G4*-g?3*utb|I z0lsh73hlhS7EcSWN42l#TrVP8r7nU+{B~Y!5Ecn5AiBvp-!yO*`?c{C40f;;YD2S0 zKT0!oL^n;Ep7V^lX38gjpdE4;f=%%62u+#pqkKa^MSrXj(gr6IRZ!hO9pjB|<0-3& z6l-B?Qbj^`?o+K*d~hxGMLDAtTnruyoB@G8xfwZ@iC0CsQQkQMo9O+I2=2I}JS|9K z?;eA0H)%e(l?qUPnhDy?A#=m!KwkUIpeOA5o+jQTJXjnpE=hd#k7?=*v}_G)gd&|z z>o}cp6~MyxZHVI#3j=pS!WZsafJ4w>JQ25C+tjExce74CN%HSf`y%ZaOi{Uwbmf~} z`9TFoS|*@Ws1Zr7%H)1r7k@av&`)z#j?i~pzHZ#rF>=ymppz6uiSXKP!tR&uu#GX} z)FBKrWSs~xDO+QG7$)dQ1UzcslH_%;i*gCu7IeZxES1upCgd_iFTXgiFxV&mSDc&# zx!>V*S%gaEIKptiFL5NS;`eoo-@oga*$pSsG=H$3YI#R$2*MEW68wH{aM9b#@`={bZf_YUjbt7MxEDJ^&4oD^~)eBV*I3e$Oh#M6i_Aow=AJQv!x zKc#VZ4R4hWls+~n4U&?8An!=E(XUzsEFq7uo*C>4_&@8#_-w#kh8m$tG3Ja(K7&|LI|GxxvDVWz zqh`hXhAQ!kjI_MHK5e`tlsO7V`iN3ZvQ0k3@!a9i<$cJ1D1@QSj~mZ;eCboC^~URsGTLw=y95dQ&a#xax2&L2!nhV8Kfa z;(*AW;C%XV{xg9-0|0Prc)HZ%8ZKWHa(|A@c#h`?+syNaZu_(u6_*r@IS|Pxe*AQM z;?s|(DWP}Hgs}Im>o3H8T`ix7%?PBlf%DELwbV)&R5ZZ8_aJUjz;%;G@?NcNAU>fm za8YUKLbU(rN%jT$Z#iZ|Zo(FvwJ?_Qlk83YEsnDHNJhNO&M^ zUM@9r{9c!47a9Cwad_6kpg|<)To>M>7qrnqEDZMbau}ex2@DDf`e`}{ohV&_PVLQj z2_ltE{BNCt{r|c55`z5Alw+pBM6Oai9~+F3L}Z zDF0m+yhyHI`tr{>Gs*O8sn`i!2vBt|?^gPstqw`xLuwE%|92;+c}rhZLjSw1=t(@o zG*}D(a>KCpSA$~bfB9F&fiXYpbnd;Y_y0rMTSGF>IXnFDOZ1;d#rBbF6LJvwVkVnf zROIQghD2+(>{Hu!l`OSG<)z*$Mxf~K6 zU$c9Cn!e&6nY|MJw~siUaDh3dhO0@LXZ1@}gWUSkdv7_>ivDKz3qKXNa^e5o!u5)UtdjS575?rSMkL?wfKf&SS4?LW$s^A+psT(X-FMl)%#zJEmW zr@U3ajWVSe0+OTY7q9NBcE;GByo*p%`%R@E0|E;2MKXIJyW%Az(8sOVc z*g;&mp&Hx_aq0iWL{&bS=}GbuSKjwUAdxxo#damPGi%c`a{|!u?x$RG^*J^^k{NE2|l>0=RTMI)!m;X|wck-y=i181AtdXsS%Mkv(C_L{rN$1~N zy8-^Uycv;mdz8UY#_-yic!eveah%9B*g_pA7~Dcz1|thdtsH7B9770E7TX|UfyumB zaz4$s;nC*z?X$qtNF_P=j2BAAbXLb9?VZ5ZuVLSF&Pautq4r^C<&J(N>Yq(;3#b1o z>?)&Zzdd9N_ce{d@gPcpaTMctfkJ#Wj`1PyHo6kPtMRJ|_RQ5|k=^&v+ zyuMy|y@}gq&vvE4BtM`A^H^_dtz%K93p(?jQBM8Q(M?z$V^?g9e_i(#wIJ}1|Nb4H z5|Ne7{AU(M7;ej4hn>$0rve4=u7y-*R^*dzmC{a zwQfR2(*E>4)W&(E`0>w`Y3rIjM8LeGLY4hGj!{BieJ_`EqAeuOD4^!O#(?|&AK)Bf zs7!)+oV1-A4~?ZeJ2!D?hMVl;yd5=v2d}!}h`YM!TG(i28c@DJTvg~$h(>#4`SJ{KTsMnyT0XtjRdGnp)h8AUALSGSZ~4@qje#$CnnA5cgwEPEg7}$Hfi)70AAnRy(i9^tC6Fe! zP}^yWe-<6*Ur_i`o|t}k=o?1jdyW=(XQy9Ax=0fWXwzH7Om$;thZuV6{nX}%6dR@} zlqK25^_Fp$^boFP;4I^Ap*&s-Eo+0FNU@d^3u9I538{I^CRO^$ zZEYNPZu1-InO`%%@>=8_5>S>NJ(EAfjg7Tnm;9RLZ3dw1h`eRoc`WcVvt3sA;;QTQ zB(O@p`1~A!&L*RpXhv;kkK-TOEEAWIe44aVusc5FkO72hB%kREj_k)ssbVagKV-CT zAji4CKyXsbbt!yG{Ci(0VT%P_&C^^m67I?Ng<%5d)XeW)u+&2bR$2pA>T#t;ij38S0m zTq4YNmXu?{?Pk%Y0v^ol{NB6ZiUKmfg*-Ibv%q z4>~Fzg&50XvrnEfd2j4=icVK3m+bOU&x07>tq6wGJDF!c2@=0#z9&Yo5=fKva1(BI z-7hHW`gJdD+8UI=L}6X4$q~6x8uMRt!b`0>i<+Md^57+>fx88;4Lq8Sia5c|2q_c$ zJ#q=){*^aX-cr7dHC>J~;(iN8R8+LsCd)<)11(fr!;Q7ytDXYnY1@as-d~&7ukIU} zr<^g@>ExWLq+5leOgMAi(r~>~-zaT}z*h1OyT`6_Y|0`zs4q*YM>%2ufyZm0X8*y* zIG-qp-=LGYckGQD=?Qads$3zF2sM_To@LXguc!wN4BINz{+!z%X?ho4&6dZTX%pKm z8WSh~OxEjvHT&fj+Wyt1KZPD4I499ucnSNjWS!{xx;<5h*kWGccjoL9#gDl>_p=ki zZRe~-EUqgvCuq8=;$>VIOZ#he^PbjuUvKgCvzYl(a`x{Koo|UUnY2YuJfog7)6@)K z%l^?u;iXO|pS(xiOP;o)8MWPZuaZZf772W3xdDw8M});&dy>Cb`-O*1dP(&DWbYrk zMq%eN5gG>$HW|Zs_uexNmbA2F0*F)mKya|eu*a94fwU9aae2-|AOOQ zn=L+E9it=(bmUvStuwFZ3I_^E5vE2&fO@2)?kSt=#vz?j%rV?<-G=4Ap9GF}>Fb!Q z*Ob})S3RG}W5NO~PtLsUQwzoYeS$5zYl^z6X9gSv&2&{_ZjxekWjDLKU#lQs6%*rP zTcbYXanCZfr@5ylCT^Aj`D*)MiOj!8zug>8QYRd%S;HnSZ+N7U?+-hrdus z`{!*1?^AmPILYp#{tds*O?@sTztW?13YyF3cjor##kX3sX%O3EYmNF{O;to3v0Ent z6^GecE(gTnwC{Xi`Ded|A~|IoGk*(Onjiky%C2}87k}g@df?Yi6X@pR7T)gasrSY1 zL+r^IA7BjqSnFi114X!=+v@PxHA9cjA9{}ZKNo-2y9t;`o&1wt;lPxdG`%UXR=yVV zJqDvGF#(&UtFC;5MEmBb#o7U!NTtOQW^uNqmAW+uv~Y&e#`wz%J}((lV!f` zi({)GoN3%@YHXF#{zG8DtlP&zV$?wW%f z8sUPL;`Kv+kF%?hRAK@bD%TIL$mHe7ZbkUBU8ku3C37fjeB|}oC%z;S0SGTn7I5d; zlu&X7Sq#9YS%$}l{J4S^p!1wi;r))pzkWy5E6~(p8O@Tozka$xGGAY-wjIO?kLOWP zsBcKaKE|Le1;2jxFgrGLc}x6LvTM(FPH7aS5_hdJB`~`pw!^cD(>C&8V1{eBb@$WJ z4<9MVsmA!KW00R?PWA;)my?(@Cq*qbPwi9_IEP&ll5J+??x@j;e@-;PT6aenjFYfI zwnrq3ULAg2m0^AyTqNHZM#saCLGOpFntr6uH%34!+Ocvmu2`}d^h8Vev#0t-$aSnY ze8;K0W`5y@$%bTC`#!sJK2`xPv(cmRRNH&`jQRVOq!}@gpImK?;opkMqg4Ck6XN$1 z7R`z7#FgTqFb=P@=gO4rv=Z`SrR&z*viV>fxEGKcgCGUpnXj z=N*{DrnnV8&wNT=I91zBjIn%-#)$lVHvkt@EKJPz{+yrxx`~^`PvA_OU7zs- zpNU7_Uq5h+$0{M?nycy3JC8Hez8C?0&FHtg->SEiS?jilX}_l4={h^!W|ez?8lA#8 zNhM*CKFYa89CMwy3=-OkayGi-dmHE8U!#(8auY6bMJqgh2zA2pO`8SiRBv1&c^uIo zbhCR6FI2}vs~lOtb0BF)gFgW*xi3uog}NhIfLXN#+I@VfgsEWuTIwXWbHzsVdQV_i zW6G<(O>8t3*`a4SbD^st1^1_sR%?yoBiwRxg8yNi+S`h5{r#moMtvSjThDNCTS?my z@yH#gF0alVKAjZ}!YUr8+l*zzkHhdR{??o_c)IU48J%XQO;PDjx-h{Ym8_K{)$#u5=nTdFu*KJH?u41h$sfZEs!{uuPYpHmq zW5&6tBV_dzIbLh~i5f(4R^`mU8m{_IogFjsHB zkM9}FkvtwWCQ^epLemws9x3$;bD0MOs6FCc`l+Q&Kd*XjP5${ShzSce?GX(J@0!AH zU3AAaB>vX)KUbBrruDv6^-ZMr9y_~W$^7x;J1~l97+Em~q$x8#m>*3o4qQ3vw+ID& zt+|YTq9pj&Cp!diSH2AVIv>AH)gM*gf^x{@Nb-(z!&>UX?7|PrgD)omYvEw8SJ4$b z2PJzmhZX%ud6tvRR9x@(FUovjYddH@-bw4FoqBEe(q%jo5auoBF1h70=r671ZhUrq z2}!7xV?WKh@-^6!=P*qi5_V%W*?zv}&rhmQQ2LfG@(Nn3`#P}bA?tiYW`bjLLJ^yp zRyq*;4$0owW)TA`O!c-y$aH08;GvE^x3*5-k9uXnSmdC{&oujm_HFla?FC1VEdlbT zT4pOPe0FKr(31s?4wt&3jEmcTO&?9v6n7_Utxr{|KVZk+SLUmf!Q3|p z8M95r>78tptpj9vwn(9D;Qb!-=$g4UgWNv0R!sCqRk@O8xA@m-u_ z@E+|#@qjqEnP zjRf-LUt_P}w3kMfBv_lfKx5{%EtYu*zBH&E>ZAH@{YQS8xn`&6Op($QIVM@wzV(B{ zifL{rMczi?9#bZgnlz@qltvEI%ae_U2dSd>9nH#S*8oRW^_$9cgRvD~`6-c;8f^?b z{0{@z*v!nJag75X0%Xv(jJ5JpFy_-r_l{*T|M2}sCgnSaDl1C@)0~!xnJc}YP1$CC zM{Pg8u2NC~0N{Gdn`6@uYULhugl){UvH6TrFk zR(m`24F1I%{!80Tj58a=Zk@#mnoK{;BouQ-1-E81Yg6@3!U;B8*n%sW3 zc$n_goQV(1MEQsuyh_SAPA;?J{azLfh_!puz2RxIbTqE^SeP(b+1g;VW+PP^K+X0z}oy0!n9N$c}jhA;&l z{)Ep-e$oBPa}(X zzT?koQF~&B3ZL|ZC$hqU38(4Sd~-jz$$NHib@F0+|4zxxt5O4Z-4d`{rWV8(hLP1v^_M_8{Ud zavNhNf8Lb{fvTw8;ZKF_=OcD<83pKSe!#^RJ+!Gwkt&uGyhJpuO2ny)_4 zGK_{r8$~go)?T>sNkB(b(P++^`7D8*zncj~!L3nvA=SyG*xMd$X}x3ml}}~>U8x*~ zxwY2wqVw!{rbW9lNgTjCUaE33w;iJ9(~IMJ5utU1gkk%M#b8)|%}p?XoR;xG0S}N~ zX|-y?>=SR0ZVDAx$BKa47Ch6SmcynFDm1^pH$tyJ~gUFJ#WJL0Fyur_tp zdoog97FpRbH9dPlo*D&>_KdlT9a2cNORV&n@gD}dqW{Cpdj>WAb&aAZA}UG|6r?L1 z1eD&JA_4-^J19smp?4CB2uSap&=C+2=`|GTy>~(nHK7LxNlyGf&-=d5nfv9=ojY^R z7iPlVd+pWss=vMGCo|rA2%9}%Z_UmqOx_et_aMDR_($*;Ai}<&;P))jrj)xpJbq!0 z3NQ9aD>L-+GXWYX_(@{Qshh@I*M2WB%b=s9&+y_!9I>z*kGXbGm$42pa2PQi;Sv=d z+w&x`>kWPn->r<+&#rgWnktgOU!0-F9jKXhH)&hYqshx}Z1|~ z^#HW(tyq?QxKsvNwG7SNbhs3#6vAGj%CG>`Kc zq!`wV52w^qNTyC#8cx`!+BXwcbWjHSMQl|8KdjEfp#_i#AX;?+XF{Vn;H;@5(wZcpL9 zCB>z|CB#!e4cu!2c2u~6Zh;uC5yp)>+y6%wy=l0h;q}tSuTrHH9jOxk3D79In0XnSnd$Og0%K!U%jRxHhG&41>IYjjT z;fL~el-dGKTQaG}Z0<#Z^9m?{aU0`mIW51m%A>LyZP z=^Q(e+_?>q*X6vL$!n1r4wkR9ggE&L#vw4_;sR9D#t^4w>nf z8(p*Yj!TR){mlO{*;V*JWMgeMVcqz<-=7~+1Lc2Uko`POaN9|XUU3|$iyuyVaQy+( z)63g|uK|9i9S4Qsj>YzIpi1d6p6azlPoov|AJ!^WrstcNxLSVRty*c$mvc3)m#z%k zDPD(l6Hm)_n~L^GOoVnilt;td_i&c-(G>9UBV0R8cDkkt;fKA5BsdfgAec<wS>@XJ>*r;my0^G5Fzq~b9i=TM^;s=MYJ>W8?o4+@ zPp51FG%ShzllFizJ(lmgWOiJNiO4}FzgZUUG~k3#Q2Iv>wbet%d{3IPjx6CO$XQMD zW^Os(Jd60h#^0h0&tsjnGaj>FD070zrU2^)9?g&YMnTu(gji=(RF&$Z=e^`O)1_@z zjBe6xwe{+a{N~KL1!;TCUJN!D=#G55n9?q5p3wCf1mm_J(OH)CdUD3mB zRCbtY6vnl@uXWCD{Mj*boWx^y2gVDQj{f-KPy|^wZ z2>6!WKv-B6$1@FgTT_(lgXweM+u_biDcjMH!F8uO$%X};-OSyK3ziE9LT_$dBF>CM zZ+Dw0CZFF8uhWTgaJe)zbawz$02$S|j(3IR)!Y`-i@5raUD6<8>r>n)d~f-9-ke1T zAA_L7uXlxt^G{~m>HBn%Fk5+gW1Rx8;w2Gz$3!zC2ryiC+5P<}@<(96flJ?R_`v-~ z6!N0$HBr7dghT>gNOk9c9~XVu$A0D-j~CK~-W~<&UruU3$J$1!-Xu8vQ2$--USynd zwq=M8p>Orq^nIsP67+)7*JbJ=Ejm(bO9uE!tqy)n;F|cO#_`~u8n)x69d6&+moI9s zQGNHq3c~4kniBs_@g!_=SuOg#2c7(Vt$VdtwMPfrvrpQB3HD5{dka~oI;>Rxs*Gy6 z9pt8aDQ^%wcnDoNhN333Z)!g6X+YtUxB6M@`)ZcGKd-lTJ>ITBwjMdG)jVmtc~kl& zMXc6k#Tj^04(m3Q*PT5R2610I)mdR!!NP_qMj8nZQKHMnBqs4=jUkf@>f-|FD4T9_ z;7fyMZpJ73@rHYI6W+W172-T$fOy9U3idmo!hIXRVAkdR3cF79>YGFcJ2E$GfYnU00WPD4)ljfp zulk#TADfVePO?Oifq2>s=LQ(mWW@75z*71I3rf2C?+;8DH#0Q7tdjj5RDrsd z+}qJT-F7%qp-ys{{N!8&oOcz4@*ZLK_X3UDH_OJY=WmKl3`@djxRg?2Wt3_%Vkkd!0^5nM~>9xDYU zA+uR@$R2Kvx~>T5<{N@Qp?ZsXhXDo8;)ifD*r^>-r5}8$J)YrN7%3*)FJ>v~@7+IU zIht)w_pylT(fORZ_0DgnwH)QnQ+W%g;t!wspD*0rtnsMdFeV;aq7=B-ihE+__ zm(yAl_X|VP(Z^T`HKRBY4{JWym7zjod1p~*7CF#}6GmIoX(oBipYMha(@#bRwomiI zfUieS5&Q9ordNgbE+zCA3Ev0HTWa)2$5%4cPY?hi-O6(;3~a+h$f`B^0M{J)hu={N zrw89_Q01&MgfLA?N>l#VmAZDQ8U-5t7zlssQBSzH719YKrb??j;XZhoC;ho?#bB<{Qm^1mVuMaplFqY>KO!0KflC92m^Kpn577xaZBsv-zq>!nzUehU z)F!z6J~_(*2q=Zu-#-L-enYk?uY;Re5Bo|`y+vqVkm=X3%|e<9Zpt{a`;uqi8@rJ( z->q$djg<6cKX;w|o`FR1$)x#cRdIqyQD9M0T=lc<{zZa-z z=aKB!Jl5<0*>>Au6ZFiXrkHwzxMg)MlP!=z~%aon%y>=L6>7lCR>SWi+yO+tCjUM?IA z56GVHW@Rkm4BDQX+~rSIG_tnsisojq`gVi!>D{`vLs~0$cYR)y&1n;`&TqGBz>Q#k zN%%nXWQ6YFk?9i7$m$;6d%1J~b(h&+8CWSxVuI*F+yd;Z!43ZNdxnr?dvQ`KS6QPN zokAx04G8C)_eUB@k&40akxH2FHsEagsZ#&_lbn$eBA>?SA>zCT=Uvd&!TPHQ-y_jn z=>?0~S9y5L=r)H#Pvg|F>e}iD?pC<3P~T67CtWO3*o?+4#B9u}4js-9s23d<_Cl{~ zkBSt%VHa+yzCX24&CqH9DEeaJ$0nDmWFG9}f+=>`bw9SDKVQ>nB#hGKDd5p2%qAZB z9`FzWkF^Lb@E;d2<6J8PU$>ARcdXzWn8hy%O|InSx~M}N1_>J`2K-R#!?@Ba$%Q>E z^+M05Ym=#mHwAERid9-EYQ-?nt4c3Zed3+Q`0_-9FilfTg=?v`*t*kowrHj5r{iVP zlv0;K%(TTq4Ay7a)2X+k-M7BjcnWzG@MnI5m+@9_F0uqN@6X-0;7aUJs61zyw|2*Ad9 zytC?3SI|bWVYM!i96E<7QkUhI$+IV7^}irOVzs(UPI5Wgb)6d#vFh z1J)r>B_8@L!3S6=LS~3=>j5nnWcoKQEK~Nx&{9EjOB}|x+xH<9K#|3axI8~(%o#nm zGF#F16B|i{bgFUAGSK8^F5)qD(OkXNmC1GPSed8&w=C2drl78;hm0hkJ{H<4t$7{PxZtBe8%{s5>{9N@~hJ1ECkRz>n z1hmD7B=jsw?{8S|UV`eN(uEU^J-qcyk@p`Rv@Xt}YTSKV7Tgm~+EPtVmZmv8NmMv# zW+!G5I+ba$)ou~k2*Sy47aY?V*B&~(I>Ar%is6o1$_s#)cx&oghU zOc(ZWkcAq%Hz=jN@er4<>Z1o=9LgasD~XcrA3Z!g3{p^0{l&e!w=Z}hQ~j+@epImU zgSgocXq*+{p5Wq1`!NO6te(MoVU0v})tH8IMrCKB_Bla}IayU2UpJDX0f(TS{F^C}#kq(L#Itm3)i5f|O(5dwHJ@hE@hmIay?u@6fI!U>TQ`F2 zGkuH!rXBw(_){F^&5zBp>7LEn03xH$e#2zm!LS3AuF~21E~f;}l_~H^T)5NKLfOVm zEy;{2q|q3rG=Rr0xYjpb->>h03ptBGecx&l;l$CDMyXKvJ*> zh6r#o|9P8oq+%(}#T_26J<{t*1T@V@w(=`(uYsoJ6{73`jw5SqBa{34S0-c&(soJp z4&=?{wG5dAYfU)2l{z#A|zF8)8qh$P&X(usraL0Yd2Ey5BUzEXr}W&4@0$# z|B2K#KBBg`Ob0N2s{Oc>F#DPJ*J!!i#i)Lv&@JaSlKtm@`Cw=ZqA z)X_S>Fae`+w{iZOgF(iu@3U>yS?}nHSH)dA5G;VsmA$cP#(X|bOJCC`G2$yD8BKGZ z=Q5s3LBCHK*QrWW%ojsimsKXXK!5zEXh#=7Si&px;=FW=c&km*{9!Wq>IGnlErf}% zyOp1H*rm;7F=HN}!qsN+zVoa4yRAd9g6#9YTnED2{Mpt{25iN(G9bOpXU7_<@rLtg zoo`|HRY8|yb?ChmlA?158mkiit>v_<`nF`@z06BR;ldEa^3HKdJ}V4#>D!-F+>48z zGn02n&zroe0lPvVsL%+!ry+>X!g7rc0Q0oly+oIMd+tqpwf_8Vho^C(2!^^qpL_v3 zsex#Ku-p+}51sUTWlD#}9%K;&P*|Ya+3EpE$I1k*2Jr^*CK|E=c`XY2%LAPQh?%FE z`6sTN+m9!fAI%QCJHj@xS;itA?WfTeyxCwN_?5W$>Bzy*O}5p_^<2w?Mv!WIK0kID zoe@*bJ3K8RwSOSQbXg0a0WNL(*|ouMaU^#F#dlIMLCzz3Tb@WlAp{YDKrn-yQ^Ozb zQNm*RVAQR4u-jWCwx!p3%kCSd|MynRou1JCQ3=t0K>cC`XjX(V9CHzvlgM781Kf3- zy@~nIaqDv1O71b!m*LU5dfj>Yn@)+g7_*JeRmju0t&mM|nzDxy-=9~|E-V!U%y})n zz9M*oWuA}sbt@U?YfgZeS8ZiPMWc9hTlv*MtTCMgvZ?4~n=2348{JZ9!+6F))+ibY zbkwg|aNf80uI7l^5S@RDB)a|7MU#7=QVOD+u~Ob>t0%D!C6ep)fOxj+HKdW!}@ zUX1q$0*aSGf6yoAd%c#~{GDAO5A18pajcuCniMQ+u@#&~S^gX1T1G5Svr|bEEn3`S zow=mec5=MEYGD8C9zKK37ZK;d>g%Z;A)&Lbb~N7DQ?VS>>u=L?jB#%Em;(IEukMTkhM%sf2b~Bt zf|$WX^r1M`D#d^j^F<{hSyJMS`-UD%^cvg1e6v&)ExVZ`n6b5uMbexEF<=1$S(jR` z1L(_IEF1Vl;}W;gDBL#i9xEhiSAVdV#Hq3xzP7naf3p9spXr?Pi%U*40#gZ}mPb4D6l{^ad_4+z zdp_P+84fqGzwruo#dsAW&((oXGJJfyjvij8D4cx`giX!Bbex2_`RJv)F*onW5;3Lm zVjkCn-wc?*)GGZJU!9V(#vd_;>5WqDm2geNW~k5|aEa}#>AGO~%=wWF+PuupUdz6* zQE7*OPQ%=zGm>ps%DhQtnZMZ#n3MQOZDx`FW^N?HZdg0VJFR)Kid zl9W(1*M<0Yn_b?$p&N&(c@n)EzY9Z+nKo46FV}9%)_uCuBj9gi&`U~g;zXVdVOme& zbopemSC2Kh!W=(uCRE?vj+AJ&H5FL+8k(XvL+>uE_cl9@zRgyUj8bkzSlfT`PEzso zxhYs~LgZ?crxyKdP~VEeh=)wFNKurY`u&G}UeNhnhuG^FVHoN`Dhl!nrgfqd8s>ca zIg?z?WzLuR3}J$^Fx!fIjN2{KmtCx|EdFVR%teDGaqkXeus(1Rkyy7%;h0u&byLBZ z*kUC$6`Rz{1AOI#DJ<@VH4<%sxel=zdDmye==tKE<>?v1wqHwr;U}lfs=QF$z7g-i zb|<}918T>62@A>Qcgytb*40beniM>Bj2+5$7vKo+3)EqMPG64e5?D8U2_Rp zbXDc!e!znA-DL85sp6PN>Ni_S5%sz4-mG>fcT?f()Ih+kYVnl*igh=o^?hIW$pQ?` z*G)`e9h+r|X$xG9jHS&h_z=S~)6qXAj3kub@8530t~W5CQRh0ae5e6aaF}oLkG$Z% ze2b-hE_*&|?4{0OO`y`jZ9F((z>MoL>_pW2JK8ZcxcxY#ViIF>CA>)Y+GlU21om(l z!WMI7nbB&Md63-?dkGWyY)9LhMNROfkd$s*IJ@m&)IXKXOCG;@x1hNo+Owg+#+`3t*ZCP= zRFH%r!w%^MWtF*azvUxz2FCp^4a6;_xO1~1^ z30m2oOYV-`ksqb^P=)o-iB>!zTnbLuoo6c83YlKA7^Mjk5b;U%jA3qlSNPuMPJ0Td z&SBJyKBLgv#!)7PZWK}u=Pg~+fS1hL9pP7gW6tCceGIJm<3DWJvu=><;%xnv)WQ9T z32ggQC_Dl&HbUh!LSkyv^+o%hnBZ&1xCwSFEpWj|P9DE9*8KkGL@NO$o3}0a1P2k) zi$L+sY{9o}ez!R9AFq!lV^J&LF-`WeJB_dJ9N+L!Jk{f6_^EgO%77fw2d5-BZZ;lK zfYiAJeFQvh6;xsJ8DPjPN*C1&tUVt%z5Q2A8^xl2SR3218E(LK5J~CG32&X#$J(ck zYd68ASy+m-7?72ZZ_-9#V>)569k!;nUdMp`dctRe<$H|dBYRghNW~uJE|D~vWur~P za-K<6msp%F#JFVzxX-rzoa1*ZFWAw~6D{6j;hR+-sp;=!xH@mwcAkNr2fx%raX}i@ zkh9rvh41>C=G9rFzS0Lj)s9UsL@irwY3G6$urBOPHo6RMD>u0uFJhOFF2AlbKWWk; zKTKleE<)@I)hp5$udwB)jGs&83~A~|mjmt#1Sy0@q(}s>)2Pp~fRrX?M&(vZ*lXH% zWO_N@Vl^pCb2;wf=&oC3L*lpLgD$xyeaj9#RdgW!)yU_drkE$!NDza*l%W;q#+CyC zaJ`t;UXFR{f7Xo79_3zU3e#4*!}c|vBBWV#P3E~c0-s#z8q%P-Z8kz){D~ zx$*pp!NIksw5{1RRA}V5Gm$cKu{v#$jg!(O|B;q*S(^Pir6JuY##-e&^Q=S(Qpi#w z21OR*=KwS>L}&N!dBV`+r)`hm$eMHC{#Sh3z*&pZ>?#VjR?)6|w7MKepC09EZa7pp zb^hZB+tFDu!69%i6JbGvuY-hxi0mIaoXoCM8DANY4Q&hCREx@(afc>)b>=B#{R;Xn z4R%L(hF4g8qpMJe@%Xcr2SNQw&fupfFw;^WWHR2f^Ii#ZHJ}}EL%2D!vx4d;RmSs5 z{$uF_7do!b71TvHH$27xdx+kPE9c^kk?yC_l(1-xi|Ar)K6^W%Q?!TfM?6foaZ^89 zO&GaoxY}kD*VvFuHxZmYXBen@nVbMOp0|%pzPyQU=fGDL<`bzz9dDs%Pocsbo0qjr z6vw$|VcBljxB7-Bz-B|QYr=gt8d{GGuWL5B{ekX3?`tgfRyfkhY}_&`rJbblYUrIn z#b1V>!!jVEE?n~qwkAFod4!eJ`%hJ)-X}hy(;vn>JvF&5J$zw5n^AUvBAUOp-L>E8>hDtsRu9@EDqU0ey@#Gly;3`!QIVa0GPG4m_5HzoARWCIL})ie~(C_nB#4${I5BJtoVD9o}#>A@0ve(D)O`CNtos{1{F}^ z+FFsFU&w#*oSDC87_^c2py1`$1nT$aR_$0P)iW_rQ(e2oxx`<-36}A$vvv85T9OzBpK%H^eKheo5W<*taJJ*4ZZU!BoRw8`b1-9p#zx3w~WtPvLB%l&yB$zMN#|F5AVwto=~%&84P-2(mNLMCmI zm6x9vB9&1AsODEE3a0wVh33$9bbu+|U!-jcQTO>cKa@{Tv^afkj>MaAhL*M3 z&`q=h-QG>OM1ogd><%yfBG7xEDhy&5oFJn3ICGg8PRJwm&u-Xh^z}-9ltuBYXUh|G zcv~}aY5uk9zPmxOEH$ z9~Jl0RC?S1;mW6VJwz^gY6qcir`$Q4(M92=asBo zJr)NXd&=VFFx`R^`UO@lk$7?Z5M)GFOF;R>{y;5m;pJ@n9l6y2tsm2s?x>QmbUnZx z#mO$4qm60t@WiP8ViADySH9#Pb#3|dONK0E8nyV-l!oT2Z6)Siuk|35&`;wIff zqMqU`I-bL=C8$d~nw#)HA>2aN(>g%_IOWHse*X;qo%269{9gkg*%Ha%eY(zW;uLV?pDynj~U7GgX3v0Y%u zGVpDg#?3jC@L>11I~D$WMGdixAWHK8eP)8DzVyv8$-SVb=HKDalg@4LOH4A0_Qf)?WA<#hee)F&jo`lsUvl-7fVjO%9o3SX@q zgxd1sJngGyYdkzRs{dwU+xi#%!=J)!Hm?@d_hLp;6Fz6RT6Vq}Yhl0(z~25ZRH30u zsH;*IL*hZ3__a&N-0RlLrHbT1v@tvG2tNKV$pINO=B=sE<_LMdxmPXO&8ju|@=cxL z;~hb9s{e1bOTJ>j+;ndTOPY7hN;zv9QJw7);;lk%{Wp=^jZe<;=?!a+N{@+n7!DD? z@$klu|7ZPE$}a9@*A`DXEreqnhKC3bkB;P@6M@?R+l`l1wJnQdGM>S|B#o0B10G%u z`Zf+cPTZH|+ilpT|<@tb5inU%L)` z??zlU)8dVF;32`G-1_p;hZ50$R992w-c`<}Sh&BY<$b6azEA(G^H_p6%;mced_nn~ zn(E^@t=~FEO4E`-I_*vWR_0*H>qtlp*%6mZqZ^AzW_6L80cM;A!}AEw?NvJ(miw#d zb7!@wj!@c{>@-b^9qDZHElW{~Ta4>Fe-X*p@oyl-Eh{27B{2s)-(HONAH)?KJ@lPy zxD~NIdgS?Rk*Uoc>#ru|zWhiS_ed#-fpwKZfO@(a069JHWx!PybKC<^*d#Ux7G190 zEbw4{zC^m~Sl#C^hQuV}zIoXV9nQpu;%s+L)ZclX=|v^HKz*ya{t>M`CdEd5%(gRn zC5uoI3&r_Ezk4`=FWhUunyz_&Ui+;As9Y#X%&Itt5d<F4#+pa0@qz2rcC7vevuW%Gw&l>rwLNZrA?0f##{lTB~*`x;&A zYaUcPR*W^pS?d28QWZ?S=Ji4hJb%Ujm70WF*dYD`JeD^MQL(Zt&0+c0par5lvGN~F zvEZYf`>cm3Ph0>(g8RP=4en1H9?l7q{tf=goTOKY+R=26X}j3JGv5!Qypz0vHqHe7 z8z-Erb^`Tw8KxxZ>S)i8QB=%c9LCK#p8t)3a6BUPlht*D)IaNk(7hEWU_tT4FL=K| z27_l)v}U6ZQ!q1YfA5@Q*T`TobSE$(NDC-SA%PeXCw}oxvtW4l#vPSw zT6gIF>ZMpF*)tB_6%THl%*6f)C)`|+Op}^-6f>XAtFMMYkqlUGl$3Cn)GJ>DF#U21 zVDLUIaw=Q7EH`+ApymbWLjA?y3q;ddnbRcp`r(mBg*sFuYa z;pCJ>^_O8{)hJvkPe?)0-9Nwk`^`_Hk2EjqVnEA3;`=8a=U386=lR;-nW)GtiWE{% zmya5^ee%{wc9QIQ;C-@OW<{UE=q~_Q5w+yxMcRtNyXUf|{gppT_^INupFHq>aN?9h zyqsb=+V)VVjD6jpmXky5L$SK=XwnzQ>T^9={Q!g4j4b`#zeBdBS<8aSLu6_>YYl5u z^X1mwmUrlBXxsg6eosp>Q98M9M&Ytxc$b2UB2kz3#xJdV?Y&x9tIfb%uxJRla^7P% z;2}-}+{|!_!MjrfS5WkE)?P-^c+ib-RW%y1+bQTb$g!#&J{1I9<6g2XS5h3be)ikm z+qg>@f{=*SoSL(i3705LWb1w#$&mWmb?Q^dhi_96EfHGigOovKyFK03lbw1R&jYDA zQ*CJ2{ZySY>ezHYtabwKT6lz@Va(G+A!=@G#AhF41pJs}D7$o;-P=S)^cr)}NeD%i zs=3^#X!hFAdHQ+k1de}s_gUU4*0D2SXtAQ9=I3wtY3*Y4cFKaiP$V;x+!;Bc#S))4 zt*TFPVW)}*DsD-mQ7-u@<+P;GSvkWjw|=vKXZz5|8fK~V+ua0rF9Qkh?Pr8|c(^Fv z?+wfif~>tU1^`X8)Fq2duanFPxh#JBi^?n4ACSF0X!Ne;PuMVW>`91Tq(>cP{7&yt zzFVM1yQGk>v6)qRZ8J$Dqk zmp$HMF83Knu58mudJkQ1_5C2)C4LU1)|2S9d{jY+cIo%)p2N`U*S8^{i_QD^zRgrm zPqjXbQLrqdBp#UU!~l)4(@k_)J?K$h=-Jh`V6y9gT#jR1ViksLY6RjpzD)C9Ll88w z3oPiF_Kj{iMwdR{U1rD_IamirAHmMeug`rvFw;7rXMC&Y09{NSs!uS5 zI78Z@C6iTtlqmS-dpZFE6fAVompOt{`M|>d`Ie8fEhT^@p;py@*ZgKEj%Fpze$hMIKC#~nuSaGbXnwY zt2F;9d)F8#6>EHXA`H2E*Ba<|AyqSWXjy4|nJp!Ae%4ZkV(x87ovcMYpa!i^T9kRx z896OSa=jz@tVi64tp&uP>=@F-U3>a&J#EI%y3JgA5dWPr`9=f42Ra9N_J)KuEJf~w zOAv1*OQ5zi5_7OO>s=IzAL)_#Ev9Aal0N82*^++Z6q@pq#6h70Y0NdUY@MkqyJh#g zqcu@B1w#qCII3CNpz%3V)nbD1(3Fk(fy;;};tQ_EYUAU$TO>(PJg^t3M3EoI+|+a6 z_*7okpFF>UKKb=amRw)0Uncn{$XmDCyYK!({yqkwTe0kV_N_l3kj6K%=Ps?n(0A2^ z_DDf}QNLnh@Td8q{^*0s#1Cz&^sNnwK!$$aVUDsE+k-JfUIsNyEp>}wJ7~qrk_wO` zW}3;TxKY1u%0$yOVof5 zC->=_Jbd11{HzDvGB~xC-8E+brchI!_noIG?BKU@qdtE4Wk1*!qhtR*Jj(CjdXOnN z__Dg`;is$-^u04zTAqr9BwdeRzg*DAa&AB^ml8XI!TS$;754FcdKIEa52Tn`Doo94 zAS6V1JoJBaLN`@p{*dq1$QuB*#>eYjOm79aM2FMMvx27-$>j6rkP&`w2+*oE>i3F(Pd`l^KMq7Qp1kDvT_^pWGn|0mumxxpGx zH!2^fPms%qHuh+TGP10B4V5eE%{og5n&WKhmF==>w7GMn45ud_yy55<4O>+DEB49Z z69M_{WNoPbg`wUwVhmrF4udhPsOLier)$7XKeEtmAW$ogoaX$WXIcL_h_ z35tm8x9aqa{-KiF`n}uTw#33TKec1t9}1a&90e+l_pkYceAq?=-whyAhxX$oa zGA6EKcw)XVzxFXXF_qrUge%z2`TkyQwXnmPu7{)V%Z1bQRPRmwXqNxV{7&IhcHJd= zC5!+Fca6wYMM~3~m5-L!{;XTU@&8y5H|>k{nWE>L&7w`A+Nbb67(GJQ@moGs+PDMLoihyc(@QO{Pcd?|aK4q9+^=??s< zd2ac=U7q5$7tl0G?kU+1#fbZ@KLG=oWd37mXGl@J03lp;#Xu#+6%v`Jnw-dl64Eah z*&Yvqy}UzUZN~IQ!Kx_g5FD-XuIA^sz5m`M!vn5;4KpUIsArkkRG|V8`~8)l@!!{; zl^{(dMfJ)$oNsD2I?&j-h*w33s)@bAW;J^e^^aD+GY(Wg{3*D`Rn`lSNvn?26lbG% z#9s>Plzl8sA|=G5iy>MC-Xb5#$|=YSaMTX^`ZQdsV6YKf^gdzuc!mT*$Hlq-l%7?P z^t9%QC;vgo!d=S25o+6_i-H;Td+>#lkJDZ5|D+gr37sY9%+!vm3Vt ze((=w$ihLM^XUK`2Rh+`(G29$Skf{gWaFw=|sfc6E0?8I-MtOAZ?y`BOoT}QAr;m5s_db8^GzTjS_Y1y1q-WqD(yd`Y zD)XAZUaDzc@mYG$VFRKpo}8#%ifKvJHo}})DZg=B0Ax2Vvbe7xnTJJKA5n#VPN4Tw zXCVFc30^a_$?tmkc!IWVxVkUreVQbHWd&t{cVd2iPEec_DW+*jJkONhTC-1V=i1RE zIypyhyu_TDFx;v#JUO4m`f8r-X;Lpgb_|>X_D@Z;D(djy0uF&rFP&*k0abJ_OAUOP zJ-0AM&5KbRE2*S!bc1zRBBpHk*O@A5mOcF}A1lh#vX229N#VsoypK{sIMDv=umF17 zBIENJ@>L)MjtoSW2LdaFy>Dz3V=P^*hV)wCmwI@ct# z_7HF6))Glgdc_vd<_rCIFVNLBE+Hd&^f2|c%SX3V`oa@-D9_JQ<7QY`Q1Y^{@5$w@ zXSg`Q?CDRU3Hl}IhphR!W4*Ped~bQdZ&#h=Gbnw1 zo)@c_$$g?16^vt5@2@QgRtDAzU^)~M-~KI)O@+-?$I^16MYp@k{F8G5MK9{_ zW<1|m?&pEbrv9{Xm`^4&DTI8royPFI9Am*l`jRcRcExCz%H^-lu1A zy`E=nIfQriATm)8a` zE6o?*?-sO%Wq-$ur%)t2ByhMZ`SOmAt>L@8O0Sv3nO^72lRIP*->oE=M#)4J$+CI# zp%IjNHo(U<(S6`>(f1|x*?8~mP2_yUnZd#rM;Ryvlgk5@(+c;wTwrTN-uNn@>XEB zu{|Vpzef1Pt;|{u-D8tpKdVqBW(@u^k%MU2DRpg9I?T2O20ly-H6;tJ=@TC_v7Pgw zR3%?YC}E(i2-DmsnEUnnuEV4LSmJn&#{!Azv5~{~qBUqVVa)e5NC}kQ*zn;ye|Jls zA4^Vh=8S6>#Wx=n{w$IksXN$3@#6>Fpt!n^HH5Km*Qb$3Gt`W3|26po!7m*JSMQAPT0RjF6ympehnVSRVv*rEPOhxv!1pu_xPn9ELk?sIlQD^9xaZy)6{sx0n1%(Ke9 zyK_c7Mf~vM!QkRbM)j+ntVbWB$XWLKN}4)+Q~qUjNwo)!{S6_$tWpeOMis zK55M5N59WpfVxPJuf+Fw&xzA*Jn2Ji?cX}$j8vi~XBT+w&RSYsGh*9`Z%D_A_g0nj z3r@sEwCXI?45Mi{vOYYilyc>KJGsK@TX%m$e);HEfSZjUSx;YIkgA$A>#LUYRP`iU znfe42q*M(^c&Cx5m_2@-cYqAV)}$tB80Ms&&o?{zg6tMa20x zMAp!7@Z`4*NtAi@65z^TGM*b8Z%*hLLzn(|R`~V8Uk<%m1@5vN`K|u=s8T7j!ojkB zt2J0lk$R*j?=6FF?Dq2c*PxjTlDCef*uuw!vAa)i=+GTur7{vinw+nOh!8Dje%mR4nq3 zBE#KN^cA_wm8NpuR9mJEp=b(*u;4{nBYPw=U4#qfC3IJ_7=KtKliyO+_;_~_!_ym5 zJZ{Ddc;z_EbLF!iceCe{o>w4rE4zfiCjKGG6!W+JVv;>k*N>HbjOBklmP9+6eR12^ zXeW|NKJr1~WfbIp0Xjj&zE#O0mBHh}y&u}LOs3t9`f=f^?g1Q%3|3U~ZXx8dNO^>k zsdF|Wp_)07B=jRgQ0O4IxrUWhgpv%I7K3bRFR(ZZtw@hlpj zELhGO+)Z2RID;FfiJ*~T%H#lym1^v99Gz9H{^xQ{FKs{{y~qjXLq@jIh2fYgL8I!a zLy=0;x*8NT=*LH@djz1^!nCvAb;^M&h~RONiZ~T#LlI0B+r$tuB4kRGJY4Cjv>P_o zih>w6D~=A7xvHI}opr2{rFAm>_;6XM0!1X|-3^?Br_^Nl+-?L;SdSachYaXMwj=k? z_}~2}3JPUieoQo6nvIwCjtk*pS-uO8E8udur;i`KXyIf2E{G37Kk(t=iVa7L)*j8P zGMY_R=~>_zV@f1Rsm3w0TjcB!nO6)>*C^y_1w_U`ivafvynyT*HZz*m${nS4!`51Y zz{6(8l1kcLVQ<_}hjbV`DT>z7&I((@ni^%aGH6@`siw?(>a9mjxRStFQ9Dqt?H7#P zU%#pf<@CCN!Jc>co@RcyrwyI-2hnoSV9z#(&Wx1>$t3r0s^v@!E+>PAg~%dRq99NK zv@{)jt+Q!MZSc4V6k%YZ)HV<&J@4+NzJZwF;tU&Vmue{@7=^2nG{KifqMaGGHK1=1@w)<;JAO_`58?#be{~!e91g zJ!^Nl-^t1sz$7LpuOpEUx8H~YDLBM$d_X`X!i&2M8jKCr(4WCo|i_D_uIpBz0QvwKWPpuYldgU7{W8W)HtBpf%L3O(I}$Tf{VADFbZZ)J1AH;2*u?XQ|~hG zYh*1_Up{%b0xhI;GS131ZK`IQn9yla@^F=_+|jVET5zybkX+MKM+MU8TejmSb*e6K zY#2ID(~f$3o&_UuO@=OLs1!OhtgdoZJ5aEqN=HjVc$d($p~h8a4;mXDkP*n57{lf| zw#kJPxOQl0KrbC{=W3Uhi@bpGV$k?-b&}Rq?=o(z;hp^Xhc5*8`;G8hp67VMV09dR z3qrlwo>gAEJ-c|xmLmqclOm+%IcPb{3u-wX6Re934!mu`pt0#m8l~Ld z%#-XAo=h7P@SxFQ>K*}TgZ0bHnI<=-A%jLo zC}LE+n@2)LXF!>xp%fHrb{jU=v-K`nlrm&e6hRYAgA19P($R`|4ME|K0*hfs11cIx zllrNVa;blv)S_Nwc&Tx}fdugZyH)e%{yhVIZbva;hAwl5Cs`U(F8dvJ@K0186(mX0@Vt%#NXuw{7;v5zks1z zgHX`=F(L98HPgg4uBs43DcQ$K661NPsvg6_E{~!0GTzMAuc;9o95NIYtOy*Umq#fD zUL;6qkQ`T)!*bZj)j2T*g|mV(RU6Pdh*A?M9Tzz1{edmlY(3b7_J*V68d+9BXa@(& zV^k8Vyg+YwmIw!hAfA_I!hj|o#H zs@-+&#x>O_f z=e+k|T1>dVivzS5%y#FNtl||1a`LMhbINNukNZ&=z;K*{VIp+eD~F_y$wS zF`ZIqbc(?;!9s?pi2x;y+uWjF#?)Eslmz1-WH^VTLYv)Okb*FYL3)`Mm+oCwgPZGZ zMwLQl=LrQm;W7p5uqlc9S{J5O$XsNE2BTzz)nu(#DoGRU`A;MKfcq~V zn@<#08Z4A2nWHK4?oU_pf@pU$&s;SwF(R1d(ceUxB(EBlNw)8Spn;1+US-|E;;J3l zrJGL{mev_nav93^(M957ICK&9j*lGEE1`c_dc|uxGs;X273C0Kt=~jqO#Fd+$qH2m-1o;`5? z#bVj+6SsZxE9sp)n7}w*z@?$3MHHBMhk~-Y+}Drp_sQ)up@BEh48wBOM(fG4n%@rR z{j}>CD+r#c18~oj)0j>riwFw1G_(8UeyO9=6a8IS;G)vS-7laccUqM>+t|=VoN*F% z0~?VdFFWjxA4u002x_fBDP+b|?wH=v!l(?|RE-?P>Z0V;S!f-3`EfS18}70bZYvv- zD6c&3j_WV4&gOKX7)DXn8h2DLdD#(nOa|R}N|38WrH-yIa>n&lmK{PnLc7BSmJN!P zSDtXk_m@|mWc9H)#}T$hHln+{?5I1wFI`_KDD@&Gqm314JMxl4?zrBRu~H-mF(8bt z&hZrEC=-@UI8B7x#)c#-Di6E54WesugaDNI6STQn`XVZSOs1z8a#5kD8;aa^m<)UD!xQB-3eWHcQu^AC8@#hsV=D4?puEb~>>VMQPan z)8CU;6g{);U}wH_`b*w*q)r2v$8Vk$Jqx7|MZ(j(0PRr}n!krhAD{7H^FBPo`l8Z$ z>nfx&dS(>z&hc}4f6`XPGv)Md7d`*;P95ke)gK?6U4EYUXCs6AFDk1K z=gxfPn?SYV%nx)Lbl#k9_U3T|-kCR@Bp~Sr{soJ}wd!EbiUT>twM~W9jp(kE(ImS0 z+$_iOe7~gVF})JUXLiqsjfl_(_`9%xuju0L2igL-|Ds}cxTd|dfAS><&r4`lDmDQmPk)C!r5A~{}gFl27}G%WH^!}q;AawLE=QW zu)t{p)8>xq8?Mu6RSJKX7HAXTexTKW`~ASL`?GHUcm*f$Bq2SNoeX>9m;oQ$I>X=P z0y-6?b%qrOvbUfhpsuOB-b4^m&r(4UT`b4)!uZVO%lh;f+#@zMDlAy5@^@i@)&TAY zS`E1051{AoCC@HCTwE!qXh9Uv53uFA+j_@G`ny~}C&cAu4;EJ*EUws*Q?~VFvBS-( z;y%Gi_#tcu3_jh4||3;^Ezla&P)>@7lNd*Z;xiE+rPD-O2 z;MAF`1>El!e%f{7){mFd6lHU|uN&3xv%6;byF@@I$Kr71S2gS_EdOrD@e^e=(l`^I z7AKq_p#MY$288O=R}bxd^@z+Uz1H7_1zJR>bw4ID4Q|fFU?lWYy)fW7COCO}wuv>f z-Z_52{eENk6CWNetqD{setY7k{xF_Da1q&GP`>(b-o|5vb!Mx#p;Y```WYyULn+?_gwI8Ir*548P=H>FA{J%t$TE z3;r&V-(Xm_PtVlrXHM~VLAoxrMr(FO?S^ATyYtHS7M!9fN=Zut2fJ9#;bwb829HQf z7?#p)WLmc#(P92BEbs^5exT*}+5O$eO)oln5|hJB-J)gaAHe-U%V`(x zN5V@>PmfQC=Rp5m6=eN_+`QtVB8=t0{p}WXLvK7@uzYV;PGxO=b)&`MR!BVr36|$= zE+#HmH!d@IQooe+*sz|l5&kZ0mjm1nw4C z@fJps;QsbcQBBj)(y9Z66^mBqlbr^Y^_eeE{z7SX3D-x#e~1j^!`ibE3g) zmr*26NOb`mF9Zjuf&EeG5Wv!<{76Z) z+3qwu-4r3sq>kelg%c2Aj?$zloUIiJrA7~eV`#T>8iYl^B)$PkawdrJ0 zX`Ml>kjrUlq#zf|X%uwt_^7ezNh4C@ho{BMy8H+q7lbb8e!Ja%?cD1QtY0y*cMtx| zuqYktB>+hf4i~fTMA`8ZCo(fGUey@5A7~eV`#UQv$De969WJT-Wv}!y!0KRVO6nkp z5;b1nBZIUNftvn_G1rdlJuEfe--YeibV2vqY__ZCT%V5DzjoV9#?1*H;dG?@U&LfI z*pWpkJJl*;g{|`@!htgr40tN-DP(%o(Y7ckpgIv z3J=sw?w@kykj#{*kU+H(hHw9`bXoWJ#OhxE-%N*_L6?$@lxCsuce(xp1WlXFmh`*d z+^}>7H*bvIo#-OzHI2bAD-|6D+N3tocGhX(8pK^j29;iND0nia3a9pjBIqWu$ z<7kqsZZN2#ld`k2WWJm+3)~O13&8zd8Jp8xT-&t%c)`Y_MSF|NMP86*4#07aM~Eo< z*J~7G(vxo*(?2OP*x!Y1zjSf;qrVo_HjaAqqi-IXGb%0K-2q35MZ#2Tl&f|fduZw2 z{M;OcLgBl8f%}1W0k|K)JP#lg1gtrd_tUPERYtSb;bu4<1p(->L=uFIB4dNKH;(N; zE+aW1G)S$4Il#^V++SR481d)_%O1IYR7R4^&G@@?tw9x_T($Gqz2EQ3%R%n<;S~<< z2igVTe&A2+FD&1Dym)JNaY=1cmBB(&Bu$dMAUGM8<@ibcQl<|~>z^2%6d9t|sQg{n zb^*BG>;4}eo;xZ%(Zzaf(s3&OF;DMAom&3uuHz5B^VpMR@#T+67+z+%6 zK@f5)8@}6l^w)h^PL?y+oG6TzLTV91|F2fiLz3ezAJS_~uY`~QwNmcGmeTDFU%21x zd2=qOsq^o;qR~A6!{wLt?eW0W5$(qPtTrfM%Htnz%_EJonun7tNnGYskeLnGX9`_v^Kz?|&yVA!^Zc^ROoC#VptbgQm>&O__JRw(#wb zZodAy3%Fr$e=9)`nZtQSpZ>aumQfE~J1ssk^gJ)gX^JeZs(WVf(){x3QN7cWVk2`; zRUODK9GI5;+Ff&LnrbI6|B?F-_;Tn)2e33H?%DoqHL26wtJ^ znv3O}3?rv#m7H$g&LQN*Rrvmw$EtxQE`057&S3@LaRW+EHHnpN)(pYYe6l zk9_#@qK|L5Ztex#a+h;IO4Fsm#HFhM_a3kC4}SBkc6wFOL$jgLB9)Tbc`X0wPnV&m z%-av#m=YJsa>769AFaQ1%iedsUo*US%G2|&(rY#7=DF|nFG_0b7tXtCWS_M2r~cl` z1+;Ts5KwUl=?G4r-sUy~XB`*z*MlOY?+|F^KOL*LCc?XA4BQWNaB57Jk{ZJ&8xL$b zSy*SbD<}$G`J!h|fdHi}CRo=mF?wvTq-#cGl4K{GJ?O%^f5n09=ay|Mt22dYl?ma& zuU-T=Y>x9MOl2hdMWBc7PVF=;b-6LPG-JMsu>hYU`1Js{wKJdlnLw8Ra zHfLm?O(zQ9`E3{aEeIT&IVAo5DZ><=@oHZAdF$cQ%8afN~c!3 z8E(n8L*MNGWzOay>Gc>fcl@ix_RB7K{mX4>lK)YmDDs2eR$D#56wB-{R!dv z&lk+Kc;3m=tZPM6j}zT zmB^ZBXWo)u3Bu`g+gvV&<1tT`kFGzBLcxnDoE6d64OaV!!m>rnen+wJMuiHOZx991r zFBzH=cf7Rr$?w-cJY(ctlZK&(&*T@rNR1Bv;MU9ET)E@REr*|;eaY3sGB=+ny65wi z^TrOGG9azA!LVS-n%gH1xqI?3mz$mZ(icaHYi=CZZ}za>hl?wp`EkSIJFl8GG<`=- z$yKj^`^@Z1CuDZto>TJF4;!Dq=F<6>4*I9^TdmgFbFb6Z=PkH;0{Vd~8_YMn^V6?S z-aMjL!r!-QV_wPE9*HGJp{u&&r@0XT1 z`;wu(dnCvxGP|^D&h1ZQVfvr|4LU}X#frIEjOTa%@RpqVn-};EYlVnTJZhGfhf)GInfn;a41?~=Ba3tr#~oG6+lxqtk;@dE=C z@*j5{3l7kX&rJTOjNR>aKmYvm?|=LWj}ek_1m13J-2Td)LsH`Y-!gra)gkR2-8hGGOW*lnb$&%{-|lg}65~+Jd!nTL zWYMXJ;K2K@nKHU>FN4Lpa>tbOgkYVz z!Du;JP^wla@4aTy>Yay+Dy#dYB&rpPMziH`eyLh6fBl~8`=%!Fynx*Q!WS!k-nRd; zA$?!D>*}-Je<-*3;dj649vAh@ysIKZgTVbjXQi;FabLly1BDgavPyF*>lHFe;b~{% zWTdH6`X@yXOpYCumN+IOAyCy}dbj_H3-11_Ui(fLp!n>LS?JHp_nv(8>$N*xyggd4 z^M3m1H|q`-RxW>JZneQO>d8;iVnZ2@|Lo3Lz2l`SfAkC)^yvGsp@DJ1fh^A>@8(w3 zXU0W-c-!Uq)lCx@F8=lL8%KC%Z9Z63Ip>`p{-3?`0F0vA`uOhDo$b5nJ%JD)Kg%bw%{>P11MOe*Rt=8=ge6f1V=f7+okeU9{ z-*1k#noN4_)`P_h7JW6Q$xdkxJ9XZ*szXa(o&r|k#c{dN_w10t>Fg3_wX z`*weF&UJw3yfUotdkVAvxkcXuSw6yQx_{clUg^mMhPi#gcbBf*cIc>4PsP~GPt3VK zAGJ#GPYckpIieHvYW-=;r7C zMko<~pkul2)h{HCy!?qd3)k#<^Ox<{5ALySPr=v!xUpa7L=7y}=pXRdJ74`{Mx;s4 zixP&Rk|0|RRC2WSonLl*uzL5V=kJQOnZTOg?#y}Wi`D;oWafa*NwUI>vU=B-TMsVY zxc|yt>95V1WH+9xHC94l9WT9o_w{qe4`KbG zwmw}aOIGp^YX5M0BuNn~wjY}N!HRv^xoN4%7qZJsh5dkk|NL<2&iuk##*e)9nvpaE zL)&yDQdZ;m#~Yuq0{`mYZ|)xBYB*mQt;TVE?gTc)Q4u7;_y)O_#P;}DpKKI=) zQAd&Bazqd31Hybx@CV9>yxjIvicIC-2GkzmB$9-hb?l z)VN4qfL{vmd+Da#FE3s;q-U2m9+-n7sF6ZD0Q+m)zANT0N{lc8!z;o>MVpO!l1PiT zQ6!cUZMRYs=j3>Wbz596OhG*_DP^8-*>i_6}KAd;` zxZYjB`V|h({IAx`y1d`mK3(R%`<>Gt`u5?Oq!Pye=w~Z;94lM@&pQgL8eU(q^_gj- zGm~O8%Jjlzd-K14bXJ$Ra~+N;Dk>T|di=L{j~UrJ19Ydj%{=!0H>5LFpVr}P)v)HT3M19T7zhRZ=2x7 zq}MIqe(3f^Kkqw`qq6^;1ej1#T@S4L*hpK1N#EcN@&eB?e67p#+LtT*LHf~~uN^lu z3*ht3uYX#yZs(YRy`H}HI{k?t0A&j(@b$q@fsh2YL&B->=fr*-MG7lw-}!FU?!4pO zQsZMIY};~79{wdY*tL_AdnN4{cfaJfhJy$K+=p zEqVIGukB{j;sy6;PlPK17Ovf!|L8lPcT0)?Mwyg->Uvl zMYWDi`6cUfju+NCj@LRtuLW$#a3n0vu$|+hZyVMBvdpxu2~p7&V_P5W+0^3fpZV4| zlLvL5F|7CS?x~&tJ>i8<>}JF5qxyM6?E7o>WF|&`de03giuSnwjY$K$|7+I75X+B# zc2VaT`@(rQ7)avoh0AsxE5Co@aEo5Ic=P@M&42aajfoM~Zuh-*{or0N-vaB?bAmA9 z=@0s*BrN><^~H6LYhGU5Eg^c^aL^TtAFkfhD<$^dw_atW&NZ$5@#Dv@96Nqus;OIA z94E*Y1Nr(7+s5_l`r$)2HM9TiyC%;WJJ1)Pe;f7>`xH?lD&3r<4c2)up7orHv`XYmUa=P9V zbiJ?S%W{KTzC?8<*uS6Ycfplu)R8N8<=npTm;Ko)`_EYh9airS?9MB!XmA!))-eq0 z^!Y*KJ3T&vz#o};?U=zmML~XP@v>!`_uM{t)LoOuo@E^V9N7XsH(Ejts`&(M@3NzvYej4 zuk6)1BR*10I0oF#Y|-3M+wJNsn-Z&^T0ky@i&BWq>OC%GG8 zXN=*9EnbTelB-H)LK=)fY_SMwket<^r$MZ-vOr6IH)@Ro2P}CTWZJ7madIjwn=O@t7?>$`n&NnNItLworWpzr9wwvQ3Z4Aq8I8Y#p;-fcD z4zvH|&z60+VfTXDr`|MnaDYDLcu*yUZ#aYRIkF$PnBQ;O_u5x0lA|M^zV*7^8OgjL zl~mWivE=7{M~X-F@A=~2W-Q;j@69DY_w1bV)$@0`e7|i0mHlD)g7oHs(gvU3O4Vk`!eK^JE+>q#Od+7f9rM3})U*q`~Ts@k`L4BOxf zF5g@5)4n4+j+BGz2Lm=Bf(W9-3PMtZB{j-6I3sz=;LO2YE~xWuakY4?AF!YOG>c=H z{zQ%)&;m3ofLjV!(&1;)k|+f@Uayo_2(RM~(mEWaa2(u%f&_Cc%96%F5twpd$`Y_X zilLaY3pqeD(9Ak9LdS9f!wcY+HR^N-a^7Qh4Djk$HvHaVv5XowzOzvO(UZ5Gh5Zs> zErJl;hU4k|wUkL=e~>NQ%OBpNK}l-xHF$DQ`rsNiXFb(#0^Vzs+*kn|bRw}kIb;fe zqRcUo&@HVnrzm{kp=v|1e!^!SLwK^l@H_LE^JGuNV@F4+dG+uqL7SJN-M9k3| z*c)lII=vDVrV*Sq^vc^uL1sG z^MpNMx;njn@Qos@CR~{ce=h7d>9OZO{$bgcy;lz(@Y1{+!rTS7$>Gw#Q*#5L;}gWx+X4HhoWgzy zpkE-n57+e_CwUs!!rdASBfE`2^ybj!FU6_?G^vMv=u|1_GXwzdbx;E) z5M77LlE|0mNdY&WmW3K4*kgN`o!{Zzhf{+mOMVx7beBc~Zv7Z7NzuE06iW}F2^ogb zbDi$FH7$AR{>rMYKC|Ap3!jcy{(bv*`Qf+YgK#)a%8& zW`x7^z;XkHA3unJxhrapssxQ`T8#^fLrMqV| z=MggKFwmzx`ra2ANx%C2`w9U2|L69HHop4|u#MU&`_ErA%QyN0z=^4K`x&0EaQbML zt8@FoUtmB0Tfs+jEYEX-6l6G>=NMM>(KI+;2*iC;en%vnJyFy2vsSsnp%o!FRb^}* zcL47|pAu;^B?6;>AY!a0;Mt@^T18!Ku|bJ{!tQBRc)dJ z#nnF_IJ)n6MPZ!-7z7jn!%D#o2V7Xt8D^!%-!iiIA<%8;29C!fQ<#FS?4ISiA=i_c!-d8wde4utvl4V3M8-hf@TrosF1Y#wnGNEd z`szpC1WqDXe2}j?BD1tcU?i5-3M}Z0jXIV6=Oe)Wtp|_1xcK`Bi|O+P_o5gId_WzE z(G35^>TOHbZ6{Fdk(;jt?B@j;c;pYfwU`(9f6kpYacCCHhXd%eKkvwS{JpQbCnvmm z|ExH>6<{*^SjjUVeGAC_@B3zVNr`3HGaNkN>j%p=eDd?Agh=aSv!;y5N)N9EZcmF| z^Wx$)U$5B)@Z6_s>idt}MziASd4VpR{Um`d+r0PnFMrBNiGT5LH>Jc{c>%1ZscsCs z^}k;>?$6Kcl=#oNQvv${*Yb-ip8I%dWutTbI}5BPiiHmHUxEL&o#O`_0*5~ogT+!58EN0hkJ&ARS;hUJ&a2L5DX;^SnDiga1L}AkdT7hE0NS$ zGcXo#)iP*H@q)DDXzAvHlI)V|ZHLRiRiOu`1VwpK1cqZmgf%NAZe;h=%QMsZrzigP zho`dtbX_@SoYhzM?|IXF;C7azu?s$a=dNq#j5&$@Jcrm~4HNH%&-f%>MlcxQ$1(tQ z^uaa!@qJ(wGHW!MH3~3~-TyORv`>Q&h|xlgz8%$@0_*=P)t3R->H1G3x((%y?`IEh z)sMX$(OWbE+!`ErWEe@}*<*Y7gRAso@4%CKaD{uB{VQY?*J@DRfQdxs{#?mHX7>-^ z+<{I)*YirO%OF%|06nizbDYUuC3_pR1f`DmpSJ-2N^9yLdh2svh@L)r$gByMMcb*( z*~fp{eqejfQAw0R&-?I9$bOC&L3jWD_iL7{*#TI1=e47T_R1hIZ1tYpFIR7Mdi?iH z9Y1sY2tkwp`<*`jJ^%XH;qgx&Gi=s`k#-9eq<=d}53Cjl(X*d_x54G@6dyC?@jP=J7-PA+g0{{Q#e;>Ya06dY>1>EBa=R8`>K< znmNbIw;Vd`_W1$Bx}_w9*$-}oihAdZpDfKQuDEN;*daaBBQ53#tLZPme|w(aAD;XT zEc775hd54r)pUQFBL*{DxKF4pBDxHu`c2T1aGobqu#L@K4{lv7zKj0yTO^(dt@&7TRcZ`&{be(- z=v4a98lk$-FnX>A#Q@7C8g%ano|vg2DWNo1s4p{2ybHC(vj^95M|R54Q9!BTQ}KkZ z^r5w2?}#-Dp)BA`fW81r=z@11h?;C%aW=E>C*V!U2x25v_MfMKm;dGJ9iRWQ33Suo zHigr`wVD{46|nq+Woyf88=t=IhVetP7)}7B^!fsyuimm|cP_(o;gMiT6io*Gn1NX{ z#|@9PnZpAC;D-Nd?T(L^Z)7+a4g@nzmk$Q-|NGNMu;0gTnS6Qw?l3^@q+bUc1SZ1M?| zY&25jh41k!Y9;oAZX8^kvIw}@iU$uad;-I9yoKXugTbKF<&t5Si-u@+$bi0Y&KWyr z{E(BL`SS>6qQ=Zc^j7xJT0m|ru^aI6rGPiM;lEf!l5W@y8lGixH<7(ZX=Rb#{*6#u ztm`*{=y?VBeYQATD9*-{GV$)iK;s8~`i~Yx^do1$A%p$D2sK4WTpHD9q7?Gc8$Z>M zdc&C8B}t$+eI~l=!QHDx2x{bXJhl^Ce2_2Ls?q87!>5DWo8GY$;Fjz=j!5pomE|zI ze^lB3C#6+V`)$i9+zlW^Z1esDM@wmrlO^%FJ7<6{_4IEIcuceW z_MD@+$IBZ%UP;#4OjPHD*kM`S(vxf~tND7(_Ps|+>}JEvaU+srqndXSKA7VbHA~j* z2r=AqcTNL6cX<2ZWnLYti%JFHGG(?Avl`dYjH6=P?-n8-IGodUr_~JGghf%e`>fy0}RD^s&Pt%tioyuqJ@Qy8Zb(4jvXIaq{JZ`*-Vf z_RNDVq7K0R^Oc`wfN|goFb!URh!ZOuF3`i4HvoRX(FAZs`DtKuiL4+6Xcm}af+Pde zL6XB2CYvG@5$#E_2ufcjofLuq!x2urk(G-oG0GZcHkc?~jMb>evE)dY94Xdjj+k>&*=hlzD4WN$Mx12w+M>ljb*=;yi zUC3`1z5l-O??|@^tL+jWh+px|93?rtC zephJYCsHT?ll5!0x_%Rh^!{Kqp|&Wr`3tiDBqC!FwE1~ni{nzzOYdGL)DJ-l#+!vkCgrm7ZB1=R#- z@%O$m=$b)SfSpME0M-I~3zz-XDvu?=Mnp-&u+quk4+~(PBH@Mln-PKVasY!6Bb>Zj z_{5mhf~}pr9L_Gme?3%Q1Yci-nWDiKBn{2|I<60nhg7~4JnN@6OY7+pWdGR(fZ_Ck z6UYS_(di4gLtuea@9}|a#^DbEyTKi#MM(yBgFDE8&m6u`fMIx1vEGyb#pcWh%0mK8 zcn+j^A|_GEigY3)f+Q>Hs^El1yHRhVNF9#Z4LV>VfcG|FDiXLdkwItF!Kd5NCPS13 zPTB%X!ZWp0HOmvEH|Le?E2-L;Ta;T?Bgz_z#KBnrW*ox{y;I_Pr^b!UOdFh@)IBM> zWp~|5>~CGus07Jb$5$Q(>u3!&yvqO$6W}Y4P$Q-zws`Kq zFM!QxRH7sblCxH>*?m@(0 z!{WMVbw3;rN;tj58KVuL5>@CDt$P?gi;XM|dphZQ9_hgvZL03c-0;er2@K!oPR zVJ8wHXqq!3z*k8I<{+g+8-O(pI^00ONOxdO0xJQ$nkh0Z#%`pD2%|p6YBK4F#7HYJ z8f@pBiM`_qMt%h8ueKcgZdYDuqnpCRQv+mPl=LJKXEXLniM_gi=G0+5$ref!(Ms%Z z^muQad*7~{ymjAw5Cum(Ti6)@rsk?Gd+xn|!Gb3re*V#WTXy3oE6o14=Zu~`euO`G zYCeD90LgHa9pYFqoHZI$qm`hGhQPc!;EsVS2%;>BSVX*j{5*}wg|>VRc&`aV5e}Z6 z~C?Mm|>uS84cibOgLl!tbuxOz(;dcPA|jr<@KH*%hq@T)ebMrKPJG(PQb8G zo(D*b0B0w#(8iFPvmS(>8=V|e2!B-KS`g*&5)46*;VYtqHUh8LZqz47SqwTN&T6Iz zEH&C@HR>!nU22RStOnB_AaIxsSLRIM1OfvhJCJCO%Pp&0wmW~#p~41lAfV*B(IUX{ zla+)w$Q3=)ZW__|vdmN+fm0VS9MF2~pML9od-HQ@4t-=X>R>vW78bQ~^S((_o}T~o zBQHF9U#px0MNNMHZR6%#HG&I-PK|OoLE`Y2Lr(e;(VWQVuZ2GfEb!gY5i@1KQ{1~$ zayM!aT#3;-DgAy^+M^TrLr(grY5Ql|FVeh3{z`0aq2+2i_pl56p$&@Pt@C$Z+6QdAcAmoZt)5EHAL41ndbV{gJjQ z4T7fmO;cP#IC)}oy5*B|Applniy$9g^zhM4?)@47qc zIBe-QEt5pV9!JD?69Z16`WQ^k57UO8VLQKWN&fBf-!9agRxPOc&`w!RCUER%MeTt~ zU*3_!Dak7PRbBj2+28)NmL8}f=bA1>Z=ysDtQ9>ZXP zcJQsCImHY8IwcJ1k~pw)(%9Zz;_c?MZ%UG+L#1_{<7^f^^;faK#r=L#VgEnpFL=0B z*$*zSS6_WKC%-^*J~I}=={f++ehr?20Wv%sD8pblr*vYl?3CYvb8t{#=ZeaXPg<@; zQLQqe>JMuJ{@&M+GP`ws@PP-6#tX_KrLtdDyO7HMi(PO^0x!TL>S6lTdv=z&&N41=Z)M3PUbzo0qM z)lwt@9}M6qfx(Fh@F@X;7%2irQHmgpI-*-*6oH|@P>3`cZAL07(hO_}i=n-QAc%rk zH2#c5kzU+^B-p>#=LRo zWGrVb8}Mmb>6w3CPQ`)$L|Dm|WFetSCKl_3+a_ zdV0zF^jQ0CSN8kux02=i7q7gmd&;ySJx^Zl4zYjxaM_!4uSu}BaZN~-{i@o9RQ6vA zI+5>Kg5hFXSX;pxWF5X>qc7kJ&^0a}!wVG+?jXyRIXrGZ4SFMv2XvF+(*s%JB~gGW zGBm6pYEODBdwLcG(D9(*`U^);Htl@Y~Yq0;`U3vL?cKu+r7zDnB z*>5x$eqOWXjvMb;F#nO~+a>$|{?YO*U4=u_QQM^il@g>=rKO~sNtb|>l#+t<=w_n? zC8fKO?rxBj?rxA~j2tnxeS6>U{SSWe+#l-0{`Zl}#S4ztLA>fB-g2*#%Wr!h z@Ly|`*tpMTK5es9d_k3bg`FTnif|6ZOtyT=z9soO8YXWs;G8z(OdhuX0;jfgs-A2j zI5&J)!`61tOgEld`3Q|<&&Jd?m1y9LA#7e$9fPf$ zfOclkl)iC&A1k5gT<$A$;@HzA$d$ zvD8j88HpSCz^7?zp{J zMQJTpm;LUqq2pBTzz{h3-gjtY@&adisKx``td4Rs{63pq7t(REmTA{;JEIe3L zw6)fji31p*sCsIG#EC-q`sZ#gwmCmU`*Y4@he^ZQ+jqYciLmf`BRiOFIj}auSDxs43w}XzQC#H8?=jROx+?9bS(h1u)^=^qQ2U!NoX1&JhYJ2(*WI`j8&~+^sg3Tr=?*{DyH1m*)9A`ZjAg#R7o9IpF?EodcHHIG*Wq*%xX%c%J(3L z_0h=1gD+z538Olm%?3FErVd8ft8r558xAU*GitKjBRc zU2tYRY)b@~t{zk)gfaGnP9dWJp-E}L-9M^HSOWm9lY84$*V8*2*E60MA!2U<*GHOy zL*Vn~6HS!ONV(}Xl&sloC3P0i=78)y>Z?D?621QSy@&1;84HF6+-v~G)}~hMEMZ;@ zJ|4q)ay0oKYQ{}|e?IXOc0d2~kvqZ%<~V(qnZkp#^?khND5vb+To}qz)l*aZG{VzU z>nqmpDj5X`R3R~tIiLAZ*|)ld?TKN$cYbGBHH}}-D-j@*0w`3F=`JT+{Dy^i8_{z* zBpUWMhCt@QBW z#$ET4xbl`s7tYuJMPlJNsDfzYSxq{4R`;pEX*yl1*7Ir}(e8tVwv5gFIO9MIw7&T; zWabhtoAY2+;_#L|-3M=ozmtlCtCbxz%Bj?a=A{Nb2;@zd>3U%N%&!PE z8207rBftZNm3#VjzDBp0xHc3^IPl0aPs&fdXXMEiGno^h?@0c>$a>zBE7;mFv&+7# z%)tZ_B6fe$1Qbkne6!hGUyA`>1TI}~MXj!^(sfgIt)EX;FH&JNn`?O)K;HCU#D138 z8Ay8xMR^bO&Px#DKG)y(bj_#AU-p7#x}F?jx@{uPRv|u$QbCqKnC(PmdiWfWbDYwv z?&mh@ok6dHp1g|C{~U3-GIR zyA%P@n!kI>Eipt~V_J)x(TuqZSm%~uzl$7><8Blbq z8Z%y9(hW<}r0w`~t|XHT&7y(NCNwJ=MrB+^ItAQH#O7e0F81`EoQ8I-Cj-Rbl{w9T z`+2)gel6_B2oZVIW`zo|sYG36SJns;mTwJX^hO{-S)?lu&bp=A^VU00Y8Un@J>erX z9Uctetf9rNXLQwWy28E)!GmII1MRe)Ni0xmt1{VHTis-RJ=@9oCU=0l^eI5MasOqz z7H=eX=A>!)BBH@+v(YWb26&3+dk^ zu_>&8d|f?#KYuk25K2No6kOa)|H+fopqBh%zxHR5VLhyt%9iPt9@~_;!6Qz-JW0A4 zvd`R$b_xd^78?S8-Gm-ujH>2Gv0FZlCp>u%sKmj}Ze3#BlIB!)Ys)ZgbxGvtWZA#t z0DPR$@Rg6tH6S(HCKFigA!_eQ&<5-39hf)+5fi-h+yC-62?eK*_s| zZPx$r1%^;Z*zL~f)>fdi+jsFeHdSF`m~Yx}(#HvA@5cx3^e9@J#I z;YN*_0&|lc*Sz{1V;d!4oR1H;n#MVBYbLhmSH&^e01w@(@Z78%#OBIk3b5em<3QT( zzPcz&Hh5umlWQ5oKI7nhwNMVES^f&r6JJX(G4XGhnU>%_tA`hVYtl+IHFyhO{SaP$ z94>t!eMt0?rIr01ZlC74!Ef(p-Y*N&7J=;#XkgVoB~_~Z`od z+ObD?)@$_5$+lA91=g*2Oy1!$pvg-fjgq;&1fh5$rV2%aB+)@T!T7Jm27cDsQQv3K zch@oxdp&V_c=!s2ikjHn@!84@$!4EjJtDL?M=rRNRu8(eT3aPNc?C1r^L`^h1v1UP z>s{UI8Jp7f_qu|0Cwdu{@f1{HZ7)C1Sm)5S|MDDkL@nJnC(4h-@@{82b z^rHWC&Sv!akbd!m%CP(o#*r3|lDP>Z z357tS_nkg7pJ1;;lrZfnZkfG>Zst0f75|`-x(Z1$!|=2yuH5zFKAF=vuVIF~< zeS$#3%FNMX$tlp7o~x1p`47)G{x15SDEvfpdziM$`#4Ms?750wt9A-}9UniM7=N61 zE-b5^re$|@zqg687Sj*JWdNLSBpvvyI1jvv=HJu6aB6@0loKl!Fzp1Pvxjbqz2;sa zI)!_;8)|Nm{TwogVpBS&FRd_b;{?OwW->3u0e!jWd_l?N-jx-d9ZGEauM&+-)NGmBaEm{Y404_y+T`N( z@{VV!aAfF5V=8Wzr#sk@U?lV*K#_jhzb}M!SF}t2Io*q6|2N+9|M`ENfk1UP$@7UI zJkyQCGS|D9MKQDrvicS$NS`p-EttJGe9N_^rjlQhm=lp}gya1$@PaJJoZW8Wq>V-1 zQ~ifVY^|6257n5ubY+7(%Av`35RjGB`(@aJ8)_6I33-a0{oMO4* z*&1n=`;axNzi?Q3g%cP_th{-M|N1GNI`Pna$ZZdMxJDQA_mK55U;Y(~i01V80r6@FHo0$9b8T#2{YiT1t0|$yJ{Lj%S@& z@jGortUo%u`7BhFEZ{Lo_iJ6y?mOpRD>+{O5cJMt(vHsw^u6NL{Ds*`U{j0H2LM=( z73i#pKG#JbC`B0K7&9qz?N^*9XnnBVm#t5`pFwSZD7T*c-1m`(pWygLD{|y84fkl0 zd6T$XyL{5APHf@1AX!I?<O+vB9OaU!TAuhK9Dn?YUa~5vaKEwIrp`i;_&@d$VJQ$GIjxqfcG*&SP@fI zDsDVbfR%HsoBxk)hw(7lqRpGNf@W!-Nryp3n?Jj#NaoQZ@=9==(OU$C*?~Wy_vPIaF(tL--BH3>{E_7@8hWVJUL> zlt5-n=MJA}mQ=oUY#Zs`7nFoYq~DH(PNr}cWGp)2(~W^Dm_ww znsU_VMP?!L*|7>fNDCEHd*h{}>c^Kg{kyb7K+@9XBki1Df1#h}9^f^_Mrzan|C@HG5l;uao_Do61qr zc~3Md-|EquaIxV~FUSdaoCW5XqZM{yEJ6x56PJpyvy`q+<}0}pY;`tzL_sZuqQh0o^sO1;E|ccz2OL=XJFq8G zDxT5J#E0mpW`pXuG+VIKE`12X$Exube5q5g7@Jp&qY@Z)8E&BeW8p1{XFhwo-0SFy zG=BE0R_a~f-~HKEyyh?fYAFln^Z`t#0~vLO#YO zrjl!}rIT7Yp7d$`E|XHFj=Ej+^(O+CRQ&k@xomyRGirt^dxVJm#R3yiG=Y@{VO_!Cl!NjqU%f#bvivC~U4DZV;@t`Kx*rHJ82;!4u zN#Ns)_W`@MFAwdHX*cig++>UKWA|Q&f?8_4k0>`g&->}GpnA)jg~HY-E0Kxu34rPj zcsgLo>4^jn0dc%=mNHWmN?SNhmOub;?2^-olr?7jQP!>e<`?sH=F^93T+`f|1)r8W zC$JWIkAL*)kZW)WbK5~uFl;=l$u;&bPQ(t?U1XeSKHCq`@=~450JDR_vHlYFa*he> z;3qnb>6?;LNPM!~)&K=^QwOnz*SFq*+oCe>Ul_C|F*~;h9Ve3^mKswCCzQjiOpKk* zTl?x!2)*{~d+T*_DTz(tHq_qwEP?oCR5@gFoKf^^@Sp<@H<4$qLIbJB->#lW%{9bz zr%3rJae4IerhRY2;~Ey>GL|q9?}f8zBCI=fn3ycTI)mSKxmRT6K~%Rf0;4-<(F96} z@n)gTtSs|wc=g2Yrk;Vx##XYt6;WqAIUAh^vC<)nn50Im9mz|DRD!mrOjgr6TS82) z%iKQ5qv@S*+EMdUFgFW1F0YCr7afy}+xAOzTZ}c6Kt;nd^U_i1S~)69k{z8lzP@|9 zkH5KibbNlg2`Zd>X~ud3EmD}k1#W4=7=@ksZx+`jPgs+!l;cHP0DjY>DA2acJQStp zIEIu?a2gwyh(c3B&uyuSDvdtrwZBj>Cu0dcq^-(oxei%eeX!AOjKVmw|6GF=5Y4Y) z3b~$d3m9mO@im<~!gt9pnXAXF39qgr{ec&Lpb$sNGNr(#WTcbt$Y(8AF`r0d8QGYM zDX&xD_{`I--Ph!hC^Y$O1n^P2u`;)!WkY3)Dk#6=RUooKry@Z9R^#P^b=2_j2a?Le z19x}W=)qkSP91paFfCT{4m=4UB^AQey}D|1ymCrKJX#)mqpIyFPZxjeM7vqX(gU*f zF2JP``6a(MLG_nIsB z>X5vd!OVn|g~9)oTAVdgpm?Cw=G-06MVuCeT;j5s$Z<5fELf^nLSPKl25p?c#oCdz zIwP(4Uh?Mc2C3*z)CmmYHEPEac)%hUAMs5v;)XTP;r zRwii1^5hfH+Z(3hf?O#|%c}I|H`1klH_`nQ!3F=P#=8L;x;87RT=}F&J6gEdaS4Wk zuNNd}$k6L*;>iOj=D-J&Xb!uLvD}lc+Jj1rGxEjzEH-&Oo2rIChMy1KmQ?WCUz5#? z2jsmV&6bOOH9{rTagro|OM|IsWIIM?F7)ve)0iu8`lO9jzDRy~$1&N2V_GGS(lp@M zs2Nb5@3|LbIeJT1<=jfh{EUnITpd#m4X3~T9@IFT^*F9EaddU497ld>r=*GY%|mxL zUj={JFF?Cb)^;8bxF3Up!ji3>6=&9W@l4mN;YVpy6WpPW}+E~uht6=dJ@|iB^p9LPs`gooARHk z4H>%*`-M;KwG*j-)m3({<>-5=+`yQu%Ijhw+!aqc`Ux$Le6cdly zt@dVAjv&9ZinaPttW+kxYT$xfVgEL*wcg~OJ($cv{l$;n&_95oT(^`1w|fiN%rR%5 z%S}rSjeys@L^JjI{<}Lo?(x+zsFR(;fiZN4+abh1=$ zxtoo**aC$YC#+gvYmSCe)7+*xQhN`qu+eyV{GFo|(w0Z*uX03dSY7pcNdidE5u zl+$4@di0O~>(-97i|b)wj{ zwl_^@5Oam^OhvJSV(0pM7RB^k$U;S8s?3-83f$vf&gF)0nE0g?)1B2N$L$cYQDQn4 zK7$n{Y)jfFX4e>z-()wVXv1&LNEw$Glzed;h2f*O;1}}`r-0G2$OT9zv|z9HQ{UgS zET^`L$wK7%M9WHReK$=FD&s(4W|uk)g|G2(G-8HCDOP{oy-%mp_h#)b`uMiIP4ssi zN*3pA#aFYYWkCx96Y>oixY6kGP64=gOz1(bYl5j_2Kx8}C0jML!w#P8O-q%{FGYtL zwDXDe(tLAFmtwRR!i*JWnXwl6!hgeT<+FSiOg}UtyiY;CATGmspHBEU)Q&n_|5QryU<7GdHv{G^&Drl*+T6Ok1W$UQQ(Pu$>6|4< z%*suauwav!xJx;W7a6E*D1|Pp+>TOv8Ej&AEi;oM*nto(UO?327sZqg@oCMs956hnTwl<*`yNVS|YYWkdG$Va!h{Ki<<~ z9YeXeMtMpx$A7$i9hLdYIe1Ql8?-W&{qn&B!oc{w8EPAzt`>;cy2P!f=wnx5+yBCEQ*Z;=}gvaKYC@dKb4(z^n8Q+9GCmVIqCj!gSOEFyTx`wIf=~@L_RN{p7)H2}%dt|Mqk^cb| z?fL2MDJxu$6O>O0oS3 zlN8Xy9cI5_+6li>$7ATQzDup(A=%%|QpiW&f))Npb1}v87x}ihc!RdY)%%M~TgQgp zF43Hy|G>=|P0A!X5{1R_-S_JspH{D@Ot504z2nBoNPx}EOXFFVke+DG$fkkSaV_hR zO9E7|U5}R;`Mj9OKXRzpr!L_G?+}q(HuGSaxmUhw%v*Z*&`5(~`&)oJhlyKVWCYvT!q`k1Hrmc2^ z_8P|)Fy0m-cz#DsWOBbPe*NQ;PWn+9fS!4)9Iv9a(68*4zs}!a>}sSNs-S5=kZ4#P zDNOs~Up#Fdz|@hO9pS!<%&be__24m+F<0fKu<$W6Q>UM?96aCXoZU%yQ|TCxN?8_} zfO!8;tPWLP8GeSi!!ulP?RX!zw8b;*6;107>G>cZ`8R&9 z_MRxdh_s_0vZEnRT1b{yYt#TZ3!3js818tJSht}(ycraa3Tg%=DH!tDTyT1%#a5CT zETK8sOB{nlJ&fja_CiK<^Z8&Oj)zFycrHEh4pX4iHDqGVsF}C_ISq>d2?mbC2Kz}v zUS}0;X4(te0SVQFCSf?|Z8xR)Y$F!Dl$^GX&GdHi9h^96Zo;^sVi3YstK$l{c|kbz zTsm29s}s9kWoAn%IA-Gb@iBV$!BZT%CjB3!fR|w2)8Ic5CFw<64y2ilT#e9vJxV6H zVa)AJW*_5I{?9%I#T|bL&2(nA)&eR>xe~YOe980&kK$emooo^KvK*U)Tl&V=91a__ zFIO-XR1QwmlAw#^%kGGWnlGnnSyo-;xQg)x$)Z*%J#ZGp2Blln zX0GD`6g}0@t+j&>S_JNOuupb|V16sh`;IxeKrG+D6`SH z8;^R4)we+R_VH^U2*b%t@+uy!d5VT)x%B-q32A4=-fi(wM|+IkBq8Y4abm>QHXh^4 zXu|PdF}B3ldY%NHWkrQ0ciS3&gybN$EUIpC={DXsuIf)8I-WlS*=z4!Q_f3#M*f;* z)Q<1^v+aW=mn3$LzO?t-o9-OQjjT^Yci7(T92?deKZA7+&__ph5B6{Ne=`||i!>{q zE$#=k5m2->q}&Ok25*j%CG8~+lL`SeTR0Cmh>j_ku{cFnTIKJBuc%H5x2r{n0Z`Cj zAo)y@0M`N$rjBYkYI<_TDDB6cgK`A8AB2zMXw;><_n{T2trk6ZpfAd&&2GH^Uw=7A zEsgK|S_u-e8t-N+7s$_9MXV^fkUnl61@(U*i$C%ywh1KQ_z3 zu#@$LeCtN->eb$6fr=V7jtKcM!>({_^k4M{$B+hDUX}1mqwIU>p6!pxG%BMzLyo}> z2_b1IGzImXqQ3UfgEgX#MtSR^`6lJ$Bl@nZYdNl6y7|!ADj*B*`g95lJ52wVxd3T? z5%D<9s)Q7$5JLmln9=s642=wSKmJ+CqK)#+U~h8ftrRh%6>$nM8?1POkIaqPlmGz1 zm*w*bxbE#YBbb4Rh0V0X>i_OnsD_XQnd%UuQ zqsmR<@wH*Rp^TZ2%*@~Tr)tyT^=oprLZT67MpV(&eJ=DtEb^)FLe{VSO77}E(8l2% z2W$GE9jdU>TJOpNjK2M(d2kntp9f5fx9?I_k?C1X@z$pT9+nl@^^HDEBeI#6x2XM2 zc5da=wAn3W*oJ(7;)d~S8vhopu!E1;;3?oMDyG0{WL^3Q(k*TT#8N93 zmb=}<89liD40PAGBGm3iAntm^8Jo*ailh*YurQe2!#w1OsLPdi5W;VvVa9A@ARLYS z@hQ^s9nxr3aE&N3GrNO#FE2O@8?>XMjw8+Dg5$l9&JR`O$;gCh!Gbj^{R=H2x>*8T!>83xnx&sEqwpplC$;{A08+%^gyIz~ zD`pq;b93Tn(XnV7Zthxt*4OW z(e)zwwUIN1)J%NPGdgM9a*_<#T)U3~dF*7HC#;YJ`BBI6*kg$a>fb5^|ND;<7wLAE zxW5Nct?UPFv0yBvNUSAAo9G8tx%x&NfAP>TO+w$L;)n0P-gcHueW$b_n3Nw*ku+Gs z^k(eACi3+kYxSQ^@~H&2e6Tro+P9%YWKI<=NZHOKP~WONjGe8k5kl*&sy%! z>L)KkFhv*uH64WD#B#J3to3ll3LrX}JGAN7a@P&0n&hmMZ~Sp7uP?0KRvvbnA!N>M zFoZLd&_Kb|0aldGh)KMmke*Ff3Zt#nZ)#2fD9=lnt6 zo{86)Hhlx!4;C-R(1Lur86uObt4lnlUei=i)7qv?bF5j#O^0id(o=r_+6MfR2-&1@bX`Qo0*W?c;r- z=j!ml!zk*(?9xzTx37yn8#&&uW#(T}BH>wVRKEX;%6Xxi*ymd*^mv=b zZLEAF$m_~QD^HED2AD%rXrFG^>98aX!|8Jz)?7ORoPxBLy_>5#IZJUL8M}-Ep{(-tf3!0bN!A=Vyn}znzJ1E=rU`P zzKb4oZ+o1CEmwiD)NSoGkpKl}#nC}8@Y)_HR*Fwg9qGYuPyCTuw{RK&6AgHY&1iNIX_ulcRc>n3yb99>wpJ1c@!Ya1#QKfE}`-n5%kjJ2OIEZzt*xgwNGVV*J$ zDg9{1wUtJDF&-T{k!eF2L9FEge zPUlm`g=gvMd%?}Wbg;n=zaE+kOQ_Ey2iM@wI)N^s9lQES8$jQ?c~Y@*6CLo|9)BLM z+uuvKx7j+SvBT_20MXop#r6W9z9}$t_)PI7vjo>6- zFDLZ=Jpkk0j)~ELa%V=LHM9L0;QK9mWM(hq|DKWI7TC>bTL4jt4e-eNH@it>+M3aD zvb%_o`qBvR(qZ;t=eOIfI^ULRt4bAHSu-Fb&*{2M37HV42nQJV@~3*z`UrA!l_myU zkLKAEYlDoW0aJ_EQb!-*`VJfR=62|~%YHL}NU3##xW_b5>h522f%F8-p5lXRQ=_)c zOymLuEHiqA&ayaAs3~RlwrU6T1Ma4%@|J%I}hPe)wGl zGVK;p%lj(GX4f~}e+#dv4}SxUfxK+kNpcYlLflL!UM-dxT|CeaRc)v-(9zU*BEOQy zAWlLoUWkMCQ4388n_;Fef8rdgbIezJ+#JBJp?(W4VujJQagAwz=U3J?*8UERtWAvl z{WC*`087}r-Gz1CtBrI193GKyXF^&V#pveEjU78*<6VnidNeR>%zOFY5`i?D!WgUt z=J{8vcb?u8uI41_UvHmXu?Tt0%?=sMSKmg6yT~YTKNqQ5&MWxz_CR}X=@$_+7CgK; zP=$=2Tn8DytS8a=q{!GAY%47-blz8rAJH52aZdHHP>)@z4QV-DzZ&_J~s-OKTLB!)V@aNQHI?f)>$l2slA+DK5R(>Fgn-O zUc0&Vg1WsL8EFfp-?Eogfc!?zZRPxGE7}mFBGATWLAW!lCGf%YZ=Qut-3HP+JSXES z-RGqP_c--9Ctf2M+G;}vytdb$%-S+@{PTJqVtUbM0`@Ad^25_d)tQ_r20^G)lxv%8;(SOUBA3v2TqpTCtsuaq2}MV%_AB3y3&3CE#LCLe7^Pm zmitK`f3by5pQR3IeI)<$QL=PP?b?bm4c2C1KG{x2vMutdiH+HbnJFg61tTT2=<{Tv zzi452kNH9uu7VY-F)HQ5rUGYR*J9@*-{|a|oy-2BhzDOzqh17Spx!{x>5T~)`cQ7L zUuJkcp*kl4;urE8vr1y}=z`MdkVPEItca}7G420ds-!_d&9lu}v*UmuTP1xZw5>fkzvv;2q5=5j>3PeMl~ zB*XEJFKZPnbhJG&>eWeKxw>lXBWAdoL37oT2;;g5>w2;UkgJZH3$6=Tgyys*Xn)pW zlY{Q3-vS#}?B4!sHOiQTPzsjR7l-%)JNJNeK&)CsR2pO>ioFN zSIBxEAm}$G)ze@-Mru1l0w5(e|ME|7xCK!CRae6*9*3f+_UL2;$1@C9o|K5+qd84!PJbmRN3N8b!qwCTeE&MHK3NK z)?~SokS{!uYn7sc8l|6Q>Z5AkQ_cKwn{El(;PKQRZ@Q%fvXwP=dbusN+VS~L*aJy? z#o4J?yGuHZ(E1z^m)gb|r;|mx6-}+LAoJxExaP(WwP!cChQa}fvGtBulG&PaQ^+O^ zJplh}0e<9mt6wSg_q{`v$n1?07K^mrftdzBxTl=BACdgD2+%Ej&z6k44q-#X`X18# zTMf?3_H{p5FBbnsgakRugIs}W%0f{86OK^Zn+bH~5$Y*3rgAL^Ob z@*=1GCe~9$%Brg~EEO85qUYBP{(W^c2RfEzu@u~l_}LOKI!Nj2uqjA6LG7y7gNc*6 z>J-sK0+!o{(;`_3sMjT4tWWJvfiP2KIhh$z6cqG18{?ZNA!(M9Q&HX0k8dl)oc4<` zA;9&zh!Lh8foj&T>l0&~6%|^HM+R3S6jzgX5MT1icv8d2Q9$R@prRZ@)3sJ+gKJt=Nq3jMb=B@-IbSz5|BTH=gV zY}>{o0$jiR&S{#130cI7zG}w6;9OCWm(fMP=+InF^vh;a0pYXflf0F=dl3)7OUDZC zOh{#z*9n>*W#ds5QLzT_n7~)#3YC1is)%;so?cD$Q4wlVK1@wT8g0&NxrR*eT7o{I z>s!R`X`Qw0$p-tPR*+n=Og$ z0Ie6w^4`%<)i$XO9FpN>4%3?^*mba47#KG<(9gMejU1HSh%}`vDW2TD?{?vQEq%Me zz^P+NzVH_^#E1N;Hf=e`W8C<#u4oH(-S6XYOVP%XhNIJVI8To2Ud}F5dXul1RzsQ? z#{s1Eu3c+hs#Use%d0j(GNaF0Itg*cp+#_@1_SbP%Y;4QjWl^id?#Wq>ECHbe4XkC znU`;Zn(G6)-=os{{M-xsXX@Wi*Uej#_5bW;1VxIwmY@J=+@01b= zC*MFeJ$H`v;TjKqZ4L&K^V5T*ji|Y3KSogF`gorYsu&p0thWiexl}jCyCy`OHrgJ3 zuXOB|EqxyX@PzrIvcCZQ0@?ycTm40j3QUQAvK%|=*m~ULbX`#|%#1i7c1qX9?`(z^ zD&E75!!u__4(CpP+*RNc&qS0DykSUr^DIsF=Y~uGU1hspyT%#y)avB;RG&lE^$IdI zwB=MH!;$;0p{(`*A&OcWfTKey4I3Xi6>S0K7A)Fg7w|k#s6P^^b5J~P@Hp&uwIzV^ zb!88Lw{+Rm!ElCx(u*3Bs2P3_Z*htYlQI zvJ4_rVLPGPv>kMdtJW<34AvUiwfq-#Q*lV<;h6zAS|jNikD>|n@7c)%CBu@6y8`(ECRHej`sRO5z| z%k{(Swd*1DykZtKWKZ}|CN0A+kQa_ZqcE9l4HMWWW;%BnAbOSEwMg)Fr_w~GDoPdc z`o@<`?ztf+GC9hT9>(1Vo{~G4EkE4=KEz#6lpL)iIyCNEzJ)%*-As3LYu=63xs`7J zme4>R>&;sp`+@z=rsSg4Pu$a)Lz+7F!Bd$_uE<8_3qA5SwPC!e>vJYukK>^3Llg&amss<4NP60t*}G^!W6fuGQ6LQy!xgjD#*TP9J*j? z<16&;_;X5!qq@QsdV{Cr#7gmLZf=Aim$7pg>GLF7V4Tc3-E&wxHvqi{qn{-7*y+=s`9dYU{GFNSs^o$pWw9nA zq^-TmJ@_~73?xfk;6NomC0e=IdhTEtirxcmYRK7A{0 z;;zItX2C4e9T9Ks`?p0bc-KI`(u~JcnU6{Ey`6}f{&X=E_~A8sxtUQt1-}e}iMV<2 z4_vJNB)Jth`vw3RXqSprvHYoV$ojDV)mFMxX$X^E3Tr*{lq<^gv90GS2!_g9AJUl2 zM7}NBz@h=W8d13OQq#8kwmYr2t&Pr-tZas@#M(}J_Pw)6lMR^gO0ovN$joJrpWJm- z2@azMzYK37(ZpGV2I94N4_k#u+(Jp32i}61J zx@FeB`KzA@RyHgCCb?_a&bPu#@tpeCNB$nOOiEB={jt*<1RJXLcI`dQ^&-oC}z{R*L0^g*u*J2Ev*Ql@-f zn#eKQ02Cja*;KO7xYEQ$v4Fvk7paKAWS<(=ez z565>UVnD1d&Z@jgDfr$b8L8!_ztN4vSlRpqTJ659C&b^^Ambzm=*Gn6+qXFBzPnGU z?n0Mc^iElN=-N*G?M!d}%ja+H`vIwoh`jut$jl;$j&aR495r*{xP)srPKAs(_L_|W(w4)%E8{66z6rQ z_&DA`S1QZ#&&B0)rxYMZ?Otq!;AltS?cd_a_Jq9fXGnq0>>((uvt{rr%37#e!odp%_JJbn8kbGf$4zvm<=0*K#L zh{yHzHVNM)LmJ=IEwQ6px~|(D*EgO9w9k$PHx^<^FJnzxrJT;5Bj_hl)cWPUpbfWo zB5bjJ2a8^+>{}=r3KM;8SL9zi8?aPo!=~3%%Ula{)Tq!k!ph)y7CW_DFzgk3=1$^z z>kqw@=IKW!?V}PGCQS##!qlz}OCmvno-T;}?Q1Uas)|o&c(f~8rJX~7C9;G6x$*;= zj-q;bdSYiey|eMJC+Dyou}re+C?DOK1@L|Q{BEdP*aME5IJ^h0xA?N~KkEGV5y0Cc z9=3+DqM5>-d+%aOm9b(6==p*_mN|o)4Ejfwl5aEZ>0Dd=%_f}~J6orwux*zOlE-;U zqlW)V(9wh62&eU`bUkdfzfHFa{an>vIq)|ZwkwB_Xj!86EzF%hgbSd8%l*dXWh-g{ zS3~lTvndveU3CugWnP3E$T+vnA8jB6X+t+=X9MG0eNMMYULmI)iy=BX!}!l@ogTM* z>V4Ijg3+Ib#n7UngOnG{!w1@eeGyjADLH4IHmU+7e4`GJcg-*-YZnx&A>D3(vxPWD zhp6xFmd&O;2j7*}vpWCn&U0osNuas~TqkI?L>eGoa`OVGX4&{@rUozh9iB9F57cDA z(b}~<*d}T-BWs|#4Bs$(cgh)%ULyert6`T6&p1W2e8Qm)+_ z-oBFd^e&8nV+voYvAeLtVyENxBCjcQYC2`EFGE~qi=5QwEToWlp>pLMsol2`UV>;Y zD6pvZhgd+D`Bybntdbypoz}K?#1LQ+Gyiv0H`UkIjNx*k6Ptk>Mdkf7CnpqMSDta4 z^WoxV1`EnPjzKvRK}9-j2}9o~1AKN?vc~*NKfl%~^P1AHeajOr`0}#-zNJH)U9e|< zVollRFb)ebqXS1nsN{ghf#0bH@uONt2WS<&in6`^COF^^ty>W_Oq!1G;6{LL^1N!Up=}fpwo(+uL@6R21*H>d0@9>~7C?$r z=}lSyQ9uzwZvvr6ZwjH8sHg}Cgf0n1dX01hLXv;6-QV+_|N7jVTQcVwWsUXDG2bzm zbCf{-FjJBGZHS@wlLsFvMn|_t!~hbK&da%f{WoW@#QY<@67p4rug?!BvOYQz-CpG4 zW>T<3aQqeTa*2SWtQenk&m@ETAIVK~y#0c!KR?9dJ7&)8l9z7QR0?jjZx%7|F+i&o z%dX{z4h=ii))=t%Eh3C8mlhunK7L*{< z+xDfuxVk+> z>Y^pm1<4-COngXJbz?+>rEfuGUoa`dWJz`<;3NnV_IjJ=r)rWzq4`o+nNwn`ftJaB`+zGAa4=j7+O) z_!@gno*vtTs{@xC{&FK-60nTGS)Utt_P`o^r$pn zot`P&xGc##R`jbQxaoX4{Gsx261JACO(m#3&c<}nR=1~fh6bI>}h&pn-7hSAR7^Q&!d^Y2Jtany0MaEg)% zk=Qm+UYe?W2Eg^o7tQmueYLjDlii=5*h*29EZbU|WN~JfFo%8-VpVgM}(S)iH|#b<}=_l`dbpjzb*&C^ImE!zIt7-gA$u;$;>i*2oovd>}xy`e8sDy z7NRHDYeN6xV10$z8Wa(BAWslW;(+wLIH1v~FxIUJBJHE{NI_`Zfo2pyEla0lJDhO2 zReP9zyKd831=G!c>(f)-Kcy$<$@9zqzG5`5bk=65(`gnYD~;c0*}8OPZh76_btUmx z+N=z&rZmCCN{p{bBsl#p_KeY5U$jWH?8*KX0~VnmSXp}3>b;q;$+v|b6h|Ctisc5W z%FH-NQt%E@8NLKiI>-k?ba3W!0I}F432q+j3#;;HjG-eC5ve^5C9NGU0{u^GCF1`2 zJderP)z>SKpP`6R;U99X^*th0@(4K@Zc!dlqL*3R^FqZrMNyNX+eNjGM1R)Squ(rB zJGFIxZ{ZHjsy^<9AJBwiwTSeXT4~Wcarm);*$T3r?v6b{Ho0h`V2xYbCu+YzQ!p6` zk$uz~H5vuhnAa~W=c`>oXN$#K3*D>{s9+aLMIWELY~rOcG(Bm5PaT;a?7tK69ykbS zEa&Hu5-~z?rHDjU;lJ^P&qlmgzl>$Ax6wE6%c!sA9S1Cr`IQV+S^td~JByCHd(xMc zy!i~X=-D$#>T*-Khzf64Wqu(qQyyD;u>RLq^aQ79wld0god%fzFYDJXZ_7PfD)@JWr zCp6S~Ww0%w^3%?nRT*NnLX7S}s$dE?n10`i+hnIX1y$IW&~L?~a?;Ww1EdOh#(=bb z*Y4u)C$`$fleJHdpDA6B0_)ZlcOBRX%el-wpAJ3##?GKJqupsx1_e1KCnFS5VM_~T zy%$9vRhFwYLT@WdocMb`fgxO!%Cd`_-v(iH-otG^FY6H4u{DK`d98)fIYmHnWPaTV z^Gi7`40X`N)Gi*-rs(`t%krE;wu26eklqWOe1{;nY1(Gyd+>?>ZKv2^Gp}dSLFVA; z<`_rN^|7TGLs~c?2qA%zwsIKvxZ8@Eq*B7z3AkR zbu?2{Ynd?L+gy`t6Xn%R z8pt1^pGg&fS8A)YX9uGLuHgFyx~cmyOj*o;7XM6+0N1A+q>pRpPU^(J?tR_YGayOOwEn?4R2?$ zij_S)5_$gY5oFnfKI$)YUEOVt)2xQ@>|mW4S0m>H#TG72AYLjbW?5fG@E?69EPT%9 z21gf2E?d}M`r4z)w(@HPIIJ@zm+D~Y29PI!uw)A8KNH8upUSi3IFEp`gx-vfNLe-O zb~BrjM(P9Vt~Nqd@NvVH$!;I}1XTX!T@V@c$J8#9)Xpb9KdVwk1P3TJ9 zPleJe&cY@`3{h7hI2+%0Vp+ceJl_v;#c+l@(4LCJ{K_NULo!VwQp=lKevM3xOr>uw zrkEIyJ#d1l{gjR^dk&AUs5hHob{G4-Hn-{0TljObCol8AX7Z@iWAX}akO${k0}Pp` zV^ln7ZxU!DggDc*5_HFkujaT}|C$Q)6%T1(F$eu4t|jt-M#1IyphQ7bAq-0T*7v*> z36@|ITJSPWeZXy!_3|(RtV=^JPf!2U&$nJ;?cdZoPz#!gIq#0G#sVhsOv?&WsUsO4K|hKp(nUb?S2Y~>FFc%$}Eyw2)`=C|d`$Hg>Q=|9F+@1r5=l=ITX zpG^C<&Tw_1OJK%jKW6AFJH96`mLF>=&YR3;Y2{m#H)pJEdc6CUe)0xoz@oS7qi18}31S>kIeQkW>xL(aY1$N#wIv#hlGavkA`=21?8>lQ>DzEjlaw&0Nea zpv%qAV0y843O@~cGe;Y(HYM$RvT7sUxe(}Uevm$EvfJzWz~wUoolYK6^@W$_A+;c2cYvx>58{3@%h1Fve`_hVO8<9MCGCW4W@tpmbR=jz9K+NVu>j60C&_KN10 zTbcXn+xN|MzxUuTdd(~0I(KSaUaPdo{McjJLCLhCUdABrD<|3qxqn=$37~L{`3rnkqoHFDSKa2*Y~bk(XfTyG~>*hpD@s)d;f+}5DvnQYH=r?KX3gv z?&JFBeK{-9gJN!_Kx4P1dE4{IM&F%x!-KyK!HdLZImV|(W{=$*lx}5lQfh6fT6kZH zRv*14t)yB&ueQb%_vf@v!7D`sUHG;+kq-o2qYfBCe1^n*?9Q9Z+%s|H&Fk;t4QV0V z%-W1T4wN;uFdBdGtN8IEhcVMC$2*sPTG8lZSGeW{yWQtmf}{MX;{FpTPv- z?f2}t+QfG1nGEsIKQ!+BRD~f1TL{sHKHxAi4o&s(>SIhJ#Bx=Ne!xTF{L%6Ae8#Td z7qy(3)uJq4Pd6>A&WLu{Hs6-OVDS^+sUMR`JK3KM#2kq)Nh(Nqi?M+vVSeL7Q7KoS zw}%O_ewV+#u5omlqdh48U^xYLzi1)5?Tz=6d%3nIV;~oG7oRsDKzGPaFt4mkV+z%w zVS1rRu5f_n`Ul3%^@xS@hU1&^Fhl@ltHb^geFeEA(>GZo5BSWE z_)5o+T0M?&8*+}SVSCiPXKPE_yeEFQ#`cs?=O{AxjZdmxOS@G{f=1_XT&tPhl%q$C|nrN;^3f#W2>p$zZ#>Vchl&1dbW0`^Chx8ScL zVbq@4KYe1f0+MHrgB<1PD>kQdV~Y>D7-qbV*O2PmH_ojzI81AlG1-|i@rzT@%LR%2 zinZ*bk;jpCW-H5aC3wA`#(r~1gbZ79Mu-2$l)+Hy`ZrTv0wJV>`L61zK%btiRh1yY zRSMAfK-85n&#bbLtWR!?H7FySa!sx2HGrYsi!gP)WuUU}-Pi~d7ml}Cfr(LOv8$y& zI8}l$ih-pLbxI^fQA!yJib9&54D0ipx31H?(ZUWlagEjgNZ=1(*?QM_?Gb$__QpTS z*tlhd3#9~)+-jcG(X!t5d^e1c8kjebnjao_KUY^TF4y9$X||Z0c;?8+bfKz-CELHg zUa@66PSB6oeHL<3gRWiboLD<*G$Jn~Idxc>S^WN*jwy10PVdZq!ZKSK`z6J-BCTKP zWJnZ*ZT5HEeqj)-$*fjnzw9pO_vO23Qy9TwEpGktr2b7bI%y(kXD)YFs#05(m3}Xz zEVd@0t4{pRU5>;IhU9MJ2lq*#lrT5SR|S6R(8)28poyOEs;wWdnIjgwzaJCS)%Tgr z740NZU8C-Zm|sP4t)EydTV!~7A02(a#jR*PGjmKSIap4G`B@!QWfzBXRYuBZJMI&P zI=kv;(=tG4rGG&UhZ~sp#5@g+DyhryXH+EY)MKvRCqPdo)>AP9U+CmV$Qd5Y*H z6i+x^psz#+H|vpV)E|iwU3;r@7MK@THUD<%h#vzN%nhgMUY2$Jti_jFkJ^+)@bhU2 zr@l8hGMvAMd7)yKbt!cJKsYQQ{kZB({llc)SG{5KA)hI(0~{iB#T;AqeO!yzD~ZjL7|}=bA1E;SFcPIt_Mq~GY0Biu5qXQnvdqTi+b*F zPVQM*8biEDn)W(!H>)~KRkX~m>f0w}Q{}0J|C`%VAcRc{&fL`}Io0Qe?~ZTP%v(r3 zrWSUxJr;TEEGJNx&yw=KXXZy(mP1ZyYoiq3B9RrLogZA6OX18Vu_khMi9c%|8;9*! z^K_)9cr7Bl&Ha^>%;;1kM41~%E&drYmB{9o=UcRS8p2eO)x8$FMB}W`J5-=^esL#ei#FSQriz5C!m>pH?PhJ3S?TdMg znGsvF+`YwS4Qt0MkX|R0jdacEZ&pUl+rB&o(H?zy>}UthM!rn$Ssaeuvy%3|X6~Ue zAOxqpB@G|bQ*K@0x$Z)FQ8k-bRZjwiWB!u>V;V@3YJb;+)N=MU&t%>WdO&f%y_l?3 zuz1AB&Up94$e4e+-oLD|P9Y102w3?(y(@Kp|EyKxqmZh-Za#f4*18?$`;$K(judQ1 ziuAp{0$xy;n6JV6kX!0AySH*9in*`1Tp4r$p<=@z9NWF`V?QEtQEjqczE0A{*^v6} zG+B1X#Ij^>E_4GIdy1Y3TreT_Z%7CGbw722g52o-ll+bn)O!V9O?q$44XBIlG45Gl zoU#%V_)Ts*4T=4nmD8R1(InJXQA2c$Eiucy>?8Ae;th-947JbZ*C@i6ROTQ7;L1i#3~hP}fqSK5f}Gi}|&>}nf*A~Q<;DuM>N_G|^(9lt6Ho9?R7<+j{){V^Kc4Rk&B zgD1pF+eOTC6O21-mfycQ>umVf{!`Z;4N0bjM%IgssFpbw$2(yg*VJ8o-Wzee$@ZIt z!PTlvyTyIf_kev2_WqwOg5|n1C9Wyu4hB5u(`?-?qZM4*al8mLX`$=H8E5ujod=MZ z81hVq;VlhbL|Bv(H;*mgh>JrQ5Q8;`W%Z zcoTZO_BpCIR*7cSPtr~BN zDj*hh9T4x8%g~LtAzT;4E29VEhZ_g)#IIkxT^x|0t(43QVBFlak5rrG#!}p6CmX>b zcbh!R4&p{Vn5;M)b)dYif=et+ebc_|0gu<&SlYbf_fSj`Rk3M!e2+-=R4L32+BK0r zEcyXE?-H*Jd!j3J)3`BgwO1l2;P#$p)}4vS*vh8bFO87l&!!&?A*H(9cJ;F!O@uKY{AZta@!h;Ioc;HuLQd^`Ba*XTsEBgtjeAKd3eCJN3@vI)D<&_kSPW#Jp~Q4 z%aG@=GQrzdV0E}cm>T?LUf!-b_mArC(B1@xyU&Xlk0bM=$J#vOjy^&lJr7a8lF5ES zH4_SWp#`19+_W?N01nZLa~Oz882 zgJm-Fu56Z8%0?^&2aA2X^%wnv=!3|wv;d(=5x;WPoiK2XMj+T^_VOHf%5sSq4oqLxe z2;_JM?)fj7vHfgqRfU zA6@$jf6{poO4%8e0=yUA0kEqNmH`GVGKz(fFs73qe%Pol58kx3U+AkC7mor+W6&Nz zAegi?pysxnGPOgOII(HlfoqP>R2c~LpcQtsbq%?lgox}ak=Ia7R#ahOm%MUeb!D5U zGmP{VINk_^HV0~^$G;B-%o}9+=Fdl~a0Zp;H79hls}OyyDbrV?z-K^tqrf6I3T$9N zAdr(SF#Z`J7v$s*7(fR|0bw`p5%(EzEFe(c1i(<9S_isUM?)9~Yz2YB9q0iK0dqj0 zK7ePd$s@^j3snTc_2cZCQeS?X0vcvJsX*Z{fDHpiG7A#GhphLAO4%4xdxt z0%R@zBWt`ni{|_lq~-Oqw&t6%t$cA^@xI83LvPUgJO2*XhAOwyx8jnx=F1kADB>gea$Z z=mx`I;V`>#-NQuRTYH}7SN=6K&qTO)PCCl&`LUQZN3E^T@%-(V@NK+nvehb{&M=>1 zQ~8STESxm(^9!&zojM8BF6$@tcsKJ+>85nNj{wlj5}~amA-wMFVdLuXnjB8Hb>=;> z`Dm)enwU?IpZnLfp!V`Kjn;O`aOa#iSgin4aKgGKpjiBtlMTTpn5o?C>DgGb9KVq8 z_J7U^#d|IcmDr@eK7>o|Q-+}ePG4~&PRmzVgAdUXd*T)}|FOZN-XJi#v7~{ zKw$mz;?BxfS8{f^;|u2=fTM>*l@gPLR>!0KC?Q|PRgl&DK;ru6WzSb6kHdboKcrMa z``&RYZuamq%5MmtQ$UZ=;A)cZ(2Kgu`|`lOs9Se?G`P|I12qiil(C$rKc;u zFBs~k@f)!E_Fj`YJ7CJ#3rq6=-lWejGecX+XwU5YXy6m7nm{glRO5nkubqi{9QSn6 zb^T)Vgz2TAhgA$f7w=B*X`#okRybllM0EME>hrhf(X{b&Ei(P!zj+gdOxf=zJq-)f z(>uJx00LcR4?+@-W(Qbed(F~9{X83W>Y`Ysxq54EDG^as?`oG8fy07YGi6wza!4!ykx4T@;dxH^Yq+DZjJhazl@%`9NXQ0h0 z%ynj;-oryqsZKEI8hKAoxZ=~T(Y0=x^!OF$6&q^MJs~5Z)L;yGab!BDBTHcjYv=qT zk3_Nm#*Z%;_Bmvm@YFN(4HHO3Ba{6WvX1UkcL3!|zzM^N8LD!Ig}LIZ<1|hbsEn7~ zYU(Dl_s#NI6P*<>YcHGiz*QiIq*Pe`27$~aU`5Y7j`yi>6tQP%q3QZYR$(b;wS7#T z<$UH#_hW_QX5j_^)R=>vC>Uw>H}XN9qFSu^2X)a z72Fqm)8y>o>k^;P^>Kth@h#AH@Vq9&+0Nm~N_(ydcxd#38i_VsV2FP+*s1mxg7%hiZ%s8kE@CNsM}*! z5LWNd6E3Cx;q56vGX6yvH^$tt5L|ydJ}x2|6?zOvL)&8!mRDcLSCpm-gNLTDG^ZL^ zok3fhFm;afEI4rI=hL4GZTtHa-$2-g)2If8H_)9z6bLQy56tGdokAH1akZy+a& oQ^*6|!~gSzgK!-G`>7|8Sa~Vj+tln#;K3kGHC@$eR literal 0 HcmV?d00001 diff --git a/example/ck_tile/15_fused_moe/misc/moe-1.png b/example/ck_tile/15_fused_moe/misc/moe-1.png new file mode 100644 index 0000000000000000000000000000000000000000..91a1f2d9dde2eb892ab621bb1fdaa9e1f7f23a8a GIT binary patch literal 92535 zcmce;WmsEV*Dg#;fffq1xRwIN-5px2xVt;W9fDI@pg?eUDDLjT3Iu`{cMa~ros&M# z-uvC>T<`by%dgC2Wvw~K9CD9)jG0g+h0o}yM5qV|2s3N3$7=BR03loFIT-Dqk(y#B;Mw1!fW21)EPoFWl!Tr^k^UK#&=`UK8GO8W0mT1j zRFoM2`?Ie9zNWBQ4T$1exW*>+|Nf{G(4=_6?S}sanSdqYpPL2{ArVwo1i;1#{+V5o zasX@(_5c5i#*9EG(*K=p_S-iQgt`|;XKqNqB=Fx0Q^SOdiRtT|kNz`gKHFY9;hO1x zFaOB$pD%QM|MSJ)BAP;EHTWpIs$rM09;KC(@_}3;V2_Zm{ z>R#Ai{BO_u>$bS;2F2p1%%RVYg%Q+z&Da}M?WjkGCLjJxf&T5EM|A@J^K-QxfO)Sc zQlBQ}u{mo>fiNZl0w+a+)_M9b_X^WJzdzR}yd+@BrZNJxJO|5~(p!LTf(mE8!nY4S zT4tzK-bi4I+f4+1S%3_Dhks+S8vMO3B}(`_5fF-=5exv9zyLPB%`(FSPU_bA^kPoV0MEPwE?HAO;0YKKR1sUZ>p_X z7etOT-;4ZB^3Nk?3d(aOC~d4|#^UZlX(_04=_oVnS}w(Oo^09@QD;E+nn+(kfWp0V zthCJj?H``p6HoQ@O7Yx_EKYL<-qfl3D^>_$k}*Wt zHJsG4UDb;ghJF^$9SN@$_hd^M7Y%LEM09&x>f+g1M5-T#5r)=E1YNnv#-c*vHu+DK zw#QP!T*Ud3#&NAVjMQ;afSP~lxl-@g0KojaN!T;VPlE%OsTPUD6p3_)0>hjS2PU3L zX7iW-@f+F5MsJ{04$otftB3h!UqSdeQSDcnqlHAZr}Lw}oHnu+h}NR4G)=%3)}> zze^MEF{(}`>GdZvH&K>ID|$cqE)k8up?+&_`zf^r?lqGGkV|#w^3)aq#Xq%fTL_Br z-?ag4PY#bX|E!=%=B(F3IFXlw>$eR0bIO=Fyg_JOo**=}cC>?vxPE^}Zu%P$Rz4cx zsS*tt&v*4`;JFGSs7aGQ<2~19xSUojgJjz}A6mZ=%M=qX9!%xc9-`X~Gd;3< zK=j}K(iLPNe$jfoDd>Cr;ek0GXL8pTt4NQVIs4D+brErQB6fA;M`x4vr(=Z*er1dc6gC?$y?B=MnmXJzkd}_Ny-rml{m#(II{U(RbLfepd!u$k z)y!qmG>BPwU%6jW%2S^IjHvFdP#d`yiT-1e1M9LiI^3mA%F%ZNiceGU<0Rsx4*3P3 zA$&>8+VyqaG16wiDTOMIx)*f z5;rK&LmQi-Wn^x{rkYXa#!smd%u>^%jxcYRd2?#JLszhf7{lLQsrRnRoSj!MUX_}j z%Vr{Le@PU;BXfP7s=eQ?xAO5Pt)fP9L|XJ#8}@Q&EdHm9-1Q>oVxM~8m3%f%ygUWH z4bt%vm9NOl6sAhUMLNS@yx)Q9EzV< zXUx#qisfEScUe{FO41YpT!Y|ShG0#dU<{gk^X2w&Mhc=AJVi~jZW2ZHtzu)xq@+tF zLd_ULyZ77tl+GEQUJcUP?NEfR0Vyf!jxC-57T1Oonu~_FmISbtzi55$SL4(ysCic8mArobPh8Co%N+xlvj@JZL*1N0dm{|`02E*$+hpKT zF$={?{6@1gbB+FVaP!^8bh6T2yvdxLksCB$1H zyc3Jp0U9ly`eAOY>*H6-DKHGWd!eQXsef44#2mOpv49M#(yswHH*Sz>84oukR5C-~ z>Bl7CvCT5pM&@d5O-qUMWuk|!`@nxwZW_B$D3yaW?ERjcg}(!ru;zUgS>IIa2AG_X ze2r)KSA*N9rT3>CK~WV_%haqlTNa%nx61f4LV?VJFELfSr7F zYIuFr5;PdR;Xcy_9+I;yR0zSc{N#K_$wXE?M%wvci{j_f)7gS|@kFB^B9i7Q-i^xr zs@^nd&W|*pw(O?QsvO{IBp?&*LM2Vy2rKsN|1>$t%b}ffdSZ2|p>3?genC=9{srzz z-ux8@;Lt#wL>?U8?#ZhcZjY_SJ+j`@Iu6vNO_QTf%oDUE*FWHogsg=dELu3<2(C=W z*5Ol)y_L)C1WkNNEU%?~Y>=Z^=lLRwvxR4P|Ct&u1qpMr>4u^-J^{zkh9^-|JzW#v zg%~$n|1u@)UiEsF!V9&TSy_cv+H;_MqNnWeqo@{z=UKAd@rZjFl|qZA^FHswz%r=s z1GVORY&_`+!=I0iZK-I->3F?;sUD<*?j_xC(8f?uUNuNWLB@x8sH$m(1fqmFC zTsma1mo?U_<}Vsks}T))NGTfAM8a1mz*7irLf1<8^cOPtkDKNsV(H?nZP?+$wIWZE z+_1#UnCo$vM=H*}o!f(cbra&D?QvEn|O39T*@y-Wd7|1^>6(pu^i=?kBWi z^FJ16K??wiEB)gq=z)oS%tsRvEwk-4UdedDh5@w;ogv5xpvsvbAffYem5z6;LkHKS_>PxGTb9K~7QN)$Fqgv&BkGDhzhwsru-5gnL5ZDT?r_S>)H)~S;ipN^CL;n_5 zeD)PrFTO>YbxsJby|n+g_A(?mTJvzD5e2fxgEPhPbM94ek_J#~hFT*n1!fW|XZ0HD zqXkJA89x!xog8;vSV5wHm;qiD)$}HYv)GCm3^<{xK3 zZ;*1)bYt)H&buV2b%o!(qGO9=sM@&*^?4}aAU3DS5qw8{OUFL-OV!`GO&2xw>WxSg zU9)c>WUDG;KYQfIx@odSgRQrw^Z-YuN9nk?a=)!|Bm9Be({=+k<0+7}x2Mv-7#rGJ zIT@OFh79j5^lu^rLNiwwA9!LRW~e$BTel6&ZNzk5Uf&>QKv%fex_{4R;&IA`1viqRONByJ=1;OGCwK1QHZV~9LIr5jEkWt$+xhsDpikLy_Y zRk0}~;+y!9F0y8)`$>GLOmux}J!ucjmKH=zMMgY?U3Iv)F$MDKcNh!bU;4SRXE0WZ z!dTG>8YE~?&4^74>6LccIt-zx#1n3!1^IsxR@7Q%?W(z|+u_qry=&mL&XuEBo3^u< z_M>H=T9*|Vp2ye#aMDKb(JQos2ou?&V4m4y#|*<=dyo@;^mj@RxTn`jyo7tBq8}2+ zgtWx`iFq`qKF?p+-G&>26Z8a(ru|+vR`~Ch=l=XI&>s>rn`^!45^4x$yTRKMqOP?| z4;y`mu%`9LVnGBprvAaae?-cnPCZy22GYzMnaM}yz?I}Gty9`7vIt6MEg2}uC%wV3 z3Z1NOpRCU=MuL!jBgAnll&nff0>KVhUzQZx$4_$^Vw0|#4ITwMFn)gbmo<%q5u=lmY4$CyKY@Ncs_DBNB(l^(IuYgc0jiWb)P2)!{>_Z1AGMM>mr8V@7-m)i{Zw9Do zq-I{X6Tj%#%d_SIf<|%NCfTur8%`jr>Gs_{3Z3LPkg69B5DFllE z$U=wnw=lI@buEWbJ?*fTOLlhMJ|z!la`O_$*ciE>a$tXX6wb7xq#_??nFJ~mc|2;I zta!>PRi%~>-b!~m$*ZC3SzPrI6^nR@s>9Z1+RC-Eh;VByn=twlILuP|H_Y0LNQ#c4 z!bpjH#4}uE_Psjo5PZ>9&%Hj&!%X=Q0J zaYII_c8FtQ5R-yWR`kUVh#^&9*;PMOJ)|x!f`p zm!)XT9OnSuKn)vpkl@m~Orc_6Vidff5u=b)q|oLdGqUHH!V=-z78YGyXha^&4&ZBa-WtmhDyGX_X-`0)%O8=v2|lfeW; z^s9D239+1LT357&o{5=5%LQY%2=%5oM>DT7*5_5+u$gXEbs)Q%-H;gRhq7npZFyjw zvD#Qcws9lt6}1Yk{0FRoNtZ$lG@k#*T_}r`XoHQMct_mEVm=7HlHUHjkBj|6)o7%_ z#jyDanLk&Tnz`3%GIEV_P#`Lfy?7>qVFDQe;y?WO{NesQYt zGP8nCEcHOF{1nJ*(NiaEm3T6`>ia6*tbXY@2d58JCJU{DJw>(ycjy3Zl7?KGxM7KU z$oJmY+)2>&pn-wAA=)$d%oHCqDm*l|V5%x{d*D4u0#jOu9{uqB33e+mn#a;A3M&Ri zA@Nx#X@Gc&qI0l_9(?y@S1U84^tjHv;4t{rrxC;GH#5$j@y4Y{aBN@=90Mkm#O!VL z#VX;et6Z|})uI+HntvAab32OgvN^JmNG(Z3!WA7>i%!Sy!;j8UV@gue;JtEADlaio zM}F~N*D4pmZcwe3nyI^E?C`+PF}gfHGypV&@xKyHjLkTJe?Z#cul=d++whH{d zG9X3V!3HE09yK?|<#)V8quW;+QE+MSbsuRnF0$BudLZ3cYFiMJOI*^k1hp^yu5O_o z2+Mxz7}eA8)Yeve6*4dF>2iGcK2BPsBKiPx4=AUgtq^R&?dhU&Q)|AINbW{MJQZAG z4C0~YKHCw>x%|yCMMeMpV+^?^SbW`-{q5$}yIR)G$;80x)*a*S(U}%k!eHm0rtBs@ zo&g*RGrtKp2b}iJx&&6vMdjG#?;x+2~(q)RDVX{ z%|3j`{!UuBo)7p;Y!OaLDMhfQ!c=f&jZ;h6HJ)raQ~{9MzQxsf31!ls@sl^~UN$WTVZ+R3!GScZs}jgR$K z>(J>U}vevdTn^md?#Jw37v^ zKYl`Y;lY=zFt>x3LCIN=;HNqdB`^DD zSd+i7$m1YGO!RM0niQt-2yY~)M5_yP(Zh>*MGMqEJTkAB1qJYcd~1W>jZ79+h-v^}4{qd794k4gT#Ux*QpRkKm1_bwzZIkQdp z6dcL&i{mU@sb0>rGx1}2c3?e;oiaM%O;AOHhJ|3o>F$pYPw!3Z3s#NU?y*=$YS3rY()G_Z@f+> zxhyj(m%<^{4as(8Odl{;e(WI?F|E$iD9WK3NKhrC=%wPq4Y7pWOEp)+R45^8Vv<~} z1Lp+rZ@Iy9q=aYxO-7Tg9YB>FF5AaIar~nEfU9-8>8tJ*$~e6y8td)twJZm| zV7)O2czvpM@#yg)T2fegr)Rh+Ni${Uc?r`Vkmk|?Qc4ue* ziAJPy^bR>n=k3?{i__emj?un`6LW8`qtdeqx@?_L4)}wCKdC{9;7~V+eG^L*AzKk@qJGUutS( z0E{S@-Uzf;P}RQN@qCv`fyP|L_%&;Bla@w8{@cM1i7GW%ZbHhMd~=HI&fz(dInp-6 zzK#hbqrla<*4SVm|0>s>?&e44DL-^HIB-=n?F4)sK={ik*7Gc<_>*3^xg--3o$p+< zS`lNCUR#{^hP-Mt{J=s?Jnz3NVt94@ySaLlXF(C~Rc|TvrNnLrS*x?_GOy&EkKBg{ z6S}U=ssj(lc29FPFa3m^ z&r8}!ANCY>u&TBuFKiSI?NZr_s>FHyhTAF6z;iwX^75ZM0qA0I7TIl3gFM8i&KB$R zqqBH*@Q|922Eyp^Fv#=gVsAAkV|I{RVNtc-_`=l1_r2wT1Wj;_`s+Wo zJ^gPT-LnHVOHfg2ikGDwYW*FQYgJgHBd^vs96@;Wqq}>?%W6X4Fb!#;q&5qTerS*$e04+T0hj(D*Rzrm0_Z zhv{wT=lklrzPCbYQ%(!ckC$hYLuA1oWS0xfA6i66n^*;ltd0}Wl{42mj+c4uRx!Ot zgX*m(S*nynl?1&mmU}6^g=8VTZrgNw@8|>HaaSC+2%lO=`mK3%x8#;=*@}ct&ZJV5 zQiQq=mHgNxKji1RlspphGVi$@4WSJ8%S_}e^e-Iik1oV0JNy!U-z;sHq)e(SE~21A z=H{|cW-N#i;a0tK?!{^#WX%4ktJi``P#H{D#!DaGhF@3|Cl-WTQnYwefWNXFM50sn z41H`XPhS@DcJMmDx@t1Nd~z61Be|(mz7@_3<0F)?8G$~U|HWGtco92D`i=GDdQU4H z9dDX+9Q1G2rQfS#-$m#&Z>6=c-qCGFY8R(QEse^Ye^CsSch|0#PB-94+?sC?O87#>2(g?X2)83f*NM z?@6HiXK$`6xJ}+WJvI9pMoIdIRU#u@43C%u=ik#MJ}9Tm<|EH)=3_)>ZX*fdGV8)07DR z4%W-yaONTN&&(1cp1b>7Z|*2Otw$S-%ZVcOYaM2`*Zecv?d_Nid+DhkhmZRzY}5FL z(nlrs?$kRl%J-*KibA3xVKNc}t%MmJPbS+5t-6*NaAl?`Tk0@!oUtq>OnOH}(I@${ z_2S-+jHj=$`Nl2dz@LoF{dA?UR6*~|seY86Q8F0hw3iLpzl_?O6_sJVG>BN>_09hN zKKP+w|H{euVv)pO*s&|U?~_Hv@AU?>>bEmTlpVeqG0@}9M^ix5*FkqiGGd~$`-`Q# z$rW%Ad0bXI?BsGZctJbWxA{WcNJz1za*Yui@1f8xfcb}^2xA*|??!yNzlAlF>-pqd z19IM?TUE*PTS7QKAWVz=Qq+*&=)Lh%8u@_6-btmgGe6huvCjnd6|GdkV1ISn?Idk1 z)rDX2`qhJPqg%`~Zwv%#cnzcI_1_gY2dH^GWUthN@AuR z%{Fsf%`(R^>5BPoD5`Ptv!j3!&DxUfK1fzM;`E79gEepAz@mu8HW3t;Zvv~Qsr+zi zr@2|pkM^um@gpU7_zg7&zVD7wP!X1m%O@}UoZNbeuRcpw!4b^}>IwQ<2PQVXv}jJc zuda7Ecs)BBMsPfhR_)@E{%(1>W04tNM%)Z!_B(g*U?9U(^$=(HzXO)R zZJ^$*whT49JGECa;p=xHOTW8Ai|bsl5Q@ao0gWHFr0RJL1nXu88YRqkn4#T)T7bC{ z`tVPZTp{;$n+hg-EsDsmt!CR1NA_l}a3V`{&Ca6TVuoK41O8gpyCL<2B zN|CxRN_+0JRB4LoZ;05Sg%oiG4|q8phpx4??G0uvm-%%(2JzSrT&ofo|L;wogLPes z)SyDd_#F(l%L$)D;c)J}>RZ;#4&%&;=e zTCv~iOxJ2{J#JATxKwE8w)y1!GehTj;>s?67dS0T53fJ}Q@Dk{Dd$tmgi+9?);{2z zO6OHxERkdtnlH=CeZs`Y*uB-(@(!w@g>56^M}Mk6Ic+-SedWzkFKTWxrQ)HUDA7Xv z*!OM{6>A=E`Sewal<(3(nNw??!_d5s?QuP?^{1ywje<>_gIkKlsxGE8<@N<39L84pJJab8A*((df@V9U9S1zYJ}X&qhyjrwSX$~{mwGVeiQn&Ex+co z#>8_t3%cB@>XEsoiPiQkhlv=DA7z*D!+Zpk_z5lg_;3xh%%QzWZ}_tfmnit$@L-DxC%Y*MQHzpBL1{96ji8cL)!#9Qx*;JSzp2zU8-nschiZq ziNM(K)|#JL7S|Lg%9&tY-sGIu`qnVww?(+VIrJj)ks&s20*yvRb5UY>{BFC0knHoU zqqdpw8H(GO>2pgri3}KqK|A&?iF0$L->JY?flxXrU)*InJv_o)QqLhnDe&x=z)z)| zK$vxI>uav=F__|qoz=6d%RDpofr=OB_$*=)j}+f36nS(RTpdqWXa;hzaDe!u2NEdo zFyLl=;C7>oMIZET-B8-`oh_(H$2C>M!e_Fv2LE&WYXZ+rC?SZJ_%^&kT`~nrbzusV z)gYVwe>ggXse}?M(2b(QN(R1#9JK9Bz7n3%ZirPXuS~zYrOK5970>D1JCC;*zaLrL z?H5Rw3J*#pOt1j)x)x-kzE}5PE&(CuA{7-;hL0V2BMF6nd>Y^cl65oEhJ4k zaiKi0h|}P}ZL*&a+MV!m>2=EDP|9sL3Pxe!CXP}i;a>gMUKQiX0REFvs!2mmrLg>f zsO3-AS+InU?ua~}(Vxe){nW@e0$sl-Zu>e$7A0~{8Z9trr#vbrT0a&!Za9@N#pD0dXZ3>Q)6j@ zL$SA~@$@UlyX0By3^L!A-VF~7%sC~kM{P4 zdhXVU2&u(BB$~DPdf0CMYCjx&x+*)&W@BS()EBHnO;AG`k8^2pJ#sa^q8^@W$ZX++ zwI&>myk-t(bxW!bfb_E%7~>;3>c$#KJs;jUrqM*9g#zQ3z;VL zaJd9a`9$n-`W%XPE3Yc0_}c21R<4Reck!=;^trIzqAa(=_QWx1E-mjKgC(PkKi=GF0^ zJ_nY>9kJ7zj#amt*hpwXYvW(LegvCBZpXaCe2FVFB+KcU8doPf6pP!Y761t%7y~oJ za2(}^0OfiQ@6|vCYLZg68+sl(t!LJE)XXtL4cGLTw@1FwsCL|}ZG;8o9O}fUbu4AS zqJZrahy03}OWwK|N4GfpEo46uF>E?um2=vnIt9eHV#3llpHidC+UrD?jg)I= zt4{fR{OVeZYLEVN#wUN=9%Azm5U$lG^uK(Z+I=t|JgKiscCg)09v(2r+b`N8B)6-6 z`a z4W-*5F!Z|~=pxss(Q#g_mf<9*Jk77LSK}aH`O^)B4sS4mH+iX}urKc`#$GyHFMR+H zLTgNqMN-Z;MGqr189lMRZ9nJ(8=XcoDir34j&oT?uA3*i0W*5`=Tj+0ZeZ`#t#4d3 z6^8e_*wyj8u4l$G1;WhSi3u|qB^Dq8+k}1^2+oH@S_xUGI_uORVtN9!QB7&HIA)&-Xf>5HK-AVTNfVhes(zd z`+OC<+K@6m#9yEG-lS(W2rz&r{BQo#V^WG?3%Z+rvY>!&_AL7g-jqu46CJ<2B{*h2 zjJR6Ewjr2VTAkN>L4}<6$8P)63i0LRWkOA>6-9NqS6%qLE0?a$T-(oCm1$$X zquk9f6WiXJNs&NRmtK5nSXIO!LDLeJV_L!!fzu4Gs1(`v8xr+?HIEP zEPyXB8`nc-=8v$}_z}3rUqaYMdL4q5FkXtG_o6`JM! z-%x1sLGl|jF!`EiM>f`0a}FS~;9ORvrlmyD(~9k8dmvQm@#dlr-ue{qheY{XrmV{Y zd8ZLGA8IS>IhOm6sdMg1si9JjQ##v8P zvN|B9pV{r*dv2QMrqe7#u5TR66nZ+}JNfqbI$_2y!k(o1^Np^T_xoNdT^h00yNZ=Y zoD;}Y?IfW9xAC4pg%PMIZs&mW=_S0S^EbQ^5b;k_UngKHrwb4h8AhcN-)~X5 zS|Xtnzy{BGyc);D$_|lwpErn$!h4IFSMax=?yUu3tzO75mUI>6I}wrBt2>xqC@){7Gs z7kj+3TH;H<+6GpKi5^XQuB!7un8`Vt0d=m@5T*#nZC@v?!+x0}Go~(%V?nVgQTNMm zp?!N0Jn8erb68?``4Q4Kd=?K3-N!h1@S4or@!qkoQ8m4>e53EZqv7Cu)!B1?P+#Z} zA4Vm}y+3`5)WgdBF*Ul?p6dSZeBDF~ z3w<3fWA@EmIqJA!6PcHRI7S%62^LuFDO#rnGg^$)GG>Vq!BpmRv*zB9KWko)Q+D^# zN8odP@_EU~pfnMI-}d3VVBH|2z&ZF#Z3ZDLDXd>KTZJhUb(sX%AnDH)bo0%WuED=Ep_{z4^Qi@jhG zaNZn%ihpQn((GQ~QT?re1eJ(vxzZs~@nD4XFF+^QGYc(#BC@Alg2VZ%;Hls{uk6#! zDcm^!&S$r9p8n&1`-!)<&Sn&2C*VkrXZy@C3>lhH;|gswMd2*jkBPpG>@)V}ofGm- zurRkzOwo+a%-Dyn>C6mEYiMlkmc}<~H@j=#4ew1Elh2v6yPTqt(NLlFn#A`lbmLB4 zv8%UBn%f)4FKHzz(UK8)kXcv_3a;D{tjG@kvRL*7?3Q@g?R zBF`^EQ2qD;4W6E}-k`G)ILM22>kQW+W`&=E?Ai5fPtg`O$Q%RO{|PkVd|vyo=uH6u z&Juxq%QU~Q5scR$Ws$O$TVCaFFz)j0=JPC51^4L*WYCc83PN$E^ZZGonDN5ecUN|2x|1xfj=z{wUkk4f zS^5TMlrzQFo@!(^-MV!|+PiCb#HsLPwd&Jj-L>$Iyhg9VIkE4MS_V)DkD7bf=p4I? zvU?@y9hUhCxmr0#P1lpys=1>88@G&3iA`9SQzhNl4M>79w*ne4 zj2(1>%4dI$C2-9F{P}#WE+94$f*X{gF5$9<<{MSXaxvmM^ocKSrw3%cRrT~hHp7+k z1jnQPWOnPSt%{D}>ngA(?qjqsYL%W3EiTgsWrlqjgBBbE7KaMNv)#jEmp%vGA+H>w zEktB|pxc`_hqHeE?|3cEnF*!@n$&2?b#G1(9zB0vrQC};<_sFjs2}E=f%<{PS%=-zfSp;mX@G*75okBB6t3)Ha-5s()tx;~AxD1~3U` zyvJgYi&oe)r(J(?xie9c^Q%lh4uQ<^PbmLSLQ)I?-%UKC1?A^=M81;f>vIqx1o?yz zMlJMDSotIcoUCqf_%Gl*(0uJPnOz~>&ac6{nVt24% zerY%LW=RTZy0m^@?mYQal50h-+(mfGrw85`@D-Cc zo9A!ibA%2vFyGke5%3GUo~?PjwX_UwAfB&C`If*ZHZh`ig=FUdTu}d0%i2^axQsd=2+M> zp~h&ja|3vQ`<8}y(%q0X=P9+#G6oj8KT)jrC(c+n2<+}3v6hITjk~eAavB7Af^=DW zMpkg=SbiVJy|e2VdR z%jW!Bsx`~2R`DRabN*x<$+}uS#?_+p^*fW}EL4=<(6w5s{B?eFj-Vl=dqrf|Ib2Wu z&TWhx99pcWLPL6ZyA?znt2kg#SvUzEV&P79Y_1+wE-_pR^~@-3we4244&&O~S@`~(1^er}DRq5d z(q0Pz>oPsY;v2+|T*$(!0n7gr?;gRslWDn#qSEx!52q) z@)kls%{e!Cy-=N`i5xk=`n6BzKPbFFC4xtQN0An5Edd~p)WS`AU<)v}p`^=hgt1p` zTF*S&OrMX3Bh)W8AoUj!;Ja&cXQ;BFu1;zi?8eK_bigcyP&>Dq|5%GtHSK_(u#$hK zHvp#L1ox}(3gyW&lyhX@44nHKPnO|Y_KOn{ak)uNHcHz&n6;%&l1`s{(7e-l92nK; z0Wc7=8pye^FBev-5JlZuFBf8?e0R+mnVY_(GG*<>g8WEv5qq9$tIsNyMxjGgX)(h^ zl!(}F;fz#pi0J#3*aY58(uqRy8-ADnW3RJ5pjX6&TE|MliI@O`N=u z4$5$+8Q-I&_n9xY;ET9V3&N=%nGY-c?5WJ)1x~ZpS}`%Eh^yX%3E`ugKm3>##MtFLoTPYGxt6!__WfpwczAAti=5j7OpFSU ze)hxG>;)?}Gb#Cf&n8DclAyvbmLP(IHoP32!;m~OBM@$3AWu)OWW;=%cVn^QuM;2% zpYu$Ge!-ChF4D3q5xhM14}$o!R(yolXz~#=FX8r)j=KCN&3)!v2I)v~@|gpF1pEl# z-V?D9&|K zF!2(8RPpbC46jnb%Ti{zI5TP0wiaK0D^hw9b-2O%IpEKQH(g!Wbv=YyugVv`H3I$P ztW+N&JZ2x0T~KryzZBe`vxG2P;@~`DfOSp#nQ1TY?xS|e?K;S&m#)jd!Ajt*$?F6w zfFz~`zqMb7@H0DPWTPn@AxJ1ZGX?P#r`Y&cK1YjO1H1dv!)KS5zblxUM3mv@n*P1U zGAQR-UjVR zQ4J9+OZhL-p@0HB_y#XAUGFXn#%P2Jlul&TLU0`Z z%y9AM^o*|E&00ENWzU5X&*dosML%saO{6KD?iM*2exj0fc`qy`!lNtDO+IZJ4KryJ zL#x5|Zn3#EHET$hrlzLBb`A>%2Zw~D%!5iD z56*1pg_UEBU}7~xl~X6 z@M$EY-2rrfyBD&iJd9$WxinLd-*GwtuN}Krv1f5)$NiaWK%hy^@z7>gaDy&qUK&gn z1Wg<*G%jgc3=@7aJYD3V$6Q3VC1Jo;SwMt9bdsGipTm zk=VacP`nL@gPU~D`LO{Wiep|qe1rTfDg#EpJ=J0khio{{@_vgqnoow{VbqhX^a9Nf z+TILkH(Otd&}7&#b^&aj-}L6Qa5%$@liv~#N$@6Jy($}gnEU#sx{`en{#s5b%zhED zbFvn$tju$HpdAQp)8WkPxL?hg)Ip9Tv<6;Hm+SKJ@wIuiWes5w5*FtK4{$FNa#)lZ zb}WyKRJiR=Yiny)S6A;`ulI&OK0e}B>V?ms0F$`kMMy8i|B?i9&0z1N$MBHKl^ShE z+?1aDUzR+C-vz;g(K2JhoPD{^Wc=W@=7(|89PwXfHFkj56kogG29DB0vwKz+0+y4^ z^Tl6Py}E{$TkHdVF)?p?%9an?$(Fm5eOa4fI+w}G;06&N=wkqPgwR2Ggy&HzyvWQ; z&1T?zSHZ`3;|@=c3o*4pTBSBB4mVxU6lF2EnN8IEkhyvF>Q!i1m`c0Vr(ZW45Ny}g zqay`L?Su&Yd`|1h{;stSvO%RV(c+#SabjZPcofpSf&v|F?P5+XJwE4S{Z{uiRHv}0 zsL^nI7FS>2a|B?L`@2 z;936h;|F}>Ha0d^L(aBG4M6}lF$h4p;Vq@4_~*}1t2QSmr*9~j3Q|%ZBf}ooRr2~8 ztf#CYu~ewIDz7jxS6V$-=%q-(z})t$qa_Z}Wc>eUx#ne4_l#7Od&_4rq`DXxr=A)k zXLriEyI(*w6%TL_6t=h8;1rEAlK)Mh@dD;Z$8&sST4+d9$R1kJIUNW8z@ zw{+P8P8DmuW&+|1SbzNZ(abFG+qZ6T1aYB!S};?RH3+WQ?d|R5x{YAO9BgcC<+tA6 zcge{FYGEan*S#ddKD^up@#xKUr*^Tvgs5wJj8nwiw#oc%C4R?2%$1k~EO!5jP|158 zWR>k;0oS!KL}kq;ysd(rW|H&jSR7n3{pKP=$D60jdP3e?jdS_n{FGv^QjVbB3~$%A zrfn9}hMT+UonwlTY3C~8ddHuQV_c)FfAYt(a7+U=p}tJU?)%XY__0bzw9}mPGitLM zlbl(w-ig(*J6eb<*WYp|fKgKyEVf+yNN-9@NZ8()Emw({SvK$s-NRWm|6&K?E%Q3v zfc*ab8*>=aK~hs&D-A_QMI{pP6O;*!it3|`PGZvK_BdLEJFE9IRHalAN&M{EOT5|_M%n%`y9*mN+#vJX8{r%I)K9Ot?gQaHN4qHSwtEGSMQI9>1S!5f$!tNj zG`BH#6Ko4VX)Ea^7fuYwPtA)Z48lx2B`3wm|Gk5pQSz~xgR*0`Yh18Qy<2+ed75`~ zt+u3rio#pg{i#Km8E4;TwHEd0nP|EDe2+Sk+A>ceVg}HtXLARn#MO6@{1RX z?Y?QLfW67$sw!62Wzu*)Xv+t&0k-kVK`RL zkejJ}0*XXwUY;62hPyWe4Qiw5Sb7s5@EojWvX3S+%XO`*lYmYLaF;0gZpvqGvVK%p zH#0gK7(1*#`UnF!mJ= zQD$wt#wrG(AV@5rARyAMh(mV`F@T5+(nt+3FbG2p>CiQB zkM4fo=dSL(_YZ)#&Uv0+J!cXNcedON_}%Z7`pXz7?tNTFe3)J$%jllGgx|GVpMz?& zJ$DoQa3^IPVkU*L3%AuN?q`VYj4Jn^7Rv2w?nx6XDk=i#x88HjSiDFFI(rLL7)NyV z86u)s*j#tY)hk!xQd7hG%VzThjTgnui1RN=P}0$f`R;Glxi7tqQ%(_Pxp5=6a-jq2 zYuAxtO!8yF0pt-k3R2jtu>wX_3_azF)q4>jNjWIj1y^~{o9AI0|cB9eszMSRF5-6lRLqX)PRZQ z7IvQqD|Al4Z?b%Tc}UUT|7(fP{G|VJRb{0^-;_?;6KR^@c%TjAQu1D;Q6$=*?pD9;0g)RNGSDWX=HF<1dG{16zDy* z;>zu%LRV|X^zu4iMW;+UExvujx&`OikGu9V!~Y0M*xwRy8sEj`WT>mFlQM`uGBXP( zSKha3c`*_!B1~Aeb1pDZ4Sg}KJz1#I%eCD9@gO$cu}*kP4F|gZ;LmJefhemo=h z^iRx5l!hm`rSU+}cA`1gM3UYLInOl-b<>4klj2deM672Pt(O%kvEwBd!&X^!*CR|` zuiJ#|*n74U=~v7cyo(tf;e-;Ds5EX7oPBxXksKTa8Z1J6kVJ;MO0oS_Hi@zDg)cyq z&rF^2`>-?|J6}Suo1ST0?-rM3QiDJs>gwcP1TuP2%^Vf_RL28cXRQB(+fqpe-C$mo zsi&oac4Kk<)=QTD6uh&-y$26q;-qZ3cURaY^miPLGurk(dH==SfO&j!hi&nxNZOCM z3LIf0jG;J|bSV{dM!g~3D-^{x>@3d)$=LX%Q>~iX$493*TV4NBI;sbKF-&Eh1U*BZ zH#qS*vUBQ-&P^@>xbDp-m1X4>)qhBb<=^7uR_^VQ!7MGBa(r=BGxzYj9y34w!Q4j7 zz9)F(jRKZ3l*L}v$XTnmA~qxOL-z&W2$7uJ8yrm!qP$C#ZJY=+(6nxIl&2>(3O4lh zjkSt!!XR;(D#a-_UvE=93Lu?kc7tk%Bys5aNSVd)-aI2ex@=+O$ThLm!MJ}Ur{|ND z2U}z3TpZy>ro?llZSInXOf2u7D=Az`kF1WZKcloXG#uUrvp)SNS`Sd`UPM1Tpcs0e zqVhM27DQK)k~V}Q%q(sC)o*Mf1>ixI_UrP!!Y3uyWyA0LHLWM#$GTU;lN#&}YD#-B z4BQt*R?7qhI4*Y(i60ycrwJ8`)Wwg`(D)v=a?mMmq&|DAcAb7~7uuxMsb4n#y!NTd zEpokcznO%eqxf1Qby!8#@Kib)GWERKV>sb9trG289_2-Ejr&47M|*22y~e?31XIT< zlpnX$y}Q|ug7}MhfJQgq!!3up1^Hc=L7X$;fnqE9v8WU9_A`&H@iSTqDeM=IL~Iul zTUz>Ba#S+_oP9i)t7ZUqBFrY51d-B*RuoB4-*G0a($V=lEovAv@st-nn^w6fg6QZf zTARCFF0=8~)L)f|5`FmWPs05apYQyg&;DI3c%AHkFmV*SPOd108%=)IE_|H6Ol86`2e{IUlm?UYeT1l`|fR?wnMOxDA-du>zkbHE1NhnQekZyZO%&}_cXGV zq95F-_gK*f)EUsvNP4lk<>m157u`oJcbGZ62vNgx*86En)^|VG3>}E=Ln9VP3}H*= z-!>gkO~RcKmoM8?JBp$fLPWo&xgl) z2Q~jcL>;(7IvI#+vHkQrCab5}N1>x z+*eve73HZG6^?%xwXg1F7~AI}1!Zf2vIk#KymAsg1{TBD5*x^#`kpK1J>;iuhEsOkMMy z`N(rlWse@|5OKKf)0SYg4xcRa^91v_+s_Hq8Z0?=FS08ow{oK09j3BhrcHL!0^>nV zEjlTQCjDMSukv7S+_*vX+pg^W;?JB7ntLHH zUyf8erbwSPFffQK&OE#5hZytJ^-kb2@R<5=<_RM+^FIX}R=SdV3NK-6gs;I3HmFoY zGq#+rMl1q1_~c&jH)|=^jA3mW$DF!cn1>XTEDURMPU{+fcE{e+M<9Q)V6#gDwaT4* zo0^iHGPJ*9{SYB>d9NQOW$aqq zP1IICPZJ>dNIJxFu;XPX83hm`_RsDrFEWgL=%k-72|#*_Z?oUM)LLB=Ci4&~5*(*gXM$sfnCI5v9>rCj^t;MQlD_9w|A0FBS|Z?16CLgCxS@ip zJz1l|977P7S>gd4%|GNdP!%+(mP7BI6@tkvnceL+beLVwa+=LQz?Wz*-w6mu4UA3| z#2+gy44k1^dq(2Tx`_*o*hvvWH!+N&!h4U+HYDBp-uk6!pw?tD2)NJi0*}tofJEtN z>1q*ndh)Pl_REZ)Q8eDK-bn`%1%ic&Mzsnzq9gUHP{>p%iFfqieb7Ea=#%eE4}DtG zX;A7-bdLLzuwD=xbZuh2Cu5HaKkKR z6(hQG_?n%wGhw>*vYq*#B0xn2&!f-7ayx9-uU}VFQ|l1n0-#tmUHmT5>gp;6vv$0v z0jO+$t{U?qvi7EIrlz_1+lUB~jzCg%4UMA0Le16TFz*ilal}>Uqm|iz5U9YQCPSm2 zy^Y!2T^i;bCn?O^n10yt?{gb96>?ifbt_+UUqu?Ps!Td#M5;aKt+(u$v1v_yJ6+eW zcl#{;-bn2#yd97Rll60HFLhcGg*&(%E~W)uwGdBTF5>DUSMLvYvqfh@vt(14f;>p1 z`?ZSk3b2ppacMFl``M#FyxleSm5FJ8(zT{*jNa@w={_3Yy3$Tv-kdhZ`McCeTTAQh zd17P%={MRZjg5GDTfqF|A&}u5dQ!`P%9V2XiT|Kt7HDy>HR z{d@PiUh^@tu@On7O8FlFWs61u8CD)a&Br7vN~^jQ5f(-#4tXps{VXbqY(GqKim3re z3`6-tF)WH#ZoMma!6Iin6SJO~BI5hB-Rs%G0QwJ7wL3_S-yaRCYn4{>Qwzz8u)6D# zSNpWrtYRZHl=C+*y2Hg{2|Xj5FlYTh?=#_^hu>YbkV|_-Me#nKOq#+1C4bzc5*yPE z`)Dql645b3NtKdy>>{aOs0+i}od|~;rX!SU-R9NI$Hd%bhuz&*$NNjlbhs?i1*l6z z-$OVe^TI_qI7*aLTt-Lt@c7_PBZbh+`a{D8uM(Ny_72eqB4If>XHrt9;^M*tWEj2B zwQJY-`TLujPlha1+76V?0{4r9qZ5UK1MLrT&CT4L>6W>(Gb05>pu&sb;I)bRt2odr zMf~`J$rvLeu3E`(7p)7+8@(UB4Lq&CtL|FCqF`c4o<~~Y5+{}7{YwjX_5yq)*!L~Ic5izk8LCl*sZ}8Z*thn7>i1<6`Xt|Z^e9uIM4cQZ; za{Yb?U?Ro{NM^AtNg5yEl!B+gp@aYN#{SGPMgUm91;$9eF^0BXq~hXgx`g6~7H>N} z1+u-nvZqcRI!$6!l!=Ld4@(|4i~_GugI_?PeYDmR%rAI&cmN@FVXZanYCBEsZB;k7 z%KG|)!@Uh4rzw&e-RH3`F>d%IRQ4Xy4B;UC+a~+wAMk%ODF|eLN*KyYN;Hy(21f3Q znTdzjN+Lt+?vl(;zjv-JkXf=s+!il*75w|x@d}$>7R7{{-khR;u!g6n-En2Dwc(*I zQI`R?f9CHG2i|GYQ~Wdh(f3lDjc1wwNrl+h*r?PBtInX?WgdiOJc`EZm z*m>lE%P%e+5Wa&PNR$&Uv8S>)Z@*1fOojg2R7hew8I1BqazYGSnPI(5l5 z5Ljt>v+>hgv6SQ-&Yn!;)8n=FZ{Iq-awP08ady{r_KiXCw3-X1^F3+9!^8Hd;t3#J zrVke&p2Rma5UL%~&jJJ6)j{{Tc;SMAipqW5$P+DE9{2RppdRXPr-l9~WdNRa?$~Lz znFDTcAYwf&lpTICwzZ4rX5H}@JYI@w+jg>i{BY%LRQ=8uQ$O95RvV z*tcgyr#x{P8xXN`P?AluGyv1gb#-y032p5^ij`bCkGy!(Y>WV@%6Yc#y0s`lqNtT; zyOdRO50P6gZEk2dK3?nEuGXLDy$SMsyABaQ!fi z92KiLE+1KLT=;GGMNM5@YQ4}4)XuOf*(iUaDlql>;}Yic$EItDtd@7Io>w|m;BaSQ z$V6rN%Vg-Lhf?mAUmI()s)Wf^&pDW@g7r} z4E_l={z22>Jno|+++Wk_;}f36PWMWca1Zl+WT7eu2h+AS)yK{DNp+l}YpE#{4#9QP>S zGT8MHe@pPa#A_=8Gm-vhGcjKz!{N5zJEMX0qsHA#T?L=&*^!0FZnQ|D;v5a!YtqJd z48Feop6tA|uzhFQT}L{!-fKs3SNpAZjhZyWX7IlY z%5a^@`>enoP3%IuLK+LYG@yp8W+@iEF`tcOg#aF^`qea|`vp*i<@SjNX$p}s{a5l+kg3p8T4+ZD7<^DY{jnJ|xsZ*S;xWsj-!{S<@ z>t3r|DCM~j&9Bnl!A(26aYjT=WGNp0gQ>{j>T6Bs2XE7Zk@tXQd9F#x1{FUKY&ba`KO3gEP>zh}8Z#s&imy=mk>FU;p@hy;a+>Tdu~?8v%w@_gW@HXJkKGTKJ*5oD z@w=UCmTbkZukcQVNHM-%5V8GXzC^y=y>j|G1ksEPp<7*jenS{ zj$B#uy_aOb<6gL*A#pr;`gq&f)%u8Oi3fjio-+&~uWw|lwW8sD!IO1zUdl{7WR(PU zR-$Y*UtvKD0lodu?8xF@jg&~k&!my@ulTi4#UJYM$TwLB8oFf)dq9; z%Fp=I6}x%I>l#PB6QBKW?Qa&;Z6rlabdH_Ix-;CzKm_)PrX{WbdFL{9$$QZGspmxMFJ_6qm8E9~Akr7i4ru z-1Z(566_Q6nycrMS~;GtGn=pXoZzp3m%zVNfUUp~eKOXY~b3Ej+7wSH0ybE%|uy2LzU)MA74=MwaA&triaFCMZ&9 zEY4`pS#ocn&BV_W=O}^^@D~nQqGR-J+>lybT3)R^hQ>@;9Y{$W9nG_`?eB?in1=VX z9ki6F9M58Tyu1#;3cIsB_NI;hxZD}Lh+lPD^WCOAe;Db`;kT`h8LMZSFf`)7D2X8y zJRjcf5kjvYt>shK6&E=lw+wWz=y=LiY`=%31z#36=I@ma8y?e|0Ws#uhTll`SBzYn{Sr279^=xo^=>q0kF|0p!HAt!QoPBo3WrMq^{v0S^;iY&aQqov= zx6DYkSo58s-(0VWk+C1A)NyN;T!DtVHrg<89UszcI#TOuqp7J0Q0a7Rxc<-YRvk66 zzQA}CQ?-i0Q5>+{BC_|zUhVPya_sMGve+UtDV*K7kJ+L)Em~J!(Bycy$jcguqa;Cea2$nK$eJmhN1eyRqd_TOPdF8rgHa(P4rXtz#wHXV`zcKN{@+B;xq_ z+G+*(9xdG>!N^uZe}ce+SvC<75jHltgzML?J?$?_&3yOH^7C6+ zAY&@a%a_5MtKaL(8W#=VgdIjo-EjR=xvlF2&@4}fU%d%Zi%h)60Ri}DWeRdSJGICJ zTS!+nO6<6e$$OmtLx1=FNpud-rp5ZMz%>+4H0sCSJ9 z&UlLGNR{AiXp=|e8zr5IUw@fhlQ@$0mXlFL>&a*gw0bD4(NT-_WZc=J^=IrD7#Wki z^B!KGmR3EmlodS+hwTC zmw-dkyCYc|1nDcOiG#3hQ3H1$KYsk+(`<&61{M~lS7D8F6E%LNZV9uqvpXaZ*vaSz zhlcj{_klr&8Yc;dF(oA>okr=>FfcGMGCE7l6>@{`1c*rMdx+L=NDf{wV~+$wm-A+Y zv?a}bJ_jpO1;xc&+4%!a`nF$g^ib3~luP`*}cYpa7E zkL_YdJV!x60pO}Q9*)+~g{^()#vCjsr&Hw-;qpWFB|aN>utva1AOBe+Y6B4e$8Ea? zTak^|PQ16OQ_-sSU#AL3(NGtFdDZv9E-p=i$~vDRNm)S2c;&e0R=2#o4Cp#vXFRw= z({mRuhJ}Y)s_ScL$jDaeuS7G;~yu2TV~|0xG4i( zhpD6YkQaa({ij!fL0yT7Ra7rNm2!6B;;9pc*$3lI+`ejpJwpx0r^7v!1hgaIplm-` zujr5K@8^j?Qc*^T00PXxp;#A`Af9j}EG+y48xb0+UtvuG)6~)`WACN`0i(P;@9pL5 zsM(xZBpod+2qnlX3UypdY$R->VQh>ve=kgf{Fs2=&E6Bol!UqgTw`q`K)iec2 zu{bYBLF0t}zP|U4vGZm~I%?`ig0whiHMPqo(Vgp8u3jY~BFcfmW@l&dd7b5Z?ZXj? zZi)PHiHT8PtF>sUOCJ7!_Q2HY30}JZe57q^2Sib^$-zO~($z?qhUa2$$NJavy{hJH$f*anuDVr+Vk z9zML3lE7)y#?i@oaI-$C8Vm<1EBp8$vfOmAN@Tvoz}x*#s|&J3TutFw_P>n0`1Hi4 zo9atwmwUgP9K3~?7n6A!TV;s4hV0ya^EMDr--Eq9$H_(s9-gRfKH-U4g@U2EnHiAx z*-)nwaS9I&%`Yl~xmnj*N9YLKCQ*veM%-wCPk7KNB3A2`?yLKz&1XiP1mg{Vf%>k8 z;W`6DpY;fN*(GrQ+G{6F1qV<|lD<8+o&A5jz803EISKPMy5u{Q=k3w?2D;v`ojCFP#cmbnUM&L0vZ_r4Q_&EHNuwFtnIrbUZj!o!V%N#iL&?JGzN z1&w0IRHes+B*;3g$G<&hFG{j5oYbM<2UPga2e5e_0e|<-{PWu?56W8gX!1RI6+dAP zd*??BgGmB8pW8lsh#Kf;7)9I}t^Q6~!gav;0DtgN&lh~dM|~70PDGvo2_(?QEG!t= zRw@fV#fL5@Wc*o`Sy(0#lWL=3gM+CXJ3BN1)SyoM5)%CH!Job{gt%5@rJU%(dLL(B zE_Qhi^4AAN^XX_UBO|d>NHN#BhRvNX%~FRe&T`2q2DshEgVhV?lgwR>2-w*1@$uBu z-}Qo?K0ToTe-acE6C)c&n zhi`l@x(nw|oVyV2Imka3q2tQZB{q&U&~W7zbkn={(V^VFXxMqK%N7PRQQxm$YrI5H z?_^-4L*r~~oA>$A;L;YS>fO5m@dCsJ030EysHi}hTyPT|&R<+yeDx~&VD!ab?yr#5~@yuom}i_6jx!~z$%;4bQ^z?Kz@y1_W z?}vqOcM2)C?g!(xu%<6Lo$+Zz*mzV@i-x9NUBl!Sx8JDOYm^7JySrOiMMao4V%b7p ze+=YEgjkV^quNFKtqqms$_Vc^0Jc+H!WICA1DPO+_*$o7s=7GFkH^q>U@m%iZ%a_lbIU}0JEhgp-AW_#7 z7}B1X`oo70Ow7!q_ZE_qX>Yw9vWxotjx_9DM~9QOkB_*3amRgbN88u;hE~wnI9Xi9VjeR+*r!?mzO#LI?4u5*I8F#o(o&|c{1$VRn6 z*-J@HO$8!xMn;nxPCY@ILD3&1H*4|tWo3)>DuUGW^YcgItcA&v>~D#2iC{q7EtXAH zKVR*_kgVkI)YPxz;w&Xd3q}8G@206G-w@ed)f0a_R3cp9@Rp-a8vOuq%>CrFcjxh! zmYN!q#Vt>Vkyl^lMa&MiMb_6zJKi>GFVTL4=qbp^oHj}8+~CEs&|9hNYifSfnJE_s z4Zpd$+1Az;3WfSC@=qu=829dcEO$aA2Rz1O8ob*xh|&)K3om}WzA1u~2njY-0nIB6 z30*n>7%=SnKbj)wP{jH5v|e>Sgn%cGU1yq022D?R_%F66B#v*ExjLpZeE?JCmbGDJ5Q!3CblJPC9fL7 zhU{FQJb9ay1x_fI>~4Q zU9dRu_tN^?8RXJqQ^fZ4<2|#KTb<+_92^=N8hf0xU0q64zOc=&UmuU6Lms$wc65w@ zz+W+oh7}CB;W4jY=hBRXhdSd5aTzhb*SNVO6Q>{U2a(X_7Zg~v$`Q~1`=$6hX4_18 zzY+=@djDAL^sEFYpKoe8Iu>nBw|Mw}?fVXqe%!ACcrPwQ27RUf?%bOsBLqRhIl_e` zS-7XApd_aNp4W)o*zudA@awkdcT1F+5Yd^(D`teGZ{q~pE^}hyr?PiN5qWJEB`26_nbOHVR zlq{on%2K_GmYcQ_#w|xmmp0{g&yW$V-j+V3$>1xATi@2!XGPdrjdIFLm1(W@dp7Ts z#R;3P>kZ2p4;*fL<~cM8W_v-i@s!6m=vPEN-(oAIZM{c^)ZFRi$h>>>7pA510Gu4p zKKx^NnW3e&TIP;*$;)~d6B}>zq=))x$VJBxG6V3)JghaMl#qp=3I=g7KITGXhQG-9 zBF=^lUHP`e;8DIE^tS4@dxpdubRdnPB1?#TeiNa!#^-7}K04Y{nKYo`4-8BD<&kA2 z3q6M7>>Dk#WIV&X$w0Q>upv-p@4dtj9!#QLtf7 z;qKmQ+_;XC9}r@_=p^6TaJ6K3gqKT?mc?nJNjc%AUUzurr{2T8g3ivU+4-g#H=H!x zt#SiDetr*FcH=wLq}n$*EQrL&kN9j0pjcUN--1FwnPE8yl+|C{U%b=WAZnnST>jB$ zmt|h#I&X=!X&ZK%Z2N)9MvD$c!77HxZxs5jN9nlUFtH%TL53Z3!xob8MI#TY(&#P_ ztb*R}xj6n7ELa|f!>qy@LA8^-808fHfdLbFMemse1MBtx7|eV;KW8?{GTQ zejee!_pOh1x3@=IojZ5#jT?u}O?GxY5LyKD5E|<0%MAh^ckKv$GGs5FKex2DCIu%q zO)^rK@P4BWKW?0Sx)OKQSlSY|+7iKQh0z~vtqkN{JRLd4WWBnil3#dp+LOEL*q-I? z@(JKe)FrLZOV^-fnai}`DDbBo=d%&`i!?`vSz9}Wzl7{gc$&P7BxNjQs*muT@9_*` zz|Q_eqk%R70{%QaJiWcWCUtJOQf;@b|Edf>)CGPeq}akelhH_x@r`T)*rnqr?xJGLiJwvwQ7x`WD0P?C86J_y#J> z+Vs_XL@U82Iroe^Hj8RO;^i*S-hmE8dkerEJRg@@* ztE(&ULL=@oA3RvEg}NSR#nJFs8C2P6xw!Bd3>)a_)dExnuw!4}!~IT$GA6GEKj7HS zl$BE?f9R3l6A%z!W@5tl?o3=Z5Fq~LBA=xi10}O3>zI0eW&X}QcT%I@adt)k?xRu+ zZ1EGksQ-bmoks>=;`v)-?F#fYplckA>5DrZMJwIg8@I?)>*P*K{)`%BJDlt(GLq|^j;Rn>_@%mm50)jVJ_D@CF0jm|z zcEcRG6F*Ta&^dNInDj&^Lz6gen$+9Z#R`}}3tZe|(-f+wQOoL>#`I!EtAkM`3q`cH z?OCQ??xR@9#=E<=8jy!51&g?;Q~)CdiOx(|C3ENES=rcZtgPfweKBqQlSt%!Cj^MI z*bf(GUQyQ&hTUOi4*bo%H(g?;?yzzk0^}5sak0%qa=<_TVG^I1D9~xe_8gzDiPEKu z`|$HNwf@7ajYoHtoENt)*|owov1K#z#Bd;~oi^k~#rx$QwDwTOYT;g&>#+13%q_iG zBcp;V``s(v`m@Rwigxlb-YJdD6Y&lFC^z&i?Gl~M(^#KVqxJzX1Iwfrv}=;HXQ_Nv z9gEjQwfH_R6fgG}crh^}Kea(plDWS|fuwX%aWUoKkFb%1m{^cDViTU5`?BPd74*Tw zz`($P{{Hp#_1!_Mx+JfL^tBMHc8H$tYt^J6e7^p#o~p$RA@gyW&bN(M#U8=`Jbj3h zH*bEb?B+F&|1atOACLXb>IyB7$*kB%iC3~Gk)5Vs$^*YYUMoVZ z{W({_xLS)pBcsbOPFXRb7#nXD>l-(rL}!5j8pa{BmfSE2Rx9sUSn+-)ejR>v8^OYS zIvfSg;*iQdYX3>O;93&;k8<%UZvCIag=e4Vin(@0zI#N~UG{Gp*$?G`e=Ho;iuRWt z_llr(CR_%;@!0@Gmytz1J##B7mQmIWSZY#II}gcF z!na3PGb7A4K!w|cBAz> z1mA;DU3bNVF8|HXR&J&SM${RviT;Ia6ycr5;*!#j6#i5!D5?CE*!T8$#+Vu|%mj0a z2yf%p*DVU`V%d7Nbx`;3b#dPuu{QE*%RkIL^OKhG{k^#8V_D9SoKK2Ls#O~}79OLR znj%BCwav_BqaQ|3tj9)|aR_a}Dl6rLomCRDZ?H_X(emmVb+oaa4KxEq3lxvfpFfAH z@&j$}PkHr+rSt9C!LDI#W!~HuyKtd;G-d!3S~H>+WRO0PY4CXLNjiULJmDCr8}vp^nb{(vrQlc@m!L zDj3k~5d6gwU=uMed9hP}W~XgO^cJayc05V)sL&%?PouYhTx}JFU$1U%k4Ri&x55HV z7Y4ZH$4*lNn>;J9qQUEGzn>NDtE8xiMnEf&A0h1Me650-ENyM=V#C_EcqnqU`xdQtl=P#akGdV_wm)XDKumD2 z5;peXv46SKkra`*1mbD9^XK`56V$IsY2IKm_s|>Vww(shJkM@vzU-Snpick)tJ(l< z3iO^&-ZIkCBi-RqQ8qwR`UF!Bmd~z8hSE7S-H|!@i)cJf(;Wh66l;CJz4;XL9U@s zw;z9b55$0U=v)%x@ow9u%NnM^a!g)c9{51jld0f`z<$M3W={?^ZYON3Vpv;OOm@9G z&12O2Tf>NsZ+>HZ$1d+*!A{{SmQhM>?V*4{UTbd z>E>eLEmxA{xvDQ17h}KvJ)~i}(CN!xFk#9K%3Bg#^Ap7+^$s6SGoUA_J1J6dU6V^* zT^0`LTLyE|WTCFKJUA!dl;5|wcxT=bPI$^<>^Wg+FsKu7sHvT^Gv6GOHh}U!g#p)l>BQz}R*VscjNy=aHjsUc3MZ^aDmft^B z_q%EnkXjN2xZAWq0*tNc_qhi04=a4B_F{1@jZE=C&@dKwQ}1|Yw8634nyx(%-aq;& zThY~b)h|_9I^q1SC|NXJ{;aScpTC%Mih1e)Wb~-L{Nohp(gl1%?gnO4ZHVF`14el9 zH~A~^g^d+-_xb$m*RMA>-87S4`s1Awn`%B*01kx4?5CihP=&VB8=Dd zdcrU_^x#8vd5I-=hgNJ_aF`(F+7hnOFfsUQXA2=gNM8HJeg635`56Nd*LMs(kHJ@J zsPXSKd9pR^X-vJ({8$A`e?cZLBPQkohk=$}uKgwKM5QgoC}&27126TnXU|3|Epej8 zdcvb}qurIQz)i03Kg2RBdP1@luvlzS6O-lX!wi`=C)R@5VzePg4~XO3sST%7sH%B9s45d5mvMwVH^Mei_q% z?(KCx*j@nwq=Q}wu18D1NSKX{O;j{FajeG00we$&9l2lvokI4Ae)kXIz_*MsAfLkx zoh|-)u zhVq2gTyOHp^Pc$a*g4$cBE{&EM{Pt(`a>EZe7V#dmMX_vg#hIi5DA}zC)U=iVdLsv z5J=7P3OJz9%F1dPU1g6-NKC9y2lG_&N=k#o4nD1Lw_)Pthm=>Zwkf(i$jtCsWEW-g z=1>P4S&^sVM#Z#Qqg2m&UmG}f;%~M=XdSZC=5Tv3!%6IrR`o{himKospX^K>T8gfA zgS@@Sp@BQ2fzt@uDe`7Iz$8Ig*eJ6cOienWw7RzWGT6!I^qG+Dxmu zsQ*1=#U`$kOfPpU@pMx6cVSTd@fZ-j{r!(Gd$UK+907Fo-Q@oPVZ7ZJ#dK1Q4Ge7! zMMv_G?R2h;*T18a+j1^NSPG$4Y6*c7aXmMZf}wk$d$r&1K78><|0_=TXmLrD2Lqw) zwzE|49nZ2We!`88@ym6MlNxmuKY;kCb@u(3Sqpx*WsjIyxmRA7Yl_Y^;xu=#Mdu$S6h4@;g59;{ zxEV=oa1trD|1XF%7s%Nsxi(X)UU7XKBIc}Lkpu!?XA7;XK&%jJfq&Cv^`h>1f_nyE zFq9`P2fG_w{(w_{ZTC#4(J59wPnL?6VyTuyl>TV-;uA6F##m*w1jlHr^Nr|)h#-fH z7G03btFGXujDDs0kzTHF-lT5uIXoRLm-a)$OOfdc`iRX$e2<%4n)X98wR-wz*xRUeGZ<%KyLpTM6g70Sk{x zWRS>yBu(ltP~GS(yO72O& zgt~z4U}umi;9eVjWna8tTpNw;F)tO?6uj}5qgj%MpD753S5^)_5?tK!O2~Qh=6wfE z5E_!FRiIsTftVP0gO-jLNl3UjIOrnlx+Z=FsQFtF?L~wC({w*KE8&#hC|CFh+yR)3zoq@LVsc!Uq9tqG3U*hxC ze#OK|H#tnHYy^yzo2t{$fHe&cCieF;l*xKgc9P*%K~_bOkfJkM2|6*uPyQVtO5Ga4 zdBecauq$Efzwk8qWBgC(G#6P|SQr|5fqzB7rsrYDIUeeupNY*gKqy_-sY~8Lo=0Tr zSgp6X!*d@6N0hMkk2oN@)S!hz|8EQLU%?=_E>Qg}tCeyjpzAm2mq*H&`S{dEHA;{3 zn*Yp?{M{T8{e52W8;S!??fQ%Qvofv^`;1y_5McT3jBnUSq}3IajY9Mag`}kXLD-%1 z1;~xWME*TCLi6b#u@OZE#k2N#_i+-c3JPBPxSY{DurY@Nt6OJ(#-|s;fdy8C4+`RD zZZ#^ml=Su4=x$0(WbEwqcfs|)02?>s@b}wa%?!tRga{OYjDVY%g{Y~yE>F~kS}bO8 z!ZI@hN_u9YukmNko~4brzi;qQqt7z`2cOUUn?cv_n}6Tb{*m74WiKk3+;d1V&A+?! ze==JYU~#@&2qNj#(n0mRn7)}z;97Eo-YrJ@?V0g>iD$V55xd+LgI8cK;K#D?M1+LM ztE!4@jRbU(y>mvDnDi!o83V&)MG3vp)?jv}z`l?~!R&8gfd3F^D{A8Is;Wu$WZPxl zJ-)48aHv3Z_xfJ@cQNQCkaKS3a;7Y2En5SD2yiCAoP3fi=Q@;dV4YQXi;=bbN?HC+ zo}CaE9@iYc-e*GxhLU=$VOHkn;l2mk2!wcbANrH=!<=`}PSdrV`qk%K)I7Cio(@a{6Tl~Hj11%UyUQ&}aDIbZ z!rA$9$hgn}{P!z$=L0p?4OtfHx-#s4DkZenAHG9OzbpHw#1|5jl|p<4SOXjx1hnC# zd;Z&s#r5Y(PEq}z@=+fSCY~L@c^3Eg7hwIKUlvNO z*Q{=W7bT%&Z8i`>NTJ94R&SzwSGbJkRX#l{US7aQQZB7(J@U*kHZ8 z+dXiWGx-rpzUT0$0MnC>W3LFV%G<18V3?sH23OECT{x_Sv4E1(Gjv)g73_D1VrC+$Ijx>KppSl2^B5qFbvFwa@8ct zKH6J0NsqV!ugH(1DHohsYOstbaJnGID#j8>Kc8hqs$|$T+4BJb1cr~tt;ER&lXEx2eyS#CgQttt)345Vy&8<>7Q^mk(xk{ zE-}pD#6;d7M}{y^NQm`%3i|srRy104j6FoFTHQR%p8CV+{f7{g_~VJMLc8@MJ%YXv zzZ!d*7d{({>>~aguf&UBuXY z!U+uvo9pJ})@!^b*eNd$*>nvAD_6#S_Wo2iqpMAJW;t((PkJiFb>pj_sOl=xP9Co& z9~5uA24~8_okz6>pl(u)+RdEy!wzhQv4wT`C+sYND@!=*kQt3BGvw- z!M#+&5414~MsYuSpL7~lt8j$8IDgmQ&ChcC8Ew_>)&0eD7?qC?S*3uUJLz}82vW?V zY5u@3W=n^>J|fiyifM(d##~MYLm~?mKt@wFycic)*=paF37IzBZK3~K)&FF&G;0WYXP#A6gAxH?PiFtxVimaR* zhh%&1nSX+Y1k;noOAM`sovke|ETg)i))bjWgmJ5M2P+x&xwrkXrD-tE+5(nOvZ@lU zgE4~YTvr<(^%Efcdfe#VQNb%+gj6BxX$cMG-u8}|(?pm%A(S>S$jhMw?JrqwgMS@ z4s~PN-{|##^+v`Wi7jv->?YcLZ*^dW#d)N0!Lv`ydNw4Sn%UtSumbYz(9#-YrOL)l z)IXh^R&;Bo9bTmUt`9=F8>8v>Hz9k`DuGkK@)7U|+1c5*hn-s7Ta#IxMoP^97|Tal z{Wlc+Ur^UfzDyK!oWMI^(KddfApJ1>NWoLwdX6>aoN|XwqK1X7hhSdS=1`4WP*_`v z{6pOBZHo_T_Tglf?0w1xn#JzY3 zwK_0!|jq96yx8>zNV6LT7}>un4se)0lj3-pg&dfyUv$^2k>WKr`xYSoZ%5<3SRrTY5PL7Ox5-M|Hs3#)V~J1=Qb^q{*L zsK$;6w(5_)MW(~YcUl^*Z!g>ht>h}W{TfOJy%X3e@Fv~?gP?aqFCyj09%m$X^ zI#M%&CR+(TqM(m^s~s^!0#nlY8q?urxHRjn__Sf|z^YMnVaAvv({x`sK18tozTchY z#dop`R8Zxe^M*bVZUMzx-XE+LQfLnzrpHk@8Ng_I;1uosd)i%wclXAO2GUaZIt(HT zQkFH#DVwYl*A;eDs{v`VzP8>ap-mN+@PW=D;d*281MEknqH!i_@;X{FRM`RZCynMf zvy%Vv)2ETGxfK87i8b;>ZX;a{*VVmwlkSeOdK*dOEjQCd*uJhKaGUv6>oR zF5S1>@89jjt`E{B+zi~PoUR0?*PKTDw<_*NShlf_^1OoSeWZ<*EtLt7%_Jh=EtVXV zWtx{`tXRABdpuKPx;Lxc?to&wQc>dATxcP79>iLH_RDpz4Z2yE-)Cy)hI_dd0gT}yEF=iaSNsr@BH2CKG7ystmw)a~(>Xah}>Q-(DUb|DKy~V-SEWnXhj1WRx5DgV|F&_ zz_@L}4zMi2FEIb05+;~N%kNz44e?Zs=C+${mY>siILqTZ^vXnzUn#~RKdp<-D!}Ay z%2|hPl0UYkn_e1zmN&_hPa~nv<_Ly`KBzJgMI*Zg3eU&N)`xq}HtP!0jdpyrHCPR) zi|ja(I=4Eb()yTs?GoR?{Yk#?0a~6E+ws1ylIZHtGFuI6uYHczRvtOIV1wfS!`NGg zMWKD)!vh8?2q@A5N=t`yDblGZ-7PIKbczBJBHbV@Ee*pEf(+f#rF2LP-Mk0A*RSaP zJiqr(9~?Mya__a*T3h1MJ}a?q*%_t(7q_*mJdpf9su|`3tbJdAW#7ByjXs-cfaZe~ zJT09l{e|t^zBuSFdY<*J^0M-S39mM_DePjDOu=THy|Omfu*UPg@@)x^lxVS5(}Lkmk_mWHqTL z7%)!65cvS$Nc*O7#-_^k(6?A#yRN{Qv)>~gWnEfkSi0U1y{-rUkv%iN|08=gCMo{k zG}-p@eVN4&pt(8ExKq00!A_dL#yyDG$+f}Sup4x`Ja*%A!a4}dWEDDd+E95}$N=W=sqNk3XUtvMscFKnn?+F%FV_}CN4de)N@!H$l+#l?b) zmk$&bSHgUv@&*jUkVWgc3|L9)w{dX75)&<#3S&>AE}-?6FAo;Xx0+TV0Se^f>nqJ3 zO!*&FWwAIU%<)j(Prab!_Bt~9-+ej zmC5|hcLR`U0=6-y7AE21m#jeyBI{~}5S#sEe{)xywI8c)PR00J8Nwj>wN5fog<4Km#aHEh>Yx>5Fb2WlR{tM4KGIVX_dFMw3L&3%kb#I14z0$ z&m&36mK4zdpS6fe4i1hkp;K2?)fl-<^s84vv8UuDlH!*H?%xVZJ#F$*OP2sOROlEryyKa!tN!O*k!C8x0S{Q|VuLIZ&$Me{xAoPXz<$3AT#q|$2T>!;l z6k4q5T4Fag*w;V$Y!bPDC~)1_w-+3jGe9yqxs9CMDD3r;-AU}pi|2sdk+a0A#cQE5 zWRqGb%J>Jj!p29JL#uZjL@0SmhXLA0F9k_KSpm4f{=_Z+J5;_lua$3koYK-;J~-w; zqMAa7TdJtZU3L6w5F=^X9hw?pK0ldfQBXN#ou5};9CaIdgh4i`jv8M2&D=Hkjx zrzqX>Z~V_eU;Z^+!oLDqJ3#)* zzY;IJPo2^+R*Z>=KO?F%=r6o4D6T$?TdE0=RC}SuAMOCr_h}Qoghg6H7Jrmd`JihL-I`EB~2c25|Dv`%7A|?Z?s|wBO zKzzyW!0cZ7rmOg7(>!y3nY=V+aXIW~;rv^-eOH=lAK;~?61T~-%W3m_Wn;HP+IIc9gwsb(NAV+V0f3Lpv}yc z2q_7P=C(HdgWq*PyfS*mchpbD*WAU)JPt=EL9JI`saWdo>16z<+5}*gjLde<o5m`i0> zy^15?!slEn1oJ7aJ;>@>BubBZgn9}^ux2MH&McrYT?S;!#PTHpBeCY2hQ%H+lL~sv z>V#W80IHHO16qR_AIZqJXOVqBnOxt8EJnT~H`k~OA{A*m8fIg@giCUebZgr)ylyEQ z4oLCWK*%o$ZQ@4Fv8T8!$(NJ3%OY27egwsg~gSQgdqHF1#&plSl(d0_0O z&`YiQp(6WQ59A4+QL%`tj2$bghPX5Yc3YrQOFbzt?wlIeoy>d5OQ#NqMx z$$vOB>9>N_@xsFYfh1Kq6~<12FZeKA-#<(8aWspj@}EYy0ld8=EHo6zTHV~-&fLAOF)Ge}`>Xs^regvkb_p&ahuVF$3{!K_d@((Ttw*y{ z-+BSQ%F#eW%M|@N^y=N+d4L=*+n6p;R!2D>%`KziYd?BptbLGOl^$>!`Xmtmva&QG z1BS}dLo!Cv__W6@2AS8?h}tvcrFjypbC+GwI4E?!eV%wrn$ay-ezC$4*3D|`$cYly zSa`e^EdfMVRR@)}Uh|nS_d7tblh&)bd?B$N*8aH51hUR~D}5?f>Xn z|FNCMFlgqJ`l7i(UwS2whsa^(va05`>GzbGt5cbAnyD9j8v}%bM?IQBIK*e~48R+6 zvUh;KJC=S8JNqX=G56N(QbgNepQ%hB7dbgOH8r(>!1x#aMNgUN8qHdF0T1JXAOxS~ zoyS^AJIA%Ab9Jr>>n}k3cU~Z(t`p{OsIFM}ro!&Q2(@ff#{(mwXT9qT0WWN(4N(k& zD3{u9t8l)7mMU~sE88Nos9W8IWh?FU~QUG z>Z%Vbv*FKrX@lHiwzxXIAoI{hlT?cVYgc`MQWck|<(X+>nEbQ3=vAr4;}}i-!*7GS zaEwb8f($!JEx)vsH7I^<*lhP5O?F1|aEz@a45TNdsmX?NhG7RP-V}S^X z90n{4a+G(wE=<;X2)Tuey7}V%nek%yoA=!Fy~HW&o!;w~939Xa_!f7jrWG~Qjx_tV zptBruBX$+1VvmTvKu|Ap_MwitTD`8BPx@u1#@0A5)tQ(#^lgZj)d}-_assyDBykBh zEMW2xbFCxeiIGffTJXz~gy3!V?!<{);ihWq$`q3>51pc!4{aGScgr!z&&XJ%+cfU* zd{yOmD7k6pzAdS(9xxj2e!R-+PTz0SpE?zp*zCu;foQwCT`Cy1zvO?LL5Sfkoir%v z%gD;k7KZ~FiRw&bZ$m?ap%u-Pg485%jxU45<0D3j45b0EXrwbuUz5&m{k6`kqJsNr z1l4QzZ?Ey2ZT8hefVw)-{lH|g&E0qZUn7TKn62*PT=B;porH5pbpT+~(r=34W8pKU zY)K==A|eczo>xoC)K?-$dT*MnDB0JAc2sayW98!EwHX?<`AcxwMwY!=)1x|8jI>mQ zW|dWEmbQJT9;)L7bTzZ?`)iHIh182Xjo}hDqe;&{=y$e2%RM-4=BYx}hn+QnJO{QB zY4xD-LM=m_IgD+%2+^NckeVQB=X7tFT+7sZ@Z~HK@dV?RTln5mfS$4`uj+^vC};u^ zrZ2;nA0^Z!GdI$r?Y@C8Tc_7H6+cs4{xg-r@K3)&7Z=pNn`)W^)8W^Uzh_YFD!>>p zxj`_u?_kz~HQXuRo)p7m(`L9z7Zjqv$dt$=$c6Ti{T;p5p0U1dmX{oT96ksaFr;w; zd2$8pvRklZqb|jy2S8(n*}}NMgD>0hyt%+ksK1qSb*i?tC?O_>Utg~n^_g~)%doHu z3|7P2i=6%DUYXz$WjFodeda!|(Ujp5y3VV*3Lq8R=#49LHnU8+Q(vz(*n zv_O&dU5Kdh*8Jr|fpor6zF<)Cf<0fX7U|u2x78b?iRuj~p@wpngZauvUjL)+7lP5tkdy>%?;u1gG`YW$@J&Fkj%k7gc zo5^DiE4u&Mg~O%eW*nw$-cn_E$G}12{;@L2{D?~PNC}l)VZvldv0@DoIx~4F(^dPr z3wV(~nPq{Ie>2N!#BV)PlcI3q3&~}o8?t|nA3&f!QkGpyMX%tfy1e#(Pb&5+T_(Tg1YDmRK}@BG%hk6lz>W4~qGxF|r%icR zVjB6!z4+06yksCZ1u@`U*MP^JEEEQx_G}6+BWMpZ zlLM4@&dd#P?sB<<_4}yyIAaqH#&Gkx+K^a`SM<6f*2}Pb zd|uAZ)k#I)(x?bwu)PMvLL~IbwIHHkSxeuvyM6i??+w)=Nv%$$bRFefe%gB=F4}X} zq#sq%w4Cp7DC)>D05YK|Q2Vy9xzd(P7>(XRV}cdJY6RE6`Cqb*2Hyz$QQnKoJD9KZ zxdR>iqTN#02rmDR!ZVAjic{fAbAz^-f#D5i?0_@4j_L)7P2XCyG_xQA@o_W~uKfuU z5D4bNhwLeJtBecv>DQ=G>sdzvqt?9m7w}s3F9Rlp9~Drx{GWz`qWkNO%YSNDe6Di7 z9{L_&Ax^L(ZDMH6j2+%%1W)7|Uu3P)#_*{@E28~LnzM|p(0x?jn~si_^IT#Bh%1rp z?0iwo+VRm9^*#XyTIWkrrckN?JjSWDX~hJfM4wOcr~&t5W*?AYQc_c^d9;PV{;Un4 zKpy^}xLwhOhn0_&0PD1_T>lunTRQiGt2a`A5X21L0M6WXx)eYT1G)c=8(4i0o%VC? z-hFx*Pym7Di*%4!s#bxq)3s~Ya^rPXfWc6a4RWf{?p=7eh{xu}2GEUfZf*iVKigRo zZxt(0nczP-JvSO18&0Nqc=uW?xYRZ#lAEz`p5zUzHHQcRUCjVHmgryJTv)M_xksOW zI8?-9ssXtyaIamw!_Cc2IkRDvl!a7i%~dV7<0WC1xL2eAU=z@PE8G_o2oYV~+&aW8 zv#8Q00n7cn`4`VotJ(xixkUb~uXD8jVFloS;rLp#VK^~!L+&o7sC%@Z(lfZA!|HW! zBP9l*|M0mQvi$#-B^yGa(0|k9YAqqF2{>S94`1b(DF>Kh(H7wG0j~>xfB%q}(f`mz zf^>QWyWWwIZB3Ek9{`E`Mk&=o_7l3Z8n$wtRNoIg36J>D5zxWuh?s4^fF@TjU!o?+ zMI|IuTaNP@RVqwViC_O|c!ZN|W@`GuPbo*>pCnYLFZ8^e!LZ|wIwlrYR7aE%qAkez zzZ9o0RXZ@nb5%PsUaTon3%Lqzo;iQ0i2NVH{+Z%R-{O%X7brf@mQ_wufBnmw1hnbiUZ|ItIv zzM&n3oz^a|3uI>&Y4ofp-D?G6kcRr6s$aG7KT(1?G;EGZk> zE2H417%I~0OUNlvvyhXMv$MO;TnfCx0XuhOL`}*#(rmy{R=Y%+MjI4Encb1-bSD!o zYlW&EAoscQm*|8B)U1FJnJ+~OvQ;BWx8TUYxVF?1n69hem!?zr^nXTG%0co&8Lb(7 zG?^J08>+29#KK@hKl&u501o)G`hP}|2m|Ttf0d566&5lRo&DeB&$Ui;x*Bc&&p`kc zR!0Z$t-4FyM;-=XjUEHD zr1N1wPNOI)PM8p5g@0KpBSL-T(DjLh5PgA0)zm zS#UQ+ejp5=t@Bs^CBs?0_Xc!)EjqU*ixKmJOKlt%m_|mQtqJOPax>Qg_Lf#dC6G4* zhYlxA5ZtNjYh0iKZ%W3mxngKMq%Z?l)Vori#?7-ZYy^Cz>MvuH&|y9Ax3B&~(+ zCO;p`Q(U$$1h9*|&*?|{`(7v^t6%<)C{GSEP+7yL!BJ8P5D(*q--mbVQ;}$s8a0hs zjS3A9y6q4QO033D7KE;t8;`Vybds+|%~R#XEUUEw0LnmR`dcHB7fg4=wdgeMKi}7w3fKq6#}4)Kp&G6wvGhe!nIJ^sp>Nhu zv)!gN2%R7W9vYennjEKgz@a4Le%LbyQu@_`g|Vw9TPi-!92VOr0uCJrfU3IIaXt7^ zr9RLGG*J>Hsi6KDiNxO~D5wBI+Zk0)+oO1HsXvcDW%P;lyDp2lHw^yLOd}u^#g|K} zA~OzAH@UG|&JcoTm0lbIyl|Z-_ul~&l_;zcmNpfY*Y9JIQ?XhEbA0UF{c-(0FeY+a zNcPnNtv#zuw=lK&@46xz{Zq>PQ@+)}>Rdv{OTvQIjLvRDSBk0vQ9#|VI!hk;Rgyqn#JCC$}~8;gm@Nn$Y^yl7=yi{ zbR*t%*$6YCY&!wp0G1P?QnKYA&T15>j6!Hd=6;fZm8i1Q5bm;edo@RWV^oM#U~?sT zQozWt%7lgOV-iNk`l+e;(7vUmBozsKYeC>@52m9_--k2z+P?;AHr|)!POG^}Ke`&- z^waRdFisR1Z1jz^E%x)6VyqWByS7d{Npv3|m}+exW}QY>7!$PJY${)KF@3$eFd>aU zyk(-eN)YG}4LoUX#*Q)-I`En4z$wsw*LVN8f}?cL$&L(deg4nHBBslgQ}YG=S=Mb* z+{~Eias1J@T*ljQXxoTF+M&G_hY*-e>0IO<1$k`bGI1{dRt?McHwuS?AMt($42m?ZmaH?mC=?%PdO900cIAIqpmH^hdyL8ZfHf! zmO$EW%q(pfYV~0I_{$Xie>hNlk#$Oa2Bmh^71sBhE9jYJHVD(I(ULT7h32mhvv9S1kANje+kR}J-V?d|Kv|; zZHFx7<#vNfx^337#?)q%q3x)}lUC}A9g9&e6R+a-d37!QODo}io#O%OMCz0 zN;%UJoU2s}FUQavQMNRSo5fE2G^p|)T|XUi!C9xxwmYpIy8A#__~K>N>&?8LL@!aN zaG4((GBsCVzj(YVs^nG4#vu2qcG^1yrCts#84Wv2~>?Prrb32eUK5vxl}PsitQ|GN$n*$YQ8O5Yp}{Sth#N~r)2lt5A30%o>X(O4WsHW9QoGR-}iDso~Pf15U{WOI- z{n*fk_QkVvg;qRQ<<`}yAx6ylM3Fr+IODWoJ%c>wcA0Jig!}v#k9tgC6;b7^WEuIv zzYRVbCzF}6f0ES^34@cp$?t6F4zD0*(u%h2&S=9Poka(Yg4 z&C+SB<{YRM%^Jnb{zxT%r0K}}(;#mf)iPOme{>mU`1-auu2pSv%^*|aZka7$iRr>I zeZU$kqpx8oM3-HIm_3ZQy33;V`l0j-5M4tH#~ek4w+~*YpijDr`AY{sJK6T8Odq@^ zpU~wfUru~)HAqOnhauC|Y>TihMOw*q|M9@KtrPC@WQ&rFxZv)-%m^(RF3$EkiyTGN zQAP4h-Oe@S^_=lT6);cr=^T=Y9P*df5hutpNMznNT~ z;T?dc>REt`A32NT{o@UjSU3Hq7Vh_{L=mEbKIMw;g`~q!d)Wr8;RAp@mVlGQel%~% z6S!_NQ)&+1UdR2lyj!dNV>pe=`0G#8iuzrGdii1MaQpnnQkr)~Hk|=NRbUHVF8}*z}v)H0keuCGW_QG*KUa%8QTtMqBVLwu$rpc8;=TjqdSQFFuU-IMl?p=`TMS%L!Z8)oZ1 z!~Cp%_>i_gR%*b}5Nqq$euUZgu=T30Y-`_AMks7Cp9;S-&sCbffEKek~Mdfb$ ze66EzUvs_JrKsTVUu46nu(PJuiJ7Zbdl`jp^F4iUSLKc5#m$b{^4dIj@brKNOM3NI zS$Ub^3=5?5uQRnQFQn^)xHo>1VphT*5zbvffYokTW6!j96gxaIA+hDk3CrnB;s|~f z73?3mdvAERV|$7`F?-4MM7}6eJrQ?zC#bnOzb%+{d{qZ7C)!TZpKdsPDK+Vu@t_{7 zpF`tkFHJSh!ox5~-PJpt<=6xT_3x=FK!^`hF@xhy5sL}zVOVGgKr8l^u^PQ8%VWP# zsVuApO*B*COIkvwp`vSsPBVGYo>G+VPF$J}hDC;X!i~#OOfSTg;Zj_F$`~EnjP0@T zn+0|Xr@d>vT3U4OLuC_wQkpn&&ibBq#7z3t1P3QE{VgT{GspFAy3;qRo!AR z^$f%N z^!6Mquy+L$c>RBm{pg<@$>t>KQMq<*6G|^A(e&Q+=>c~`tJc^Fk93AgTvQ)? z<3Maf;X$mpXJgKN6Daw(OAmU$AoEOMVYx|;W%x`v&S$gPqr4l_ds^d?5tqmi%cEc- z@U@4Xwv2f>2nxcmJ5Ac2wdi551%9f23M(r=Y=G3={bN-CTAtbPB0@>i@lO66p}ouO z=M(9wnt^s_YBQgn+SAxF(_N#3T7^FQ@6~qk(k^8SY=xB7ifb$CHA*D*%50kQ56)lTx@!f^kT9^ zlwVhIkLGR4uIz20u*u`m#Z+R%VA0{`UqgIV@7U(I=wE}0a40akHc7+W@>SAP#`?iv zM_JI4&qFyKc(T}Pmkim4|Cdxl8yE8 z^{%e=E3;xzMi2(HKR-G6`l2ZuW4Q6NsF~og5zROO013%wW* z7S3;OP+yY6Mo7O!?pBLu6pTq+^R=I+VVS_bi#3@ZJQ%t9d1iDsu(tAu0CxX^4$t|Z zfJJNcYO|x^;H-G}>d{36@ilX@njHGexOFJSQ5UdH&QHhQurJ=L>sFI~;N1O~J~d3I zxj1^R5kA+b#kjH;Z4(r+c>~uyz0Y5G58n>jOLZWfZ8a~ibjs_%2SP!Nu@Q)xRaJ(CC z(PV+{y@;5Uw+Z(aPBvUL2yU)1F~$uYBOzZOV9%=qw!UunO$`9zXzIYrrmK|MFKzWH zTi(i9BYLPne}}ZP>XLJH>v+16!#<+j(t4jzozLNqugC($)?(jd} zc44px=^iKab{VydHme!k{oZaEOEU^TiMZQ5aRHnB&$0Kz_*;na%Um^ zETH3H4XobJ?;<8ZZX8;w7VANj+B(e-wlhKJKdg3rTy6z{<|LW=j@Wxd+m*9lnoNya zTs6XsPo;L+itXZybK$G``EhDk!Xx4wa(nNpZg#1e9U0wUW?Do>B-ylzizRv4j;HL)%!j>o8d z?AftjG8LxFMwp8~J)7I;L6$u9t+p|Dc#)*28i?=tj=Z3X_z8?&Y+bDWq8fyo^!%tF z6EL@!_P?#Yo7^v;RphdnuBp9QwmynJnR`b$w|CB~HLuDv)K;l7$9UH0O^J2&460f2 z*QKn6Ms1Jg(Ftdww!STasw*(?-7dxxABRzYUJ567?b}-#lk4ZTGuXCDbj& zb0HHs`$C$v8a6!hRo4o)E2P;ap)ISjToVhQ-FaLX-aqwoUME~pLakItczdpg3x^&_Qbz&qN)!TS3vI|p!?OuT&sbufJQ_0pa8^G9CA5}vYa)$&QE zr&Ptjj(MdS+byoqJHC-BhC!?L@lYzkefnCFJw_qg&~h6qgX!+E4`_+`k2BvG5#-YI zmw^~lDI#OEOUumMU7PgijOS_axsKj2ifj@l zc6Vp9COzvNzdLV0^|}$i1=?uvbtVklGi)|?%3(L*4WWFpi%I5Z2TAb!L5d_Uc&ETN zw5(jP?oC-TMx`>K95UC>A10(ZR3Wm;$RIY+DtPZ^t?TXKn8nGC-R#F9imH2gw4u!y z?DhJMiF!*$C?q>$t>vGIO-QS14mR)R+4M`vj@o_`W#i#SWdpvoW1t6WflA?28?)6> zEHHcPF3_(1CwtTB(ovVyfVXCpRxB^nbg12R zd#S(naD45k9SMv_*5~EbNPUW4qxFunZYt;PTauj)|BMR#KR_*KFEVF5zkG-Pul1;5 z4=X~pHn`2c`ba%%0(~lZ6^L<4rsSW#YO)C#(wrrE)#Q(YKxnL%)<2zRpXlqzZj^uX zr7*as_MxkuU?50!WjNT`pYHDY-Jk%~DBSM2NP6YB=8PCEE#RNCPFGfE&F4Z0N#Bv~ zF7?whGFsHXx@sTQNCXUeOb%LugdKdT9Eb)p)kPNbeK;IZK3+I~^1ywO<>m%ums{My z*Gz@7IrV8;1)<>~mQ&kP=Gm)qj>6C4WRv30*YXp28EC>}t4x>e1S;g)o4Cj&Bqdd0 zb|#?D>B(M`P??p7ekGS7{Ulu`^)*R45L03X9G^MGjDQY&TxgE4tsRW%j z-bJ!=KU*YRrP(NHS4*=ez)KAX0oytLuj%8@M1m`OUNGq%X>KCFNX}#AK{|T;M_gE2 zm{6fX>!Nm#iz=xTJ@xyWZ?p^J#xzxP3GBWt2QNHC(oOqUak%q&vA3ru7B%yloe8UOGTgL4>FgM^W0@}UvI<9{O2e0^v z3!ja>eOZz~{607EhLRW9{<2(9D`$0!n3zxRv)`ar)&7k_+ohu146Fms64w*ZPj#*% z^<@vzxFa?qG*k~Z4;pDsdYrm-7WO4>dAPd^dXS7Y#E|m8`0BZUSq+%mhikP-q+=g- zFPsYBhQ{p0# zS)fO@D%T#$km>YV#i?@fN2`5NF!O*Z1 zGLdpDvM|{$^{dCVgT*m@hc!q9@l52DHS+2@@gk6KbZbv0g$73F`LnKkU&#}4($kv) zCCJ%nzuj4rBom5ysKb0u5;cf)-RonG6SI+SFwssr5w>S?0~-K<8K`BW?1iac_^NaM zjutNG4Oz8j#VvmPxQB3vJFJ+%DM#_c2fpxp^uoo*6U)lX+90`@*Q}ORPj_PLGS7=8 zn=*wpe>@UPtVdndN5=KtW0q&*f!>|Co^_&lsMdo3+TvikO_wK#9mg>K{`kR)u5K04X()xty!u3w+DyVRl_J_aEd|V zvh?L!X#84E$r5={RutB8-BX#CcPz?&R?SycLTAh_3ALHcILDRwPnN3_Te!_L2j~@@)YEa%d>iu-1Tcd+Ai86-&344r|np!M29qD~(CF(gs>a`RB zNo^>FNz3xgOj@82+P6-`hTsJ4^pt@49HPF>Kd0D%NgeyCQw*twV8*Jetc%a4H=q4? z$JNUDW*CL4gW_TfgB0KMIW{?wu;ozXh$cB??9%znkT$S9Fgkx>S?8rnu(G2S5Pj^E zVg=0Rrdk`1b1U>E)}eY}){pzt?oFY0LR`Fk%Pun}#Wo2+DfENTQLRa%DZ|M-* z5)}~4sv`)KpWVGOZCH_$N1$bPTO|(I`o+iRs^=+ljXm=fLPOd%-tm}g+VINqAPR2r z?>y*WzUo|g=foh8+Uk2;$5b96f-<19-P+n3P~@*~t2KQBSYyEVCvOtl+Y8w*jcdDO z;V7Z#P@I1eZ#;+Vy`>$MzDfF$Q)@i!GlNk&Q)ENpcMC2L%&GZ>F*c@&20Hp$e$Xic zbln{$edTT&rGJ9W3*^qh(K`b>XyEVf>n3~)P|dXyXqYXm3I@+wck&|&iv+E4o2nZV zVXEsOa@H`CQMu%)dH${yiA6>R`)5sycLQ=FOW8I>A7uHd9#kyNc5TTkO6@-#{aJqsSel zo0IF|Z7?)RHld^SM+yGLA-!VV0w17FJqaP!=Iq1F39)%NkH-YF#za+ zDQRf12v{D_yn~B$_ohqFqCpcS<)nMtOpTpydteIp9(MArtqTI^%0$j_q{IZgDEijv zCP8-Nhba;27ak32u+xl~qg9+Jvq}s^ZDy*kPd?F`>t4f(}S@@we zB+3vn+!+8SvQ_mX*l+3;%zsRnPYespC2yL;|uK-tUWQ*mlqRV z<6g1r8T%InfQ&Ub=5$cs_}ZTQYlW)z87e}!*pBU{}Y%#7l z(sKWkZhnrF-EKz&vD0m~gaDgj3zV;pfBja)5&p-OQ00*d8=KSDjAVyH{{a2Fg^#+E zhy(HW5ZqYBbJpW`1wwtO927J$4ad*UDZr@|g(x8LXy>LR_$I9b;G4sl65&7EHu6bX-q?6Uaoadq)%cOjXr?+O>O}E9x z!V7L>jxUjFaAe7+BP=+A1C&1Wr3t$>*hJ)LGFQHsO&Z28&kLUnNX+hCP2S*`evWpq zsrD_a&DXy9fUcG#A`M`Lv)Q73oK^WUAl;gDi!*E$Zxq=ka7}%t%#2@<;F4FtQqgq{ zfo>O#Ind9)+=samlIBcigkpyeQ9QqyuD(XbVmeVV!arKSd8f6b?z}#h*|*RIQZwqj ze#-e<)qX8Ds)m=Ss<^aAwmGbn&4*y7O_T4GmnY;V#G5w(3c=7{^6+%COy+^Ubq)>I zw~JO;`?`~$`00s1sCjtop0K@SK*a7A7m2>kq2Qdi{$)RK*A67T6MKF7%{Hy&Af0A_88k*=8EOheDHon3m^a$Z*d}#-;ch$l zr;_4TQVizMbXQO*U#;YY$x76t7g-pCo%kB`+|bemyGCt}T9f+3lJ%haJ;wa{l!8QF zjiYT}!~M@XaL+K<=77cF;RFQ?v)zKYKcQDBCcb->P0gw3EMvzbCy&u)Ik>mL1PQ@7 zzoTXm^lV9u#CS_=FqrqG%;tpDx7J6xl7>0EMCnd;%Bw87^l9C4m0U&E&n<%{1F6vl zFj`bX0+^s9bR^`3GOLfBiuiUZ--ZZK2GAj}++ArSZT7t>AhX~xo|kCy*bSmH%+(eS6eD%%G;XA`v9 zM@Na@BzB2C?B6doD!cfkrN+tt7hh6m(uxRpCKxwpMwNdT(4&LRxZl$p-CgeO%u2~| z;`itc#pnr(k#jE}cV+8`Y-suqRoTJ{gy2cg~Aog^o=)vg}gwW%6V0 zIJby~s*gKr{+10+$PS*l=Khm0_TU!uUY|;5dj{Id;`vpz;uyAU%0+?pixicHm4_f% zR^<{?-c-?^j|^(sT^s-pm3P(l2BqQ#?JNLp!&Z6CU$8;mo`}}Yj9W{yZHHxA zpnJ;oN{iEX{MKKtTY=)Se5TBVL~t=>!2N>gGN~|XQ-HHv-L;K;T2yazE@AT{yCIIF zZwZ~>{Y6d2e5E_T4*_!4(WhGh&_RA|vvb-T~);RN(0g{_=M-K}Q4X;)W>mHj;Lu_jB8b?UTg!zLJic)l8Oy>N-sRjfR6p zlI2;kuqqG&K|g=-zqvn+TP&bnaU}oh0;rbc>WVhvJ9(cl)75+?KJq&?qVK+!?_yIq zg^w~aXWf<&z$jqj5Rfbos$qEY7lJCmu2uNeB8&cj_cDQ1mZQw})RSP#s}2G#-?UfN z>^471>lv>^u11m+arcgZ4bAm|51xR;owQGbckyhmhmoJ zNQx`$Z%UEk-FupH{p3lVKma2OZgTI zYu5aP{M(HBX$5`Q^DX7-eEh(%>ryXYSxXFrIbtV-p+IWRht1Z}+B^_6R|-MAyq=!2 zC)GYkRtutmmEI!j)IBu`N}sn0;vTUzHWY3Y8{S)mLaxdI;rT1b!V3`B^DoQGCsj8B z)DY4}(sHGSYd0pRg+ay8bo2ugJ(HN15846Kghu6)LEjj`QxH8D`d~xq$j(m-*rowD z5$dMAQSoKAZHUJjRYOI)>oZi#pE251nDHUl7teodmI*(q(4Z&`(`|>t0r%QTJ67UA z^q`B^0+{8rLP6Fl&Hob41i8cg`(MF_aNQXwMpB=}<0-ZQh%F(8L9GRJXZyJ(y;7P3 zUc~mI4-y9P{CaM~LvCX@XeQR!q*{CC$9IZuun0%T6>fv@vyn?Qg|9}QOG^qL0(lAf z{S1Pg$WPcmvbIZ%Z)w;=GLrff+|VJMfw#d~XVW?ftBD0gAt4(F^s2T?fADoK)k%Vc zEn36@d@+%76_kPEH!2biP3_uL17u)AC(u%pL`?A^kI!W=_z<)@NlhnHZIm}mFeY&} zm{%3}j(Xr#+EYY@=v@7M?$AAZ9pgg%!%ZZDDg7*8Q7kjbD{wGWhK;{Qg>a+&9%Ws6 z@I)ePE^jo0IIFF}E^|vBF(Mp-wP*=SPDk6iU%~X=`aK(GhO~b2h6d&HrHLz?=C81x zJiYq93gXFfZl^##ye$6&iw>3IIs4iA`Jd5#imdjP)%Uv~^41e@vHj`H+-310wv0hlpW zVr}jTjFBW#!VRw+dU$u(PmocmsiSo_m5({ICN-4l|b#+bcxqrmYD2Xaq); zgK?#-eObMU0?|PEJyt};zIv9yw;$DRyb{CE7-a2^0*|2Scdn+x4ADhIkBD*^i5G1y z$h-C7AlhoS=9)RUD4S@ z@4oy!qK3S}3{3)cl{cmKAtTTWdhSMB#+M;OnCI7z+)f~d)pYQB_ux2K2JR%Gf6KIB zXXKw_0ij&E1WRlO#V6f7O)eIr6S~BBj7|0n5OS1r=dw;MP#^{%6UWWtjNh5?ukdW8 z3xVt%Y&d|Fd_LPkW9NsUa~Evrt?%u@Jz>MI|KVTJdCo~L%AXavz%an=74eJThw{-M zyc<}t`>f7vRbeJcZpU-2L9sMBNnlshI6xjyo?k_km;5jA1!Bf8yR89)Fxd7m`r|bS z_BnnBGsIAQX7_$qxF{&G-4uw?eEk8kRbAj=c;zM>;(6u&;uj`T_q}h-wMRC<2?v-; z_Y0H)5N`5#)d66VNv<1`hdardFdPK>165jZew`#Q%Xj5GbkMxo?*(cNJk>(!9%osH zebf~Q#PZil>EMHkR8mS1Xo`3hhx%0s0ifr;6ysvFjfxDy4|Kn_FHr7vkr{GF@7LS- zF`&5}*{cp}Qvv0&quV7WeS)CU1|&@*axyYU0K|G18k#an!^PzQ3Mc8S)b?66uKWJD z^Z;&*OiC(v-MEQD%a>N=ZNxNMw0q_BjZ^m6oA|E@tQ8JYgWXz%D3CjMer?q6^}a|S zoMA^vK|bhGctS#lJmDx5Ao{?RODvFp6?DKx(vZB8%lk4O-4VwHY%$_QL0MRlDE|rD z+wTwGtG~VyhFgh8>4@@sj;RUwG&Nhj(g1K>87`*T%^E0O2Av^gUJ`>sU%Ax?IV`N) z1VH^rrPV(VYvu~i*oeF=w@7tf)+W>gC0)n^tbY%vFv0pjkMd!9-+>hzw3pVCxA_9( zhgnZEHxx~bq3OnYB@`cq(- zuL-le=d}G%x8c4#vmxoHvr-~h1ZN6&DLeO!rlug`CYvg7V!_a%wjqa7<@=cXjiraI#Z#etCHxVZovx ztfub)<82c-?|9=gMI8`l!)lKb=d!TC`uCoJxm2hS2+pqqC>$hVv;7&> zvYW6#s5-W|sL1(fjjn^;He$hgMgKOl)$a1(+PK3c?%`o^`^{l<(D;2Aw3$y_Q$}8s z^y;j=M2eJ`=>5AtnZ}rku|D~#*Zhyzfl)KADMI)fTSQ= z83u35;f>N1@c5n)r_Rn&9sRF2I@!liKJ!KHr`VW2wr`95-)oW1KzKv;u%orrdHw%m z>@9$*+}ihHLO`XvK~zGzyF^mD1WA$Z?p6_LY3XhRk%kS3gaXptu<3Twv4L-KJnxD3 z^*8^|8D^Z>u%BnGb;osIcdS-4SL-t}N^Owr5yL^ybcliF+xL-i`4iedaSzPA=-GqC z&8|bS5Z*cbeK44f*=|fiPyK;30P7I9tx6xrX+-snNhrj!+-xTyER-h!j9msT?@D9E zo-ns5BakD#x%Hs2$!?dGql7E12eiO;WpBi0A$N1BPG61&zHkVKQ3azN7 z3qZiT+sgtapBQiv?tT4-atiZm`kA!9!&OY?rfH1TACwU@S*+kCV-sL((l;~9-@Ppu zA1?#HtKeKSXXe^jrTKITV~KWMDi5dF9H@wkWWvlPL%e{g`0hu0ZJEoL*p&if=NT%00BXknvaC@eU0q<vARU%BjAUBtQfFRDXvks^LFqGg<3bbX9vu{GP+!vti5kUq0UpQq9Tr(Uq zbf~D?-saxa`;68seVC?SyN^^tb8Hg2feIW7n7(W0{`@|Q(%{xA0giq$M-@3j=D$Kc z^ll{g%Xr*L)j%aO8xq!0YLFkY zZqNl669dHu6+-JvDye956~@6)eN84Edo9_YNXL!JZJBiGiC&Dh@A z7|m=r_au4mSOma6ttjNZNNk87B7%6`UlM%NaiBs3!eL&2KQPxjma){|RJQF@2Q2p1 zB3E+1>$A)4vYFE7T%XnL`DE<5GAhM~D#0wETwQQ0%-PvlvUvi31&dk1fiI3?H`g^R z%71yBQ({@c{F(WwKBRXpSFTfjT39#!i;HW8nzZ#R5O@IijYb1mz4vZI_TZ+t}W+em44o} zXRCgniajDy=qyVlWq;}IIZ&EvM$qF@5dy=?Ednj$U{c~T2IC!FFux$kIl&8O=UKm% zVB`mZ$*RgS%PDnHTp}^iWd28=B?Uh##rQIRZ|`ih8f6@Q^HHoIC#jc|?t(q4d=>Am ze?HfWmLqsKQ+mEjh0_(Mu|yCwYPB)^>{gDV_J+q_CN$-gnG@E>_ZF$!$` zugkI9V^c>k>j*yxa?x}q@Apg|^5^|9_;x0D+^>e6Jv+qf>A@J5$7qlqS6iC)}{xHmoA437r#r?bgKJsrF-1ZOYJv!cKimsZQJ>4n$@qyLm!2(V5 z{sWgU&Au0+z_^vV` zE&vn)h)6kXf!YpuvA>YeP4^3uJOfWJc}4m?Wo@a3={!?>YN3k{U2=4+tda16>9pwF zu&Wct`P@W$TH1wMBPx#}y4;`r4JO@6zk6+&@AsMD9!riZ(n9$x;lV3gS<(nV4zHjusO|dB{%kC9} zV4N&(_jEIMwkNEgA6rn}bcsjgk*J}U`c(c2VJE-qrpA(Pk2Ut%6L^&*(af6k)_r%rA-#GM<-LD zdM)|6q3Q6WWbVq1%=X9aXP-Itak<+R^uz5gX)2;>dESR3hx-Cs=pp~MN5B1yE6z}g zQ<sagt6H2HjG}Jp+UQoqxD9-nn;#c$qkJLy+jy1<=MY+z)mur`yWxTqa-8 zV&LA~uuC=_9t6~PJg+@2t%c`8N+VN4jvRr4j%hRtRcdY$tquym1{gD4;oBOI_exO# z|0PqD#mXTM z{t$Zz3^nah(FLezp;!2D?}UcP4`B_ZTv-L`@mkGAqd`#wMimX~kvciL3qA?3XgmR0CJxy^;*34lPS|NIgqm7qvu$ z;dsMk{jEmPTMIgWA+H&Q*0V78$cCi2>Fd5zol#uaYfhtIX62>=U-%W@%QOl}Bn`R3 zVvF?d?S;pX3%~1+aQZVN1+|?i5r56jHmVfYObz&gU|ZzNo~V!hJub+ zyZ8}Sv;Mj`Y-|F)kKA{<(1ZI8MV&*gO2?|Cw~<}9&t=;>#fT}h)yoox_=`Oi8+)?< zZj`iJEqGPAV{mFXceS{4U8aT3N2v151#CjR9ETK_rNZjrASjWxe^)#N%3KNYGb$SS zX%@Fn&gh9h+(L#$jj`n%t}0B*#f(AwPbNGJYPQN4?j%{|Tw~Q~2{#>&6nrZq&`+Rd zHmvtqy7Mt@R#DJ1={Z#5g3(LPSQ_@Y`Uhnbg{wRp(`Sz^Z&P;9HAA!B z_MH!u2)(|DK=}T*;3pl4nH4~eaP14CCC^j;oW59$hy?b5o zQ4r>LTu+U@9L_4HR?=hZnql?Ot>`i^zFUH|fn{mNu|^5;LJxQMaS8lO2u zT$PgOZ#*3|&FV!-QM!h}3+?!s+)qY5GvU%M?+b;j$NcJ=Ybh1vvu8I|b{;5R9(>MO zJ0WxwGg^8ajuP9`Kkb@f;UaL2XPa2Av#8b2Ez(l(zQ(fO3ptsjk>c8UFf029fZ#Yq{+3Ae2R;J^mm`xDl;9K5B#A~rPg+zmxGQ8tCAKf36W3cmD1*UN7737?1SeEJ2LjN`is}6h36vXvUg)6)^m=>W&Iv#y!1R8OqxnS66AZ%XiU75?bOBdR2T_V2hLnVRR@R1Tii9f9A9^M;gg|VQ$=4F&L~P4E8!lpO zQMm<^;XezaunyR8kzsG;H5xm2S>u@_a?$F-^ATOy90Hh~? zp96JZ_lceKISqPEho4!d|0qsvT!QBcrx7v*J77d7Kd@i)DP8wr+IQ-FUOFKkq#okC zKAR)d*)iizPJg-IBC5H!K?lnJ5+p?6uPv@rnoP~(r;=kqb*AF{cd$rLpvd?km zQb&@i6aKXDF=L5mc!}AC_7rb#vy(uN9YilYU^~(_!6k7{WOdGiQA}iiI5?P%LXh5H z(nk&cPRU*BY$NWB|Bz+)^|v*O%yMmt_6&P^^X|S9sl<(YRk|}Rqj|*p_OuOLoQhZI zkym@7jKZLT8G9^`a^2Q`}ZILs_e+sQps z&^{bg2s*|q2VW2lIXkegqa%-S9?Lh}yrqX} zCPDslu{^=Bz_TmO>SD}dRaMj3_^l8)OHa)9T5X$H`S->oH}{iE%)3jNg@(04j>@Uj zsG!UcEcWM2sq!=1d0Y2K0Z$Jb3RbQoyoPqw5W+y_4KzClGn2OLBN&1JQV?`PuC7~_ z)K#+fAUC3QrBv{h!-7_r64ww*N(CFVNG}M<%`1jXUq-ZrD28q3JcQGue^i=qdDmB~ zk!z)Bi>ov0T~+EP!j6qk zXixcp+Ew>yF8s``u6W=S}ZU`cZZ!T&DDwY}oQdw?womS8*Z385x<0TGl#G zTz($o!vxShri!iHHTaZ{eiXn=7%d#y<0 z**E|5AgFE4L)fk8n+pqT$Uz%2Xlv6O!sFc)J#0Vhs8X$V${UPwt7+udS>`6)ofF8{ zT4Ri7oYYSsm;6qDFxOx!+;Dzfg_YMSqJ#svih3_*Upsk*b8+>&j&1VgnPM}q*sn}K{Sfu^h2BQ- zNX+^sJ5JrhK5T3-tiVxD-oxh{s@RAL6Eu?Q9c@xDlt(-1zl`P?a!>8vS@q|xcf#aF#u0ASR$_YB z%h#qoPmJW14A0M&a)kS0waSb13X0dsnIq*y)emi{T2wpMpK6k8jf1JNEO(aLN@agI zjb8%x=uDsSx>#q2>S!6mYB%wkdBP`Ixxi|e%r(QNy;2&s@t(_2#5)LEX*yBa5PBz7 zvEU>0YFaCd^&-8bi(pJ?xBy9!c$iopa?PB<`5=7UrT&4NQuHSAf$9P7sd163A!KmJ z*GDMdTJ%i)#jMVAjhX}Ru=ai;jr~oQoCr_XFaOw`y1l49vul^7;336wEPF>Gy2A@% zb_4a}G=AUfT!E@+6}DaKaxr^%NAaICTz+ObryoF_NCecHQbhfWesXjJ+FT~tevUcg zo^*QnE@+13?iV%(#5D9fMY+0B7?Uf+V&dt`snO>&@a+hU`#6d1XN!ox>|V>&UadbX z1eUXeZGY|Fq|+My#*q;`+K-WcdN=4gFQS2i#}7VF436eD6^CX#F5U-F;ZkF%?f%sm zdqQ6U=h3@JggJ3EMA`+HY`fOnpGz3CqwwATdWw5wvVp@@=R(uvUSxUDk?g}4kbD)} zEWsz_)x}z~*L@z1VaTrvKImFVSPa1d&(LG8pRt%AuTsrZ z>o7jJvH%;HbQ5d{iCk(#{0B;-+=`Y0MfJNsY;ui!J_7IC%Gp=!U^5m7JmDm5YH-0$ z*`(;$vE8U?u$K2-i!x>$fqxvG1AP0zW8|fy&Z_?A{A;EJHnEzMOSeRnxI6Yp$li$g zr|ZN4REQvwb@=mbf7RxHfqCG60NpM}-)HlbN$=`?2X*AmTs50ixlg(0#0I5pBn+;Hp0KC5W5$kR`Vw54iCBdk*QJm z^{37=48Qw`2!;PKV8{$fV$~a`mF`LX zTbU2<>d(A@fX~F+!4MkCUv~c!%%COR(*1D&XB<;vVUZ<-#r$D*G4gf2=BTQ9KnsJr zXVu{hu2R64iIa;IaQ_bg(Tw0lBuMO3>+6FK*2=eM>h;IVD7QuRUm$h|xqxzX3h=|w ztxv@w;{FEz9{USOGX7N6lw+GpkuDg5s=md$PyaI43q{>OHE7*>3(0 zCi!ZWy}$VfF@?lTJ=ua(m=fsRhqjZZ@HW|35&JRHWIe7l!^EZ%mNo)iZt65mG!isR zM~+EpLMOwEZAV0piMt?>Be%qZ=~vxVBzMS?AI z{4S^4`?EWB2}Fj9HHW9FjoZ;TaA!-LoRr$zM>2gamX^DQ0VSF==XAQ@QVIXrrgato zzx}AS^>xPN<%S^sibTU_5;iTdDV+VJ`05t3-c2VdaHX2CmXfNaWk#iWm}~sZp@omQ zmw=cQTb2x+a=N1Gc&b2=?bJ+GtifG;e>iNxx;$d!Ys{{LKr3l^9Y*Ngds=uE$o`d8 z`r00m#rRqXXJqR;7E>Pu+mvDQ+SW4R9xuNH1V<1mgVt9w#k6zcV*R$amt(uw6Te-y zwntdDw$Ji+kZcJ%?H?Jr=8WjEti8Ham%0j-sQpHr@jlHr)sFE{tmWwpXaXxBL@{qb z?gqIQR$#Q#n$)oH&&;PNg+#yZea;fEf6r>MWXiXYEO+g{-^E?53O0#>b(e00~F>G0W^{R42kXxBdQ3>ko7{(Z?u_kD!~y9kI2Q871p?dSMdS(A$+>IS+D3pWP&`;|3=((|@hG-JZU-SP3x z-_5#=KS>%bsXg30^7ayPvBbjtojN?5%|6YK`7q(NGnG_r{mp^7ua@?VC_Y737S}*c zjeNI0F4p-zhvPjIAKh9%@$#5GJ_D&wIN}UjW>Z z^`36U!xv-a5`Lgqg-zWKPN@Qn(fF)9R9&WCV7~FV7y+V4!{5<+c-!~lFv5PmS*6jH zC2!BMI}X?Tm|}C*n^}MT^hN-Dq(I8o0OIjfb@1%j$7B^|Rwsx?D`pJg$Rc5WZtSW0RdY{J%*>|3&>$Ea2G+?0E zL+ZjoJD=3*L9kw`|4%n=J0z#JL7&K__^49Pmv!#*X}7)a;0BVF`=Bg2=-nEC2IFyR zsu@PZ!A$h!g}Ld$x~ugVjVrywQ*&aMySE?xU#Az%W!{X}T00?Q@(>8G%cZaDWQ&!6| zsBWmHHYcguwy3kjbKiAq%r@_d>cfJP>o1H*JO&kFqZz~o9!y-$wG`Ery+b@IN)?-B zHclQpI~Msg9z~9xN_`=3-iUSvP-%xbCCbS6Q!d`R4Q@y@4*0`bzpAcf@I(mTC6eNc zSzh)oP`V6*b>?_V3lCN1NQ=-)_=wr7cIxK1F~u+Hl86rgLyKYyfhcO8bkSe?{#+2< zhdjm><)0yO+F<-x7@YjgLote|3dT7%h4Sgm0?3JCe91>ohLp+K+0pnuH~BclCbPNO z{8#{T>{bOn-gSI&vbJdw*C?#xwR@Xh`~{dJ_h$@M+90)Y-lOV4Onr@W@*w63$E=;G z*W{UJ*h{|mOTrw_Fs<$1+y*A>Ubt4-mxcmucVLSZlDR(^vqj?%XinYebf2ga4BysU zNGG#XUb*XO+ptBe#}i2~G{Z315y~*8UyPBI6o!Y7W~JtjI2*6H+rP16@tB0|q)7S| zAI9&@u(oR$RPKR=F%0@>aBy_q(|mF$kF-*i5x)RtEKS^oXQ@#ofr$ zM-qC4#Ai!*GR#ADjBgQFSfdisg}&aw0hLzYK&{dZI#{q%oq4Qu!g#lyPATU{bftfyG%RJezsd{eS!_WhU29NXq^_rEMAgorolqL z^k^@xJj?eSn;;Q7w)~+-c>k4dn#EA=a{^lHbZgvi(1sA_Q**2uy)rS#htSm`5=+Y* zDKXFCoQ<+Vq%oc1!2JN{xAa%71ieP$^*uRxYmp8}w@|~!IY7l+CoKz2oo<}{ditVJ z8Ru-^u4eOiBh%3*mp8zK-_J$oxlu-Pwpd_e>GI!-9X&O{^U*N7j#_Q}3Z}XvFLg^O z;&k>5xNB*9^i5~iao%qQqhC(_tJK(H?v4CTZr(tNP*NkDX!J=V#{CtDYY{f_s$l2X zt5nV;zq228Qi0;Ya=nJV0&O#|#8rO-O%`+s5CXFYIuv<9*FF*$cySkis#`#_!{H|HZ#X^sUQXQZ^}leEPnLstw`p#L&s{!&@f zOHs<-`$W^<4+LDTX~E>Zjv;&L-a0o>Zx+#%p)W<$m!=7)w(wJyg)Fh8rvFCXbB)V7 zv!Uw@AI=D9ZX{(!*N>Sd<8Lw)Ts+ptBtrCBQ+}5*=y&^^CcqSoCZjeAT%Dx<0c8@6 zl)|S;QFt$T#^2+;f$f{Bzy++w!gAx_qy0&-f;JQO8;_@qj<;7dg2=cFry-v6b#)fS zyif^xT>#SR&=E8Vy%Nf5HRL!s=M-hvAGr1O89cJ)(l-`zA<67nS6f$R%Q`ul4I#J{ znw=;U*7sj7RI`?$x#%HX&FVHjJ;iCw!})9=5|6&hKecKTNo3f3sz4!&I6wB0dZ0v2 zO)ZhNcflc|{B^qcPtU2$+4WU(9i>~BM$&`tSqF+)bwjr?Ad0wzno}!1E!keIujsE# zPHAUuz4`1VL^h4ilUh4Z)o88Nhp#N0D<@!J|HP9tfhcsDeUn)dK(m>(RGt{FpNID} z5$5t`lbI8#vg|XOVLQ{rR<*2#8sNHc(hBqbthuIdLqU<{fpd|aS|bw^>m6YQ1cYh! zqs5?9*FBViO_N>!Dl~&L-?<}-$Ibsoq{LuPj?{k{(AHvI0&^D0a)JA-WkQ#1PKGXF zn=|#ZnUDRmNw&TO63(E7wE5zO!i+jto+>$<)nP9Z5tTbf$f|At6)!Xj=A`n@vP9x( zW+&5e!TG^gbbC{?-Gm&sn&%Cup(OfTdd((|M>l?&Wz@s9r5I;Mj9N?S_lk5(L>omt zJe!S&uS-uq4=RZb!}+LsxuDVMlsub{aG^^?gcS^ALnuc71-OI{KC&(kY3l5DF^7*I zT7Z#VTdb>7m^>f`TO|!pB(8E?LJi$A0+36;=sSD0VdTiDesdjtr46Jr2gM4KFJQ!| z>;r$Yj%P3MJF5nQ5B@shXo_IjG+3NWGyd=!ABtXXli(9Q1w8ljjQe z{n>6&g@S%1ISBC>4P(Q)kw4f{0U!~W<`WbEY>o6Jvr{#a0h(2pR8oi+Ng6Am>D8Ou z!e^xsbr>N0W>o4>F5#lDqf@$E=jd66M2RvGX18v44L0C(lK)7{lQB?fE_e5Ysp$?OkJn{;7V>922i7LK1U zU*Z_@K$KHN$YmX~Ma1Xyp=b36((k)bld23}o>l9A&?o;0^WDRskHZ_ZRKN2$LKOx- zz9SwQunT(GuyRXML#AK!WTs?hGRolSd@F~44d*Yswd|9OP+Q2J1EhCmQ}Q7yL;@8> zK_gE&U6@7LC~vmd52B4r&L;AmZSjsrkSa+OU4rT;Y56*$b?MI$*7i_JB#kQ^T=18% zbiLhuqGg_551g6HQut;1%AGD3J2beJJ7nd_S8J2E7g7OQjTm|<`@A!@EpI{pD8#JjW4VfigASn6pf$VBT zBwFsLYSN_`86#u|MAR6*?mS(y>}zSFCh28rPAQPzcCjMVMu}5e|#Mk@wkY< z+1%k_ky{4Sr?&tVJ$-5=-N|?3W<|ALVE*oHe8Q0Ksl+wb(U2hr;-M~~U?F#g9V5YS zxVv8gEtn10nl;mbsY!`|Lv<7OAeB|z;|FuZ|4fsesuBSoBZnbr3>vL&B4Rv;2$WVp zNg@16Nxg=xt`hZSw0s5;@P25HxPBb30lW?wj@Fd?IP;fGIYyua1{(DKAJp1;da?Pz zD&@)ib*eV!oKu|!1Ty!5;}LNFv>HOOewN{bdGCrk&x3RaJvK)AN?@a+Hyv3vql6fbTdfBo5P%k^;oJvG=(bf3aYapX?w~gAV`_%@Ydspb^RT*8C-3RoJ$v!k!Igy$r*r z=}#rtl6bbHruw?4=Cj%Y0IaAGR*(hjRsZf$ps3652n|7WtNbKdd?ns-4WsR+~1z8 zOawS*D!-Rf@S`bdJ|>Ah$?ZOm4s8EFQJ^99UnuZuw10ZAO4s#m1~le~N6w)N@i5iAV+J!gRv6BcUdzfho>)uum;P3RXnH~PWw z7Y@{WxI3i42By#k-V0CkKgIV~J4}s<`C)hoxJ%}wsy0B~7Tx8sP)RHcc0oMUTerKa zsx|4-@Cu2_ft6EfXs$nRU{s^+0!B(|z# z{$caJVXyy_$G-8MZ(|z~Iyzhnu7mCj{D-LuIRrOV62ilngYE8rU240#6yN4+FNTcH zkiIpt@%(Jzp5%?A*BLdHU!1?_;*b^aNMT$9TgYs9H42VM>SucfM#~%jLtNJhKtbIB zVAJnnr=rnqMHtrpBsp%0hq8Msr=ROnf_OyY1%|W1yK>oIC=e(04-{y4$h68tuKD(n zy7cy>bD{fQ1;NSz&%pHHX($=M)gG|8?Cb-eK$i1YO3_Pscg>Pppz_=$ND^lQzij*e zuJiN{imT?)F)oaB*4X7a+g$L=lk+2cc}3sEUFFZh-#L=(mk2k|!yE1(gIFihNB}hSRJ! zWr4ZPP|m`}&t0uewKZS_v!1Rg3qzUK3@nhMg}7TTczV znSJj6+^@Rfd9PTOVgT+ild{e;q+K+7`i>dn2|QqbnIUF3J_V zbhOn;+?MR$3|?jt>*v{yZC7k>F9GZ=1$Gg2DZ&I``Sj5r{Eh^{gBqlqT*NR!cTV*U(GVaWkl+iM*T-^g} z$my1&`nq}l{9%vpS|Qf|;=rT$zi?n@yx9wg(F1@b8MytAMx!ywMkcG7g3DB0XaT== zg`&deOsH&`-^dH%?4p@a$FG1O>8fjwhCv+CFmQjt1$Y3~Kj`g0Ad(C9zR2Y34J^o4 zljm6IUta@*=um>+o8FYF^*u8n#N;N8vrq7B>f=HPG1JSXJhU@fVZ`#ns?MAcQw`U1 zMz66AKd8|kZ#Ei2r8-#k=3>&l@l)*MS!(?qjW^UnB%pJ+QUAbe*7OhX z^h6ooUdud5tDyXy#QnizfdhD=DPyi0j0@N=3@5&I7ySG# zHa!1Z@DpQ`lm3Bn?JqN=!_0G)&Q8&a1JXQ2X`oFyMSP!fk>TR=dK@-y^J> zrw2O`^J%yuho=UNcNSu%-oyZ^jS~?l9(Y6O{U*e@y?(>0RJ>?4bj2P6jPHqtNDALx z-SyR4dkidr%XaGP(N^tn1UpSuD==S+(j->k3kf(4ihuMZ`j6(w&=1pUYn9J&8cTcW zOj*pME4})hlK>k!s(<`iaE>*g^sY})slJfeK;|`=rCQVY9ZT{zK|NqaM`~;vLBay? zHc`|kO%(F7_NtH?nU^oELl^5bCMTH`MG&=vvT4#JI^|Wz+e5f`ZN276>{S`YAfATB zYx(_1)~eqzKhlm%Sbhaz#UmQjqslgH4R|Ymy2Im{Y;L;AT(#O0zgi=`D~$ygVQ8<$u1|h93}g zV~Mmz>(2+@7HT|yeqlXU%BnR2kdyNYL43lofOn0yR+nN_Ct7aef%+kym&yL}WTiQ{ zrHH;-F-6EFK8Z=dHZwgq5SUgyR4J7AOd%72EuvGb%)&CGAxr^2n1^oMJI~S<3r=YP zL>2To9l?E&PQt!mWZXe6un%hb{jAYWn%%UMKyc9@V&J zvMp{BnjH~ukjj50G=*uh5HI`7p^1mvp7SCnRX51G0om&TEv@Uu5C)uq{O06e6>btB zs2(l-naun@QPU8JVf9oT5_Hf+2>)OF*)QDmn1Kit_@HKtP%-mfuV^HTtIf3^a+JK= zwNotDxr`&3k7ofq?(C3Kuf`=7oRkM!TEYL{{MoQR@}uqoa}j$Oe1FAo@BzUkXnlJk z$b_2k3pM?3md~CsB;+H=u*(G!{K=u!DVzO~>XgjVehC%7ds<_68DKy=Z+t5FU6sss zz|Q61;F*$FBOw#7O7!vz+&8yDWGP)J=z*-z!K#ENVzr`+8ym}{u0qH>yXprIjLUOL~qh;g|V2F@uQ0R)G;8@ ze1l-pw?Q&?B4e{3oeh$gLK$kCP!@P!dK}{~75TBLZdpl zm1WmA-t)bXzfGn)hx~AYGot3zoe2wRl0cI6Xc=_r;>gDo@6cjz=i=s!j+dh!bzRk5 ztuvZVPEys^zzBSZVt%6sjIt+jo~56h+xwW&yF2d#b(9HFD4&(33M7ZrJ#?JR3o(Ua z>inqZjTk{Ct!`pVQHcjHRTndpfwyHtXkvWLOLi5As8ON!;rd*(5`svk2Qo52dFz{8 zIuF8Z{(9mhiahtF&fXPANhij9ihLgk{HrvUYFgS_6tz9^+CH_^yM6K}>VholiJ01F zp(}QNes-oSPJKZDhr^TrVnDC}&Wmo{IqV6m${iQ{CC;t~+F z=}RR&gpIdc5i@fpAz&zO%=GF>ja9qZawHwLzLud^sPkLt_Ld9&w2V$2CMK*XL&c+0pOcoT5jo@Ese&HHvChXy{HR z8iD8tl|bO69HoHU`}9_w!&Cq6y)HtDnnQ0*fgm zKt{Tgf&(pwCl%gxY5DScs=DPKN_Hw1ylo(^Z+27C)wLH;e5Mj6L$k9PJ?h%e+q&@C zp0$T2y6ry#$Z(+Kc&MGTIP@6j!HXSK`^PXk9QP7@&1YkuWnNwu{m?bT6np%v-sUE4 z%40$@T>~z>S0gzR+L?f<=kEKx_>COZp{rJ8`G_26 z6RTopY7@Guo_d7ll{Yk8JviR@!URr1FU8Az?X0S*1p54&)ZxQ+_SI8)@(L=%7RM|P zQZKj5%S7n&E_D}&4)$>H!JrwGPiz6}$!w0yNu&$XI${SoNu~!T-9w(%C>)em5m}G) zkrXJo(r#glhw}Nczt;HjI~l^s4q$zo@1g<`CuT6n49nUb%<+~HL_7?DVq&O3Q1N4u z18`7WIJooV1B%Q|ezd}CAKym|t*xue&Rc1ln``T8)>^=P+aMFQOSy4t06CP8T^vTH zAZvvb%z)s+(O2I(f`|~(%73zc=V!$iN)-qq!Y|nc&lL)Hwqpm7@g;koh!>I%vjaGJ zgg;Ln;|Wp&iJ%R3K9-BzmqIff_Fv-R!#seTll&jWe0Wdt=us&)!+t*c6rf%TEfp**x7Y@|yF%9@(ia$C2;6KB+LP~p5FsJ)&7?FlWzu*v_>>mo#AM@y2^l?XWMJos%eLN zEw=yh&_>^ju$2{|ND5C@H?hvni0aYNek~1Q_;PGsSM&0Vm+zqwJ7%7kxwrU@ek*6O zh)UM7$A^dRE~z;Yu%&X`S@mNaDQ_nyy6z^mDj9*2$^##rDNPp)TNii#^qBo#%i(7H zr5MTM>stSmw2Ta)mun;rKz`DM)XfiaFlj`mr$puCNFR`IE-Vn+q<*#7_|V$QM))8~ zhm2UZ;6Tge?v5ukq@kfs!K2(a41zXYF?R?3rHZC~PeiCfTue<(hpv5yGC2LOJ`NYG zsVu18t!gtJf>l?O@9!t<%p1N=^!dX;1Qj{YwI#@1x14Xfb;s82T#pnb3V?2#=7L)H z%-R!na$8l^#g4Go8ba$Ohle94bJMhHC53s8n)j#8gy=gst;c#ze)!y_bUWTAKSs~d z;Q}7$X4Zr&qY(@$lxYJ05(G8nV-SszVPs`pmT9rp>gbL?(RpG#xV4$w^@fwHJVp3S zhLp6(g!e1pcePkLsbp9pf)U;q4b(2%cggDLr1y3==u!}cKpw5Fo3P{tZ?H+g=n#IV z`Xx9Wk7?QLyBqgJosUK>XBNL0$ziF2?)$jKm9@3~?|FTF2L!4Ni(m#kieL#)jHPeK-Qz^yQTXee5*fE1J!^)Lq6Z0DyW#B_oN@!Cf@_j7FR zL?zu=W9Y9;PYKG9Bt8qqcWwqT_6;kSTifsHRaunL9>Ju2;}aQiHO19tP1X)MIE>+6 zL&~5lk?#9C5fP(@AI$oE-nqpf{^ZSD$aS*B7V-6h(knqMFfY_N-xe2-Qc|uFvFgc0 zM*3!|TW+fk537@t6{DgyfA6YyoT}P;Cu+dS?tQKKweuCy|FI8EMEmk=q@>R4{(~n{ zG{u=OtmL}#=jISYX@yZkhJ6PMT025=y9h`s#@+?`=Qu zF#0ZI*j&MNfrl?J=4V+_wY@iK*DoX7hrahsYyV4cxm^0lL& z(ixwa%b<$H)A-!-b-0#rO6rVId?&X4MsBF>olmiE{ttlR(I12!+`h#PB^i7$IL?;H-iuyGgB{Vwk<413)C?@iB zo9u*Miy$7ICn6Q9@?lmL96z+**B7qN9XAB_S1=} z6bxmHx-iVCzWR;fU$A`_Gf%B9p9!xP4)o5ATmei} zZAc{%SN@P$QsnRlli2CpJb}M|vP|N~j~<>W3hqD!64+XF)A=lpw>6LBLRco z8FC?HYmqA5eiH+`^I!OER%UF)ev>ghLWZ*^bliZ9V9x?Ilz8LTJ8FixF9;ph>@-Dp z41ySxcBSaPwZ8s_+troe^kE;ij~-U5dJOG&41;Y?ib>zszpdIF^9hINvY;Oh``U3W zxOgtJx?)(rJId)g84zEty}P=$_Usw!+Dw??_bMCdlr&%L=S$!qET(S|n7kn<_>gE; z!c$e@6wNB}s27n);_oI)Xf(k9TYj*g@O;Pvv7@qUNg(a%Yba%}IP88J0u1?YHw<(U zq+q3=Jkhk;{E#MOXlZ3t5!H%spEEjcpUrT;Z|YN4E|5yCar2`idq?e1vTSKR3hcJ` zc;c@J+3dl4x&nHPbaAS>!(<2zn<|Zc@jRYskCyW5kW{!_!neM>f`Z+7!`N7^iu_et z&k8+om^#eZ{w$1)%uJ2*&F#?`apC3vPiQ7r`Uf<_*8dMQ+kR;DKTFYn2OZast7KSV zWb>ECV1fp5mo923JXaDoaXs7#B~0=BM4eV`Do@I`5FO1S^P|;5t|Th-s;bc89Ujv+ z1TooAsA-wxNrP@$VOd#{*7t=D#)d|*#DY(s-^SAO3Os{U#VX&VS`DfbL8?_VhLHow zP4e{`T$$M$8PWj&Tzq_Wv}_^|H(8RiqbZcaO&vZm;RiT6>+kF}wz>~yv_^&qq8Jyh zj6RE>1v%e9(%&p06mw;Z!z(Wp3!pEdvT$u?SqwQ1NVw4K*3^iQP=7(5K4H{ONctO+ zP6~fZ_S4)Y2^i<$_W1?dq*lV`2ZD7kNu=RM$ zy~E%~e<2a;<-w=rAP?X!cUd1j=NUdT71@!i5d<^ThfJ;_dkg4 z8o2I3EMui;LZQ_?v-4GH&+E5%h8o}?U7r?9dFa2AEnrPNLzno(JP&?kIB1xKWQna$ zc$t$%tx--CT=BF3xtR}Uz1Ta%_a*H|jvkCttIV4Nqsg(*@ zSC1#$E61u$*b^yC?O16!jOvh}kEMn;ZhEOzqA^hu1v`>GJ$L)I9|TdSpzEWB!M}*{ zNRNJjMRgy0D;QhU%A3r2~Lv#2y!^42-u!9)S4YYO|E(0E=t1^w&6#CvD8enpW1y}Ecl(yQz3>72D?3Z zB2M(|)+5se3O+{#VhKHqSj!pAb+${{0@)N&^7z*2)ANUqQ~1MO>#{xMf~il}mm2M^ z%AAUn#9`;5LbD}=H}GKA}lW(qinV@CSWsl?d=T)vswf!)#5DSMG`wRXHOpvxdJQ(M%?|((LwBF?vU?e z=Q_}#ryZE|Owaw=6-#NNZz4cTE5^*qs≫hVy`cZK-}&N)XY7G%Ymo`E&kCE4lr& z<$kicS?BdPTaPmE-ja6++0Kgnf7P93Sd?4%?h!7uf6rQso^NoN zKi6b?SJW-CfSg9E_VF?fXMo^5j?O+|hd<(ZcQ}o` z-QIEto0i5?xFSk*^;d%cK`&@+k)1KnI_}o~7^7%mB0 zPy%D-Ehi+lV7E1VW^sP1!3Q6JL$Z$^-S_fh({6-7{8Jy7aDR&>SHMw&77yFp8Kf_@ zEV^#p){!XT5X)-$eO?Mk?+DNFn~m3C9d55YfX!kcT?$qP{jHSXB)h|5%DU(j0?U<` z5@4?I;B&vk?B}6tVb#7edidIItVi2BX3u77v~^CoMg>}LCA40@d8DOKDTN=FS@h+D zrV13ZK^w+gx&D#oXuqqzG!+c4(Qd?dC+mq8BkJm$MJmYVibF89>>A6*!4E6Y!F&z$1(m)XD7=8iXh?My}jy6P5ahen%C+jp?SfL(+{-3@<0!A_L5{H1NqrmSceJ?Zq2pN6eTkUZ z5XyhM+_z3nS=n=?*5GUqxyTb~^&y>>|Kc;w6Hxmoe1~m}sGZ?z~ z!p&@#&P7D+&$=$<>Qt>64#;BkeddY8w76@?=e`!k#iiC}^DHA$^Il(1|6KSfHB-mk zwPL>EGL5;T{6V;zi5H{c!RmQq+Phw<(9iv}Mie!}P&yky29=VX>B1TtnBwx8w7``0|xA z93fivgyUi0$vXth&&}oFu#1Vs%CM<^iH@xZCa5!OF??AH31Uu2(Bn5S6XTCMFDggo zM2ar9=RDv~6$q+!GfWmtb*S~$V%Nc@vmvc{bv9cbk?xIEUJDKLGBC5Wf~~om>Ty!v zYl`320%gdPwtJ~wj|@CKX40h`Szm{Jozpb*lft%eC~i4}`V|x<6pkOcB0R>d=r5z# zsbeWI#cqD;&7t>#HwArp!%l1VDpzl~+y24$d*{j7ehg5?QSgX}l&7WLHy#U0N+Qcq z+zXwZphZ*PE!t-Ga4o!-5BAX_X9!k-rdSV;A3H@H@(3=(g5j6yb|qb%@Q8uKW;<_> z>d3FpX|HNK4Qg1dmKq>>bYVPuVEKG1TuvBRmyPe$)zKn-^%-+}nqGb06e+v6yiCt` z2Jdh-kI_BbqXG9r@UYj-lX-uymF4UIT3$Xe^+9vt*M*W&il0?4P2;P@r`Qocy2}LxXJGd3G1(x$A z=q0$=HE^8Z%xT2Ll2h>Qz*!<)Z-;i+0lrdJpdq@deSRgz*S0D8y|1`*)e!AX(lD!)RH2jK(}yJnJi zT&yw{M+%?d4z6K}XrLy;Qc{XDT99Zl3m)dMt~y*CwUui$r!B_8r~|@xt*I zOb1Ixcgnfu=Oy9vFT5FRa4z|`Rgi?ir?6G-dc9T7CZ>_D0uxO^2e7}7t5ZqqSNzf& zw>Q4yGE4~XkjlZD!|Jos;kt68Q^@MHDK)7x4sne(YPNDUEHlUH0sEcCKA1Rvi8yCT zS-Er3ti46~>y08NY38VNwnEXDG((y)+yoe#$}~^PQ**H}5v|^VM|vLzbUP#Pi3F-G z)@@22Iy156R$f&_FcmO;d6AdNmF8AUJGb0D$hge2f^;0gQj(YDY?1T`LpP)C>I;dC zHpd%`kgc~6K|yiRpbkpamx14sC*SXI?C2Ykwif!xEDn={<0s|xoE+EN#mg*#QnU+E zG$~f1jQ$o!oW-Pv-PFmjgi$urAVWDghz}?K{Gq;t0Yb)FHZs8zq?+ee+fyA}K_%zq zt$?Wt>@6&8yVr$YBNT4=X?zrOgC!+L8NQ;>SC~ zp!Exo$Po^V(jCnz2aB>L18_aaN9%=8B=UC z|BNa4?x%7XbXD3ra?*@AMl4hOstUjpSy;|SsL3f9(l`}uF?CP^tQIw?rebN&Im#f5 z=eHH~YC{fJ@|j~hRHp^#jBZl+W+ox!4_9WaOV5#ZrrQpd&U3xkF0Jo*jRkh8`*Hrd zr5gt8zNk_<1JO+EA3Wyr#Dcsg+?oSgUc0@1HQ*|?zclA)8n?YgH*8#=+rWWETm(*! zLNH0%JFs&JDCM}3hbKW8{TV6Q>)%vL^eqn;wXF!63dWp$u@Uug$-+9W+NPmho5gi~ z-I19&)^QoeZm!3h(fP5Co?x!};rInGNV4K_~|V2USg&A$`5*Gsy*2GAX(OJN4QN%r56SWG-2b!{H*u$F4V)z{iMIwdF3BuI1UTPUs}3sZ$QCd=zEzK z^3@JR0R1Zc0m-+eM4p&x_DPLTuxPPxraqx$oum=Ta`gDdrYPdoOCW;D87y$vBb_hP zUSM)`r=SV&7v1&SgG z`xM3Yn}30y-Hb{4Ge|LuVldQQSknfzdJa%Kdlp$tbiy%~TrPTjG0|^cytZdD+~wc7 zGa>rQkd>0qnbQCrd~zQ+`ZcE0@}=9=@ZlrMT4KSL=-7AQKQ$2q^BW=8Rt zrSY$OBdZNnGICFxDXpvp*2{__)JKxqT1oAFQ0o24jnAp>-@bx6X)^Y1blOLktoESU zh-Y3k>Cg7;+{1MGv{)#at;hI@*~3gON$EB^icW}(Tq9(HVLxGO7T}8$M7hREIzu6H zM)Xo8^(thG&(_kVoP2`VGTOpOM*k*nuG&=i2dBpq)wC>+Wc2%Z54YSBy|?^5S1Q8y zr{@x6UyVAiyGG}gpq+dHO%41r=hQJ!XwFRxWzaXYpt_zlMEIQn_YqT8v4MjIcD<7qZ*j*x0fN?FqYxop}2P$cF zP)VDf5=!2C{;Ih-W_PzDD%q?hyD24i=3q;C*lQ1~*c?jvDYYM@%C|EY=JR^#6st`R zb}bP32AD%8_v9>Jz_aoTt9Cv?-EU+hAbij$7yG+g7i43i_TN-ia(w$rrt?7!I++Rr z;gH<3px6@o-Zw%>uT<)vpQ{&AF75gL`09VSOy^EM;zN@3P)fU%1+qA?Bdrli4}nYd zw}=crN6-?_&Ar~)sd{Xys~ZnOg1CN4E=kgap39kT6?Wdm(8n#|)Lq{`V|zKjO9b8=%GI~Z02gP!fi`QISdUgvV@R{ z9bV~XxsR_}Mo({!P;A=v|9pMNIPNvTQF3 z(?F(m#o_>&Iwv2Oj~7yHZBsKdgTry1AFg8aDw&iyeHa^yg@l8E2RpNOhgtCXrZ7U% zzyNG~Xh2+56-m@BkNJ)A@Zn|~y?g;UQ2rUYD@42UrU^<(cmUdV8 z#POo-sNou;JLS1xH)tvO*b%F(LTr*k4^UV=J_CahD=M<)DAkgG&Unh)bjrDEMKxpY5nW&f9J-M0p zeS8n{Y1hpw1)PnGab7TNRbohojHBehdai5dL z8=;G&J}ntr^ZhFsYj%s?TB->9#HoL~#CxCt#4~eMEIO-%$H~~_!p~%^icT&YnT&B3 z#(6S|wEUl?VsHABD;95bCAnWMTHEzGE!WWufH!x$j_gbsxE@$U&x7@MqkeWT{eq?n zM|q}sWRi`2HTy1irOw6lNj$b3ALT_h0i#K-DgM|+BfdEB*v%zW+xXZtN*Fqv;|1C3 zeoQPWs!Yx&B?QHl#OqsP{L#&t;)VQL)t7H;}xiK zWeQo60Rfp~K#jgviTAMaX%p4YGGvW4^r8rXrzcuFzp|(w*B%JCU%zsTkq81&RG2S{TnpGqvX(EtmS+hzT535JfhzLV|z@s;#zllA)H2?*UsFb%dGUkD!cMGCJl#mV(mq4U_UG z@wbsvmQqKb1)5f!X_b7S>M^<`nfk=^C%HtPZkvOQ>^x~_424535fvf7P%rXL&KE4M zZKQg4w!jQqnu$P#SVp!u3Pw*$ARM;1CLf^wT8Dq-CI1&PdaXKxR#B_i-O=jqbh%8} zVG2Bw>kE-v5crmB;`3T#z7(?3MT)wu&OXEZTBPsJK6fF8%~YX8*o93*#QEqjAt@=4 zhljm;?uQEgLN6ut+`?ju8Qbm!B>RZceR3gQLv(l6%zct#mE09t%8dKO^ibWYq+iz;k>(`kkg7|lWcF!g3yB2ih zB%^p4gQ@j_b9#04T|+A0H43gdu=GqZ=I4)31`3@y7Qa%QQXW=d0|+SunNi{DuL~_y5k#9!wiYcWgHHHtOB-E4CfH(w#G_Yy})Af9gf{G^^x zVAw+@Nshj+(NG9{?U%a=$3P4PLOhfcCh(=|rCHN^^yf-APvw`M0VxWUgowuo3qu^_ zwS})wH-sca=$aP+NABgbILA*a9wqih!>j3k@@ze-+~VRa!0B1c^cOQ08|g>I25;@} zu0S0_JZC@z+}^&{aO+=y6Yi`y{;x%CnP>&rQ($UbpBZi6GgDSt$b)yb#@*6RUmPmW zd-^pQ13936mu|^1##w^j*E&{G`0aOr%4-f)WMr#-68fRzPE2_Cqs4ygVv<;lW7gy*#C30zq@%-?TwqOKDqmcYH}g>NDp^1}?xO zS$JtJf`_<)|Bs%SbT2#2>>NhDMuKyCOf!1V+&*!#ghz^7dLhWSxeA7cWFfOlb6~gG z9htK3f$G>IA0-x+$5Hc2G;7)FsT>de0Sk5I@Y(Ni!0fx?Gpz+)N?E#%yp2W(#v0f9 zxA!%_gR{9}*lK$pxalwonw)m;k$5WJ)RbacCgeR@juCmx9Ey>V=FmSAPc`!SW_BB($=x21P`KbNU~W z$UXVT%<3oa#wt5Fzh1j zDTFp99M`FOQm?zijr^2Nte9;2fkZ_`Qfz;C?1xN6LrgR-5eJcD@Ax&lx;3E~c#q#0 zzf<16xv{a)m9%qVi3V-VR_-06PqxU$##~@3KO&uH@mnitZmt%f^6peVylSzt|2SFB zEDYS$fln$t9IH}FA48&|f{wYS&pYnDB1mye^hO znz8W+!6_3e%;&YwLrmPI_BaCOD~!Yb7H(0yv@t@xdg(-|4(j*&>1OCPUM(~=F#in{+6X`Oc%MqXvjF^#

d*HLMALN;HftG9OiS&=MMZ=oe&i=I?VQu^ zpmUJ~5&qfq5I?`%zCKPN*E$KNz7cAw=4m7-bdTmQP$)C-N_v#!=oK&DZqax6tN16> zd7@3!X+V=Q$ezdFQw_KPQ@(2zM6gcIp|%J4$} zZF}Bv1{0X>w}Z{-yE=s`svt~%8Rn0lI`%qj`>NR()iT(_tMBppbkBzm)`NpWZS~!b z<2mdVC`kSs#SF$L2oi@O$kwxk1wKZ3a{JP2_-rr3z#&@2p5mgbSf_m_u2f9CME<5l zMkL=+3{BSLAn1YbPH6H>R`=&Pa&fn906xp6Y+_+BQs4d$O3tD8OO0*PQTREneJS{^ zPjn#FzWND1fw8PbpPQc_B3YfE*4>&E#7sEz@CHVn{#!~z1=tm?VB5hj=aoszxwCLp z8x*O`@z=0tZ~|vij~?^!vFP!3oTWIajQ*Ms{Nd>DkhpUBYgd;PW%(K=3ioS|2u!(0 zn~Vkm6ooa24#MVK4jKW3I{WSWmJwv8I{>ec)bFCV74^V}0NkK4tP^DU&-L>O&{aU& z(KhrIOR6+HEO48k-z+O`;Qt?V)<070C-U^8g90#2uB01}4_99)`SNHIqR+BH%i-CqeL%qgl9wC%|x(<+3LM27D|(ipFn zaK6MQV*dm?R`7pKV99Wi-wBD=j~;76p|g*rrRS-`9@j38j5pG!7qZ|BpYYrAx(1Hx zY3}QH1zB%XSF8nJO;poz5nB@z15s&&_s0)V-dx%j@1kNJurlsYvERqqKXLN@eovf- z2}V&|-$O$k%Q(T*eqFc1pJ4#deyWDJVVew=fu9+zGHUkLqoVAjafMHoaLPbv8+{LY zFc}}o`V{h0CI2O;dVu;_{FWyf+54^3J6N)sO`reS6s<_hH5kxSZ(25$=}~@I255r6 z(fm*SA+e}BR&!cd2K&Y7@NxQ7mU0^^2;QBNObPS#sJcGgxTGH>Tm~>jqM76k++&)eC!esKo9%_tOwlaSz0cmu ztw{I~qpkQ)V)|cmZ%Y~!%1A`UQ21~chQy_bBewx0 zo4%lM{QMa>LRY2r2E3B>QD09T?TBgjw1B77)q25-yZ!jSW2?`c4Hz_?J<>KBR&wtN zzKkCc0vf5fW)$+pK+RE_90~iyLPR@T^?@A)H8q4kunHZUVeSBB7k&PZsCpLwTkX!M z&k-S{(+^9_6&(k~bz|zb4oG-X@WcRs!>sK!>?-*ai;y{dK|b~(V^6+!1h}AZAL*Yo zIq@%=oKDTfZ<15Md^ZS5&YU*tk?Jamh35v>^rs^UOqBp8hB!U7lEBG#@i&SbgIIDN zTyLB(eGZTAJs|CDz!z3&L-nQld@ux#c3fLSu1bKT{?i_t5O!I7K0SR4U@k$Ef!>!z zpObk$!v%b_=9A#fxty=%6|QW0++QPh-Ij{h9HcU6f^AX`>G$Y&6dy5ecGxvst%hu! zP)C?zc#pVmSc;l^iY)Hqj${)hX|OG&v#qy4Np}KR(({v9Y-yb z0&L*~2*i?6v<-gq0q8=5vY_yH$;)28h4pO1x7HHJE#O31+OpX+=Bv(8TN?i*w)QBK zSn4Ec%f0*1;j-IV}Z+D@+m^4zAn zognb6R@Hw?EDfkr+$^gJzm8z%E?Ob^v0xxU8*fF0wf21ZQu`jN+sTABLX<)LKKkvk zyCI_iZ5P&TA4N9=+%pVuoXuxAT_{eVHKKQL>X`m!ZZ!`y*$x&==tl_k=i0H#B#C&b z*bQxGZ+Q6XMI`GK$JE3W$)zH=CMtpaMMMJaD)J&o04BP=md#@LxMgk1wPXDp!lq5G zJ&Tkzu|fr^Di1PUn~`<{0OUOrf?4C%J^?!yNkYsjNe9zRy@HO>K22e=F!LDV zqFQc-k?!CVm*CH-OB*ziJxTjbTe(-&%18uv<`s=fQb{J2o7|EGSV&=E7oh#%Z-{s# zzm2{cf1;tGu|4LCJv%XIN0}BT_X%~x^qyRf#k@mg$Qp6S@c#Z#d;3(92d0^NFt5-a zg&|)6xO3o&+Z9;9yotM5di@MOC<}!1;<@ajcz!r!B`t&#=tM?LVKYulxDZ$F69eS0 zVaFyZwRG<~t%~7c@ICV>E2)BUlKv>LU4zm~JMH}b4+fOS9dAjMpc(7p4Kj9y4+|&mTU>D~m0Nsb~FIiTxkbkKMdSbmTEhSHmwi2%O#@;~z$#Ug82}Foe zmQ-{RTx5KAS8+ zvQ`2T1$3rL4pvQ>VwS9=Srx5r5k0I^y2Mcb@FQ3QPRy6kKi2>;BKGsVK=BtU)}Hi6 zA!tO>>8YgG75B43`W?7UsV`Pa{7hrru8jQ?#$L2Nw;giIs6$06iQ_ai^l~PupEZn3 zOsR+n_10;{I~yvc^=u# z;jszTRpHPy^hS>Z*6N>{=+CU~6P7BV8bIR$qfkqzqLod4sC~?+{7n~^6z}=hwe(6! zb-VLrvXKvv#2(8xI5W7r=NJv!=Mxg~!5P=Rb;8KTHhN_j2BDb`@jl?3&IZ869tzx8 zAGjFZ0AP{Y!(ET)=#I3@`~Tso{*e?q=o#J8H~~&e{InoBM~Mk~*TV}>LXP2>*;Fp> zf8~WdviA1qm#a!jU%ldRNjN`*j0$@@Dfn04Jw+D-(5eGqTiz3`0u=^rVQe96#?v4C?@cUzK5G}!g? zKkx40^z}7jlBW0%eFPc4$Y<8-At!NW)yzNsJN%602qw_Qh^&r zs`D`fp&Wwz`0v2*a`bhs z($bZ=rK__`A|IshXlQ_QX0Lt5+hr0FiOzifrt!Gnj_jX)JB^#$M1m~RB{1{|4nHRZ zCRj>_Um^ujT4kk*yV@_SDU$w(ftot{gEW6Nun*^=NVAHAsMXW#+D0S7%gTfkqn#`G zSqR*FMJZCCw3o@^*>*<2WSJuT&F6W>sct@4+U!-3NV9I=XYCw&e>8tu0ZCPrv}4jU zl6Bef<+GLkVv<=FisH7pD|_$KBv(}jUN$mpGk%pFUjj{ZOlV1_!ROIafD@ZzJzpYc zB2OK1GXiY9@T;wk=C`)GX*uGVKcrDcU3*6WH(oVN45Tq(m#1$|J78QyqMfgA00qR} zOk8>z))^-HJJ!i~mBbUOWJgi*;jW$R7J3H)^FP9)xv2eSdC*(EYqa61dX4c|0KTqp z6%W1we|Y`(jBrC+vZ2&3!HV&$8po}Gf8^hB8(l6B_ZGf|apq<4#GdrgnNz)EBuPwH zx6((C(uTohODppP?%Pou3m^}UO{pmF^;;ykw3!;RyO59Gpv9`KR^7@*OwLkcaf_^q zS%H+EsL1=YAqtwpMjV0Nfe;KqvWhB`D$VX|oJ2}LP{jxg2iN$dNP*`JA1Bw2g@=sy z5iM3^86j1>Ht^QV6bzPc5>w0HxwD1k-gaLr92e4!H7+*@7C7PxY9>Qok0Ux3^x#4` zaZBWZ@TQI;(E9?`mOS#JDwDG@ouzH>KOg6cesGIo)*}N3!=qEvFI-iJut&J0vJ>v90C`kUC4pr?iJw|6Xcwz;vw!hcUe0T~nd~U%N4NE%6&f<~0+GsK z5(=x4SB0?MCs5BVGRB&g2}kH6U8yQC$uwv#521*T|H%}>dGr?6_?(HmnyKS0VAkxw z5GuLKB$VcMDvN;}=W4WE0x4&of(ENH1S_Zen01MF-;<0yw`9MB(ji*2d$z+JrL;jm14F?2$x5yCcQdS{)t`qg z5dP$je)W_;)uxim_vrql5}hD;cjluupr{wkR+|RzegB@wkVs-_+pnSqBJ8|{QoysM z*I6}tch_-I5F4tM%!SJ=XY~TWuD$War>!aEVgwAM>(u%+Z{rja@~z)LThK0q#{P7m zPABiI2h!8u1HyBOx%177W-?GUf!#}^66{9w6F?uhhX*}vXB69dZ|p%D7jj!Gx^tRq z+Q&l&E@I+P0Ho347W#jDvYHDpcFutuL6(2o>sSVf0_3!0MhkU{<1zNSb?>~5LZDK# zlM@{8aDW37PU(*kTQd(^{L>OCSj1-}sR)U_qk`i8mxe`BVYDcz0g`~z#@_dJHzjfC&x5==w&S#ceK088 zcBWz@J&7AQV%Q9Jd*s<^_n0=sC-49)_Uy)%FX##byDOl8-4%QjP{dyx|HGNM{%glu zB%sUYv$w8`oCUiis1PFep9e=NQT#uA!L;w}$zK&;eFFcC4&LUa8#oFI`8LEVP|$67 z%+?*hkcJA;2Ae`48NTD6uW5)Q=21^bwc{6_VKEqj%@%M$Z|?EWPgKsp@~}Zb3jDs~ z7cMZpbaVEB^N=zeZ>5idB7qJ%_rM27gJB>4g7~ZBi#Yl3;3y@If8Glmn&C@Edi(%mr(F^C8#-RTh04bnLX(k0DMLpKcF z1Ml*_ysziFpXYtPAK#B}`}o7Q);4Rc^E}oO`?2rG`l75TjgL!#i-CcGFDvs}6$9hW zF$MK;?uushzwoz$HUgA66FBk3(aHZ<7m&l})oD^g80 zSC8rzv8yr>gMq)G7aw2W-CuSVp7nCkuC+%+FFlq1o^a_SQMxg=+qd18f`5|xpedK6A5B~cA<16JqFI)S6eOb^JJx4J6 z2Fu@fzVkqor7^1zBNY?g=JsDtbbNG1wTtopms7T9RE;u!mK@}N6G1*nSf4#}3mUZ5 zg8Fkh7+COoiS>AAoHMEtEI8I*rex<%h>PdVZqyeuR<{jtyt~6mIgdl z=oWTU<1CY(Z(VfBzuZ8N>h~u0K>yy_Jj-AE`#=f&zw|>c5bS~UI)6Zea>6PzvZim# zQO3Tw>$i^a*L!Qh|Gn83l;dv`EWLo6@R5D`5%<%mO1@qpy)vV@aU!p0QROPSH z7gluO2|bamdsm7of1FtG!+-4=u>8EZ`#BYzU0vs`!@5w>5kayz8gvlidhni{I7?){ zoRpUh=3g)EKh|_c)#8(i0y6~@$$8wUJc(7wWqxtd_`v%X198%~z^y7#@id}f_HatK zsc)&@K9|{?>HXUjPgcODM)q^XSba1u4(1jxId-mW3loVq{Dg;{jb1&WbV|W>V~a2H zzXb<%89s}|W6sz}C{%Ia%R8zkRyYZ8^;b3QJ9Z-d^OAu*&8!JSg9Pqkwwfwj9^8Jl ziAbc%jMAjK3Q|;B+&>pxmH+H{F$m{ED#l-iz4gp|$yqY`LbVXKJ9p0j$<;}j<~rOQ|) z7%uL6P4SoK*7{{$g$Gm(W!tC;bm}rz{Ce>zR8r|+F}8i&z%S)jVG^AFo#s2t%(=Ya zZsxa&bwdB;NxLS~s6-mkOp%}gIeys(2YlA*n|KWh2XCG(-usl}iml)LthncB}F`06hDozYtV+`8NG*&hV$pq z;C!Mq2RX?t5AO`-CIg2l$>=DyDN#tu!;d@*1$3mfyroRxkurbrS zAuA=qKF8NB|1ENK_bJ<>m(*6XXx7&6V`sRWW7ls2kLV$nDp_9}mu%b!OH?s4^1a2= z=pO~DlSm7!H9%6n)6a|>NEoZH9o+WT?xppv3!>$gBGZY}G>dUQ7_c4cL=pu|M2+vh zAI)<<=9*WxDSIKB|Zb1)X$?-2E{k&p+vz78Y_4|HIgGwR(Ew}2J+UAu=C4Q zU^X&Jzve`k^AU>kj|-GSx%0Ds6+D-SOHH4GpIOD4P5(Rf3iY7gf!|q%E$j#5tVWDu zdEZU`*;qL%yDb(hqoiG%@=LNSk5u)uxNe${K)*5515i_Abi*&npZD>2mN(t3riw=L z$E`bR)FL4|@hT2|qVdMuoNwo!AEC&NHARIJc$_)R%N}_MoWy+U>Z(hOxUhvir!8$U z5??LMfaX1m+qYe2B#4cS?uNLLPlZZ$zdcUu2ySf8I>^eNGF7;^?kp|I({f69DSPX$ zYW>^eE#+1-*7;J2uV7;;*!ep$9fdjP zvXcv1AGyZHu?LwV2}JVhLGi&o6T})tX<*lE+LeqvITLC_ncVi6)yU*SODro)WWV&B zyAUiU9_(FAD<=CyQ+FyWf>v13&Ge2^X^a4hlZ7(d%TeFQdG?UeV*gL(Cwa06CXPx? z4f9U2)34WDK6GWi?_x{k?n>>Bi5JwW$wm53DUqo5p3hVq_v7}!akB4wVLmd_s3`bgeoj=+_a`=4i)PKFD;`#}BwU z8kGl+1qJbshn81%lQ@gVUJ|P$PAeX<*kqz*Ac#!HIiN3vSSt1jJbhcd?Eq`9rH+OT@a~ym zy4bP54Stiifk}&B$;EFTJt59F{S1e6<-tFSb!Q-wTcNCAPDeE}jAPYx4zV+;)^Efk za&TgDTl@!vabhGX0`e@sMQ}8)f20!`%fCx^zqp{TcABqOmW{*%Cn*@>sGyn~!_=ra zS3Wj{qjml)zp zX1x!>OP*;H>pzb+HPuaf!Wy2(X*VhlzHy%0cl&OReobPXKWIBX*YM(A`}Z8QOnDC3 zi&IWDqJYM>cYe8WYa}x3(;5zP;m5x)@ds9+pQthoJmy~wW*^vE!%o?P!-R8q=rlhD z25{3L9@Wllc{EI%m{%jN19k!@ZJf*7al9-EL1JEx4?stQAtjS$q)Ma}^?m)z=!vX% zZ*TLKcC=T&Z}i5>Zj5gyV?DWJYQy&5te&e3kAYLi7is&dZ7;}s@e4*s#Bqwo^hOG7 zTj`)TFCL=Ntl!FX08gIsS{uu)k@=P^Z|w$@mC-p7+Peiz#bX9H^c<*P&ayiMiv9u-&WLGY;tm5#YgxQ`Bga?Ub!@JS|f{wp7MR#*&2{NPl9A8GuaD8$*wufEAN9KWL7*lB z=lfm1b}0=~6ry0K8UVY$k-nhgXCyZ(rTJ)Rivro?%?9d8=3JNf&s4_t6Mpj>W2#A^ zbO-H2CTbVE-uoPb_jSs{f7x;bi~Z=CN&Eg8_h&Dd;f{bDgk23lI9iGNPdR<{BI=TC zV~k5awDWO?XxnZdFChxnNl5;~*K-}=k}2kI+)`g$7Z7l8i`LyZP$aKA5er3wWZ|1LbS0)@}9Ur2dOoX-pK)chVv6fX5Xvri@=)zyk{nmrNpI zg2RC`Fkdp4io?fj7S0r(NUUXTz)vj%$J*0JW`#4GJzvZ}OkWpG1Ug$aij9lsm6ee4_66x1nVXY>!s?8-xnHkh zvOoV=o*nu|h}NAF(H8RLU92R~|NU;MaG*GjX{~D($@{?bRwTg``jI3=` zJd6_252pj)ol%`+V3vl4MGo5b*%>{yEPo~HWf~R$DNmd?*ROd-y2A7)r#kV&GkM9P zdE;ossR`98yk|^zd1Qxrv#d;BdAfhdef7*!q)D@QjM2q}zVcwvxvlIH)L;;R8y8CgE=XORHy6-<2 z9iSyCQ%jyGeouMaaxnzqqx>huM^MWk$QB_42kc4o6DORzN$4XGjfwUO8xKdFt`L`68qB*MW0&U zB-UHpueR;f&aL6Kwo&bQg3sB*Dr8Cr$4NA~4;NPiuEGImpg9|Y$oUjoN4qMkv|T}| z9?-!uP#5`wZ>Hmj$sNNgq}O2Hl@~~c|1ej@Qfo1OZpolm=Bc#2`;qMf1WV5xC*E+# zpz_LjJ;_sL>TL67Zt&8#nhP9J7e4MU`4aXbqAqU<1D5kUn)jI?Jw0L}jK`;u68aq= zZaRyrug6>BPwKC_vh~7Fic1s>zX`9Q+HcDCe+8dnqracaKzva@*sT+e?m!ttJ5d;* zfA`@`^~mxQ&imiF1e@8AsIs4*>I~Zqkik1PZ1LX4oweMc$6MsOg8{Q?EVdfV?&ao; zX7(^x{m(!MH_k{oQ}C>m`)8VeZtWpFrf}%F1xpjj_c|}dx?Xw*cdO*DNss0?a{5M; ziQ|$|7)e5|#E6*@br1^uytG`%Oc;5U=4pFD6K=l8`-)@@NzBP0#Th7<$yPbC^H_E3 zD5E*2QLaiebL6tV_ZVA5oL!!U)Sx-7a^vV&c*%>>(!^UQ=N(h1f?`_5*77hoQ?0^9 zwmjB~i$G!E6;J%};WLITQ`TOOy5679wpJ~V13w+H1-$mY8u!l>T!d1N&5nu)k|UJn zYPUu4^Ccr|Kvr7jCCTctOl1z&1X_Mbr~7|#K=asdPC=_BJudiny`JEM1_->;gWIo) zFkyIJ(D-%_n!yfebY!lHqs+N^L|bZ=`uxTEwh+ix#f%zeN|l)Ogs7JLM}(D&pZTG2 z`3unHEA5|T#)k2j6IU#rD&d%aXcnzK5TP2cgl=_+=x2M?{n<_{Ljp^~j$WUhp4r~H zFHnnx_~p0R@Z7~8F?}ahEYGtnWa(%Vq$U;Uc%<5*&siIU39W~skZoM zgar6BpCO1-pn%vDZR8w*9p&I=C*rfE5-c5t8eLyv0I7YCbN(X4E4lGk2+b4&`l%m zFKqhx7q;iSwsAVoNyWNVo5z_7Ec~rY2N7j^{nx&`B9Mb|f9G2Ur_sYxqo2E!S7$%_ zs%GjXmEyQ@4dS^-&4Zr+gxT*hS^M%Knn0IH6x?!mdWpC?Pgs{I3T@DjE`F?L5us|- zOW~A0nydfbuOfzRaXT_I`ID%*eZntE`g>7H7Ge9>Hd45u9HyBICy9?KA4VbSd;H!A z9n8S6I@aV)L zn!2Ab=}A8#4uzk}eXpPSz(G>_OJ zSLqOZ&V8Jj4HAHgULCG9u-v(<%BS96HB6cC6;gjj6rNaR?%Hrs{H=8&t=@fpb5^3G zN$P-QY^ro(J4AjUPdlS;4LxPfbSRKk2R-UQ_t0KrC69&P*Fot~M``uQo?xaRA|Ws#q41w$N;uC0%o?T-tNZI#)TGATuD zjw^q_>>0DRYaK(>BTQ*&GJE2ZDz?<+st-bL?Z~Roup^q6cl@Z?_%p{S1N-&(GhAZE zRTi{aj(?FC#y9dhj%;j!qg5p~$E?@;G+a7PIo8&WH@@qS_1+<{(q3B+s}*igVu#v% zc(?x3nI63M1WJOV&+jJVGUft(6!VmuVxm*)cwtIj|)Mo`{ z;uUb`;;M4PQB~@sZ%8bt@7+0cFn-@k8>eLM(nPP+HtzXF4ap(n(Y@p2E6eWi&pgdb zAc4gdPV20pt!`en`9rJZ)bGwpA99V~SLm#uZ}iFc6^%D>LTcE3EYcHRo$MlYlhv7# z>BidM%Qw-qCfDx-^B~3c$Au;0P_y`1(m*!}?=Zii?O?dFx z+Wa}yWBcMHQPah%ewejZ0rB$b@sy8^IGNn0sPXD%O+CMjL2)ZhgJO>93Y5RhM;00M zEDC5;ML>6;Yi3--qRG&D{6B26@@u#WDem}!72A8I0+v2;t|6B`NfQ|!{ov=9uRM*W zx}J&gMl!Cc28Kao0Xo>r&LJbWYdcV;AHNPe`VEA=qU=#rkC(u~qw`<{7NltFgs4}i z>9ym^WA=YW0RaTVcgl5rz5g=qqxax!7!9)Asm4?A3#+>G| z6r$Ce>wkAZOrG9m^CJJ1AP6`O`$avXX-PuppZV!MBfWQKC0u}^BzU03LdrsK94D1f zICSB~B(u28mK~9zXL|Zk4S)iL6ESSqQ;EqjTWcJHW9ybv>)Bii1#&XI&n}y0O#>~w zh#07vg;bZVQUqVEVu>lJ#acZHe8sYB z08jp)BqrBS&-s`+l6a9e0RS;*QDWhosuqt_XJ%#={e0Xb=d~3=L+1oJ8D9B|=O3b9 zJ_a+RMy2EuCCZ=tp;ZA=dXn%DMOrcs{`$Dsz!yXtCqTjMoDUH#S61f#nf5L3Tf%+4 z+xpw>jUy?4KqvD*)ER&xdcpih#C!5D+39c26(iR8&jm04Kfb`kzmD=Wwbos_g|qy) zi#d3ha-9&{tyH7@%H@&LrOA(e&}nFFdY1KNYcVf{wX_E8>f-Knces9&nD2;z-wWGi z;m+1Q;s{c_e1}fLgHu<|jru82!p;E zDBS}A!OS**daWFlrGJPn!M{>uUONG;$=Bmzd|1rs>07&@$M|sh-&Ww?{MG47YP$#N zOMLgN{f0i|bt|5hL-@^J;m$~|tDWjv-A2k0dG?r?XdP|x(USbCXA_zFP6?|&X2Fi3 zc&B@Pc0&Zg%|%3r11f|qX5lvbqy1|ko#RZOeq+tIg~}bsi?cW!*-i)X#9wtzB~q9B z!!VSa1GP%W>6iu%G}jeyfDd?p;w6Xk4=K?PS^guP)8cC_d5!ySDZ?W8=d&Mmtb(15 zg6&Xv=7@wyjaga687wcnx94nn3tR$kVC0z;z8ctW8=FIpG;+Z!c1On$(~3{7R^4EJ zbB>!j(-aT=QteAi0Kqc*w}Pw(ln-T*|4`<_@yXy7<^!Az%ivFIB{O7~*Q1Y*K>lAl zQEW9xl%5w@Nj=i#45lqz;GU%lO$ zJ+?>vtlV4j(Z={(J!>bCGuSXbP&GO3UJ)=+{kPY)2P%!$LV@Mzt)qRQ788$6VyzWw z93B_LM1LaNisz&&S~0zoHQ0TbIaD%vi#!~ZrR?V>CC2$9?z1kfH196Q-BPf@x6<|k ziWVFt#9iXk7G#waRMJs$?pP-Cbo-$H6=y(^PqEyR2;W}h4^%BF&=}M{-2EsQ^N(@te|QrHQyg)*E^hE&8n_7- zC3J0aMVFs7{`6RVTigvHN$a5A6#ZlVeLEmV@9{u`7Mb0m*Tp2^_GO?Mx}D=TtekPA zS{vsVt^0V6O|{5-FV8j5W-FZQeoH%jGbaj^(xo;k8}6L#C%X!3k^GKYbA6MgDDQC-YzOy8u%k zDspGx<`NykjkbOrJL|X}n(}r$$G|O^XD63Qt|j+~&u(1o)va!YA*oEo8+FCKb7F+- zO2oy`8Y6G^QwO{^LzP3K#Cb2^l8P@SDEUnN3NNXgWi3a_s91dc1ysMVC`lyoNV20E zY?g~iDYtOM7e68qju%r`VixWSRK=#}F#8cwG=I5+ute$fJ!X>w^uTUN&8c3v_VM-m zXa-ubt-Y$r2?`CGv#o;@e%5d+q;l6z4+v~n6Ihi4^?wsMZ&oy z28(g-MU3z?&5{6ba_#V(_>Gw)=rlbf>(#R6PFYYTO)M+^d1&GsdiqEVpgflQXJhX|zwpg5M`OzW&nP6u!C>+I$(o zbB{r%F~xH^4IlTguLy4ws+&)z;m6jL&Nb3M-E@?EybGk4T56j~N=4pXhz_EjG(c|p z;Q|2aebLU-EC#=^7`ycMnO&i}u6e(&i|)Pjiecrl9p9{q2@1Y_bq!~uK~x*e?nNd| zho5`ZqPZ$vN@)`Kid8dgnpkj3W-gsZGSG%$g%8$8P`TL^9Y;l^6w2>`krX+>KS7A` zLG|8MxY}P+DSfu$E}6)|ULEy9t`d{XRQv*F`bAm*YPi#l2O{%TxQ{M&<;N+Bsd9a z2x~&Xi*xHE`EPbUYZsnUHF=29|0*?1={PP3+`@kzrm}ZNm0CVs`39%7WBeW-^+@we zWCwpEu@buy^B%|-C2={n=>xPcPbQj;;w-RKfhmTwM#_9Y{}X)ODf@s@HgM|#l&&y! zN9|m0klm!Y0N&8(C56(@B!iFGbgqjpXi2|@WxI`j5Tg@`ZQ^h4V8$^lUOT5Mp4*!q z&twTZX)`E>&Ujclm&OC#X+q&i<7^Y;L(nsEcAJiB6PhDi^5q>UH;2=Y<_hcL64gj# zR*AxF)?x*CX>CWC{o%c-u{@6x1JOD;fTHuAakR7z3N-T(!z&`VQrzeWM7|_^01PKN z2m-S!ACOMC01Fr%atV{R(gt{{jw`cx#Tn1D!&OP|kt1pY*xvjXSPC1W@^A|MQ^WV# zwheXl-Mb+z;WCk$9j@CLAi9Ub-;^R!Os&eYLvQX-jCvfd-Jxr!%GaS45*J1%km$nG zgc5T$@PSzh>KQfKH3i*DSWZ}4=(@O9fAeT)QA7}2YM=S=XGt#9;vj()j-KC}UDn<2 zeswENfWjnZBo0E+tn=c^a2RQOhQqP#CpS2rK7)C-nos%`<% zqky(1vC%5o8`?ID>%|GVjcEX_gb8|r?;adZ`R-0tc}0YZE*(!wM;T?L5unaH>wR{e zHzGOu$0(Z4*=~+O5e^*^50e+q_7BqTwF79AI)@{Lw{OXgR<_rEA{P!-HG&yxwrJ#WhY>3(dO46Tu9JtWzyuK+T%vq<4gYlicH_k9 z5%{r{uU%8!;CL=HcWr>%MYp(kA#D`g*>#cuem)9{ieuk_TbR233>X?41W?S{o&V`0 z^Wg955P0aiRJGvO`V)X3jj0#dKxXBXHhi^UZRT;fB0R*dv>7he-Z9kL-8*!pnJ65a zz!T8KCl z6s{zj)@wm!Z=1`GI+^*kQ$@OQRH3Xvs#(GOGr)Ox}@wKiw>yz3VK`uJ18;>_jPa zj<-)C>%0yzK1*0|?OvyGI6wI$?C2=wfM!(W^FC;>Jx|RB^)=(Qs?$m!!z{)7d!8RK zVm0^Nu`u-T&!}=S{x#2+h*ortkG5F8<5`y!LAnaQ?R|an=&?JjkE1C|Pi+Zk_E^me{L>Jpl>xCD4Z*PgrSjD@|Gkxi>{a!A$D z67Py2W=N1T@Gb5?&}Ymd8nP*-uyniW3B*Jb*zs8(W9Q3Q&c>_GgE=}FqR06Q)Z<|T zCC5X|1|NqHU7aE9?0K6oy=l&LiO6M^UT?Y=^!rTHew^CDjP=Fy zf+J&^F9qMUSO&`tRkFKKT;|d@^SH1ShLRa&)j~eCkMQ(7mjmv1?NL;M6Z3k!e6S8?5m#FrZBjc4)T>i z9Dx8l0Y_Vi*)5Y>C0jt+cG}-(rHtkAe3NJOB7%QK8l_N~13j)FPzF8>sH zUm^y&dYIM@c$l|>PlvriL{;`>>PtfhB`4@@Lp*~i)E-%`iQ%0hwW*lL(4Zd%@V1Ax z7ck1xrOZI@`PJtMvh8yx&%t1L@>rpnPbCv{O#?u&%E#W8^X>t8PZds`x&-sbLo*;I6?0iw+2pDx4OUDumVy244=0aFBQ zUBH-8jdP{CpsLmOPSo-xta%9uE<(~;qP`C4&FPA2-Gz4gHR=)WjhHo%?n>BXPIB*1 zcuR~%MU5u%UY}psaAsItTsLk5yd=1u|FY3gm$G{t)+pk1om0Q?qai2U>3#o8W)sv3 z8I52a#7Bzb_dKX0KR{++0;8j9_o0kphXZl?l5#yL+iBlY>7&4e6A+`rEV9ut@I?#j zHI6phBCL*>_o`TF`xJyNa^07cF|Y`>b~PZH*Bw)YyMPY{KF`}SS|)uKJXMaXaj$j- zT=IR}fzaz7u9Y^FP-Mc^t$``1cV~d^YQ$q>g{}TEd_n=#hEC1q^lY+lS?Zg8kOTYV z_lvU?pl*1_#HFY|0j-%_x{$1Fu;cz# za{dui0#^g`rRKZBV08b(A#dnC(e77%^HA@_);P-Qn~NkdXs$MV=$>p%Zu3a(r<~R% zE;V1_hZ@m!n+1V7K zES#>RE2_CB3$^=Vre{>suGi)n5D+t6&=qIbH0)}yab#g4%pbaT)$|EK!FUHJLf5mm zFI^BfN71p%rI{VgzXvw{kpY3;Qu0R7>wL~cU3Pqn#6{A5Ymc7Y_%(D zM3fG0YJEwxhYtf)&efd}PUpvOi|{O_Ael2-E!ny?Cby+q1Z!pUc0vIQ3OgQCyZCJTdQx4o{Lh8#D-BmTyPT`bQaqZHNx(Y- zC&;Iz@a(F5WZ{g^`X)aFCzHxdd8Hyw-{oHLHk*Z(q1Y7F8p0o9k;PPCER!z7R4KIl z8d!iW1aRzb)?-iz!PjZ`>+vBS1jA>bYsCyQlj9cvrcn-eo+vBhQq)}_xOWq39iBcruc)2 zlL)*my~(8`G2srp?WF+>X>@g$&gNs|3}**}-e(1Rnzg2(T*+d&?ZzxZ!;a~tuH#9O z1==5`OHynm5CUJ`xgrlX&#seRDub6zTTKZj9fPy!>Nc!*&H=xc?duZvqzku=!9={u zKv=kRQM+^;o)Xo2UxFiRE~^KnzR$+ssU{+6Pgimkq!U3h;?eRVW!efMl%~y8sm}S@ zsB<^^Lj#H!y@rh(_$DLfPnEEi?C&6a2j#eWh?P(Mqc`8vLb8QlNgU zaGZr@d+%5%I>g&l;{0ulY`JtvXMHdI+h5O~v(zCZxud{|W*c2@^DB7i&#;+;fKi@W zian4bFb6x~hXmwd=3!@b62@!3e47ymY&W*+iZ&DgZ{xL=!HMV01!*R3>o2tbYov~zqiLlObVM-p3A;ZiG71J5Vhri&U!et>i&ET>i+Bb-a@gd>PAAG^F0LCs7 zYjV(U6X7o=yeV67lnX!5I-^RU6_X}8A8>diB6#im;&M>}^!u%wK;A%u$e*_DlJLHZ zC}v{okURTC6K7pqce^4J0faAJH&tpJlG)K4r!?}c=PjWGqtX&bOeB1p*`y@23Rzyw za(`T)xQt9Hl*x$l!C}RZ&>~tdw&$lL*t|NOS9@HmyPZb*&DsrVO-Ceju8F3kCXhZZ zC3(LpV{x6nJ)Ychd!B{|2G^Mys5S(tTF1Xamg!GRu|{$Z8{MAS7FF;_SDJpgq{b^Z zjIx~t*Pyfrb!r?oR(8#O&djf3Qt_YV;_s>xxnVJ20DDrjS_Cd^t)C~yIWc@OSmcc4DEZpjqXJip{bDm}Ses~0Wy%bkdeF|)f6XZaSIPCwPzH>d-) zMkPzR*=W-|mv{gWMl?<11@xQPavz?=NzNzV2th=P;eBeTNlA(vZgW zD>kLN*n1ysZY-0(^$qf%*p_yT9-6nuF$=)k?&r~N;N35LD`)OoM7->fdT2(i*;Y&4 zvBVYAm8d%i?4^?GQp?+S8&D4nQ? zRg3hpix`V}*FHL>y3wmF?MqnAV|$eX{$*jKacPIYYO8IQe^ z+Gtya?;~yDn?;&d04=6Qf6d<(up4R{eQVI{J34ZttDo#GT=wpbpl4ckR^>p>&OXY_tt@#$kS+r(2gIYpzW(4?0yPCCcTZD(7|t2ll> zXBwk8L?ju3BnY;Q%PDk>jxFp(K-mFES`xa!=zw&#J?G!Rrlk-Yx>mP*nUpXnf6aXH z4(~VA9evy#X)!4>^2AsAT`_ZMpcP_6-s#KEO-;ISq;G3KtJ&>+i;2_4)r<8#5J&r9 zN`!ZldD>gHVM*Tm?0Ewxn&~jMy7@!>P%m(hCJJ zV&_8RGkBZVc$9e{?`=q(VW|B3S9H2$-zA`3aRgpW-g$pj&c%L3g9lgM|C z#7;UXwi-jBi%I53XH}v1O*Y4?492L|9JGHrmap&e#^0)_v@MDlAmtNKr-BecDdxt! zrEy(+SEr3QdZ6vUL;0x#VRV3^RCQxcTBo&)RTj}KRJv;DGg44Fcw zj6S7o|KvPf*U7=_M#m6phy36=kI)C3huMYbt*THD#>r);01UvRFM%^J zJCYg9R=ElWTD!O){Q|{`@9r*5cxbZ5Y9|t#*MDtqVWs54`q&j1&}sAvq+Qx-&XJZc zfdB^m*tafpM}xW5NOR4n_{&{PR%wBDlZ&|waT2AS-&+DwngxCEFkrUEsg;Sy?)`P2qH{26>r@wfmmG);$ z88oIJOXXY7X6*ck^Cejm_MLS;3eoWDz3|*lq|vQxAm;-LnZ91s14u2i%x_{x^F=PW?^A^;2njU=flJv`M zE&C4dvt!O_-^)gu7?D8*WF?BpY!_|k3-4GW-)3t5nK4C(c*j9+jr=a$4jCW)Iuoz9 zdTdY3Z|9i1=upuHe1RzkgXNY-y}8+|zZkZci2p>Q@$C%!_3(@u2a-LjqVjn6$m3}h}5th5!!N&Mtj17p} zHxy5LuGSE=4QU? zc)ZQgJk7z?V=N!hkmCM!8sAQj8`Zh|u+=lr_kpF|tr<^6-N`JufPE^;aeVNL$h*KN zq)tK%b6B5{PW*B)NTZK00HG*xY@99U0|_s~HNaB5;Qrk$VKTl$)QuS1YZ=|cebQ?Z zbsu2ni2ae>UG*-IwX)~~5QuPgXUf+{`7q>P9Y*}X}vnS)_NQd;PJ)+EGN^fz?f_W-J zvpxV^8u)z7 zew#=QTcAKfEi$Txl3|Zi>pnX>AgTxXQ$J-g2W+|^hKF#W74W-9|N8EMd+8(U;U?G@omv@jV;7d82vP zXQSem6df|$<7f}G2MYgB+8H1CIEoB664Z#fC|JP*1V2gX6w2{y!XYRtg?;|;7&}Jl z;cs>k&64T@Y>I1tB7pUj8gz82RmBo7;d|YflA;^OqQS$%Q(_1=C^ZmEdh5=7#)ezbhA zN5kXzrwqIe-DmjRKQOH4xL$j=m%sV)==~c$l?v5+2)0R3L z#x-Bvj?S$W>ud&$BHm%BtdrD(Nf#YWLXIZ(%Cy`~Y&)Z%WK|<J} z|AM&4nRoU8#whNd!5$xjUS-TL7S~ElpspACX?eb!t(m3ZRN7FGlgd$s<;aTUXp6-9 zK}Eg=H9q$q86JtH?@J10)#A-fKHrJs)EKJW-=^5j?tHzOS_ z+lN9*!Ipe^Z`dWk0d5}w;|n~wM4(U+)YF;l=Xqn+nVFeOi;G>y$TB0z^MjQnQ+ao4 z>dU1oUSA*GY;VM%);2IC1sytWb@Ob?Pn3q}P$3@R1t=88Qch$Tbxy^#x8A&?{ui+R zPZApYPG@KN!?!_N>(<(X`ubbKD*#A*az<5w1p2+4(v(KoxP#TqfOD994Qg<+Hl1ziE74+$oT9dQvp`{7x&z^xsD&ly)2&v`c?wvdkr#w|jM-HtY0Ye!orN)+pU0j=Rhm6AuSm;b+ z<+ff=_!&QGAJSztLZ9)r_sf;{$cJ{D__jJD$dNGE{7kVIeusCx{S=OmuAuP=n_jKs z_GEcm5aG>Qnr~ls^%Jp)t)(S6@Xyc<(EZLw3sB_*`y<*#eFHR$Z+6 z$OI!qNv77W_?~Nw#qu)2#>_Vr(=&8d4D8=cXl`4*R6-Bn7#5+gGd;3*jjRJdEK-IZrOI;yO|mBzI|<(O zSe{V5oj+-m5m8hNVAqvPSqo$8eM3h%Vk6W_;_PG8FVW1m^wyfitz81gQ|b|pF~R-! zVj-KF4$zgYjb@9BHJxJjnOY~$CMXcL66m9mKzzz-TLjUA2QQ-CQm!Pu<)(&&4ZXU| zI42ST!1AzSygKwg4;JxPgxG`W9XydieVS^EWF}=n)}1mQthMCPFC6oR_M(}9h6a&{(VoQhKXl&;X@Sx zb~RzhNp!cQXNQDr3f#&NkPDY*yGE?w=#tP9;`l?&biNn?w z2iO39IQ-_SY~dFnC1F(|Id`>wn>#=0h#^%f;ebJITwdt39SX&(9tr z;^nfJb8LmYJ@+2-=7#T{o_`kS)PHo)XIeN4$hp(q^I+in0=)mJC<5*^L}>|I$j3jo zKAuBND+Ry#$nxX5O=pa1^xH|6;VR-YoY$`PsjmZJHIf`rtWYesqQ>hYe&MqbBU^ldN|n;^n2ashl;27Xraq?YtaMOjWJ{DqeFUIUoi3jOIA1sW zMMYa=RLt}W8nqa$N~)AJneAj3lIb;QKz`j%^8-5PW&U>m*m0=|m=674H)?cvexR>A zI7=~+vG5`1|8Vx!aZ#<^`!I@%f`Edek|I)4Dj>oTA`+4c(xuWZIpm-qAd-TVlyrCJ zNI7(O4c#zw4E5fF=Q$7O9MAK8KR^HC4`J_p@B3csTGw^0wK=gXB-IZA=b-6nci(Bc z(17@Y+lAdqfywBXBSRlO&N=c!r;}GqZoxOkmuRUjuTTey0kPfPxI0A+hVYaqlU7IX zWyGLbc9Wc3zhMt@m6LDGExIitwvC5!rKggdS^ZYpf$6ZuH!J)Dr`GjV#y{n_;WZ4T3lsO}6Rh07-H7?Sm` zS@%SU0#I~jum&-Tt9+5kR{R6ZV>8EcqZ@gg_wo*j6lLe+ejS;ImFX>#rQ>NA&YH6P zOywLkL)}(AO9jb=z7pMpbuz-JQHPb(qVPMaL0sR}1ADF0O!#cx99DHvAMsP-4pe-c zf|-yOC99h(9LNJzIYX6psRV|2{hdnho{%G7`XV7vREy*3yN(8mw|_j(Woe1_P3qP6 z;nhMa_#}ESkzFNH;{9K@?+Pl*4>B+B`8ZA@1LYMJ2hj#d+i|`v_~JJg#D)i5?u-jL^U3eqlA=ul z53kkH3~b2p@rfS=)a>m`d8}`cvPhqS%c0+&ygT8gGqT-yY$~(NV@0BG#u6cTf7ju% z$DyCk0Ba}CNoF+%ZL*|o6$|#VuMQ`;uP&8m}sLxi+P&w$v;rn zV5|q*3^#|Z^gulPFQ1Ar6a!x<5eC$}c=dSz6`!CZ70ENZG(c8a88tpm9Y{||@aOpSI1cQQq@5DI{b+RUSFubn+K>DgOBvF$UrF-a{!o-HYeSC!GE3-j0SzKt;;8 zQVwB=Cy3|)j&9`xX&-5cvj?fwD2yRI`}^ZVX%!WP1n~;?hq(}I!>9Z!qfQU++}T_i zk=mYe>c~7Qw<8<1t&yF!b#RDJqRQSsdF)llKOZ|59-4DhKj2;v%RonDvG#N=Whr8q zNzu&s&_sMk*E%}9@;UNqNT!BiZ#HuY3$0~^hCdBGRiXZ{)$zI=t@@V<*ENV{=k! zsLFkZmEpFZekiS-5{)CH`z$ObE$_Ip(Z-~qg`}Y zK&IH5IVcN#VyL{y2D5~`L{ygX2=gB9QB;gq93Kv6yvz`emdU2a1177)_zBFb|`S=>i0gl~a<#{`1P`Cu;2 zPb0P(Op3`i=t^|hS=mFQ-90>HqIkm&oP=j{OZmZT0D{kn`jtZkb`k^F!Pm&RRuH22 zcF27BuPEV2Q++=h33#ZsqRMWhtkl|mWPX0S8y;6%G@P1`Z-1<+q*;j)BE1=)?xZVw zbKsNbz}({E-kn*r9}<)#k|H7^Nz##_9A!4kxi72Jwjspy(h=->zC}gQ8h8Az&d%0I zh-e`pC9hJNLWZI_Pue0je3BekhfcI%P`aOS1BUS@IbRZ9nef4)9O)}J{qNs%7E^pC z_BxMLe(5j~pz(3H15&soe}7d6QzKBT9{Jq(@IzH$!bNGLnAcdS?%6 zxMX8xq%6bPKRzL0q}(ph_#6&SxM;i5K1FVlN`YZzW#x>wwx4ijf4`E9j7)&L@m)H) z(z3D}b4H=n5bK3}$32V0C%Q7vp4E<&K{Yqg<>yZA;~c6D=k?^b(fYrJiakl8S4|+f zbpBz^r#-RCfX-aY5p5#iwvi4DOAGdNBkGNc;PrX&V^;3kz^b)k`EZDyh-p;#tMsle z!^l>I)QbPioI#ww3}`tFIQtymgx(#=IP$goAiXIB<#>EJ(UZ1gjO5QV>eID}nA0gm z=j2?vdbNVb+A!Cs&lu?JD5ye)3a9x*t+)5(nMpk`rp08C1ISm@4Hxi9tv2Ug(vi1K zH-+qPFFT+{D?WX?xE|7=Q}}3Wp=W=k48@GAO-w;iVZUV<@P&`s>sE*5d}qSjn}7tL zWd8fh5$DTl8)?sa?&Di?SLk{1b>**RxT&H! zeAgA~XO_9yjU)*R(K*EHBV5|(#;iEVs6#{{%CH%95e@`0DaOf*)w)rrTa67*dPnTo z=i551ppS*27x7ke)Jabc#sNXwT|w3Y{Ri6!D&)~z-1x7B=;(;f(5RT5>p0&M-d89P z5()Eo@NjFf4_ROol9h!jx7!%Coy7k+pN)xIDhc92q>qg2CFei;_|PglIyy>6^GjLe z=-^vEz#$;7-=zp(rlF}p)BDTtlL%SgbmM@o46jWh;I$RD)PD&GAX>i|gFcz3;}qpI z-2Mh6)5}{Mn*mK9&+&$m*ffbI86{ z%Au}4^_bp;@?mXlwzj*PR+Xj?EjU<2X+u6WYq>DjH@DCr9m7(~p~s`8@7@aJybvSv ziL&riNuuB1$mh|7at{Q?aQLj`+!xI&q>^cs72(Q~k5jv~Asz4l=#-2huzYRNIB}hQ zu8gqES9^90Vi^Dnao2to`X)pas6FEis{HVYO$fw>obb{b7tZjTYDzfp2S}cC=%|A5 zmK_qXYwLpDqF!dBv%R|;YHVO{Upj!|Fd4L(s{f)bBPn^AJ!By%8iwAT^i#}KyG21^ z!rHq8?YB(F4ZkW9V7<`Ya0lzfgJ)K{ietj4x~8a$uq3g6>?JPH8vvXS9hC}op|#yF zG~2?o?4!!a1kBpz_lVKyp!IIqlWFCl=aV1wnt@()`27~!GqKou*JX2()X;b3O15ia zkQo9>Rmd{)pxnt>Q}4_8pBw^S()P2}_H7JT?5Y*m%*;hiA5<{67QaIdn9Q2l*VFRS z$k1JFqd+=zy0o}{>kF26hB~T9fIXXQ*`;quRk*Uc7qUXX&SUKEIo3kZl%&n z4;YDL+|p$xlR-DzsO{#|koCU7;M^oDJn-xKPVEsiu&Xetg4^j8w3CA{>{&`L?%qN0 z%Qv$tw6*-w4n?9Hnby&)#TPK)OB$YM*b^n_${&6|M$-+qYHE^DfSJy@EmD>A$hbSa zg}CCKqVq&8)l9SU&eJc@BZth{T?$XcA>V0QRVpres zu#-jA$I5Hb0v5gB3aAJMY*u<(L>p)Zd!3D{X6z>S&)G849p}{`g&xGnOVfC9Y({G{lpGLs#a21Xgc4L=Dm zE{~YDs6JC@zoox^$V}|DyQkR&xVxcXn*Getp@W9i2hQmzg{(#+pI95sa&`yAnz|gK zdDQM{5|_8>(EG2vCSpmT2PHnrx<+Vpan!M+YNLo)?udMcRE`QjHK_+}ZRq(VT7iS? z$22tIAyk@nYZF;Z5!_ou>P}SAh2=MtA2ltgfw_J*GEc?3(IH+OOv54PdF*LkdmavFb$>*v)7TQU=8+xGm?yxxCnDzK*Uw_<|3GYLsTqPFEl>c5QZlXUyWeo)t;-ZmxUa z_-J?I=tC6(6-Kq^+8#Y~ytA^^CjD)0qpCyB8FElpn2&JIO|WXg?%*HQ*qf$8?eRQ~w8kEN{>^eu5<^dea8(WJ z%+>Vu8Ps3bX|m8J)9o@8hshogsE^kX_QWv4Ue!p{L|8bFLBF10sjm0PFMxl)0^V-% z18i*9He*I9ANZ`thAxW#KzJaD-g&1>G#b6!>MR3WjbnrJAnoT1k}k6>D+V~{Z4l1; zIonP&C;3RJI+soiZ8HhjEskhrlsdZ@R)_2K92c<8?`5F)citZzL1vfIAHq6Ud&1lJ zEIt^?unnt+Zrg^^eL_ymj2sIcbvW#;=}4JS9Ir>yLSRt`Df87hqS6(62Q)I3YqN@q z2&CP{rW8sZZSQbA^qGoM0J;_wc|D$BPcb*CsI94q37AiyRE}2bPbFlBs~yZL2Ko^D zk`hCB=a1Eo8cEl)pfk?PWsKU5UQJq5Z6ilX1!>}hPP&zyTZ{3>^2gST@IwPs8QgKO z51}Vjl^L|dET-n@v}1)?=bs0SHq`sCke!9?k_}U@5xx|Hf`Y0=rYtNh3G=qh8%#zV z-zFv|ii;oH^rE*GwcfsERV#kdn9`NA0NvelVwiH7N-Hfb-Q3(9$kPi9Kpr3M`1trJ zM0V^BQ_0xPiG$%bSYrl1P-5VRPXIv}-JD4zVOdy>SV*UDBKj!|r7 zc~K9Xx`MY_akymgS5_8{4icA9Y{wtVI!m?*9drrUE^yl%QIUhYC20aQT0{b;cap$#scy!x0JP}`ncx61R-zo z`h6etSV?nvWlzBQz(Eg++VuZ{9nED5n~4eU4g|;WSp0Z7WfO^laiD!ggPVt(cD>+z zQM;3a^25`udZWyHBY1uz4_Q7&5x^%$f1o}US=W@|jLeC@(B~k{%gyb#PdhkpG*l9^ zd-dw;YiFxq4iCpN-Ii~qr9Ae%iBjQniHf8??k0H&Xx0Q@om7gR>jx zZ$Z^^RomJ6M|_j|NCrjZY>7ljrp=f2s&QwHkmfMEKzrPjE}H}+`**K3kDu5kqj~$J z7i2cKOZ#Cu+ui;js;aV7nz_!)Tw2{YES$Wzcf;bF{pV#(^J$f~lo_)EIrsRsS6kW^ z${RhI$}v@Xs?A+OXX&Ljso}BCuV2r3Y;Fb4+%ZMNx-h%<0!*}=1v1L_*mqK|=8IC5 zK1XhOQ(PHyNp*7=yGh?Ik9}8<%*z$m)K%W!Kw!+WvhH$LYKSgjH|vyK z3n`Q1!UC@a3@t0BwcWXbO{mWJw?)Kyr!@v01~%#kt1HwOC`)BA8N?Q3kWNq=#Xjyw z4Z~3J!WT-MiLu6bD)hY7qswvNlbNrbf^Ai8cd`lo@f)3}%^j(7yAR0Wo44+0({TEbs6?BG73mb`-TjY9LZ<{(jo!$mSU}Pj6vf; zCVXa|jmeA5;PM`Uhphp$ZM{8_?TfQtv@EU=5xr8}k$zTt`4avBUk?}QVhD=_dpJbQ zQYE7_aR$#KBI@WF?8x50#OG+lb+R}%Ha0Xgw79tVrDm(F=4)5~eivVJ(2;4&Av=}8qxAml94egrSPo^7H*H7-VZ3k$J{2200BXsD@UVq>e<-9Anh2hz&~ zxMpW(fBLj|+?2NoiPv5kE>Xz{juaC=rxzwkSD+%b;O{y!GxOue4>!CUDmH%G&(6cy zSxriM@aQqL!=I@V`x4d|Ibe8w)nFb!`)*#|2@ZkNCRA3)9hR3bD=VuqPhVBf*V*Uk zJd}R^{JDsT`+#vGRbE~mm+5eF?kXgHaG@tHEj3l2hcd3C*kZcz=TDJHY0;z3yg9}T zu5Z>R>bSYNUGUZ!nVB6AxB419S0R(_!KxMSs!t-8q(lN{6IMMri}e_E5qr>hA!EY) zYET=Yh1Oq@=hwE~A0I{&%mu_4W1K++3TL;joB^X4a+yU+E->6FOhX z>k-eSB!Da@D z-h2r+xPMb=Mi=mvkq?qI34k%0lD6hbRZ zQ#q~!#1%wcT~1zamL?T3vE_L6d0FX5UwL><8%isVz4h5uLZ|%y9-x0bO4JxsWT+P( zMQIhsBpB17-`v>vb}?gpW5eJT0i)mN&qPapnSF~!Mn>~RM!o4NX=!nKCWHBmoSgEU z5m8|v&y(Wv4~B$>*0)Yz!`V&$$CXwg6D9*|D^=#?O}We!E*<*Xb|IorW#u12ihl&% zzb^y^eYimh7sq0lPMFlB4+beG+3_Zrb3Y3b;XB~e7$z!r`1HXe^V+ZcnV4{r=w9DG zqg;HjB2rR@l}=o2So2u@7Fb#E2R}dwAFQ?q{e2tX23$NV-hr>n4>a>y;p)m83SO&! zoVNP22fj^b;c>UeptVEQWggbQ{X@+t?`SEkM{|5e*S@Ny9YXj*^QRD#-TKExv-WMU z3-HM&5h!RtV4ihp7#Wvte^1*3=`B1w zrnJ-^@Q5WPC1GJ@k23FP z!t?;Y45B_yq-kpe_x0=7iBBu) zJ`oYoy?ghVHN_tLLhRP>a$m&5^ZHbL1m~523yIIPMyB-`4tT~tdl%fGEt}R8Pr$8j zYD#eVvRZL_BhRx=VheE?jGy}vsSj9IkhKaYJ;}k2{M{x%Tz`*z$+NNm_5O)YPELCI z)PA8uk|(u}IDt&NY5_YZJ)zfk&^iH(hoiOFcTHL{_hVJWv6HD^`Yl_cX; z!0>*uxC~Vp?raVs$!{Na%}-sJu#}(z@>VAcs{k2mi)vrJRC(ljSb-g)QYx^o=e)T0 zU>h9Cd2m~|B+7^m30MqFOms}${?GJwNkL^`8$yowf@^JUEiIjy?t**?W~}}t8@y+> z1;a&fU&u(SNf2Df%vV4B{<^mD%#Hx0O5l^`<9_6jvc}`~0sdf9%vSVUhdh72MoY^b zFcdHa)t~eEPM}(Por)?tFi^|Dq+U==?Bbpc$Zi~revOpynkfP=Yzw+SVc+S)L%0K} z{33A>JJBGh35M#gK?147cfLK$x6sZ`i$fmxKQ$;8AvjJ*nY*kDS|Q5405bK8}bJ*u!~X8M=NlyS&E^ zlydk(fgdDTh|QGmR|)9rKfl&zm0`mspfQ8~mVz>FdaWpGCg-%%feUk<%c5zqrV^Or z;_8*)%gCCSh`v>XS31(hX-&yci1rG-M7lJ;a3Feh9&0h?Yd;kjY|j9~$y%YvDuhGs zA$Na{mgtKY?ZEjXAt5nu$k=qyNa~Xa=mu6+-^c>kAsQ4BcsE%6Ef%cO(G*qdOn~e* zmKGN;UCb=gA5T}#i(s_posD4ve-rEdsXX?Cg9Crol0oto0sanr_9dy4<#YwFU;!*MB`L-o zeH(WB@&;q-n;Tt73!Ca|_?-M7({xk~E)uG;3-??uS&Da{jH8#ejGsuk(*u=AyWrqp zFC|(ol$hrq5k)L7gwNUIrPCROH2(oQl7{UTD;K#Am6;uSS~=~gO$i!!&JFM6@8l2M z^LoZT8Z&;TDN6pi((5`utkT>a7QvXM#XNqPsbDQTywqSIP;1^bmU}&b`bE+Ghb%Ig z1Po26$HV~DocUc`dRL4br>6$<9pc=;X_+Tws!V@F&6_%|o?Oa5+n3~F3+May!ck@{I8>1$i~d?F zmOq7#!$q*w(+?Iy%rXhfGv)#>=lM4X%lxRQi9wNE4m?kAVM;oQ6WgqgWsq&Mp2@B@d# z))J`m&}LQi%HgD*%et$dblin!oaqwvNQlO$bW;tHtbQ_H7uOA@V?+B{PT8lE@0GX( zSVbC18PaYj;v@Q+m5z8TEcC6;W<0?yE6H-b6DNk)g}ZzWgy&NWwqeuU=r~$!${=N> zk7I_rz97KhkbmNb`_eGIT3QL0@_HAv@t2S%k!oC=$mz8=FoZ}nTvJ%*st!%IaO#_^ zXF>RJ@M&WiA(ll#BuY|w4UhC|EzzO(wC{lJox+%o2TL?w8kn8K!SjJK@4T9$s8GR0 z>nF3Lhs$9CX1Tl}-KYo7LY%nAE^qmA?9F@LEy*m>JCqHvq76)A72_(_yN=%XCb`Uy z3c%<-5xwRAq;Y?9<(xeaTdAj$N@H@xLC`;a%!E4Ny0nj8MPo~9D$>5=#Ml>|6 z1)LyuwSySH43;8$(5dvdWczp+b$_;Vzh^CK@|d0acD-2@_F3hPLygHAytrhd2`{2x zHRZY?UZV*XycFLzizfbRSFOtXm|QlMM+l`Iqs&oh^eL? zX^_ll*(p2RibGzN$aGv8L2B?jZTf~3XyStQjcF%|gXTPT)i@ZglwFY@4+oqO&VANF^+dg;B&`v)l6nA{K@#?$P z4ucpdy^=NSS>tC!)*R$j0FitMftM!IWnI1j1G0 zFj3BQTe^{rP{q~!MF%Zw>&=3QI0Gx@KBW{}@lO{<9HZ4rrj7mPlaa$phHvVv?b{qh zh^aRKvYVzLm6jX7ylsWAR%#0{gBbNbXt`;?5t`-YV++5&H{} z{JdF0Q@rt3ruG7dS?f*++O+8&u9R3aTMYjvr+A5E<5~JdW&Z^2B5=?ORY?|!)Gg{X zJ?gT)mzqc3P>t59dD8LXwtX{+BloS-<&_Jjys|324!|2Xx4=-rVs!kojHvZmVnY1_ z!v+)l3Wu-%gF93gkh^1NX9uhYV~?eix-!(udK*YC{Wj!K!t}he#cp{#LlpCCsYP>+ zuI5cW_P?6P$%IG09~m3V*XoI6WE_CoTR$>Lj-Ajd<@X|=vtlQtI?B3w-A^7aCnu-% zC>;L+c7~jWhUp~k3GB~OwrP$h%Tx?a%I((?7|v1&Iwma?KZ zHSQ2pIXgGE-LBq@j*bqffVkN|{LM$Hh=qj(;BO|NY&76WOig@VM$6|Fd;1$GXK8{uhRG??T26FI7nN@Ydjj19R-C>)mtPE=kK&E zk5+_R8Gy=&BNzdQ-``a4YAq;RFxb)beHohKnNLC3Uk8TzUw z<_Ykb0Ya42{8ot8|Ga^p;07Yx^fqSR=9RdTFIuq|8QXbA`|8CP-rb{st&n{{b}VkK zt${Pl9o*|q4sO8TY_BXF3?NO9nm+d%-t;zJ_NC09b5Zqmjd0lB%%Ex%Y{0~ZT#*Fq zGzL1kKovfnT9XczLDH)KK_oU`J@jqRPEAR9GutUL{%;%V{Vv6Tx!3^)_&Ch8g-6Uo z3HW(@j?Ft}a=#CSylAP#b>i$o0XURz@p>w@OoIxy_#M0@@0G=FWXJ!D<{J4+bJdUI ziS&I{c6_wofPf;MiZU~$U+5j6{E`5(2I%v@FuE9KZHd9$DL6EaduPQ=)u=HA@#g-# z(4uNa9ANxvhk%mcYHCHxz(MMxtgLKYArGh$pLFVJK$W;0c5K-@nV7H~+O&M*1oiZ0 zI&fhoBqpZ1%hBbT6kY@_Bne4LZM_F#d%(*=N=9}*34-7Dr|5gS_5~{iQgzH1$kdOA zwzHG7B%Aq8LVjk)d%fYodj>YW8QdY*@tQ zgzxiAglT>;3h?8bF*|4dtK70FcX&sR$YqwuI;k~u9M~i(OW%#VEO#N6j{vuc{g0Y( z463c?4V(3nY8zc!w~@$sD_`(QkoKy&mavxPT#k=OhQB%^%kKwA7AlkI@jIjc1r6M7HgmMU_B#)&q8#*@X%Z!W+d$&hbR|wJ_qx^&M6%uAb)}b1&@dc)qCu0ePRWFK>g{iSAF zPHInuY-F~i5)%Q&`lY58rD>UR_dr2N|3mIOwfEi3*CC)%h2c@I;&w}}ZqZZW4653< zkuMq~Cm`+5Cu0-I(%Z^RiMC{4m#NUCfGgx7=pCbu`TSDcTbh0(kla%M5E7?2(Wt-y zhi6Dxb;=OtHBljyD7W#jzo%p972Btj7`QJ-pVPs20LIw1@+%YBoOFj-=WGP}y6wIz zEs09O7$V)5{pmudZ|yHzd_taCzZ3E&`K&$yxq6lTE{FEcD*@{9y3e2eEKl@KBSqv- z{+`*#naRlvbaZsAtft*J%z^j%A41h_yvcE>WA&_*%KGX#l&8yj6midpCV4!fd8U2$ z1{|`o5y{2jrsZB|_N5Biy80?6$xHms1%c@n|9hp!%7T)BP?HmORj0V8={%TIIb%O~ zhW{0d=OL?wS28bYd8bGmEIw-?_YBBZw?mN1xl;qBrNlZqW|RV>40Z5+NKaNKf1%Yp zsl|N}xZGc8?m~?UtNG61;o;_HNe7g-`aANS%`O?yXV0c)XZ?(XxJH4|TIZMg{a*>J z-lgV{z^mof5^8y`JU1z(7s4@o93^V7*%Gv~ey#^yk{IPqy5PE7dUx#Tn%VM}?Z?U) z^igmgaiV_m8nqhBkhqYok|e%7`&4e=ik){@Dfa%Z9?Mw6 zrR!PyiZ@p`l^3ZeG>v@79e?O(tqp1~9#b09p{L0VHb*VWaAEH!c4?@9VKoF~h&> z>7kr2y_0ij97Mttf0YNkg-D!2_KJto<2Qgzb$m_NQlA>Hc|0pES2Bh89kOe*OE-<# z${VV0`y)4oF|Fdw<7U1!ng9`e43=;N>!0hqwP5TzKDxeMG)I)WwhBo_J+&n-;Ai@+ zTKx(2jpV;*b5WyUj5)9&ULox{g+=@DF!{}!#qY+Sk{u3%Zc}M#X&amENa!f=v$kZi z0|U)(U={faboaU8h5dc~@pGEar9VHj87W-^UpzV*%_M|w10tjE%`MB_88^@C_7s)W zcaGJ~nua_+Q|nqPg(!6sy0PGDaSqt1T(%fsa)8n7`p799xEw{aHZC9V`w{F?aQ$e2 zA@tKV3z>a>39vol|3s^CjnyddIYY&)%+%D_dE5U5SoJSe(4|C2M`!gdBAt(&lBUtW zMM|K4?k^;AgxAP)(e&#?)Qw#vqGXp7s>9vazpiVdAHpn!-zCI++2a<~&nFH|J%`R? zA@^~#X$0Lp8RC@@J!M4=j9B~Lz)je`EW_Ek~(O}l6RP#AC0!`UZ3(+g^73py3n zOZczumZt2n`VnqG9-*6i-zk41{lFrN??77`;MIY&2)@LKYbg_agS?3%Z|&Lp#Ystc zo*6Qxwl-Qdn5i3xRNB|BU0bz8>T?Li_q}n85DoAo$c*UZs-A_3LKZ^;Sx~J zgsBJbQAA2Z(VGT2B#b~u$XZxfNbZWc@ez5;K^5~zM2G&^S+^)jdDCGB$!jqW^}%D< zCfo0tt=b-AzOMdP5*qBtxxs|0OkXAk^e=* zsc*=1+i+kiSw~vmD~2GH!?{44)l_C>3k}>wFQ}QpMsNS?D?4(WdA&&K)KJf<% z!?|1$|LD+r)Ub<$AU#i*L~*VKyS+kWzay($n?G8Lc8#Q{+;Dz|?cCp^WTA7K&v|>% zg4gfxDkk1p0MY;DCYl-cuNq5plh_&%cYqY4vJGTsYVe6wp0D^BaiAT{n2KNjcts0L zu7F3p%iJQ`bNE1Ju{rDE*H%y%*rn|DvZlL@Vw~$Gir8V_*n#cu&WHNy-DN*k2E8Gz z2jkTiD@rSwbq{Bf%bm`J5NR*ud~?yuhVQ5>Ym{=Ag8T1z_v%ZCJ?SE)z*zl-I4?A4 zhXcu}3<+LDY{v5YkM3%%sP^TwesLy4jk&W-upPU)mT1c(DU z1!S0JCfmWSikwm&v`T9S2T!n3`YT@AXRfA|GMU2`g$S!|GgIb*`%}P?#-BIrH!NTi z1l4~$(r%cYWHi=U2QWogBOuTuhqnA2EULd=>CC_VCGV!Og!MeXJ~)=cSayfLR=;$p zZ;WB&ptGHZfkA(G+7qsi~0zuSBZ~9Ofh`*$fOuV%*j|3Wo#&a6kEM z{-zic?kzVV|*(3?~zx{Fmz*;5R{R?$u8J!j~rNncUNWE$X<+C?yL{JHG|t7rOz+Ce{OMn z77Od$%~N>3s^`F0@x8spgvxa@Y~~`+pNJ1W3u-R$3;%m}ouWyV=((n>K}eQokMmsSkgnSW zdgjvWnZT*gc>3D?=M=GJo_YR1*djq`hYj?hC5jPn8kZp!;-42wXbuvmp(E~&e38ujd1vwu6_Vx8i0U<0`vfZ*Ur>Pa;+ zth|i?WXsfSR!A?Viosm%=flz=NS4#PsqP!IY?SDArX0SK?lVBwT`UY8GL`mKW!N!0 zkM*AV6i~%f_L#5il2mj{Ynb#X|>n&>r^2@ z6(x>#%qE9=0!=I|BXCPS-(`lV9GNX{U#5HaK6`q4dU=s;39FF+_4)qKj+R+hnk0UK z9jUzz`v-ll+L$NDI`!nh=z<}TZfhMyE-YvP3vfe=dPvw_`bYPra126y`BLUmthHn! zU8Je7tgPen=e{2c-ADO=hvKuB*ZJ!%uCC8ZTA~&HMCi>ghi?rjEfX0SjX$NYdx3#> zIB1FsKA`)dTamU$B!`D^#Ck}`%aq2s=S<*KB@Px=+u7gTPaK#N)UWm;J1uo#xwx1C zcu2W=lBxuQV|XlQcn5oXJ9=%NJXPh|jr{hFG;hH8{!RVJ^>yGJy{QNf1YV1vAX)}q zo28ovzBGdtX*XH{f_2n?;pgL249?u%=#Z&MA|=-QQ#L(}zbO}V<6WqqhUz=|RJ}8^ z+f$WY7=rY){bJ>1DLaH!3$wJWQ=|R;U%Y(zX76dW`EWxZeS@HqlG3xbCs|6<_Z zFs|R_bwECsmk(0&f`js_+IF9UpE(&}_;zM5FA*^cvy@#r`-u|kNAc-s1xNzS$@i~A zwKpSDW3Dv?0QEs+5en#r3sS2~446_W;LDdckHY5>OG`^WWF8{mvwA1MRafMvK1Zfn zNp4Qg%cL$)m@wOau6nIhZfg!~!(%R7NBPunIRyohYu7eAM5(w1nh1e}0NhM($kFRd zYnC^f{GxJ6yWX~wnLwQieIu-Tw0nH^b=zk^-tfq*cll;!P2)F zKT@qctDM>P*2(|q+D~wteoMiL#lgByc{;iPD~HCJg$7R}`I^L; z1)TGRO#7#;21$#hehS{D7#H{Zz(Z;4tsGIxJ-0^bk~oZsbZ6lX4nBTafeI%}{D1tL zzcA|8pdjGCiUEoLu8ob21Y}zd#)PB21G!8V>)}u83~e^y60qtsk**ZL!rD0<%P^)G zAGL`aoL?UoW|fTEVA2TBZR()LNLZ#p_I|43i{!qqF`jd;ol&3^78wDySuTJ=^^lc9 zo}L1V_$&~6>5QnScv^2syhUL;J?*KfYQ~peN#UZ!8Zhsr9@V=qQ(&kJO860 zK){~3R!JV$EWCgd6A`_A6yAVrY!nZE?$Mi)nu?2u*VNhyUYCwnc6|%S&CAS|d7;Q0 zZBXEX4ijKV>NIOr0(LFj%t`3Y!miEupGY6fIZ!&Cz$U?Wr4~c~%eOpKtoHkcQK^U@ zUGr=QqNEY01K=+9knGnUxA&-7i#=0_9l1)ty_TE~~HOyV5Q{IpL=+TwJYXP>kYEZQ$gY{jlqaQ=e4nerH+*+EC4fq7gsKSl0==rr1R>sh#cHzD} zM}JSu`c@LfqETiN#h6e}Ko1osqrF?vBX4rGw2>jC{pbfSd`y%vceX0K+4L%Ua*$1} zSl>A_`szU|7%&6$7=VF&C!Y!l&7X$B6HxGOfjv?1C{s1Q16axe0s_`xxarYxFcOif^BfQR`69M(u`7f%}=CPMP>eBkJ+GcHLhWWw2xX?A5)1qAuv!`W)%`=Vss8J z#7-ONN&&GscmBM|NNEA6xDqFGnzPb7^g-$aqX7Q!6xJ zU_jNMj`7=gr#&_4HOwkW`~9#&aj)y07bC&0ZN(l6(+FE%_2Rha;f$Ndo$r5sX0=aV zIkEdvG0FfmKFE_iw<#B@!4Ae}G58Xori|tFl5`1Gp->j`IlcpYf#JfxbuEzw2{p#W z#-7`cTiikq4yvG+zi}FY!g)>^Ol=UCl9IPI2F0pr@nD&AUmCs90}gn)+s? zY4b<#5;p1Vuiho#(YbGYS4+eNzFi=d5?#KOI0TSj?#IxresAQGfeV;94PrcaQdv!Gn68tff7lF|LOm z!lfe)3-`B0!AJ1^Fi=Mzg8<^RIhy!{Vf)1_Qe0pH^N3up7-3OoPhszD}Tf^;4Z z)L1asbm{Z`>PKsVSfFr4zP!`84r#^602nZouusvr*DJB_8-YF)Fk~j)d_6*f^#tn_ zY(CAIkldybvvnNlb__vy>`{<8=^7O?=S^KS{$kkA)DfiS4G2rFiz#J9{l2f&dz^!! z()DT~x|5z*eLz`2uvC?K+_aq&ui^t=8DJJ9+#WiMb^ot#qV?-_`1b~@%q1tU>(ai* zDUCyV*=1fAChwOB?Ld;m-KYj@df%+E&tyCPWKQ;wpJXdv2Ql~hHF1Jcf+M$UDkFUd z_KH*(QbQKDj&GFPdh`5#mLDh>xSlA(aAL?au`rl|9bR)qqTTf!CD!TZ{4M?9-V<}; zCh=I!2K>nAj4~kn#jG#m6^)Zcue2R;_YaIijzznxv(s{>8XcYQ?l#1ex$;H2nB6+y zRL;++T1i_Psv4!tVdP2m@iJQj-`Z`uIA2?ZAD)!iiQ((YtZHP+! zMQb7AZMEC!vwCGI-Yz-*W}z8QAZ_@jE6q~q5FdZuq>uq8)*orO=(=cJGN?JYYEL8`6Wp}y!A zpNgoq;0KRkYjEab=Hzj@)ETa9ufD(m(V)9 zZcM2GeCFhRM~zJ9n@bMXL)!e|veNm`RI`Vi^IL79Efg1ODw1TP&5X;x_qC;EjF`&& z$8NuR1un!5TM2gpU9UN_LV|k-%3TJJ{e~&dVLiq^1rG0@cqt^gTwYK>L6bH03Wu(ZIvGH@-G4DGmn{$zS18o+XFm5;T;Ex|~fEx(5 zUAdWr{?#7vO1^^P#tqt%j2NgNNMI8aZw1}P#$L+;3r9~+k3;E z$`JT79TOl5kWM=?^;M7SP!~gg^@h2-kI2fdD*e-i4>r)IAuy^_Q&V?zbmW`yL}nHi z+BDw?J_pX(s`g1>>wk#DZZmlHhx+MN0R%w#+3rB%zEoSX*Cl|sFntetq7Vf)t}pK$ zqL4;Li?0`x?s9OTQ($k#9SjV(OG*|WvPTyrVJMs5*!H!qas$<-zc?I@jCZz-k|h$pFZ;=@-*7} z4_)zN4@1=ZeYXFMaoT7)u+VUGYcwjyMS*_7CCv&RReKAaAn^4SV4nsE=@kjr+oqx}t1P>P%%p0&-pK7q2ZT&(-PyYvdf3}vyB;hCWaKZ5T@s%@J zYUHQ(Hx`@*Op`KwueO*68VsbuKTAmL{Hn`A-7Lc1qi2zXad3slrQO{J^t=+e6fjt* zsE}Io0Q%ot4L}4dD-QKeorhudjHEs@tK&6O=~Giv0PX+$`4fzev1bnq2uMa!_W)67 znyEd4+j4MlFq!O52%i(0gS)@bL=}vxsE~PM^-DcTi!iI$L`M;3s8HU0ec{Rcsi1cL z5B>yHwXbSa)mYR^MJ94nUP^+`jx#$<`-*#l1=N7wlq^SlV5pDtR*&L5=U#yRHNt9*6Is#Mq*ivLypV?y?0vV>#q4Wh zHGJNyv=hH*{afsbxgQ}@TA&>V(EvOg{?UOO2!H?YTZ?i^5+&T^cUA+sKE-28@G*E?R#?(L_nZe>R&TqGu?ys8zC(C+K<_d*F zgjaV8>?A6|1jiY+MfE-bLrb^!4{#Cm?(QyV3z0AqB6#}rX?>AVU#3oqHTIfW&3S`Y0Uos%9_9%j;PE^ShTN zKQK-G|Lm2+yYUAZJ%Q<41bj@t3tJD=s1j^uU2AxTdGRm35TTJy_Zl3qCxWMU#UG5G zb^~1NA?FvkY_*Loen|vq(hO**0)GJ5l?qKqR8&;NyVjIu6nphK>gwvu&CQq6!Q2x> zZ7rCM`r)fwl2#9Dbz^^Zi1vo#azUSNYs4NP62N9!lOaFe2W%MV*cQP7kJ|_WpfDf| zPL}%*R`AIm0}G;O?vq?2_4%2e&RjTpPe34AWZHinNYzzGbIb+=d#4>M9T{2Y@Upg- zhk(DHL99S)AamiTT9b5%K-79W==TIgBzR2E*D9$O$DnEym+$^b>0scl4x$VPVP3fP zxd#_C!fby|y~+lexc+nYm%k)32fGRTeJ+SW7zKmprr3!+1Cu=d%i&_0@O;mLJ)Q90 zAmt8d^|8M1k7*R!MP$a}sC$f3u z|MXJ$-M#kOYpd$fiHXHd2-@V-x0~^Md@#2*DxoSh*XYtKRYm zq1>F)yRwcMKkw0@pRT+4;P3ccuEaLW5%JK7-|VeMo0VK8D4vn`bi1*#v|y2r5K^l{ z;Q#ONoB(@lkyo>sRS22tPSP(k0rxThofj(Z$*n*H>Ztj1yrV=*k73hD%44$5ScM8@ z1^}UI1{FO0Tz767x%7 z&&WmKOTel}gam0G-*XaNM~76mW0PhdRBIppTHkbr4B=LwIN&!p==7J`?qSU<(wxax z-l^tX8oq+$5XM23|6#V~=}1*Y=ndndr9*`F#?dCz_c}3TQ9dY^my^wRS|0&=X8NPs z+%Zf1fV~IhmeV&R(gSl4`?5ODTAIf?E58b6S1C}a0P7wAn6_R}aD7%+qTSrH>1-$? zk_GC9fB>ayA({y>i}{ZrIx!<&p`P{Yc?cbGGG{udbXWD3@P}2-r%4CW7h>pi%FQ_( zM~AKG8uFE2i=2>kyeMfhBitSP?U+<~@QuHy7ucu!FsmssPf5wPu)Wy|AJ5NndUn4zC1p)qB7p5qONi5y3eo`L-#BbA z=;v6w)G)(&7z~qZ0JV1i;z79>DLXu8R`bnytU?pmNe=dpKyK_JN%5Bd{|=D_GeRRY zqUy?}Ks9J;BB{w=`1l+ff-3RmDNr`$azM1dzih$9K~bZBZ$w?o68eVXe^+Df6bsQ} zVq!*kgB+WBwOd*JFyZgYz2^Oy<6je$u0k`BI_>W+$GMv5UYTzP%`4H&mv94b-%y$51>wFF%c09UwFV#yzqkt ziUSk%2jtuJv!sNo&(e9(s>8~FHqsztMFS|ohRWC(d7Wnj-(^W}119qSTd}G28|JJp zU$t`j&ujtA<;6-0AY z%-=zOG!b3O4rV_ioRcN{XXdiuKind;r6Y|M&tsLbPU8(G3=E9wBimL`5I_;QZPr(* zqzwI>XVc*)C-9f41Nz&qEoai}{|7P4?WRx$NkI7k6u|Wv)An2mC+f(|Js=UJzSk8% ztye~hGJW~)Jsa9SSV=T0rCB_l!K&!)YSYhad+kgA(L|K`%xeDflywceVI!OI1IS5) z{C`qSs1p|By^X)`n=JnFkzgi;$7DEoGIA1RZ95&-*uK1vWe}@o@9Ax3H`(e0#W3Z{ zpM=>$Nd?Sdf1LyVKB^GdLDR;Lo)WF9;mM*D(84iaR zsagC-6wcsRD*j*63ByiTfF27D^v@}ZI$Q{3ln35%0_GQNDagoRDMTOh63PyMxMy&{ zfLd(4+9MHeUsRE1mtgeDZL7vmqEk<5tt-t@b7sUNj$&akK9<)?qe4Qf_?FMX)5-_q z&tlts;M!Kg?ABS->-@zw|4EW@&WYTf{NJ1i$C|HZU@|=KUB0exl)Zj#O7}AyyycFE zu9Dg>s#>9;9a~o^D)p5uUuK*!0PQs*FMqlmR}KyRI{L;S8JCs8uX{+VAHVeWDZYOR zrx!@SeZLLW9HHJ0n}$2Nlel{Cu&iTZ`R#EE~FUexP#g zOhMV~V2r5TX>C7N$-YwSM0Z$7EC%x8y&*O?HHA!19IakhnUy4rSozdwe*Ic7xv`(M zeNyr~YCESSP}|BxN4nPH(Ik1uw2QjHoBz$tbbwq&(NL8=OZXqE4ldDGhFVl`pL1E8 zK}{Ga33}Std?X7=*L*foHZqo0{ZqwbV@4HL47sBUvouGNzLIL{>Z((eO1=lN!0)19 z@cD_7l2ZQmi$ajPQv{hlM@L6coS^>ZspZ!?9i6CX3lQN&4t!!WIz7l>l$YCaJ2~}sLt5|;`xQgTDhTNUjJgDa zov+ALRC6mSi)zDpKRs*y%@r~KxWE-9llo|+0IGA1Iay$UwR0IaRbw?MJ4dSD`g#M&rFBVX-7n?MwV)pYyPi{$mKx$8hs85 zz&!7T$6^8|0~|7`Tn+43>%eB{3}o8k(sySxyQ)`+;`mP9t*&;-Pg?##V!JZC%xRN8 z{&4rzoGAAdwpa=8x4i{!UbmLCG;4*lgiNcqY?h1ImC;?$zbl7X#tAqyJ|9nmIXdjQ z!9?cMf^t7x&B(~Ec0c`QIL$!umSeVEM^gA{y@!{63i=cDPx!^1+B4ks9z*(*U*2ek z9c5>v@GT^Zc~7{eU@2t`mNFJHTAWrobrTi1rN~U3tm7W{&hQOkc-#xv)+-#I4v{n# zIBLJH+Y){yk#7|XsjM_OQ~Ri@S}7{HJs0j``N7cKGw>7GIQqhlm%o*lFF!H`f&xGt zmMdjMtR>O!0~-M@Kn&!DEB$-nI53OJ6wJSjNCiPR8Yn|7$X;+ivCRUC93<}qFG9Ia zx5j@43quTV%3<8sW2H=2V1`@m_qu^e(5|ujnv~tPRmeg~Lf^I5YUB>7QqoRW zkv4p`H>A#zg^PQ-m^y3xl?Ka0m3-6{Zd*4bCG$fspm|474%c`1{jlC(=WW=7Wygh$ zds&khJ8@=|WLx~q{4yXr)m*wIex?R%-{pS?lm%Rbch!;f5;$?fpM~@L`jD0_wKV_F zmM!%|y|~jyKlsN-&sD$EQ##;5!UGjcXB#CShaTK=!xXXw6SK=CmG*ZvtDZ*MJ7Y)@ ztI$xI1gAMqYQLEiCS~W;+j1LrpDUWy3wTp#yT2(pgpq>r1ieeJn7Q8VW8ml5sDtm% z;bvWS$5v5|6a^8Ip7riHG$&e1ZN)r6ppX`q52%l%&6o3YdX|zHeF}$(zvp1Ikfp|KVsB@%X8r zw~l!KY|`AEUf2L18#qjd{4Rq}zmsIgnb#{yR{u#fJ#WJ6rH^Xs=t&B0o9&Xx~Uq_952eIrcr3ELp+M-tP zPh4nqF>X!SaGnnfq&+D3qpWmmPpU=H!;rkpdg)ZsOD|329}YUzV@DDt!()Hpe&P z2JX9`sGA@B@I5i`+)WfTat^s-*4F?F`sGUN4v8Ypq+k%e6g^=&;eVUpqD7wE?Fi?{ zJWoCpLV$&l*f)q7&Hq{KbDZtygEbu30}4{7$8D&ZZ=Y0~vY4&%*)_uZ;qv*ilC)Qs z5<9O*Dec}YA=~)W_@-wz>19`PT%RTUov<6tW3?K`>M$45jIor-;gbcL&~x0XomAGImOQj;h3IV&R_LbUHOP(TIwZP0XFSv11)N!wdP zUP**r6d&V6aY9^9MJ1a1<&1LXUF`XqU0D-aV?$yBK_xO7)zxH%f&AFsJdG!}#Int& z<8tHaX5)AqE0{JiTC{zcxAQMA7Z_kH7mR?#xZrq{WPj~D+N-L+eqpw!{eJ48TW)ei zTG&gaxa&*Ya?50HR=3%20xUQw-&-=HzM(krZ34X6^a#EPrR4eH)whVa3dcFD3OwPG zyb1Nz+edhdm?fuX=}uxqFyS@)GOJI?B8%pwM^Y`yRav82)mkuiR4-C9V+DfV=SO4@ zOzke0eHPbs#bvxEe^l_Sk})vHjmtVK-U4^7^)ETsTV$!Kw#E@O$gJPAcXzyIW`AD! z{p%2ld{<BlsvAiU!yde^VyE zi9Ep3Gt%kX0T>W#ITyn_Iin9C*5F`=#mL9|^2qucP!mc|)^V;X)7*U{+{y>uOt*=Bn-L1i)avE- zxV+XMfW|lNesmkCbnLss##S+aE+dH!b?q zLxTbGQJ$WjN=dd{fHe;b>mC>olc9%P#l_{kj#C^ShISsVu8O+O7dbpdcv#O6GdVjr zxS1Y}B5w@kOBZ;rzZUUKg?VeqcUUb`*KB(kq3Zzsd1w9$nt6Us_Lme<(aWoZ%l!JgT+rP*uPC$Vt!q3;)B8VY}&qxhU%w)@R z<>v4ztH5$JWbDU&jHk|CWTEb8x-^|m!~GLjMhnwmq{y6Fd_fAg|99B8&1UYAn?v&Z}3nv?CE<)tN{fPQrD-WEhjH=rg7xn~033>E+y zEabOGUHo>yxH?@)Wn|1|z(7+T%7)3_JbrQudi9vah#&vNxyKzz1_O^XP%yo@QEX&r zk(TZY2-Q9H;wle0N^?LPUxBAEZ108F7iBiSW->B=|n6TU8u-SyWn zJN>Gj?t@R&TTwO@3B$)4FXlbG`L7^&9Sv z(Q&MI{jVQw%Zj8goEW7I5@ggX{M1qQZS9YD%*-i^b9{xbC@Bw90~MS81*Ek{I0s8#M#YOkGw(5;k?{egAC+8>j3q9r$DoLGD(}=^J(L~edy)h%hXr6 zUwj@Xc(U9{F9d}7cI|{L=VZ0q2L;N0lBy1&_IS?Y8Z8YHDhg(Rp{(zIjJS(@tufFr z|7)2Z?hBesHBk82Ei00YDICLiUZr;&clc@fQ2oSXryZSj>F)Tc@WZFFkEocH8!>5qq5w;fXpf2s`M0=^tWs`W}yPm42y>;GQ<&^6zcP)B&7xBr^tIQ;u)*8`w*9m^p zP9xt<$E|_I@H#p%@#PbYYL}gUE1i$zbCbJk&NVxY%Rj8z5i#tq8DaBP74xE8Zg`KY+B_{lG%X>>fU>1??!KA|pl59K8VnXf;;J%f%HjUSvL)k4bb3nU~XK zOzBP>&wC#8{3jm}gWJs&x@QZz?&L;>`cXxma}WhX<0?{*u-Be~f=QQ7rd%LHSi z?0Om^d8`ork7BiSJfn!M>Qn!?`lhSG?Gp7#4%yUqrYZ{eo(6Q9Sfpm1+GF=KxqIaK zeYR~Vv5poYbJhuR(hfJ2fSmdlxLZ=Ps?ZOvW8zjU8k>t{|Cq(wZbS!yFA?#7#0)UPJ-jQyFVWf zEb5XflybCUV`7*KQ;g=C%`q)#nLU`% z0tWIyTp=-WK~-uGU+AbJGlMfhq_?_Sl&g1Ee-Y)`W)8a?H6Nx>wrg0Vm$$c$RaG7e zbLiAgiwGQZe#cHZgkU^4`1uJZpU^a8Q8r&HN|2kRAWkX6CcVE<8%Sjf$ zh>q*YiU;@M#^NJAzu_A~^i`(91i*yT@Rm+P3=_41Z|WbFqCQ zSQsDfo<^;mHP%&4M@0@N<(B+x1&Q*BR`*~- z^3d!k83m4Vj&ffKN3qlH#;V|FtD5*7s5#Djfpi~r4vF?6HT%R4xk#Eh^O0!mSkHVm zN+2aQn1E%a$L1mpCNA`%vg_Axh=0?!G`*dC-(!$wWT^A~_Z~kcYrzI0G@I#qll$7H zw+CT%&8i|6)9Y-OL2K<8PEK<0qKra9uW zJOMRYG)(Y7#7f3DmZ?45KF8K&(`o+pw>Ag1TYcKcAnQvXwb0yRyF&8Ts1^6g8hsiG zCF$xH2s*nx-^ar%*c48;F~;j)G^ZP3u-(fJef$Zs!_PPTSqO)!2%pYc4{e-mZA6MuV>-rWjSX_YoaIKH!9S9o zcb^o#k)W7jj}s_~QO z3YIG7)dStbvQMugU40JF$EXPjPl+6p*EbwC=Oc48jju{V46UH^Cr*`?8nuMZEgEjbSSlOX1UA^1BOWL-E92tqNN`x1yC%6)*;>8L* zTA{h)YYa&&<5_pAH4YUW>KBUOMM*4|1C54O?Om=UcWkpI-fc3-p}E|SA%P&@bp0*aj~Cdk7wf& zt3~F5r`*Z0Pi?k`)hI9zD+Mw3iZ!e-PX>Q+oCJ`F4jK`A#5BSCJ6=-hSG^Fm+>0IZ z>ORJUvPv1jDdDGcv(yv(J97S+lMcrE#CK?HuHwsNGyCCnEb-C?*yG@r5KzW1i=Gat zX%CzI=(xS+Tk?`Y4)654`Sq;8?PC5<9+oQy>%|Y6fg=6V8h@;(QE+=L+nI~}l$&Ck z9i`CJylEggpY?HNbFMOWDq%32q$djqyCA_b^h2G*3D_qs1$6XQ4y6{Yr|l?IO8P3Q z?XVr3^>76u1WZpxOi;v~!C@eLjYPkp$}hU8;Xq8R&EAo%aYMyfmX1v1u5VEbKVf89 z&Z0HFm$9h2)cP#7W6ZR^S*6?Yx27;D#ePEu=w;=&VAits;O41s-f#kVcac-mjk*-y zFL(Uq7w=t>F|+F~>Jt1wlu|I$`q)(=1Ug~i<}~$+Zdt5L0J^CR7z5N?;f&4dwUwXu z!LzfM%#Q=wYZU0I%Sr#FIk5#J3)4#~|4D^n+b|86*8!C=w$0N-X$8kG=%rjPZ{b|0 zey->BVDTz#WL+Gr($yJUDSLqnlza~FPg7xEgiBe{4~Oh+3OqTxY_oWE+{8M+3=cML zE(edNT1*9tX14EA$LR+89p;w6yE1yjUbd0U9_W+fPN)CfU5BXbwEY|zIg`uBq&96dBi#9);)NAbgl3Z7fVQM>x;k1S9XIsw(_jJqEN_t|a1smzNqbvOk!S z)BZLPVpQ*YFfT96%?;h0YYW9XKHN3x&nlFam*40V!8bH8U{=j<$h)N8taC3@y(AmB z+5)E{0RaJC-oXRj<@PJ*7F&|79qy+iKEdtEqXHg|KRP}6BZb~3b$gnw?d1~>fER9L zeea2$%)JOkMY)84*+gY}q3^-(TL}<_gqBbHtZPK-9<3OPeM>k%nC4LBWC?36j}Ffl zr+i}$&LLE%qSDici_w#1-8fOqo4S2CIGLxhCHRJ=I&4&u%J&^%MPrI~@TBCkZqcyR z8-ywm_5AS;8$*jsw(v5#XFf1r1k7|#&j{KM9Hfww(u%bfCa&z`vr|G$`Q z-?`}skflTkvu{W#r~o3-F}(6oDBp_(WkBTg?WwJYQA7&AqfWR0C z2QJ4Tj|QWwpe=*z+`Ej9p3j$Q4H^glOIU~N;q}ipW+S!Z1e`a zTaZx03Cj~t(jBj6TOceYA3@xh-Q1=E?RUtHBz;$OLJJs7Wi{{Q4dI z;o@JAa?tIIG0>bHwz#xbN$@}@B5)6H zhjJ_9s{Fm;cW)L}Zr;PQs3g>$B=yI0NeR?6S7>wKzAf5OCQMbMr7ANi7D1;=UA#_> z`yp0zVdl2I&0T8cQ?6gAu;ytb;IBBsqbU{$V%at^Qj%!x2Dh$xWus>@%+m7bk0_Io zMG$X&`TF&xkk)TBQKJ(z!efqyS=K&_D5KWI7ekB=i*vfYwrQr<$XY-7P}$NYBsbw7qTfYa;(p zxL8pBR1dLs_Hp{Bb_fDs=+ht=4JPPOoS<^u@%Whqk>)z(&k`AfS3KN-v*@e=MvLT2 z{01kt`-siHtRgrAPmL3vfp`B9Z)X=9LFez7Cq#%hNV_bfX^uin{bY38!x&||k!rd$ zO3yGZ$(K?2>#*}X!o#b_pAQOL_t806q4b9}3zE6{_kR^@XW1`Wm9gLb>hyw!wx*E| z4TTT{EyM=SFNJzY-SrVzZZqE*GcwF%;zms1?abfCl7(h%@vyKas=$Z~n{xNo3=E^+xRv-ln2E&T;A4A=GA za<)V}xR+3C2&6zPb~wvOCGEUoloi;&qUJe2<0jw%s?zN(OCYWY<=e5SXxl zO|Wx>3i)lvSn;sQFaedgoVgdMD{Ms*i55qwgSlg>X?f#DUaY9w0`RQY$3gQxF}W zpSNVL)nQOyzI?fIe=Ztm%|8S+Kd7;_wqBcSi`LiI-`9jkuteX}1%8!e0}i%)kOPPf zjM|G+2s92Q({>9Pt+pB$8j5e+JS@F*MmwS1co=FEbrxFJ z^aD?bF*~u0{Pd%Rb-08))jqh}ffJU(t3X(D*UGgW*6py+u8oZVb73yYo)DrVX+1p+ zOUv>$_v3}Z*Ldam0&S)jC<-y@vMF*qu)`Kd=i@!9Gc(_RpxEXmbg2VOhD5f2Oac$Sy4G^IF1inYz1u{>@jSb3~y5<5a%)@ z6=}8@MB?DOF$T5&$OnC|Z=kiJ-V*U+<(kTp*|jrH0dL_JQ!&XbG34`ub}5Vi6gW}? zO9#Cd!U1dmUV>ZV(x}EIy|kQ1=a@F(&=0>C3;S1%*{O8ZSI(Fje{*~3hJA$cKdDl=qXT-=h06MM%Uv6t^J&@t%7eEICM5*a^ z)^^%ixPCq^YtkvZZnR@pD=iMex$H?K_=ddto6({GT5h^63eGX8#to*CQm4=GJ;*_T zTDC$-kCJ~5w>Dq5Q@(;VXlF$-L0pq00U{&v-=F*h31Pn@#wXdv)j9Mxs;Gp#NKqs& z%y;S!;7{+fALQOsp`EMDCcKm1=$5v)MZo%~@)+%yD74DWB_QeL-fGI%g4zD+qc5TZ z#x)pUF@Mb^C(p@u5a?GOp{hkktGx^Fm)I@riFVe`a|dQy=aHVE^&4swrP_R?oclZEIelQ#t(tB+b9=;k z&DzG|ow6~f%$Wo3!?Yh3()DEgRx!eJuDcCdN%~hpbl0mGtNXw2$euQ#hdzm!lQde| z9oG;pO#3W4_kM6O6HobR7H-jFHerf^z_*OdVTj6gNo8B{pTwEMfOqR>38*g|Xh~YRim|_VDmX_^yv~GZY zbysw*Tz#chw}PJi@UxQ^Y-*B0MtTd7CZ(7WNG-PLst&Emjo!W3v0dSxb_3Uyj0F^% zxW(5e$kyj>!9r;50jszyJo;^0VqBZ~bLx$CmXW#6r&J&5#*_`CNQagIH}CjN}I0&3WQeS^MQQ^f1>*yIPq`0w%moGxG|VUcQK zQT>Rl%4Pp26;od3B#nSYcyu%cp1D~zU&rtyxn0AYH^#Oszn(}Bsj7(k{?^U=4HkIR zguw?idtdKqg;A$8(zs6(QS`YUu?=bU^~>+X7bkrAxEMj*dI?;gC`zwTQb~STPRJ)X zy~3%xu#tdTdQ`$s(T_(em}fxrTH}NmA(h>W7pH>}bLpG05EW@_GZB$|oWe;LKSuPa zCf{zoYWt#vSiIqR-ABM%hv^|o57tX=Yfin!up4Amwl8B+L@yWOU6|nT{a9A0fcOTj z$~YU6R5PLD%V@tUY0{2-e%%hIeo7=0~HTNT>bokPZKs1 zttGyd*bohmpI;zo878P zTy`V-oF6_NBYl($-}Acep%MEmd;DL?OUz~Yom7f55GP9NIGzbHFCesR@m*@?Mn{rM zzlR1z1w%vmb7+a0>sv6LPten0%ecT^S=Z?fMzLD*$q!9(K_TjUp z_ujg{IAVn){NFe3H9p_P#7#2()Ys$_@rioDk7<2=X^q$pwDelR$*F#nxy)K`J&@`W zbX_#uwi56z)dqHdTpjPR2EW%L@0lK z^e$Q>ngI5NkKTjeb+mV{HA=5LE1Cjc+@yeU;-#QK+k+cLpd zjZvA;#0?{LPr7*4_&*_kH$Ytpk0wAT(2o&$vw*Wr!)Zhtxww0N<1JwJ)(N!RwSs;m z9qq{-O(UEm-E^pTG5-Cz2j}eUE2(~L_X7r48@B4|g4u|sfqP*J_Q5Dr*C^#^|N6&V z4FQauHM`&^2a)r2cE3hVzw)o29OQO{2ilI|OkEflk=-%pC==y?%%uQ~((C_yT+zQX z9^&h7ymwBw@YyaO>;jhk71o7o!}pBp?KB zgdU-RcYgp5(9-z>ztWC^TvlW=Qk3vKr|#7eLc#EJ!~u12dGe6qk8QU84~PU+9Xt); zHE?VI(B>N{cMo~raeE!2J@aq|<#EmC#MNq z0x1yutkSt{Kevt>X+!y0l+&%;tk!n$7w#|M?c^7gEw(mC;RPqM*Vt{P6!g2>zKR>p zCB`5GbfxBybG1<}Vq%`oJPjXMSEDS|Yj{`ym;u(uX!hah7?^SI+|Cn#%47vDzE5*d z@W8n>3oVVZ4P-=8yAJAA_t+nZO}EIql(1a8MKcIbN#QYVVtdKA82W#UpNdzaKFc(h zS7=p_b3bU}Un`~zAK$+27HKeW4j-X={x0LEPdaq<-%yJlnLyI!=H$jIcCW{AA|RxPjc|&Rg}_-wNe$gp_z-%x{V}m{i+;g`Fzhg%%r0X!vG)?(?^9E^64qqv@y8m>wfuE-0qEf-6KKYv0uILoO&HvjBE3QlNAdLI(M>(VFR$$rS+k& z>C2fT>^&tNv6n*kJ{^QPG@b6B0N$1*kOkytU4s`WH`vgQMYpFxxAV;>uUL8Lb>HVr z=pFl)Qx-`HqPpw^L#}~To6_WYc-*Die*#5+lN_{Uu7fn}OVHba2%U`h{8Eq(0?lFA z+rin!LjTv<4l+H$8*lD$N`Ge+I`xyUKbUx3Q|*ZM=|Uk_PoGD*cLibG$}>0Y5?=cd zj5)tur0>dlJP)9yC%vAALM$!HpJ)H5f6JW5aKFHz!Pzn93T>g_x2PZb{Y;efWL6$7 z#;3mMvQb=$;&->ME|BNhGKZT5u$|^kYmCS5)SLtZB>e`;hR~IKwo5TFtV;$4xX0KL zv|w(2_VJe1F{IX{7mv_1^iPO4ck2w|J$4P4@;1K09SM_1q} z82&Vtc6;xgdhqMAaC2IMyQgwr=D2O3BgBlM;%&k)q%Pa5AX|psf#!W9iGFHPxr8J( zC*5W$hXmGq+~aOhUOL#x-UF>=L+_5ev{j zwZ0G5B=H5hoKw(w;#M^_qKPzMN|%hyTwqYaL372w6g~fb8R!@YZVd_)aF$XoL~Wv+ zFwsyJ6|Vt=U-y>ZYxm1?I6?H2_qjPgT=uP5xn^l$EM{QZKvG&X>F>!rZLjttGMluD zuI2c-)q3ttCRJDS(+&C-_N#3qu}R&~Li_xV`p^u3Js`b*w4^Gm0kRF=0)0 zr8W_(1qPA=Uu?W~H1<93~D z)FWX^q1@()%-eEFt$?;^~c{`dwXr_PI5V z7J*b>Wg@G}DWeGPzhP@3?sggjv0%@v=-m1fJ@E$i%MNxP5wrUXr<<>HH&ikPHUO#W z{b>sB;E`i;Bw8OhbPJ(129i)IRcarcn;$&Io0Xhv)*)Al;8xT`$d4#w@(kH#IfLxD zIrVkqC2D+m-A0W~czrFDkrDaC>0Wr^0Pdb_PhK>4*54nmJVFC6#N80iN<52qv<&ok zlaotN00?f}99^@f%v&#mZRKQb7gaSKy+ieIGIJvfAD|W&dO-QiOuuSI6Z6Q;zO|-X ze+@F`S%Ue?#HZmF{1WK8Nz&l#JOL#Z^`*?Ahxv9bAlcq(0}f5zljs~5p97zq>-tqT zN>Xl-Ah`>$faCX(B9Bo;-r#T66l~huq#DH75Q&%(x`cTP8Sgy4Os~ZPYE#KTa3c!P1^-eF z3qGA=>g$o2dslnH48RTqjQn!;&Ckp;-q9$fHEU-rzzR)11J{?4!1ipNocwoT~H)YinK2Tnd(hkxcMGOEMOLSpvz!q$b*V;G4vA(Wv zY@a8BLwgA3#fk;ZY#RfTroNi{&z@lvDc$UvrZ|k-Sl9iXUTI$34`Jhrw*<;;xi?9X zL=uUtM}GUB@c4@%$WFfK!wxOsb?p5}B%bh}?$;oyn^zp+YmD_kKVvqgX`m}IG% zD{4pEl*ip$OU>tKh8Pk$lCP&8GSdq3v{R9uCsiQhEpYr4X~Ie|^+ce>=4a^X3R)-m zg=0PeF=9hVSE}=Yh2S08Kc3Sn6{nogv(e4o1r$1OVJYS;+J9}ay zXdqzV1GGyLX=YKCP(C}@RBFza^*ipq!Pt8;g1O`up6E6HHzgiA@oCsseqK^Z2hD9u zQ7axhU8@9lr0LTbKa60ETG9&?)H&rhfn|2A#i?2B;@DUIqR8r&GRv zgPi99uPPWLjc&+nW8>Sfuh0GSV9L6a^I2Jj*?U*fMGQ@sK9$)~OG*UT?e8L6$i8-p zR04u8M^)Dj zl!RPI?sa^DHyCyNOUVBAprw5QV^vR253ndrNJuzD)BxpMM0j}KhEcpbDG7;Tm1^^j zQTT@&&I^h2{0etn0G2CvQSu9fFj5=^9|hIPswzV{?=InKd1{t9t>@3M_7V5$j^@RO zgwQ>HjLLrllL-{H;(|61dFLXQOJmeWe@2;9iGXsfo*p_0o~0y{Q3)Q`~D1!nofs=-THCZQ8nE~1rf&L4}2iwy-z}b)Rgu3)hisaE85dRmh?k-6*ZO6Ui0{ve@skpK>_sl$UQaH5=1WC3IC0jrW%_9h7IxW zS3q!bgL9gUy{b8(*vNfop@+`Aa2&1sjpJsQyj|M7U!E51QdXN3OH(Vj^wpNyD%tuz zoky;!S3oqFSe&-eu$_b5IGy;vg010``-crxwVw#f2`T{}gTftJW+r^-KLH`r<)AR? zh|^S20j-Wfm!d}!jpkg-_T}ekUhg+=uH&HZSb`o|mX?+;U(R(UOLljG1KV;*&+_6yT6A#G;&gqa*kafd4I}l3n7DXLYb#-JK<}5GM#}u#pXa}a z(DjKp#VkO-e>WeFi2Fcn#PKO9DLx}RThQ6oNK7nY0AcXbQnlRP-DbUlI9vk5!!rWi%lD^32(JX zUqFWOd>QqMa%Lh~o$z3E$LT3didWYwY#06#WR*r#*RR^Y!juJcblq@gW^h^OLFwg9yPx8=xp%aRTMm{}-}Pph9FqkZ|$ zY9onsPD&<98JM_`*p2q- z)9)Jb@1sL@{>ABcys~pZL=%7OK2H!$Mtst3+kxp&jEc>P?b$5F0D9$R_imO{pp;KN zpj=Gsl+5%+rt_wXgYL@s(Y?;R>G(`}$9w@vTTa`69I=4lgJaY0$ zrE5}pdbEcv%bhHso1Fj~NKEFo80cvyh4+0&C@3oW3kHQB@vp=~-Q3(BJ@TA2Zq$VO z6&&Gdic21Drz@sKIDAI{1JRGqi@sPjI804bGv6L~AFcl3Q@M}k7b%s{d!USUAG{=W z6>_&^fj0GFKtVFtC9bAd>WlUBZg|R7owthOF5$UwGWu@?C@2Zw=?(F_^Eu4q+jg9g1Nfb z%?kq#yEekm?|cN6D9)R67DI*p;SPM7CW~#Sj^u$&3kwSY0a$tRc^{ZHx>ZcFM8fq) z5q_eNK`S1aSiasikVqZ%JC;@(%6S`5eVd8@Rvwpvb_yl%E$f(XbM*?nU61$<5zPW+E15o4Dl*z3|{26@Noq{7I9Hk&zy29d)~2=P^8h zt-07h)G|asNxM@M{U37{i}!3EkUhhT_h&^vQ(^FAu(1f6ybnpV1u`2@S9QnMH{n?J zTQK_yXq7k=)_5%Kn>Dx$<$JfQTJ7fS;^JaslV|wM=zjYKcqDl_w&be2B}&6pe7RKC zkND9=lBIXet*ucM3o0eIEWogcc~?5h-HxX)jK*J$=aKEK)l$)1rdLA;(ChN=#4gBQ zWeRus9|!%%gvgPh?^!;CgHr>4z2V`=p7rl|w))$$O7-jqkn(9h=9z%@8Co5zOe9uO zS6WzU@9;&EM0p6fDX0xL!G0|*FXTK&evjk^xi| z+~IVPdeU%BqFFymJ8JZBt2@}o7yoMo*v=Z2Ja=v;bbumyZu^B=aAERo@dM4+;vzP^ zsBs;l6bJxjZL8Dgs+BEQxkW!4241A#4Phy8h)ihZ_s|2kut@rcb6?ZS!kc{$Ax!#n zuBFTl?;1!kArP-Ix5amRI+?{9kMTRctUTVn!pn7QId7J-V1qIbNS`or`w=&o1&<0+ z;_{YBgQ{RgOYCfNcGa(~z_ z1^$V^sWig}YHv>V27m<_~(YEFOvDTaF%3SQG$rmlQ&#HBUt6pD5EpeAyOU!Uw8sjtUNp$*a)=e; zT)hCm0DXt#0U5dpj3BE-yIvFBN!u=HSEl_o`wOo#iKqFAxb6ut+ce#XyA_48y=FQU zs0@O*DzwL{iMpr0E3`H6jSm^sGAxM;G{eFVlpjwO)RqI&1r~r`fDfWd3rT2nbcIDdjOGRW6C4-X04j~*tf$+LHe$lk-(9rA~OE}$r`nJpQ7x6IAnFt>* z6z3h@{O8EbX5cLs_B208AC{6~)$&}eIhzjvV;vpsW$vM_2iBZGX8$b4cpnIEv^Hj* zfT+R={m)q$DdUJ`v0^@rj`Vni7!YO1_vxpRd@Lb$<6C_g6ZYr&0+%2^T1b7pMMKm4 z!OxoFzWyDGO?IrK$*gq^UOf-`{c{a?D3oRLu&lv12MVrc_1yg9C&u`0I%dkn0@a{Z zzD;>zM<|LLOQnEdi`+0HObunt4$_b z`=Z@^Qp$7uRcfa-HfNegT!w>h#tDEcjm3Tm<5;}mEGeZr(U4%!jI}-ds}ByjURBU_ z>--{IK!43Z#-)p9=Rit)zjTkz_UF>>m%09D+r+6Ihz5ckqJG5Q4_C=SB zi%NbKnaKi`)TtK8r64eIG<*!p92rWhN2YYCQWPzB2Lrr!MXBw#75BwZ8I(=87?m z>5t=yv8u)D^fuZ~UAfp-_G4d>=eW2qKP2z5^aupXq1G$@5tlB|e4P8SmfymdBp*BU z1b7|cF}N}OhJS9YcOM(t@)@W%Eb71h3-pPTKP*jH%;X#jcxqx;0|ZwyGX^$CK|Hd4 zS1m;e(?u9bEk;U1M#pC7Lkq@3bU1o(Zt&MN74igaJ1CpGIRx$xPoYvmU-}<@3-i$g z2ADR}O@n(mJxFBl3hpWlvBFH73qeRN!rIA^Dn+#QJ5ll;1c|D|@|!~Y#6h=FVP^<@ z?F}&64QpvQp;9#oJVN%y7?@URshOUgC84I)mX+1vRimV%1Iv=?UFx4#d?RG_p*F%T zS8yA8TFv(>Ir7CE?;6-li4OpoFo3;dl!j(^TqK9xC+~b#!(btC30>jOOdMXadPKe0 z$ZPo{ynjF(2xOnR(5g5fCQnc)X-0q*=w@_BINW6k&haNIkXEc#3Rc zdz%Ng#WWDC+hi)c4FEi!gw@xV zjEy-Cdbm&nI$VDBzdhun+9rvKNLFU%c)l4a>jmn2GO2x%5)!^}^>-Vr`S6YPb?@!F znY-Gth2%6eqlvAFha%v0F5i?3@}369B+kSVeP zIxK$Un8+ByK2tF=k*2h&@Hj!B4#vmQ(qI5b=n_5A(j3n9^HaRoFm(egGyzy>!oMBm zfd^1}KG3m`zXyf?P?-B9V%ia12=F6N3|{7M18V1j0{soI^!ndQN@iCDTieH$i`34+ zyoR#1ZW+7K^Lu*)sR~CVE=-CZ<%>YT97gXgZS@;kigQWzJACuyQHlK&ukBc5Oblir zSEKF-C`vC2^_CFK?L!~Ife7Ei0>yEA|5FAgvVKI#J=S(}b3rto4}1RMCrl1HxANw* z3%k%KHz1RSiWG*4N5aB4zH9w#M`b`n&EV^$?;2RHu+vhp>kWGQebkQ=vUu?!hglnFY=KOB^~Spz6MP z<0)bC=@^gp75DuG>AE_CK3%Z=IP_w!TZ3;!x%DMrL!v<`4Q6+M0X2fx34YJwW@3#D|%4E1Qz#kaS& zD~cW!BqUVlHbPfnF6A4=)@YM^{od!c^mmucZi=qw1n`ZdKhU5+=@(aFWCWJX>vOJi zGM!k~ci*AdVwyErnA|)W*)^{1(tlz4)?e_FhYznaBf$SCnhqgQM-wXf(5O-&IkSBaff89kMnhH2*%~`>0#1J^a|-d{Vc4$juNE+sm!>Ol z%BiLrm+0;RdM>AU-UO6@tm-LBsOqcW0;8m>&Hqc|;)AR6_s~7n_I_K01E+W} z5(aiCE~Ods#x`Sbc?-tMeNqB$BeJ^M8*fQjNAzp6FbKP(0KX=Vt55IsONe|h;2vz0 ze{aI6sh`n@eMn?On_7pM+brGaQAx?KO*{;=?vaH!N}vtbV?Ixc<9}RQ1nJUSQkwmS ziDlmHH`3HQ-@!B?@Nz54*%|}jxW&ai&_eaRY$9Md@cr=&1d6e}=atE+Eg%UGFh!!urSaPUW^}=j#>E=(!$s{+MBn;Bd3Y z%)`E*KOHd@1@;-DHmbi-=|kDkM-z>9$apsalej+E(|D`zL+4wbhs?vS*e=AO+z-Xt zcU^53hy(Af)Lcanq=9LLPIclhKY2pK%uIjzG^yQB zdz|E5#7+aS)!~dB04CxRK$JAiNA-e3z<4#fCkhIkBZXF)#I_r-OE2FBAU{%;?|2p4 z05(BiN=3fi=v~J5#ogF9Z6^$7k5(27B37!Nf1B#+D4uZ!y9W&J#DE)o+<0XKwGxV( zkx+HVkp6Ef3(9N z8YhI$6*(!X#tN4WUYn8nRxnlqOgoz3p$-63Cazw+`t+mG0=&dGw^z$24|Gm8nSVWv z`46d$PIG4VNIF>D+U)4)NnV#zX^auwqYQr6IJ*#sShRrq$|00%4J4x2%&v-)xLkdN z`zC2Ns->!hbV6$|xzZDDe`^eTuKh)SpAk-Lg7bluDo8z9=h7`#TupGpK>n%rj|26) zB}#k-#B}NNIpt|c@W+p^=d*H?O;pZ+(O+))fk>?zxH1VZ8OxQF^<9{anh|wfq@q~H z%fFCzU1#zqat0Re>0`;prto`GtFRRyx;$Mx^1tMin%4z5Ue2`hE0PlMU;A?b|3W#r zD|4ScS^=w;!_B0sSNPRNlX(sH;bqhh$BcrT9;oO0V|!67oC5Ula|f|)-(MGRc*8y$ z03(7SAFaqDWw+$b=EtF>15OC(!{uSb1*$_Ntmc@_fO9B58Sj@gsmRd+EK}w`mx*LA zjAg^##%4KA*j2-@J3&k^mgB~a2NmTHWAvjcKSo3}O@JXyGyg)Y^4|Unv8q`Qa)R&2 z_RV!lD@NIJ4|cw=UKbXH%2U+gpVUkB7)1~zS~r1%=PfB!yPj#)7UaN%`6P)bhM0(V4Rv6 zjC`tWqi4{>uGMV`8d|Dm-j!XkAgT^`@hquxmyHDBm@JLdyM0o76$%AdU@t0rZ_!+! z()il4=!sD&yUUx?$Neu9tim-9(1$nM;SsEZL8J6R%a4B**Xf)5>Q*7YSevB$&p1>@ z)d7a^@nhYS6Q>_XnrgPWxjc!w=QZh&6YgctManAqXLS_fVO@w3Fw?H1YHT~v*DKk_ zhEEMNJ-gcfR_0hAP=MhYMj+E2z&Zq@TUz#B9J_sKJx@fGmUaA%9UV{p8r?$(lmlhz zygBh_^VrhK{u!E0*dbpq!`J182Z~{%YV0I&Rc4Cr|FbgLdd;pM7Z!Jn25CRjEzy~U zY_&olG$5e;koEK*UjvxModJ2?Updz%fXiLBG3vy`qNx#}kUKt{le={966&tQbSQ)L z+pAZ`gkD8`zL2i}*#0$IveRLk zb#n9bXMT`(LXUKuoR(WcVd4+YHXv@@ecslayyWkXPF8~2*9Yi%JRK|1WMNS>Ght7% z6cfYAXs7;c36;7)meKS$X{6WmabVoBUjKzdWXk<;lfr-AzM6Zd9{vCNVK$KZsY*!* zCH4hyu#Z1bXTA&hUn+(m_DdNngbXrg*7mEN!29|EL9v4$B)9foDhhBS{VrFo1}%8} zXZD~8^rwprxVia$?YTk~Gefsu|2Xn)wHpF{R zV+Up&kp@-tNLmEFg({#|G(amy%^dydNs%+z!NLL=&fMQbc;K(Zbz>#pw|hu~#`({PyXj5!V7eUCeu5N#=UaBy-m=7bfb#3xLL3PMl3*DQx3}JbeJBLd0$X5t zYBe1`PhmIGK?EdOqos}t#iO`+zo~DKLSXZp{dXz%ZZAxB{_|%cVJkJzP#B@CGAJQ& zA<%92FW#84*`?-@%)Fi5ccf z!2MHkkXch~&Q+yV&dgST0>bZ@zhjrXP#4fyfJeIuTgu~ecV6wDAe|^G=5&p1^J^ES z6S@TKbXCj+Z(BG->-&aK>?)_~{0WF2Nz;bjQOg zT|-X9O?q?OTTvAL+Y!Fm=kV}#PLE=DE^>~Oz!>(@U)jGAr{-WAd1Z-13uVK*U*7(8&jMb5pwibRCknVV`vTFfdmSWYGAHanF)ARki*eLqhySfPy z-e3mlJV6YwadKT&76M4riq6<9oOaa6#7-ZSTk)E!4xozQQ$M271-XgaeH6kM-NTl( z3cEiJBJiShkMgzOgi_Oh7hwWZX#ppuk3u|t%%|AO3}(%~e}Ccp`AGnhPfkv%rdR@$ zoC<(QCRSjuj5Tlm=mNX5e~p2RoE#e~D=RR%9ApGfsxbp5+*kr|)jq?));4Q7zF7~o z(Md3>k1qN01LIP>7kin27zK0MuWi64VFRdP`wA4Nn5)o1DYg!NvQs@bTPCP>JoBvt zT+7zjQ;g)V#FMv%oIUJs`t#H63MSrgQ3vFkb#bt_RC>F{%5bW(bk(HaikLvnizWuS_kM}Y$M-v#a#GBb{EBO)lQfZ6rIoCEy{vd)la939Vu%(j23pV}3RVz_x+m1}gttP5|4 z^gX3W2FQAx-RI)-o;ISbjDBu(AlL#EqY7H8>YCz5b9Ad&v@CIclRAODOLiA=xC=xz z4AMvmNy(Y%>GpV0k6)eesy9>X^k!#fdLUmw@#aJK64=$7@2rdm2M4pNWi)qo&U&Wg z516x(r}J73WY|^h-4qbeq(w16uH<=>7|gau0mLuXWO3mlz$cxCKRRPr4v!uY-x6J4 z1$>Qu*Pcpe_!!7jBX(a5(sW}M{cl}u)jGm*3f=X(`aUH(vgwlMWO>K~no}se^*iF0 zL1TIx9kCLA;b?A9_x_nPD~qxcVygN{;KEP2XxAgSH^2(w8K4B+?7v-~&6h!kQ46)L z88ffX`}f^+wx{!}BO{v;>w=E{Lq)(1;G^8l?VaX_R?~1W@T#7!>ESUH)VuEiQk|70 z;`~jp)`yl5%C0{tiUw~$#^ESH}wwlpc7%HyPcxXbQBVA zAStc0#;)4Ayp%Px>pNLjKkMt>-pe3uBY=bgGz?TkKHyfM{#9pqy;A9Y@^?n%*Uz?e zb;S!hS^6vlxD3#!rFgfds}-^nGa-DGIaOkiVFQ>gm=G+`$U!=&7@c@GGeuNHAV**H zx$by0i5&qRzMHm=!9mna(M}<@4>wShPt^7)rdD6{a(A}fKzxzyNt=kQMBo-UOfopd%r{eya^<|S-+Hc?49Or%T>XkSUVEo-z+ z`%Y3nv;JPAtOKa;f+SjLb_$G(JN5jlO%U^P6ohIE`~45@J){k+Zx1s&wfqADmX&6d z5b;3i0lWi!Nj3Q>FJ!ofwtR?;e1Wax+qp;r75v80!(aybeTLTF^Pwk>BiuK+!Ha-m z-dgC!`0Gl80}8g?#R z7HbS#DvcVbrHxv@q`x^ru>j+Kmup->|vSZ zZ-Pn&#qaQX%78{ZV)CF2GF#{q4%veC#crHAz%xw4=XakwGl3&uIM2i_Tg@l)e6cEW z+Adr#JN}+UNQ?208nMHDfOj%ZIo9pXL#Cb`pRy98X-;M6h#plHZFRn<=8=`B(6~8i zvY{ve3H24aiS6Q#N3fIaQM(4^kSE`ps&hT6fmTg56<28k~f9?6v8usj3F`F;d8Cll|f^-)MgX z?nbAmuUw*~wP%KAexot>`5SG+npXExflbD+4l@}0(LZ$RR`FShDn!>WgL;M&9*Dk< z{1Ef_M z%$#p?yaxVkPBG$OSDQxE^_^u3Q>DrW_3g>vt3MiFKL>k}o1aOS=eK${_|__HyoMW< z2QC47HGM5aZ$hT$zi+O@I|TsTT3vm1+it96Z6P%t`SUd70M9+hvK?S&a3wVu04Yb$ zO)}rSDa6x|kK&g6dG!C#7_Kyd-r9%euPvtM^SSHol2p=O;-cT^+GxoK+(#&bJ_(J> z=v&W*O>V2btP=%v)1rP_Njs2X_VRX4OklbDz`))%3?*)(fkHM4qA;9^YYvgU3SL z-kZFK8Uo3V$5ivgK-hk+7(^A#aJ5*LP?7I{udkzXawlv1Fs zAbqzyNJBgSeeo|4{oa}SSJJLvIFO;KU2MCx$t;9VMU^NV_(Df#3?%Li+R4f)D*5^O zZ@^^`ybC~2N^0uc+c0QR3aIzg5wW}iPW_=jwRh+C2BQ~c(Y{MUlpZ@^StU!8t&~NL zDJ(9BGq$HbLe<$Se%#{-VJ!#lApr~uH4GbbOa^%W+-cIpX|5jR$NS|zGi?#6si}{8 zX)ZIufd)d~b-56Lt-s`zk~ypY!huqBP(};3^@zDai%=A^pmPQ$15hYOJ!OWX@d)(g zji(k?VY&#PwpbE8R81$FIxY(L1rgmgZEc*CkTC6c!hc@GNOI5`I_|@LBfmD!r0u3$ z}zHyt1OD6U-T-qSR;nJ;iqlaA3BU}7?vX$?ao4wiw!&1G?>Q|q!|FwakH(E>YB z=|w^X1=_9LDDLu|o>VLi zE!{)7Fr^2)_lNZUxy@sg`wXrRzG&FtGatk)j>pCH?$Nu1>}A6A%^oXx=MceK&KoF_ zWzdJl4PVyZX~9v8bi>AC*`p^HCeKL!aSa)J)))-s8cCbYJkB+}eMP%`Ha8`ghmrxlrN+x|&jb)-7yf z6)6b=s$H$5U>4QLfXfx7(C|%#$9hzIr)S$I?-M~a=WWzBhhG%8lsWt!EN^F{Jlv#% z8hjR@4-M^{d#^}=mQ-pBnVfWU1kS*?K~FXD{y2iUUwhIE0zSuX!BMa%?`XY^Q~u`V zz_1)LKy1|X)&YDq3P>}wd5x86Np#;)l5gwECT_U`-siv7TaR(V=geA2nhudp?TzWW zE1`KADi&#rH1lTvD*Xt2RoLR)mcl^WFz8*$q50=tY`Rpbr~gZ{M5h7_2`^DmwcrwH z;B9kIk<-1b2CPl^l1AGAP%}1gcWYzFy=sCYBQ@028r5olMAxs+w0Z3J#Vb>5HIcUi zJ;d-APMedqDo}l$3dLJz(`#daP#k1;@P;DE`s1(varVLYb6c=%;7tttSjuy9g1Fz-H#OOfm-YXta?j$g zl=;QLm{1z>0F8O`Ij14&a)tBeU<~lQ&;p=dp#0BXwblj(MihCEOS+fdL(CO%k)GQj z{ouW*lpsH_=;1L*Wwn40Y?gj2y>jUo0O=;|@Ie>7mzn{@SI#c5Lu5N(LZU=HjzP6- zJ%*S8KFOyltTQ+);bElzOurdS4|mtVOxFV1e-i z?RS+G%-}8ksVlN;_}UT8vjlvq!B92wCaf>+oBAAb=asP*Tg5U$I;?;a@#gO#n=4`Hn+PnipB-o8*Bs(yiyLwxj8n3w1#t|DicmJIn z6~DUD=Jz)O7zif8#WDF8%~B$FJu~#LI_i;>6n>Y#Tq=J)N?J{e0m#*#Fm3aztz&() zftT1*n=Gxu)~vwUSLjkZOeE*FErtHr&cf@a_>Wj36hli&1&3=NwpagTwCS27RV_HG zfnXAn1AGYS$6j6`1N$cTk9|X{d!tE;N|+8~qB zlhHsf1N|DqafE!f^qzf$%rxRh?jTmqeYOG6gy?HWqV^$EK-{a7Ke^0N=&y^I1LEGD z2E|Z*20*v+{-t@s`H#!?x2nmnSZro?wlhXRW!L^K^btzjHLnD{ki+A11Od+V4b;X) zIY&uA_uQQ0Tj<}TdU;RjEbA$HvtJ3BR!35b$W>5S13_&#oxq5R^w~qmJnzaHj9{aU zwUw30=L_x|aB>iCGuP6|XuK8<$T7Qgx)BKe8Af&gQ%=eLWKQz<-K2iNnYVy1riMia z2dZx55S%TgG~Cny!1XW-`2VSNd^6KKTm;I%pTuuIE7o>g&wl%mz%CQ`B>Me4WD1s9 z|5;a6all^#d78e*_d8&BOP*h1=e)^|U^H!|T!MpaPhQgs$tqHN{^(6d`VL6CX(@Z* zhYWBLE>3c@v{PRHzFgao!)kf{S!B2J%|C8oO24Ya_NKzZLcp;ao*G>)ID&Qu!nKv# z_6U=0Tc8PaW#!=sL-lJ~0qPj~m9`Zd8XU}JSbJ{Guy^`BNUpEBOp|S|Ck2W;wOQw| zuGa(JZ>(|XZ0-1brU*T$3&aOqCmJF^8(3b^^Xtz}cI?t`cgC5`fZK$LST9F%g>V&? zFv>p*=0vhKwRS?@QC{R-=iOe`4m1po*XIIjx+mmjLMEj%i{8f%+{by;C@Eke`X6>;O=#{9u)TaV7j|fHI8H-E?z->#1T>kv`m0S> zWAc~6=fn(a{r%I|^+wU?SynIHNX~?tUgT=EvuRP0>EsC_$)(JMrnZrUH~Uk*q$wWg zU&ZNo=pUh`9U(!$aF>_N+*PGrr_n*D{)(9!ZifLSe*IwH$!jpOG=B$D*M6sN47Rg# zv^)8Hwo_m(&ebeyCk`940h`htr>*DB(Y{U8jU|Lu1TjMPp@yD_CN`~1US{UBerwDj z8?j_YhWtQ=cRL3XVh1emwKPe6iguyexue*jO3sP{2Ir~!U^e%?EWcsrlKFELn=0E1 z2QRJ@QX|n(bUBvytKGlVe$E~{5L-utpp@0+ABhEjZjZ!}G#JBLZmK(;DCPGKqEnOA zVh7Zu2(oL|V4e6!kh7SR<8K?<(P~lkOuY?+&W|K<_MwSh6e0CuC+Ct zfmr*G`zTzgn=XTg-D$x$#iurs1_uXu5z4s(edPp8@Vq`hUe3|{t?$Z|UvV97vD`J4 z8oQj4@Y>oz5*~8%>I2?yAKtahhHR{WJakZXGf*z2*~j;m)|>IglQ2B zs<=Z?PqG`!X~{cdy&scgp$h$R_@QzqPV-T^$q{7eMpOfpp`$*d72l8*B%#gAv* zSo$-%H^%c_2mN-!%gO|NZ48Q4uYlhYd~sctb`w^6?nptq+p-X($N&c|!mqb5tH#pX zFbWC^-n!M)pLw75UD6@M+U2^1Lv7i@p%Ad#_gmR&w89*+G~<#z_v=q)IS*@*iu7)v znl%~*$P!U&-b8cuw~^nQw_=m|vU)>Ino(Jyil)jCtC^TPQT> zoD2Q+K&C8P$Wz=Q_N#dFaBqm7%q4;&z@=iQ)Oygrjld*Zs^u{g0zK70y zIS{#=Piye6n><}#23jg#-seTI_=O?N6bD|*k};T9Lx))Qp{XDi5`{iSyU#amFs`-a z%{7X=K>aU|KSCj5Q9=cIpMAlW=?22>#b#POYXJB>M;|J$(~^o-akIc!o_c4aNIy_t z-d0lmQBSv?gVDXJ6_NTx0=U8zq&_SWBqgev0!!VIR?Iowa5DNq#TLHC`W)n z(-<$WT92JG!xPVrQlyJHvK6mk7fo+uz8Y74{0bpG|NRZ4Jvdb^Z^F)sWe?8H%oLVE zB5DkL-@RMg*#X0LimM#7k2)6%idBj#vT5(#cR5M;VfU~({19cP_2mU-8)n9-Ln~}@ zQk6wIYMebjYsFcZ|NRdB$_!Cda@rK z7dOsEQJ}P8VPkSl6ff)j1SA0+%>Qn5y=I}HKEy&1Y$=gHcNZ_+qHgR8Bn)PM@I>o)qEF2V6zVNLqzltmJ&nx`EhYEp~opaCd=EfK>1jw%@>-- z=(~zBUo|dWTUvFedhhgls#Y4G6kR;!K6KWHoR5zP`7PRgGW6}o$qOj2625sjBSe>c zoGos=Jj5pyM?{T;SW@TDqi(gwvXF#?gpQ6*OiU~%88rCbxzjy5`q7jgyMJ&XPMsGl zG;Jp%Ba>c8bpb!LMZ&L6`?IfJcHsN>-^}k3kkT1HchD$xFmZ5jP*(0`2o&PB{Kjnl zil>0JkAA?7}j7jGI-2M8Wb&n8=TUq zsRUY+x=hfTrJQcP&+dZLst9a4+0lcJMn%#{cdPXz(nI&$Cma4`gPwoH0Nb5|FKKBq zUw|)n8R)T&iJ=8E%#^=?Mn)v9z>nqS%)zCWazKYz7779bYJiJ^Mp?k*F^Yb9Jc*+FGwIi&mQLA0Qm*cu;G=PJx&JQrrz5RaP3H^MYaFGwxHeGpCTVkoa6 zqJ3z+<1)Srq*Ozt$%B}p_Y(0Rky3XyKOWGHW_CoWBU|);Z8{UTNMF{<-Hbr59C6|y_y8K|fTeGRD zLT?`(@7$f$(8S$`7N0wUJY@yNEWCA+z zRYsySe&g(`;9wroVqp9o@cuochXoyTNObhWsT-~?E>r z(B^Uj573dH@YQ)0hHo_fO@NsGCNgxO>#WCTm`5-q-98Q-JRXb3yR^`Pay1?*ZM_|$ z_M$Z)<>e+UGFXKs;E%6P=B0T%exqaxnyH6L$J1@qeIOBNsdI)sJIqGl!BMrHs zjAF1tKAo?&)OWM|fY}TN!f`Oqy|J;emXa#$;S((_A`tL(^F2vUf>5jHy9ggk>YO{K zR(I1K*mj{xj~qFdr{HO0!)p_b&1TlVGuF~_w8zf3_MpS#PH5Rr5n~LKfm~h3T$5jJ zH3H;`8k{2_oOAzv@#~kN^GH`n{&4};FTrUa>CBr4`Zob75I$#JZ7uNsC18M`A@pE4 zz!JjTP`J`|@zDiQS*ZtgFkDzdk0QQ}0^r(<9FrXN*7Xh~G4yKgvR@MQ@TN@FUGutN(_z zA5DNFwe8zXgqi)W>2YIP{X3&nA?wm_lbT4=5WB(7=HUyC78U^NL&D6u=$*lsf!)56-zE>0#ksm_XJQRwU zulNDzwXamm)GBoRi9n1ic}9@QJ?e$;3E{7{tMhc!=+w0bq>ICN_hYw*2iMkB>U@UD z@h<}RcU+u8+)CqSKg!0kvWVkxLOJQ z7e+|>d)f7mQ4|Eb8e1wjH!l8TAN@+_2dG_ZJG=6f>RU&QJ0AL?_+O5Sp{CM)UPp%L z!we%Pv)RDWph!V4(V4}9qIE6tzH`dX=SaXL$^|QG8rusb>q%72;fs%)Vw6AMZWecz zK88Rra|`21$v1Er*PySp&?+Ff^?=sy%0sqCv$ zti<3QkbmW3n=p{Re~q;9@mHvRBx-^!2&C{#nB0wBz)8nHH&&c{hfFW!|5iuH6*%QB z{mOCdgiAI@k4$a_>Lr9RKnRW?o+UVc9OZrF-Pl!d?)wrF>_hKxx0KKNNCn)ZqK>~- zSyffE7V1|RHzBNn3XN4Oa2bxvB#5Y0f(F54qqy$=h~v-9d=JsPOeTtOd3nz`l(j7M z-j|DcQBW=k8QGzzh&7Um^u>#OH!;^YVV$`D?dJ=$Y@R)PW@>7xqZ5-p`t6Rgiy@BD#1Z*NOq*e$j3jw7w}a)CO&Ux;;xlNVFnsUn=zlCAwxqq z`kpJRli)l&hGZXl&K~!VP>?2ig%%ruoaVmZ2XbcKd~P|vl(Mk_hN-cZ4bo)4t}@(s zg=A5SNMXb<(imPMH;Yt!gdli(7!+-o5Om=PxdA>Fz3qRl2_V=1R*4uLLD1tlh&k{J z=&KRpA7}?g?jIX|6;te_)BK#Z7MX7QPb3pX*bxRD96357~?+ z+ws;!U8eq`_2DpMn5lr7U$kxbzdiRWaO5n;;2j2VR^BmNufL_m?vEfKBir7C`o65Y zCVv`D!pS)kFXRV=WG!tJ*yM^H`t4iUXW;`ghPK04cmE~@7mWe7x9i+hiOJ!q(g*to z_g|PU!U>;C-^@?edeDEJEGz^yqKl^I}tjoX-TkLK#tFp%k8DtBHM1kJUd z&#$emfhh{s=?0t!+smPCGpBUdRG_GB`aHl-tlNi*ZtQhz$!4>^3oOhd+=og?fS9tz zJ3`(siPY=q&(_A89gcVm7Js$V+ne`H-$wGE`^QxZ#<)`nIDD2$di_NaG<$$C4W|Q4 zRaLKiu2~3*04X&1-H+TR4}r}LP!wDtl0+a71_s|3`_u1obEhR(eBq7slgyKpmX7DQ zzrn$Qgre#oSI+Kkx`32(0;=ncRk$>*@9ht zYnI(9r-OtB*m(_M~Y%}^>(-Xwpk69JC?criMeZs*-QuSDZ&+LDczz{rR zlES+NPpX@nqkTrFXJ@6Pq(V^lNUzAs%O~3hS5{O2cBJ&0W*W(%0+`RrYR*=VW?^Oq znl&BpkyR9hB~OJ_>^dbuqU}v*xA$Tr)Q*{0wU4&qphnUnmhLQ@Fkj#MAhY+=IP9Du z?HaJMTxdh?IAa7$%;KYPvCHGh?B!KzJr!N7}#eb!?g<3$RV?FHA74YK#8W}1H1S2COD{$*pw){41J!3`D z!LWJqC!dF(o?FcDrDu`qZQY68#=9KBKj1Y^<}z&aOmUBp zbKs(8-=#=uQ$s@KI?~3nIv&1GJtlVVJ@d-OU8OLCB17u~S*?L@Opo>UKO(I^q*qkb z9f+45;I8AG7vFUi+FI`^B7JAM(3gu3f}=5X>yDCt!RIDi2f=&z3KUs^skmDfzJOyR}Bz2gJyElw3><@Zua@@>hZi}2$zBY3T!=xBj8Io=WE#tqB{>}FlR zLx&4U5OHvJf+G~WXf9KO@qH|Fd$F5guA(v%*T_VxPL3u38n?k+b)BS*L*8=kE(=s* z63#FFsQFPJ>cZ#IcBGrdZq06A=X1F2xFK<#;47)JN!3nj5=4dxrQeUc$)cOf+Q6}= zG^IcM3`HMnR_s?hi}{O@#GO*koZWXi>p6OlU8iFwpKV&T8Zc}T-($Bj9J|c&fV;q6 z<^yJ>o=H22y5{_(wXx{aZobW+Nukhl&7kna;i+ugzG?P0(>wM-|6;0m!*EC2;IJKW z2kxZ~%=Mg1U|r$D(D3I+JsnK0RX}JQfLT$ViMij#NXqt1Pg|Q9K@cu_OZy5@1E7i* zblCF?TUNf(b_zfG2`JQ<-|MXrvrY%IXfxH)Tgs=IhR4Bf7Q3$bZhR?e6aeZH5<`5V zOByXvdsDoODV^UHO9eACD+Zotj9hsvPIWbA8|i%4dS z87b$J?!h(w7Fo%|o_LUVv5Qh`)XLhf!CPc!d*!Ta>0_5z2v(m2I404WT3-BUDn0Nj#5@U|nA&_Z0_Pgno}-Yoi`T;`AGf#9M@!)p zd2cx3<_9g0(G1uN@!XL&wchc>Lz*R&&l_(*u!_8%RK|hLm|d1_MxBwYTRwdU9&U*) zcc|N@g7xO%pD+BzO+*4b2OkfwFghSM-XzgCtgt2e3G&U8uC14qK#sH3_DSV53(=&V zKh0^MaE(GJVqZldS|=>7ah?;aTZ8rCs*wl$Oo#>%EU

7} zeY(Z&#q5@J=gxVz^=FD8<{GoVy)B@Il=g?;=gm*D(?~;GSV#pTgy~}2Uot5KF~AZWno@jw9FL(#jxxgPZHtRMg|y@NO){Ov zTV_wDS2at;QMY0gqiA7UumLwg)th-f)$t`}dzU|>i>n@cjhMB#{BeF;5FhTg0)7MH z8QJg}LGWH~J!>!VF@yZu#yRQi>};FE&rdT`;iLmt|@?5)QW`DL>hfc<8tsYC~CgLo#fx*Pn97}YEiLvXcr|$oTPGpx7f661 z&mX=heccUJ7tChBI%Eh9dd+@T?0Na#PR^ZU&+s4T)J>(c?^DE%TnxHN9$uc8!o!@$7M*;X6mY zag!rC$tri+6FlcIOUy#>Csu4D1~w1@ftjgkV?#q}8GATr+6tGDmY$oQE-Aa}t6~2E z{hFPVQC>lVFE*aw5UJBsIq@PO_7R>7sM-pW@8p6yI3y?3+FMp5#!!TP54!sB^DGbS zDX8~KFwMgmeCusIb6+SQ&>~llATJ08OrE8Xt>G4~PVQczwu*wjuhSGJ`u6I5RH~7{ zv|kuIjQ7rp2O8JuD~ypC4lwzW9+``eH4Lsuk4CH9(*@NXkq0t_awDF9qKuLr0=LC0F6u2zz8r zk=GxPxpH;D5%NM4(RG{ZTrdLERs*&1pm!SU*9M+mU;uxeRE6ZF!>q4G?)8fj!w*ro z!xO{BksRHSMuM2&^sX$|8({V0dGT&~7SfmvhC7A8w;{`eA@paw&cu53c4Z&!6R`!< zJNH;L#i^PzS4c1j_UYCU27}a?k#pgaf|HNc{3|jmt=gid4D@TUYvW(EAC&=`5=N>BjCYSx$+u~$usyz+) z2~?W}-HCeOK*+o;M6_45I&~js`oJYyks=Q&wbY$`8IVKkP9-zcb?cS&q4JfFhY*++f#*{#|g*x{Ean9npr zXWd;D#Evb0V9NDE_n+OKJZJWr{ix4Yt?FZ`5S>C%J7r znKW6iRkZHilC-b!wGM&igE` z;{0@g)>d{_t|tU^95c^NOdegu(YEx7zn(yJh-6O`8D>V*W5AVnB51 zM9Jgm-J_fda@Z#Vdp7Xqca%J#4rjgO`Dr*kUTlvnybUiK<4&8qF%bT5< zSXmzIIQUYfpesZCCnWR=iZC;3EXsyJxhR)Dc;Pc1a+v}OCzH+G2LPn|?)zv5$2rcI ztS^NH-G*2>I~mR|?Yx0|)Z2o@@s3m@#yonN#f}_DxR=<>wbBadG(&c3V;DEi&Od|y1Pl-`U0W}BHzVz}I?bTkzVb_IO<`1w= z@m+w|{XbpU*+)g_Zw#xcx)4)()smn6oQS#Ga|ut;8tZ3qsTz3`{ekwT$;ad zpsCxoPtZc~1CC}?oBMYnO2W##`axl8OAj}On`ox{N0)Q`v4QHO)BAbv-Fxxk1u!eE z2?zaD)z$ay?ChdnR}j~k$^1X?p=YOcla;?He*yjYk2+~k zUI+sF>wren1_1d3YQ!CC5!Bd8;KPXdtH0_eccT@G^08o^Y`zn#Bby}#!SZW=J|r-o z1MV9@`2XQyvO+2ViP~CPxT554hqbn~EdhsLu;d2XHy9ZDEuFNqMu1KlyobZ{=duW8 z+@{)9D~3^@3l>^o41f}3VDMseULJ7*-E1-6;W34kYI>;Z-yK<6iZ2?$v}CxkVb#G^~GlTGhaUcN~c zvbFsdJ*c0X_aXBq{GmyF$!U(WmmLu96)xkJRu9+!oHs(a1ta{)c1l$ID|lDzuZ!`u zU?NBmEk5z{0^@fj-Xf&a{>6A^ncv~go1NqZkQe|Br#U-tB>^g9p<~R+$(ht6bL^k< z7vs4tN<_Q{|Biwx_eb+sB@o_|O6}7WkZfDo(*%-)!x#E%Aj%^+^$e1Kypz4j|8=Q%IhSbESvf$@NriwNnxV zc*+<4&sM*SeN0i1IrmfG1Hro@XK86^ZH)m`u#8r+at~!Aa7W*q?-7he77Dnb#CZbV zjI@8HZA19jyc<5!N&&HJ+(FF*4udKvRy%5V-~BIA_}MfWpeQmqj^lE%rAwG+ev}e8kt2BlLdIFjwlK zfO(!;eoy=7wwIgpk9!&3uxDzq|Mk2ahBBd#EG8;k?l=&qUTL$72$FZpgdEenWZ`9I3YxUcEXyAzRvKI47YH17vsC^s{Ph|0&@W7 z0(n~9gMA-=a`v;D4CUpj0=K%kca=iYGBe*ix}a8cL%X>Ubap!knzdTn%W z6_+L;u%BSENjT}x$qmn}soDj)3Eln~);b?y0~HqSBgo8)Jr*>=Jwub%KOYDwAK|P+ zZCflnW!|`6mRYtbENE_FY9@9%lv^PTJb z+m~C{nrqH6=eWlh_qYcU(TDebLuiKGnKGmQsmdZDVH_VHbNqM^E({&EN}0{oI2UU# zJb5yXW_|N2+3m~}f7a>*Dwu!eY>9(EBj|XwHF}#FeOs`ldDPNuVc*|mle2 z$hO{+u~6IdvM)&2-{LeeFQhpvphTNq1paspzLI2=(+me(Z1@|>k1^y6+Q*T!Y0O^} zMRGEtVPHJ=>;`?%!nf4>7Ma&*_=Xv@Eo{h3G@)-*xJa(u$G>`e?2;^~!s-SHV9ozSnv~#2Sv5tZ%Nb)=>r`XcbC7 z^hx>w1aSJenHz5(pn_d!eFCU)3(>f^xrHw*F+RD&4T+LI8q>Be;WcvhGw)Tz>6310J{!)xA%h_aX{1d1BfcXhz~n3QmNyR&Hc#gMWh z4LiG8v!6xjtFerbWujjH>+1#!ner(xCEXo7mw=Z_oOS@ou)mhMvJ7{bOT@OEJ*xQR z_ZgP7!&SZ7sBWQhW0}39cNA5JDnP!z4AGs8c4Gn<{k*0PWJVHbb$uR-qdki;zd4KW zTy%zGquRo)Vp)%CKKU&Ly6JkVYQl*keQH7hm?6CA9qJ9=ZylvWK}H@l~e{HMHJHb zTSNhBrdMBkejh|)rJi|$fjUUU*rI3^;Q1wfkL5hwyR{SOKtYd&a<^I3`v(|X@a_V` zJ)KpX`0CZWY2SHTtbH|8(#2i{ZWNFjCX;O5Yyb9ZcJR^3teDdw3ij7p^{>I7loRi=i0-puWTm^XD zOgW@%85IRP<_U|*Ev?=-g_ovk-E0dR(b9OJ&>`^>|G^W?>mEAb>$Q#`s*%Evc5ihN zLJaCbk93)yMX)AZ-1Qk*t{Ilvw;nuKLBFWCbBt4V`{HES*z?=la%A~JHftD<2zjjFQ-_u6jYD8+~7)_ zk=8dz=O$z+qd#Tr^wbA4sXjg zOZaMk2(&E9-gej+aNP?%!efJHW^E60-7g$Bmj-inRLp0mNiZlhNtB}s8Z)xFw7xop zGBc^__1zM*O{Tl5Z$8)Tc+8gv=)1XwaG%`bNLfOuI=OAR2RT%i$jQjc^5Y+=x@4UC0f?&_hZdIQy@oZNT*dM=Nx%$!M}26$y6Vj6#K&bP!d1ti;t(XAyD zO|{+^jrYV`F;+Q#mB;*S>;PN?#lTc|A(3pt*{+;oSqphgur+hje#bFwTJ{E8c-$lf|klj`g zbcm*0hEy+FqjU`1Prh)5Hn?vrr)O}_aIaiHC4W_Id%TgT?abE+arZx742XYP%gmAV z75d6PA_Fq>Bkb}KZ$ZMwT`{3^akM(iI;kO$^cuF_(HkNNe+)YgY9N>GttsD+e=&|+ zJ?vJCO%i`E8z;&%_QQo zUx_Fw8KwweJ11?T@;TGYCN95V@;04>h3;IPmG37tyC1n*vCN_hSJkaj*IVf{V34|} zEY_3e3R7~b8mkq8{8a#jz%6Bh?6&i@{l23@|zqS2TIe`^g>Le6a=48|9Lc4h{|`5pLTy zli%vUeTS-e4C&7KZu|NGMI7F9%G6(t92tNYl(8+0$X!O3{JdVX)`<~P8M)|8amW@p z3~aRkjEnS0c3mO-vTY}^GH}{O;jpksotEDG^r^SJq@)u46~*tC0b*>{&e>UfXr2Bmefj zm*DzY*;6JFx;}np)J9W$0<=4VX%Te?Z(EQ-4NOX4j!t;mW@AaAwUq6?Y@UY3G%4EM z_V0~e>tB5M6NbMhY150ZVT<6>*zwFwt=;~fW{m3u%@CcSPO#yNLnm&+~F*2g&X~25b9YNB*6GZtt?Jy0VlJDX#e<}kmq{Ldlrs$oDru(QL zny|>oFot7ZLWH{Apa_`5mnQn=4rq}tp5St{g!Clr-R{|7!T1g0-rZ6m?bQiidcSO4 z{y2xXRS+5xuN$g+>brCK%=%eT(}zo)&lihx?+~;YjsBCf(~}6nAU_7AP08^isI}men9NNiLbZRN`?rwf> z@VxNw^wj()HeNx4zI%A+6O;QU$f$O3g2RA{!9rHPPb<~}P<0~X zwSn%W9*eI(^O$MZ$+o*mS7b@4F>7R4n1oDbHuK5JiSKUD2&^Nnt+USvev!)A({>?n z++4On^b}X}Bt9q}Vd_0PNwX#r@d%np3H>hWScuqd>|5AjldKGim>hoO#gkjA3^`we z_FkqKmo0{ziX}|oJnKh5^VxfaB^=+xYfT{zI1Wu4@i9da!!DpUm$1v7K_-u|mMTIm zg1N3nPOI&e3q@c3RJUbgEpao7*(^+deLK8zFz0z>^jK8YF7`0R7)kvz;{G!Sh6If! zs((0WrP|xnx`2(Y<9YOAUsbv?R6;hUeRkf!$t!xR&^0x$E<$1>h?x6(3YprpqNE?W z_oXdp1WgI$!Oof|keWO^bWr5iDsTdV+`tcSv27Rw#}C@CP!Labzi-_z4wNlhl+)#u zk$SCWMtOI3sXgUirvBhh9Ox(!5)*bPB!%>4@{8Dcucg=i~LEOkM8SOd}&|GsO+vqAO)}4h%KXxQE-eMyv9TmK-X+FSz(UP zK-lLs$hb?_uKt)cZ$RwF z&(FW69_OG5`U`R~9}#xc4#Yn)>v+B&3cXZ9xST@Eh%sghTiC^;6IHLKzL6>BJ+o1Z zs@x9%2@Zd0=*?9iZ|b60X;ns(BGE(|3CCP}xEd1X)Dk>1(`Y*nP7(HFPtbz*ce9JF zefrF+6KDaR1ay&pZG48!!3(d(V-7MK4bEdc_(<2?+{SNKWSSUonbsoW872KYRT-}4 z!Nbob%hrC!rW_T$#oW`zDDjum_|`lsKo;ptI`Dx6;ySqt<0Fdb*itQxzV)n? z{Cs3cqUYnImb0YB0m{A{a#UKi;{OCpAXr`EwF7yrIW|fEje@F~v=VFaC6F~E{SIvZ z>*$;G!5qaWX=!O7P052fJOX6!w8ko9Q}hT-neVP_x%JFwpi~dH&1(i$n7S zA3yeciORBYD>{MtN?9#86!lYGH-^I7QExq;JDpO6BGn{4W6vv_5%KIEefMVmLkJwr zvT)H7oJ*XHg5@`r*^!zggkQsNW~GT-GcEv^<7ayPhQrqAl?`Hi~DNKh<2vGiD`@C_rJdy8vfq5q3`(6`#}O{V^g@o>e%uoi_XUpDSFDvmIT*4T(z-XR_^UmoierzV2jir*IBJ(?n*xxwV2{XOy9OG5#zsIt+FU>f ztiz}4MW)W@Yj zrxG-QgK}I%gB=%ZiSv?h9Oa{H!s9wy41eEY3%8k5a$Astp`j&hHtIde7zLa2--Mq# zrG8U{YjH!S3n`AEO3OzMfK&wJrv*D}4y? zv}LM&)~(pI3c>Y?M|&mF-C9uQ*r3+2d+sHrNWgB8%d-lad#6B98c={T2`k4Hf9~Y$ zaX$x{h1mj=WJSR?4ONUw)4mEi9=mch_~n*BfgMx9<+z`&9^!qe*}{f<%5TvN(a`gr z<|26yI@D7;?Spe}i%J6QSMTo=vl-nFgaOh8B*^W3Gic^5!=&uhW}mC?ty&9s;j zizDF{AG#1h(CnqGJT913^&-_IcElC1GPXnpLt;G+_px=3B+MsSU#fX2XbF^jNBpmv z;S&c3*}wDYQPI)gYyYQ=dV<;&#;f;J!KqkIr-|6jW{T zwS55#AJT@|cf6V-u-JlL3~2i3n-3p8^de1^B7x*!Pw4sjY1J0ra^CO^7*r%^!t{_F zGA7aA%iJ+jA4B8TDe&N$;9!d!C&^9ULjF9y08bnw`ojSA^6p*#t}cgauNrq?_HnU6 zPZ{1Y)p>r6_r~mJtp|Kpc=fusL1w&P$I;+j>+S#EFQBLhh9hwj&k!uhzj;G1B__P= zxEc4&mjsHC8uL0K*mr7c7s3MR+2MH%%r1!VhZ+sE6Jzw9Z=qz`XmNYxIjJ(cc^Cy% zfA9&{b3yjZ(b8X4$-LQnn7Tc)v9Sc4MqL`+nV&vk+#JV0UqL*TBVwhp`S}geWD7Mc zbvBINc=7&c{BO8?sKhcI+fDx`KZX?V|JFIAKF06+sbN&v(l};5;wL8$AQ*-SUGqpR zyyc>rz;;CjaD*VxQDv2Z$E}V5RGY568VtL~5`KO=z%LkN`u&?4_#6z~P9waoE_N;! zHuEhwg$_fOe-|$=KyRMjkZD(xH1zG;th*~Kz3zPU^drA0h!~`*35)>gzi})*P)HI7 za7hkO1icENy&u6jY+$H~HIi!Dwd3FY`BEU~q`qvkqY2;{sD3<855vGn6AsmaME9mmYyju> zX*DP)zN~a~R~cX~ZEwxaLj9Qs;I}OZTe zVl1Ys&vJ7=Uec*+ZLjg!4*)e8}_Qpi3x zi%oG=6)lvNlrQJ|_cmNSE8C-*$ors*(}MnA3@6eu(+VwfwaeG5CYn$5jVw!zAB_wx ztXVc;K*a#y=&I5sf9m_Wr|=ZwAn_|(4(eBu#I`A5M2##fYaTG1W|_<(*kZ`A@PN&w zyl~^;+qbtzI`aQ-soI3Lx3f}muRQjbc=zo_a{NzSv5-N_m&o+3|Amm2!RXbl23nDw zV5qR`fS)`Au(x586;FA=Y3s2BkzILMT6#WfrJb2c$_KK--8z*P2TdQeU*}+SB=+=V zR96#%-J6?RJ3S3JJX{WyOAY<2MW`h?`L3nSQLdx*|I(50CubNC0LU*O*B{wsMn76S zG-NRm9#vs7|J@J^Oka8PreUI-&OhCZ&q0w6G{pC9fve`_A4i1B{#|mp-pw^%gO&M- zX^xt^+x2Aykz9@Z30?4l?SudMP!cO|bw~frxdPyV!uS`{N=k}(z&CFK$O+)yuNl<9 zJooEG7_=YRSn%<|EyaHP+Scz3@YYH0-^Y+kJZE!xq@Mih1kU99AKI<{BEM`ppYHe_ zLBJ>D;(KQgM%w42FRADH^(LGdT5NHkq*+wWmI%`7lmFLB=dhz?-4~A5fp7)h(hZ=^ z-32p6p2CZUaM1ugma|jYUoAoin~jK1iM#=KP&qg0U_V^!w`PDuo`5Gfe6A^ZQBmpV z&u2;uAd^Qm|7`?x_KRcN?`-Kp8$v7UHtQ2#{iIqF0zKq+SXo(t0J?+Y(*#im2M2li z;5_Yj0S}Se)R9tg$jPxuNp&>O>_Ng&c^!G)<#aDfYBC@|3KR(x=|R+~uOInx#^C1K z8X04Y7@xa2fA(V_nw-2&3({Dst+#QXM|j=k^#!@T{b$=62e_cYzYLR3#d*7J;2AN6 z=Xrf*{WoVv$IqcRpjaq^)d>h6;PE|8Hyau%RZ*@a4mzSC#p8NRYr-g7`RS8%S1%(H zyuS9m8Q+{^`Qo^q9lXDqWdvOv9qE}v!!Z{Z7oxw~i>M{wK`uDR`n~6PuBbX~?GpK# zIG7jD5&&8FU+rWWJqnKK&AOh%L+!I}nbyph>QyLbnBce&!x%u*NH+mmmfF#Mp%5M2 z<@L#7JCSZq^NMU+c{K=p!T-E&boMK>&5KWmD6h89ecm?A6k1{LlMleE1>*NHukBVb zr1aLFmb@KY`+@#Hwhl$|+K7^u>8Z9+bk$=gHUtCdyQYTk*jarJdh+OSdgu<`P0%-sEOD3c%!)yO2{tTJ&YKna^ zTHwO{&n8VLN$9w4ep;uOx;*G-#vmUVAMF*N;_GlVwr$0O`F?(KWJ^pa$^UQy8QHVc!s^F{{nM8hpdW1UnEz}QN+xuJBZXV& zHLqlBWF#KzQB-ta_u#e^8u`dcvNvcQ13Li{jrvb{giyg&NIA;1{z?TWqS8SRm%0*t z)i(;ieWMKEAsEpPtK#sxbnKiN1n1IkJyU<gnJoAMCQro*GEmbJ)`6&!#IT2&X2B(t^Y-@sF%>n|n4tamQI%C2sxQRq`$ zI*XOzP`@fQKZIEIZ@g;NY?YAIe*yF+a}9t%O~ymzV~_fEa=9{1e16V1mJYG-&j&sZ z*z*DR^W1H4GK_+?h7=4qzz0sW0>(F zCp-WbXUd2>Z+p#bZRhyZMZla&^zGY2c}fnK zCuiCoTjhDg3%J+ELp{0FWZ)5)Cixc>xV>bQC)k?n<{Ge@<0RN< z+20`Rm}dEzD#@vJnhYN-LNO#dQLEElO%vat_$rqviTfvv;Xe3*p24^k%`w1+3)j7gC_&$IgZoO7- z#lL0h!|5Sm9lqsUuu_laQ_?H3Fg*G_)t4_)#9W3E+cfc_l;fs?hc_^+b&$+a=4E7KG0SZWP8N}r_Yzwq4*J$KP&9BQDH?kj-RoVen|--wGD!G52hy<6%-WZ zoVNV<5pe{ji%oV%TMD|^JVhJ5T@&%;3obslSe^hW2?;4Vd00{dNhuw!-7h_%>j27% zZ!X7M8O6n{8D6o;&mKr;&W2HYOSCT1{MUlz6gsbuRfiqo>mIQx@Gl9rk2(#&Vwd&K zCWR~}L>`0)v6#Z_@MYKFT~(3o(a>zDvdn{P&$mtlyCEECgE#UgymK1JFi=$ss8rV z57whUk;1E@RQN+VMzExdNy!JZg$gKZo7)>JC@6;zW8Q;^`MrnhG`2UMpS&|0My}C; z!30ERU(c#-*L2fhJ9tC;XJcW1t*R1Ka*C~9-4VhBBeX@WIL69W?(X~e2GKE0r>Wu7 z(qx1NvE2L2-qT(>J1?%{bZqwu$N<4H!#`KDt3G?3TlkJ971)4|n= zij>jgc@H`#Cd^ohx&UiB-N%#*7_-W+rqq9$KgVGvx?>QxUd=1GhX!tLY zLRgpshr=nwlF-pTWeen;jwT%y{>whl17&Ik>oE3w^B=x~ZG$8Kbrh8FUrzavDG~C@ z;aQ*t@DY-(Tu05o7eoL$(bU`D{mmgE0l^qW`Io$ul+SVxKiKet4vP#6@fI3yZ`~&k z0X$%3m1csKc;8eZARh;Pj(|H-Vf!8{S^>f{YUC2$L#JkVeUv{}I zBez3oDf9jw5Ksea0z|stGBCMovc^6ej#EQGHVu?=yIudvH+_65>B9Zyl zjjlSkcz}leQ#*KsfP5$2cnDv1RUxnoa%sIkUgoYoc51`ly*hQjDCwsrwOD0r;ZHr8 zSNXXYlOvM*jc5#|ty?q|jo$gkZGSKmq_vQgRNpbHTQpYljq*_PDISRu=L_rbEr_INr-E z>(C7k+FIKP(xwr}NbXNva_MGFtr$defEq-w(KC%N)jhZ?PrIgI&HWOZ+pn>*VgE5! zq(JTB?9A84XLok!O~vMT(fBW#Hb_PW1_eF(sP+_OTi`C;1VLd04m%vhuviCE#fCF1 z0MP@kRYx6$dJpzIpg_a*&wheR1itx3V}~JQ!k+3tD6k4v>%3g;*H`kVahBQ4qaJ0C zF-9=+UX3k#p7k||RANk0E^C3@DC(oyOodb-GY_{9t{ zonsXjk7y3&i%nQ(FjYH}<(Hely#7bufl!no7~ug@A4SmXZj`>!Qi&pA1L6==BunaiqnxB@h$(>Ib^zO@;*(MnC|YmIkAudVF?q zQCZK`vYY}2p}`>>u$^stZu(yk&;9DCrcm%@%WYtE(?|`olXb9xsXAs&gMEK=2u5@= z===c_4;)Sbs?5*ogj~XWPEOe~p^ z+dI7SWqiIXKSnP4%zLWe9r1x7AtAxR#`ax17&ET0`V&3s8f<@G--md3m2MOuCDJ+{ ze@2}CfHpXBzj#?0=#Q9sb=wWKD`gg)IcRrwrsK>0>iA7g;(pBWCL`k57i zl7IuEnGf`&WCQkc_qV-tj@2r;oa$e@w84u9X`A5L#R+{GroJMml#*kT8qx6 zmMjM{#!L-t8nkHNEYN|>7A57o23ZA#5TJPWTORC}2norgBpaZXkpLNo{(ftqx22<_ z1L)E;2)OW46`g)+if#pQ{wAp5H}8YV8r0@yhfGzK?0vO~DAnFB zk&(@8y&6G>4Lq=wn9T2PD;!>~CnurNbBH|Z1B${q$ayg-Y47#M1~s+pN`#r1D z0V8XKwF%E7$BAssc^b&tz@cThr*nFExWQsuuY#Q15+;=|1X-$>n>cuu(mCJH?|Jp` z0HY12fy`hn_GID~)LNB6@vOlU)<{5QXP{mFgp<32kG4@*}x-kF7QAhrR`s*}W@I-SDh3fHbvjWa=4(U0)qsjjs-&V)g}-%3BJ zVP6f8K5t)Lh2C0oCb_%md@Fd_HXrEiu?g^=AxJ*O-3M93UD@tE$h~`XfY=5ykGnpy zLx!jYh|)^t_J?ayzztguwj1Ly7k$*x*=IL7E+s~|lLaV-<6~ay8w!mAAUPoa2giXQ zfrg|a+e7cRs`xkaHB|eVaN{jE+7Ipo|3!KAa33}mT)Lb~`9;)Rxhzf@SEq{_rlJ-b z-7r*7*5d|QzXh(?3PuVCog}{F+p{yk`_mwxw>N(*f_vl*`-o%`GM2_{{Kz=Ag{B+A zJ-z0m_}R$hNB5KqG@%IWH4`bkTTYq>U$xb(r}cjfD9o3*t>4VmsAiCrSh|Yv`HC#_6sNi(1xhnYa<2? z6eD9pBut;+%v3@GPM}#UDCFkpX}L)I&|IM2z1WKuQeb*eDxi(Y9fq&C#wsZsu`aFcxySjKRovrMB-O2#b(1|>QzNfLIpaZ4K~=NR?gr=zBZ zO_U^V!Ab+th5vO{jUA0Sgw;hOV`I$<_W@h(a{rbcEjv3MlNFM?`8$h4TGWm>QiPTDuY z;Se3MD|L1#{oB_s#LffC4%x+tNHWIbrryA>>iQ;>`a5j{yYR%de0D6=hwc&i9 zfle*j7s4!}$X9WQ+KqMpFVa@23t0ddkZ6nBI*FUmu^*;yWXr@g^pLtA*}fq5K<4Bp zvRfb5h2Z8CJV4|`4-(?MfzeX&*E*lAcL)`cT}@MZ`SZ7II7D%B>Ne((xbP#{iY2Eh zBn%X3?Ck)jGWdSCxM0|G0|t{xYW7ES#Y0+b6oWjq0!`#^%q$wpsQF|-5^e{;AvmJz z1o5_C&!2c3H5MO>+oqe6`|B^_t=}ayOZB`-L-7~`u?mOY%A$Ug2a;A6381{B4}hi7 z*cC| zfJO`Evr$pK4<>l8*T22)tk(?`cX7t#R#$KTpi*~O6&Lj5H~3CAr5vk~&m<=`A;tBb zh9S7RWXsufm$Yj-tP^-*APUwr5b`zZuJg+uey^LozHk-n9a~Pe1z8cNkAntdi`U6H zll+IP3IRYPSg_&3C5y7Zr>NnZR63o%l*XlmDXN!0dILfub3XVaupiEMey53+oCNv--Se!pp6$~#6aa8#goK3o{pv}(Q`LnK+X8X>q%it%Hm?NC|=OPL4+P9QXV1Y6MP_B1>q-jr+ z0;I$4ST1eAJ`=M63)(-=x^N81Ua9?xuD+AN=e0>rYLxic(X%z~($&9v2o+Nz(3Qut zsjTn~d-8EYa_jYRXD32`eOQcY{Ia}q(0 z@ju)0XLVB2ZmHF$5W9#ukbF?sdBScYNJtDp_X;Q;g2i4{H2tbsu(@xFP66$0L=kZx ztpc5jH&OE05uE*bQw^T(l9DJjCt)0jTG2lfDeuZ716D5nC(s`Rv@nO}+KtT-qMFIS zA{cn%-oHy*2?~=MU_c4T-G{%*$j+`J@{m~Jfg(uz_LYbKM9)XS$zQX`rT6T$|LSO0 zL>3yP`$<1@mXebnV)#%>IEB2DK^EQo#M=LYy6qGKvao6}`e0PjdgIEsYMrM2r(G$k zdhjl6JyItZ2tS#9sjZH3{t0Bax(`%;O9=*tz>D*r=m5}CQ&cB(iS_ttfm~Zz=HWSg z)CMn6Lh`Tge0Q&nMr0f##ppH27Ah1I!xQpYgRd!^Q6oBu{6$wy){opkkW$tit*uMk zY4&>FzCaBaj7e|T+MoQ!|9|~929tXQ`i^EpTLp%??0A9H%*=`GOGX~-vYe(e^WDbd zDiTvbic4ZJX*2!9fKsks{n1$R#wPKs3$;vk-V&yD_P$Z;Csz>>u~b~w9GzOOrhZrh z4(%Zbm4JUXzQU-zf8Td0nI+Xs33_^pqR^l8cz850kbi$auZWGcx-{aS4?DaK9HZuu zxP-sD+I2!`sHhbgOGgB`?w81*fLL)WRmD%!6Ay6Gm-fjnPP`#Ddb6OY2)?4m{}C8e-cO8fpx9{f)v3jrKY?<+n1lar(T40$o104)r?g zYs`Y)k9V8HagW9qUZS6pMx@%>opS& zov$sW#$XuGhP33VWezLPJ_XExL_4UbK zcy%hmxk#1BAL*>3*)@U<{Hj*!qK1c@Tpjt7w?X+AkD|Y)K(KfJfFp#tuIr3*WGqfb zI{3BlvJ=Q0Lf!$QtmKmDgT9}VeEr79IeM3(8d4KO1S^v|xaW7|!pi0Zkn-<*)J;R?DM* z)t_z{oWQ*E$wnN`_Y)uiaRQbG&*J<1(X@R}!35e+;2pD5=pb|i-JrIZf8ux-ZqY4( zTHaGgLOvXnVQB;D*=?k2{ZA9O`}*zv3id%t=mK6MKpE--r5&I}n)Nhq;ZJF2`}xHw z2&^2G2A#37BN&)Nd;{+O?KJOsTMAp;1tb}i_tl6%@_S*o_cBQ7BMuNboUqGnen!gy zHR4@=0>p3>s-hyjT)PabH`ae1th*YBre6!WTO&Lib!7HXm<{2O!~O1Gqr@77Hj!dh z^7<_Sr|bKBSQcrlE+!+7Q^R$9REL&`25%=HGKJkbX8oAuaoFZV#3=T@E#vk(4&xV1 zAuLOL1;T}V2HGOC+^>a&ho>9Ne_mYA-&Dhn6XqKOR}**>4sHq<#=fw-!wy~_Z+UuM zKu@GpkJ|_d_91q*J6-FsbjlCgXUQC2zXpnVReC1U(hF@t3NlOd%(E(r#RqGH+z;Jc zYkG~j3j-!~jW4MU);Gnp=?r z@67|FB(A4GGM}eVL1_g3XN3yhqv$H}@4# z+)Z_IU>i8avu0L;WO#VtksO$nI1d&wett^xLPk}P(H$ zcWNko;H9m5<{v&~^2I=(@ZKW|u>7JV*L&xJ17;Y|b*JQL+Bs~MHMKKU`T=OYi|tY} zGn18*xcHMDFyN%fxw!U%ALeEgLJ`yL_*l>j!wL=4Tfm@=WABiugMj#=rLhs~jFgiS znoipRGbR0?n$dWPQGQ$ABxaK2Ar3pLVDP3I?h!=8MN?C-6*(8-jPyN4Kng|rEa9OJ zqgL64i#@GZAe+L|Sty3U=SCPq2wmJm`-{RXk4p_Dm)(MJvCxM0tB2jbnp-c-k1ed< zU4=5xY(+aK__C){pRunW9Wt`xXK69^NAd4Tc_MFpd2pWi=*vApIV?)lCFdoOtG2o* z=ifh1{Qmt@czBf!gn=O=O&{kNOs}A2E*hCP<#j*yROKy3ch{hdMJ=1yHiz(iOKlF+ z<>aViB8c;CxZ3$iu6P;T6*@bJv*cVJEJClWSD<9RncMm~`6yS_#_$%`X%phR**zCG zfB&Kd$vZzj7~py^aZTOwXMZJxMahFRK>O1zrq|t*a;}yZ`Tz$^RsP}5`R;CKe zJ_f+pw)}^BeGP>rS!BpTnI3f)|28)*cf^7c3b-Xr7o_5Pu7-yYQ$ss@_NVk*U!IS* zxvwsG=W2lnTX;xvjqMs)!oI7p( z*eekp)@X>rF-mgUeaZSX`UGMzHmbq?Jr-$dZ{Ge%po^$CM>&zsDMxeQ}>=&&TT!&mEKK)ubQcvTSFhYqVnHRYfoyl5wAgoy)U4rD+!E z@Z|br-C0mZhARX0al+3X%#^&qh=T9UVqgkgWd{wNZ3ispA1?>kfZ|uV1&k$=%C^%f zP;V5m=dMud0ETBZDld%sJ2l^-6I2Sv2Kr{`vn4-}po$Gpyt5BM6u}Y^VS;{eADbfY z{ZaWc0qII!a)MC|LHA(g?l6O7d{rJDv~^G??nHayV?;6HYr!EeiQ>4@p`uMTo~b=Z zQv*)doy7Hs3kAqlZ|N!Ngh5GAS9qMvN8Rf##hv9ao?}9wQcL2z z&C!wZJsGE!@XW}_1($`(eS?`!m?_~t#O*y8^UEsvEW8^wM^@{}=w3jb43r3hxpr@- z#Uc?usC-N!|NVi_8MuoWe-`-C%VfaaQEKV4MOM`M)OS9frcd``bQES)qAe-stMu7y znc4sl>qEr>Gd)V4on-bg+Fo)@IIAa{Ho?M20yDU^;LS&A;=$jeO=GZ}L;%$p1hT$NE~xnzfTd=qc--S#}w!iPo1w#{6bXZ~YUEP}$uu;9a`^ zejvjC?3zJUk);{$ZOf3xDLuE(Mmt?mnBayr3#;ry;(UL@?5~X-XjZgBx%z=#pLs`bJ4Rx|V&CpVUd+=&$Y~No)6??P zMIF-ftn+2eGb~C`CyoW@Oa0RPqMn|b&@7A>O zSw*@|$v`X@IDB4)&W54Rz)*TH9uxx#tHOMY)N|m+_YbNTQJ=fuf5;=>g-I0U=bOwQ zU`WU)Dc$XUp>lvK>6fRye`UCpb|7*x{UUT$>EStjoOF10M_T06U0{)q1X-v#N_^0bCDKGC+k8{#I&Wens8rwzrd(SALVpxdS zxOWa9L*PLVzOb^hjRU3^IRKM1iDYu6dW6M|aYkALVvXS~irW{Y_v&UuDn0gFh8vTb znx4L&vAgvMhhErALIIc#A5yO5yNc%vskbufJQU2#dVyQD$j)S(s45~L#*FXPQ6GOV z@ec2ifTw`==qv}lAN|u!&B9NgM<^1pnW^es%{8>sr)Hf-^@#4)NCCjzY;?{1o-KlLtUR1peQdsbrkkx0aj_ zfj=oQ2>m2pfIZLr`~KcJBc;;ofIUG!&bdc*3m_zf-w5fW7Bv+M_y$A~_#6EDuN>uk z1kfuT^Eb=@O9^PCfG;H#JXUZ}md=5Tt$=u|w}0r5qkd5F+y^>v$p8l&!e0I;06|Kp z%ZE%Y^9@gXQ2jdP9z9)FN`$|cmLYFGGBM$FUhclwmi@uA#s}@szVyIkUwsh_J6vDi zY=s;gSP~MVX4-RdldI<#%~-fmi_N8qaOg+TO3UeoXied)=?% zOzWc(`OgKLQ|evcdY@ep8>_j?`gD^up78}=jHnz}`8Zy;3UCc(W~hCMwR27<8VvS7 zew^fZ(*^q$UpAgv;9%IEVZOO0;72>hz@Djf?{Eb$rjAKNVuY|<0^xRNWBmFxH;F#L(kqC!EyZ;^Tt zB|vNPtk}9B3E04YK&Te=1_KfBDZmBeG|k(HKYPZgRW8Y=>F9WL0fXwQs&ZIK6JBBP zcNb`?Aj+|Oam1$|!a>`7xepz~;jJf&dLhI-5t}dWA@+2(3q{J5Vx#4}ztgrW+W-~g3OmvKmMK>;z zbIf*j_8*>L&Z|((8lq~wuqd*XoL%N)_rBhko|bj-YcbrS!;J{KFhohb?KGfwt+GzE z5BW-&KHGHPH>%sga`!J0E zIXwplj*J?Yy@fzRc0KKH9K({0xv0|3MJzJlFupl%0?{ono~Ji^>7AGc12r@!+3S2A zC{}@SNg2%IOwh%>!?@n=)AgRqmuMEH7w3YLB?ieU%f7STWxV&32sBGPB7#V74YxeM z-V4AbP2pH_Ruy{=MX1-Tszu(QI5sYE4_xp&N`pWI;<=$=ksKjLEpeMp-dLmpIJDZn z{{Ae`W{WVhEyCAm{0r3>0o4Hb{S~N5F~JRg5j-9g0a8BA^Bvt6up{ZG^z;^7+OJ=~ zmbzE2_##J@Ld3{uIaehcw6B2;h7b)$Q>olt4Q%tmlBv$G+IeL#ntjL#t_Jv}`)SL$2~K(^|3E8T82yX9fM2vR&e zHWro^Asl@ok0pa)VU|KQ7CUyU2=qzyg3@6(SGW)11V>UVp$O3+;<~&yiZj$m#+Bw; z;J)H>|MWX5Bmqcw`#dTP8S-jh<~tY=+ms!D<6I2O>-+o0(9~RFIO4${iT4zbQF$&p zL4BG8qi8%nA;E&^OR)_`)laLFTovlg+TLWLg(OPUaEM~#>ABVyq@(~4&8{PE^gZKjI>3=h>2%i4(?Hk^^iYD}q z_&B#&KXirE#Jz49OHFfhB@M3c-$uH01UJ$)wF)L(XAKg*I9ToH zj_(DB7&fc=QW|V%#LG>7-1Jg5H zdT@RF0j-EZ4@10ikabQA9hsm#>*Ej4tLT-BNtP=E(h zfxE}ru}?{Lh67*3DMV4vJ1a8owZE8z#THf!?zU9BIl3wI$GEy?4&WKw%6IAH?}-0W zgGffiiqB8Dvid_*)Jni%Z6eWmCM=@tyGev)MOJhqf4k5d{bMj}ST4SFLY{%Q`X+e| zxJdXrA(Dz9@IzhK)7Ertqnla1_NTzYu9Mmzmv!H#kmL_FDOC9QtIwYOoS9s=1<{?V z#XbOxRVlm3zUZ}{mZodC)2=sHLMca45C3r-hA(x$@O?)5m2-)>Yq2Oxjf2+%xG}_B z7J8d=#mm;R<3+{PyAU4Q&nJ+Dq01vk^?8V1ww)JpjPygyP+^F+mKHk;3y0g`-HYRg ziv;%?@{!{$ikz{uY$`+B*E-cz)yB$R?jE9`GhvkP7ELuN-f4owmKZjz52Cuk?cxpM z7T_zZoEn9PwY61VUT&Y>(VS^T*&u3I{=^tEXH$;*@I~=UPEP1s!v)=2Tl@!~>5Ief zu0H>)MUVdXi+y*DNXcZdC^A7b{QL*?sKsOHNxiTc?q)F)y%HX z6Tz&kM2Pdg`&>ZR8L3%{5*hd5i-fktb#(MZszh6=M>x^|rE%UlhCD!26Ofp_?efem zBjRiJ^M7Ej?ega%nh6ZQzwzxLw{clbbu|yjW|ox=FWUhUpB^H*P_1-v16zX@s(R;Z z-5|J$G>@Ht&BcZqzIvN>P=*4Ww*+V8ni9@8|G#fY4ky*+uRtUM#lKleRdva!RxbVE zD-gfKW7I;lB)3=;?am6D8xF%yNFNq~BFySX1L{d>DckWKhUCoDE^Pk|y)6X|4FXgQ zB2*r755}0o3b0J@nZa)qL*h_w{cY)KV*$JPaBnQjR|mRGGajLSUommzPZ}6dtlY8F z%M_7LeuN0X(sx}Db|V3(_dlUdpJ42KXgXc&?7g%0G`(b0R$ch|ob`KS9BX1l1lI0++q^tG5aH#3zkj%` z#$^@d<%cIH$QR-s!eINnQ_nQNieT@F-vW5@uVmt&DFQYD?r#XXaSD#Qaya`ZF}EP; zR`(;rw_p!)7M(#n)!R$U#vUUHy-tV)hxk_(=|K<|z|Wg-h>LjF`%2#J+--rn{XYXl%|;&OAP@cC0IfQdziC1?OkkM8~(Octnhcc0+mZt=Sv$~}4$G!CgY zk}-03R~HvIbvW!nCr+aJoB=%IU(h=vcs8|382o_00=2m>07tULr|at*H?`;Iw_8-I zuD_%|C0!$6cy(Ua%2rkH8N%n>9}6DgptnE{t_r*tAZ1dV7>ZS8shuRww0wziso2sc z$jZVJ8C`j+|GpOoNG1>jx|>n_?MFevQ}V-ycTD47)OB)gYu%}+L{*d@l$FIa`7Lnw zHNs6B#2)W@``=g!RJiPw85x}|DB+n3`};%v(62R>1`OT#>zkX^7Z+EH5sO#Hq1!)kKZbV=R5di}UmF|!bkWz*mB&0*Sq(nlbQ9@$q?naOyq>-U>fNu}R z?fpE@`^EbBhqYY8c+Nii?0sFo+A~cBm=&pP3K)Lga67w_YhR7n>WrN6g(Z(aEjI=X zfJV*B-SQ=Rv~ZmoFMd%4#c)D`Bi?RFVDrdz0M)tRR95zE-5}e;wJ%x@P=z%ITg=qb zweC+iJO#zK;r*eReL~Q={eI3^^UzO%m;Jw~(e2fiw0CvoW{$ootMi6s zV5&uPSZ^v4p})3O!E@bI^wZAjbV*_1yZ7I@?;;)d3U%T#a@s*c2E!;xTdJ<37-B=T z0)v=nDXrSqY!LLAFJHU}O0mMLK$pY6%St?#($N9R{iLd@M}FyZt7CgAbG5?YENKIt zPNTAd*I$}*++-S7$Yn~D??pzYcoGF7R6wcs8%M#DkvU}}RTW$y6EZ=F5drWlJPa=%ma49+_L0|{_4I|Z!BqAX>F`I1Nj8uu%C zV1n3u7*hLFO*Ub3omKJ#YBDwV@pFW`oj++krVl08>opJ<^^F=E>j3I1qPFH7=o`De zH4@nI3dXwk`cjsvn|=}+ANzfw5q(HZJ3i9sqlCJe0+(ON_nQu+lX$i1mH_NW;_@=a z%5ZdQdEG96@gnIVyLduGsFRXwqJ9{|tjg6ID105H-G6#2vhW)6P znTYj_PAQv+wjfDK$z6$vyY~}X1dO^~R!Y6f7vRaHU|^_65hEmGvY}2~1Likt%0-mK z#8duN2Fk%H8tZPg?yptC0m|SvjGMrzGy`d)%p2D1TEAfAK4{Wn%M8=yDr2jV}By@H6 zGdDM@Z8Y5^0G`aP)$`rrs(TGUB&Yr(A5fuy>(2HC7}fe3hbN*)OMXp3S2rkCK7{to z%^N$CyN+AuP@xu8p*NYS{#iJ##v<^B4VaXSjmLi$1pNc${t;{olUvJ%Y>ehGY**@I52qmXTaTKex{fXJ3)>(&dNDI`- zqoF#wr}Lc_R{j@B?!)9Q{#ilp>A}-a3{uZY^W4V!HSE+Q>T+oqdArL)OA-?L^K;P& zT`HG7Ig=j|ISUHL*2=KVd$Bfv74>dwo*yxN&5-tT^k`}7qbsjn8k#*}ATyB%o=Ik6f1{Keq!Opie z!q1Ir&wHLAwojq)qrY7`Bzh#V#11b^$5htub^4z=ftzh*-qb8(r zi16kWdb+&~%o!mDC0?P*ernc70%_=EAB21^IS>QRo_ucvnnY|DTW%l0LQ`SnWtb4~A)`h02W2wzlm-;q7W zAi9(Tu%%xB4!VE<6}I31tSd;RL1l@Cm7aLJ4Bk2S*G7v*fqt!eok$;)u}Wdj>o_{X zYVBa!@6af0*FnrPaO(Uoa0=%4vrzf#3sJ;meBcUyeXr6OwxH8|DLORju*@RcGIFJK{J^ zUsIUN)nIlL$J^mCpopUG(8CAL5Gd_+n1WFR8etpUfPkHIex4#e{=uSd(IF@;eq;OQ zpFG#8q8H4rRef5i-(eOf?+mQ^u?|NGZO0^fzjQ!tN>B8<*KMRd^Xg}zM{tt!5OH^pffMt<57Omm} zw+TZvl_ zWu>WE;}q2d;1Y4v4XgE#OaNN3R!>iV+}Lo?xG?IlF{AFKqkJ*bv!M3-6Qrt?yD4a@ z5Lqf55TFD}(j_7)7+A~&>&@>1@R0#b{;{`bFen={9;9T2-^x%yAQkw0{Ja3F(t8tZ zU*;_%x$It<)Ka&-YdkzeCC}q0tOCZZceuo_=O>-XR6YK!!QL%@u5Nqddk_!6T5kwK>E!ta>u=O5Z^q!p_mA&WpUv{IetncXShzJ&;T=a< z%KX%Ew-}_*$`luIBsKIEKZ~Y^t8yPsf)?%fP)7vASlPTiS9{+~0?-s2*_&oePycQ3 z@srTh{Uke?pELFwODvki?V5k;R?gWSdE3tWhs{4E*|DD~4H|2zLKqI@Hq&6%r>{To zGmzVawEt@PdzF0id4pca#0Jjz9wc=u_RM$LT)oz1OkFhp$6vG4(A`*?Cf8DJr8NoA)nwb*b{LzPnUiIP z4UaExNdgt;KxJc=JSboQ_nSJon%+S;M0(0*`ram~0t zUw%R_9azf>eQ>Z&z0@tEH(a3~3Z-3jMv1eW_B)%_JCN%SmZs~kh@@Xd^Mbz$f|{DZ zAp#-Zf0ZVKP?)vpxrP^$gUW%1r2HmK`m z>L8fRyviMGr;||Ypc!xXD?X&4W@#R#(E2g-alqB3#%DncK$tAHO^?+;@hk{Cx9kQ1+k4zp*f!m|U{n zn$O+e7mXv}u(z|*douVVDTyI=h44j(`yo9ny0a+tK$9aCjH|*I{A?^Vud)dtH#g%W zr71~*(4!bcfU*O9GyDNRS9KHCxj7Gs;}JmWLXi7e@)50@=7E6a!Tu76E@>V-u*Amp zRY;$`f(B^z@lRtocMmV@3O_ZU-~?TZ# z2ZRJ4pvzLZS{SRi_-p;^)MSGf=Z!F55>cg^{F~Ifeu7oYBgXgwNs)b&;yyfv@DLd- zQ|mDwMlwa_(_n)dK?^_}m-HRfWo9h}KMRwqe@ zXkCR0Bdxz4-*at};_lH|KQ_=$Pz2f<&rEC#S#^^-^}EN5mba%lS=JJlMNK6G~Yz(|1 z+%E{jp9{i$rdjL9zUhXX<2c72(YFmI#1y3Vm zZEWyTMK&SD>Gbfe&U~mf!7O9-p6mUKifB$wdJmG>N6N7O0L7=^~L7Jm`4uiH;hRIk2Q?Sv0hqx_#_odNLX>ay`pn@l0aHC zt!zSP$e!GW=)I3NlgO1xNV!23i&SuO#WR(vGs_$Ug-| z?|D!!Srp1YBt6eZeXO|1ee-lX{Hhab;h`61`KIFRf_^O`*rzfJAw1NW_n;*2WPOFm z8jy=0Js^mqiiIm~xq4+Y;rkkD{!a!5mNdTZwxvLmy{Q+i9oLn^i z;YGGX`D-_nDc>GqIG8nPzLW7{8A4fI)4za(y0s_e-kkL2ypxB)xY4`;^@+xK&A7C! zaNJ$S%)fePOl@~CUFu%Z`rd0eG`nW|;;r`&;ohz0=rQ<-!QpB$3<0VZ6HvKlTrgi` z2Xf!i(aEj5$H-{9nn=pjQ`r1jrr*Qk*5oGQ#-k*l$RL;$ZCtp`BDe!V%$K;WidD|v zz=@juy5VqQTNoa0yQenJ5=R~>ZIRr5LBY4ZEo+0?^2^Yj1+!^g(u*N$^9LCO1cHxk zvw))Plltuol2n4KNBW2SoOC^j>arTdR_m{g1tgJ?<9iH0#CU}zwUWsectLu*KUa+3 zZ|d1|!eN1a__b6acVN~|VH7|m&|N^3@0A%ae|5G>o;R*CNgfk7r>?88{mMmy<3z^f>g^#ZsgrYzs*M0x zTe97g)Iw46z9lX4pE)L;WRZRqHdTMQuj1)?Y&-{-ysbXZwyvU+1r_h7b%gI8q@3a; z0ui26gd(>0Mqe$u%b=7d#f?w0+i()!+1UYk6c;__Y()T$iwOa2;>>JIu+NoW z|HD2Bh!w-*E3S%(y%~8xp=DfL#dzG@MLaWrY=rNB&G4L-k+sD7_`{zC#I2&XW=Z_# zLYER)jUwTfMJxj7qw?S^ai-TqZf^5s_VK0QEOf*6arkHTjsP5heH~AMh&)N#5zM|4 z+B#IXJiR`cu9bEo2SGSybw%ExpA}1{CiW|=NxBmyHbgbIUuBV=*JgOBmHrQYntLRJ zCP}@RT2~d`Y;SMR&zq)5#m&vlfmk%Jq%6gx1b>8RiT&3*p*523Sx#BpQcn$=CHgBu zZ$iaRKWgn(_RW}|>UP=Hb$=*x&uZLwJQgO)`~D0>h>V`L5|m<;yl+u*Q5H_8Jbuc|82NVrqV- z1-z_EChkvNY9B|P;|hc4sr)DSd#U*D%;Woa{l`nWl>&^(u3J|zCNbg8UEVuhj2v7o z*U1M2D@9XSq0GSSUwfdL09svSMiI*H8Zf;zb(mC8JKt}*uxU~}=@v@#B zo7u38Zp}k=*O&}EkTdxJi6|G>Rd$PHiwhDOvd6@ZBL@<;M#sl~LtkFY-RXeI>y>y< zlp{RDq~1z6fc3>o*5@Gw?BKBRn0Ry7kahKRVj!p>+iSKo`uic5CK>!OSUj{5N8j6T zdQd4DBu;gVu&zNB(zi+{rN#~p4uV!c8+x%nGXk~2EBzQ&J~&2_Mpj9ORcK&9JQia~ zF+<7NqYMZ31?YNgc+0bgFGcQ0iBKAX$xiBG#swF2?g|}Q(Je83=-^N?X60D<&BDS0 zv`WAs`$>Zf3?ewACD`A8-AQ~s4;%v}LT9eG>Ijqx^zZ%1nVvp=Op(Bpl2LauIvUPq zm)NDIsfkAmSY7#0x;ks7n>Wkez4Mad6u%FQ(0A{)78$p!!B0=@WW&Id79Da~8*3Wh z>QED}=6|?8M6KR4(!fGdyTBW02*HA7jSN6v?k9?FsBoQMJy@Q3D$T=&KTtb8gTwyqavl|Ts7?hArK`f_k003vXa@!89 zbU*bH|4XZzGEjN(M?QsQUnWAOew7K<)kEH6Vzh#sdcx&7Lv2Z)LJUP7@E(~V7((Zc zR-l)`I`(}3A;#MrgAkhLknSGx3EY$Fc~MoI#z3=NS{zm}u;sTEm8=i%&q1AR@m9hi z zk>1`n;o*H>zkUIdg2KW=5-AiP56|=;X~6Z%Cu zaZP%kO?!4JM{Q_A+MQc8MFhp{b4#9CiXWr;bw8*PJnVi1a6NQe&CZp}#_mi|)=+;N z^w!gNb83;_=c2{vxXpTM!X!*aeI;^9JAs6G_cQ1_xx};5reE4EN;r-H)oIQCOb-ljk;wClNT-V-v3-v>?C!1!o?N1chxwag3Iq-Jdj$V*!XD4>uzS!>ga1$ zrR)i;(i@`^WQ@OwY)h^VvMxexHxrqJEw1pZ0H{^=^?|0p5`1=tIA8N|qZVug2aO+jYjZuH%FgH<1pB8@k@}=6xODDxbaT&R}!FfS+ z3=BXHa1QQDo{@%FT3MOJ3TZrqgoK2EcN&~+6(+uk>y}$kpkMYpNu<+HUJmXpm-&8o z?1}bF>fW+oTLIyYzWb?_oq1zk6=Ll1WL3#jZKAw>y)wPz%zW@ivmI0(5KTVM(Fkd~ zUH{zEnvFE_Ao%j_DSxF#$~tTY{?XI0MIin;fC219YQr_;7 zU!JHWBCK1Fz1xS_)k9QYvc`hIQQ#Xt=&70>tE1Zz#<6w5aWa3a>NxE z7q70aeyfBwlr|Yz*x0n#RO{K;+ZT`jn70lcMe;plQTeGSXuNv;dVM^gJj0@I@xTXG zmb6WI-dY~4B|Ada>Z0wGCsGjy?`>LY=otxwLgutZflL*r=TeE0Gu#`frA1i?rO!9^ zQ~X9l`1w1&ttA5#ijM=QvW4xr2Je@WffDB}S}>P1Dk+!}! zd2|d%Jx~N3Xjs->EZi-txHwsCDDWLUJe|1nF4seUZ@9wKT#v1WFTzx-&IWqh+vAxt zv;~+m^)IRs{B|dvAIRA}W|N!22JOBvG3JmdUoR&kUsk)XfI8OX^r(ov^z_Rv1JCI& zLb^;q!s=ObHMBur98XOsYNvK$ES864*65~JFH2Eu-Oa#_>x6rSv^cUi9Nn19?Lb_w z2yM{WpXb|+AmZS?>Nuhw&c54;>s@bCX}vt(r3nDXR6MucF(aTTPP}>JG(H%uzUjSrP$~8H z<_hBr14mLA%@ZvxEj2YtxiuzYKjWbs)y$Ozxd5POZZiVY%Hrl4wD}=dETT@r@<-Ub zwKk8UH;fhdBhowzR1oVHl<&L7hoTTO=Db};+^dcEG`#p2E#TgccpWO+Dam}{2o*9w zeymQr4!Qx|kSHG?riB5CcIA}TUMHUWJeA()`6NRi99p}rWcw7O0Lmv|kH`J&1Do6y zBvWPl)nx69=<+Z5j=sIDZ>?-SV z|64#~qvJw7`{L^Ttr6rCb#r%3vNW+F*v&;d(Xa`<(nJ5y}j~`?wGWKrk#ZvQ_St1J^3>u>b^EB%gYKZ zoN@C-1ySWWFZOoH-a(25x1<%sTj(Wg%@3=D1^zFY!WACPEY3+FA4 zSIb;sRRCIVU^G(FEPd^R2yAPtxJ&{1kY@nr%?5ba(3?KAfV9i+-E8YMAb9#_d`9Mh zitl3g;dEx-Py$+~=KtZ{ZrZW z_(`1;I0Nde?qh<=347ygTS7Xa@>Mkg-*qG6`2@wt9k}l%nsq2u-1t6k?5BvZGusb* zi{o~f^yK4QMFcBmL9Z9akot+I+N286uk9xdcsBlx%n^fSv}h%xYvHmg1{Wvh7>2#-)Y{7R$2T$ zZ&y#%^eA<=8s zdNyXst-hs1b29pd{p4W?UPo|%Iuj@W*?D3)B_&}KCMXnF02c*Umr#!V)k&AN4X4{O zGS|rYfav|Q083~l*qe18e-#p1zhT9DH&MNcmzGs~8pXa1@t*v+GN6yNKbhO{-60TM z;|~c;J8YLK)Jfz}>j@5RoQpqT|0(pp%s$Nl<2Kztbi3 zwe!|&BT!aZ1%@WGo7XCg%qVK|U<-Ky`vj5xbj!pPVp|L?d`QZ#R>LbIwuamJ_O{ItK3{}gF40UT~yrG@HreyLiBkC4$g zOJ0lWgvn#6o|Y$$3Ag^{Z&CiQ-+#;Y`8Ih+U)OXn=3uyC3Yv7^Khxu@)#^ zeu6A=3IXQ;I{jN9f<`ZV4L#}r5>omjTf+5&O?%hPr@Z$RPN(Z*k(nEQ7 z;oGwYDXGkhwtK^jrxX(homdEx-3n447ceFlfP%UI|51}=w0@wufiH-{ybH|6NFAY)oW{iGeiF?97PTp6~q{K7`gW$Sh-qIn)=O`3nLjfRPuTOc1 z2rC^?Z?Bd2LMu>H;Vilgqf&G8iLlEO12r?u7qy+#Y-y;D{Pd2pUoJyyj)xsM$7}Wl zts2UPR@B7F83{%%jGrbPeOKb#Ymfya)A;~SQZ`*Wh_cFnvv%eup@^JmAV7J4;n{P^eC3vN$%^{U8O=Y|0{n3w-n-Y?t%pq;h!DXLMCnmHN6!Ke z*Fg~11=<&{BxQEt=Wbe8)c=K>yyj_~HV2`XNh6TE(N6%G@JB=ahtU4gbmcmM+%ANE zZBjl6kzl{|5NW;7|CJ~uHG#cdfjwWU6WOB|vELphpWLoN`i6m5IR8*@(A<-U~n!6=GU3m4eZ?a?->3$FNrkFPt)8YkUb-EJ+7(3vHdd8^Mq0{qQ!fQfqwkN8>-Cz@83XFTM+& z2lP&g4q}y^^V$FQSHJ-KJvh_U z$G_ffe}EyQUUT3I?v08;JE~&p2txZcaD@axE0J#TpmZtWoL5+nPv>ql3@a5%2Bm;DefX?(sM+kbd`actvhN`*Z-1AozF&9tQ)| zHYX>7BG#ikuFERr@n$@`&Kv}&V?WZf5j+7w8hQ8s&9wIfdR-r%U7}~Gp<$HryY@_O9s7s*rQWqHb=|5g6{Hfpb zgxNs%oNKPnFi7*LNffpB#PN46Z1eb~KON$L*yv8v-ImD$mR<-YAFzB=3l^{MXnf$!>D5S4 zq*6XbeB>|RR=)Qfb(dg+6k8e?8La0LZpiiu&B~ki?g57i>n-U>dWsUduP(dmni?7^ zk2WdaLn~c1csd`cK1I~HldSrg7#cDIkp%ezpIV1`Amd{{lTx8IUpSmJ&tnS0-fBc zgBPcPa-eBlzD{P#0u#GkNX$7f56WVD6ITweXdtMYo1(LmJjy!(Z(ufK+F~n+6fB}l z%~#S?>IEA)jU0yGlLc`wz%$>I`6M`rCm$Xjf)j*7qfHoe`^pxC+#Hd-8TA|?fTZ(J-(3AK4i_2>syQqJ)bVpqymKr+DYPl zDR4~brvq>3O+kcL&bQIg(iclxYL+<-DqhhvX!A5_WS#JPuxNtA;4SW67rx(n>1JZA zCYJ+fg4fr9N?a2#m3T)|C|f^!%VW}iR)DMN2)uR!rc$}yxacTxb^TMkRXn1ki1-Sh z!yt4oU8qOsf-jsFkS8xVTwJ%oTjOY9prMQ#NFvNkkb#$i{SvQ)IZn^^ViXgFzL92? zk0517J{fOFk5>?Ieki(q{8#4S0hAA#c_A|hgH)-1){R5)>be7uF@A#m_AeE&MWHk3U6TKt9vVmx62FWl3Jjch1&IRa_tp<@4G$T_F` zbTP7}S(5BbkjcooAZX-o!Gxmc3vpbHzly*64Xu3py>$K>IK@l8YWVFf2bzWFt9v3@ zJ!AsqnS)oMd~vb{l)d_eY?nWgkbkYux=P-O@)APfw{W?Pf@*5}VRczAd~GSszKHvDebrToP|T0) zAJ@O|c>g8U0-YUDaTGg-z7Fe;f@Sa*BkGtE*l!{}J)I*TyxAFm8zT`@NCLLTHw*|e zVhc&fq+@+XhPt_!g9`OU=xBdOc?urW?k3mU*e4z+rGk_EOM#cT+PH{Zd(=<*U*}>D z>>v;BJ6Viu7k;p{Y1^pSSgMN`M}O>o`671L zh9*F$qPyPzf%kDy5sTiCX^E;@C}zRX@Gyg@J!YBb#Kc5kApS`%uwPA$gR^vALZPQS z1s<;@_7EO$jjJ22E6~O@fM%}8jr7}JU?9T+#&#J8IOwfco7&Fr(^E)@ z@0hwghI~=(s(xZ$;3R<_?Zy}hhP@%h%Se_{c8%{Eh08>4t~T49UxZ_8uiT>&;F%Gu zxaop@u>*%O?Jl{A3CFWsCpux)m9_W*H|(9b{FKqT5vjWLCYyT-9xS;0XK4A>;D^Ux zLJkcLtqkXV%**>cW>p@dh$A^XGz3!UftLuZ(>!`rB3xXn4J1;*K|$liz$)MUq4sL5WRyJZL)U z#(|y#Kd6nCzn=Kl2^0;mEwuuA}tRP-_&7yxmwv8SL)?}WA??(XhYE#V+c zU|gT=8F(;pSW3-59K&wNJ^-D2U*4^c)qkv8rVbb7rUJ8`$mqF+5dR;v%XSvxCn^+( z9gKhEx(c_MO@QD9E$&iYy!FHBrwC|Mkh*DwQ$}{YN|Y0Xw#<74>gc?sy7bc3EDOLY z7~kvx&Y*wov8|3;9l~i=SK6PH8S>Za`cv(Vj!n2{wMqfNDp9&?bv2cTjZ-jrf^{41 z*rHJa_gKb>N8I#ocSs|NpYT0>`gCk;tP+a!wGL{kALiS2scvg)dtR1UCTaw45DK|t zA*mYr2{GN%xL}6`s7E)&%FES)zZ!7^+T_LEADqn_m)tZ-90^nJy{3H2{oRnp`fuTo z27xa*?mNr0PkkEME~wUGoFxw;86ZRXZ_J<$7m1b*A(%Vr7l6R=-8=l4kD2pSZxm1j zuXJD3J%Y^4f(qX1Ym}@#S`7cEQvr{}Ti`OA2v4Z}I`eIx)i(${F|SKW81rpETp8a& zjsm_rgHB?_b}xo*f7^V-dF(TpG$xghKzUM3OiXn2>u?BeOmrHkC9yC# z=RT>=u7#DA&X*S-oScmZZd3rI4|}{kb+go(<$~;#=4ep<*YWXdXpHmw+*}>rTM5^g zu7%sqKAcDURh0kMC;c-keo3?68sIgC$iP?^fwBTfYm7L>a1@R=q^7;V+xgmuA&$1% z?f~N|6!-XlAd+0BCGjb`O$7yn2(T{2%gGWRJYJV{kDyFnEQ9gIDv~!TkAiQ`w@= zG4Ay!#a1awtfNfwNbfXgkR6au_8WvHFkJxR>)#$Tfc}641hh3cuw#zKa+!ieO6tLt z4$KFFAsozjmVz7aetts!#xMB~BU&m5f*uBG$ULRt2AI-|(RL%E#FRo(LF-frDH$|E zM)&^}U;v^I_!g;xnQ*I(Y=6nZ6ZHaf0hW^OD|UvDPPkprm!@6iypA@U{Yw>|kRl7< zBfAP6;B+m>$!TwE!`rRU0jE~7{)(P@cz7xD&Buy^a(j9VfU0HAL-*@_Sd*RDSj}D+ zL%T*n*H83U*U!!#075fCIE8eb;>k|2{j_}&JKN~B$*_XYs%tgWe;HIBwIF*9Qo+Ui zj>phLrVflf6_#&m?DPuL)C?Q5`p{nywFOyM(0;V3_Wb$0F0|`1UDYE^O%m4O$r|}N zhy~qGF(6KkwuZ*tNB!cXia*s5S7P76%uY~4Gsjtq$HJ1ZP>!-*0O|jc&%*P@ z&VS|>K(1VQVk>gSecVG&j;8Aq-e2Sc{8Z2E|L4QvZkKVOd(}ujN8g_^aN@xaYinC> zJ9QFXI=L=J(JB1KU(1T1u5YCPIxV6QtlX~>z7 zVgWPjT=GQCp83duyHuG3IpfE9yIFlV>wTNk=?`4EyF^U>Odv+f)6uOf;fi^}28+-l zh-I3kNKI`E4bUXEmT=v_!-DZ&^e#~mpm(RJBT%7-=m!jp)oMf&zF#p_v0cAjK*JTw zwx8Zg97*QT{hPJUZNCW;vG|2nHnz36lz?sit@r_LQT85>xQo1tM(sIAV*=8U-e+yE=sKobCNT%jwKV=i<&gXh>hEpoWBV0Y8 z<5%CfC)Z>CH6pJ&n%pN7@Cyc&I@;P7!aRTl+kR8<_*iB?s`ciWm*2gb@TM_3HcLKg zSNtXuNeJHR$0OuvU6HYej}B#r4tS~P{&aEi%71rpUGftK8NNvi-Smwng+%PYjF=?u zOcMcUWTUCDa{=n;#rRj6#8n{J1KLYxwx}ocU|zO$`bBJ+N)EgY zF=Uqcr1!a>Ct$xJ^uekSx;N_iiq?0Wke%Qv=?$+`3zs+5)2n(m7{t z`p6v{n8wI_L>%^OX6~Sc?#a{sup#k`Ko6~C0526&@wkir!YWrk$a_Q>sVo@b-G1CU z2el3RRuhx=y*;b$I_+14rzi6XHOat~F>@le*DQ}$j5`&lduhoUXo_{I*64E1Mc6CB`t0Wo-UGkl~_=lJ1IkY;i-_!*d z8f?8aoQLG1qj2S~;8Oo)8GqXNrpQzbtobB__Bhog-zHTPrkn#h4?b%6I*ud5 z`>a4vjsDf8s9pJAT}rU?UI)?une=Y?3E0Pe?-C$nqK<~}PPTXU_3zOBvbB>52bIFd zq&;5bhPgsO8$0ee_zr(MAch;Ts6T?)hRH|`J4jkz51%H*KjYqhGY1hP$ZV<0{_C7QSVy7DjNV7| z!9w7k4hU8nM9lCT9>Jn)Z*D=dF9Qs|3xLmeHmd;MFP)GR6IpQlT|Qfuj~ENzd3X9w zu!Qt6vwu)HZj51S-r{JD-l`FJ>0!1Hb^y*yoOOO7#vmEb6?U>KZ)nm%FlYS#=X%_ie<&=q60Hssw$XcGmW2~L z8VQRgv137R1waXa4}9OhY>@h}3Ywe%r~OFPPhE*6GqVc@JlU;3W5Q&P?JK%(l}%(J zrq$a_M9zG-7IS}RK>o5oJ{*xdWphH8KdC@M>8{G<2}InI3rG73uL9H91h084c}RV? z`Y45Rkx4YjI!9b)`INg{?gxi2uJdAtM`!=TVe)@hCm($moXuIq7 zqHM1)sl3u|c?rgr(>+mH1)&8v3A+Dm?x?mE|0A^WemFA#@e@r>_HCjLht?jA62Yyo;-RALqt6}RbO?F;ui z0ZNU1{8sbfOe5vIlCe&%Vp55CbxqNrn@XhO7lGfq&CE>Bn-AWK#E{pqvw~vSSZ{6( z--Q!0kTj;`q?&BBLqI2w3`vgPQNGaOC4I}7{i;{52s+CQfu6JIR>k$N%?^@IbW`&3 z-E1JFVX#vUXv7zAyO3!=I@36_qrX3OF(CcOO|`=58+A%8^SG-?w@}$W^MTKX7m-3+L{0&inQvSrq1dgT$i=cO| zHgy|Cx^pBb;-ZN2ZvHn!tqdR)ND>bLi`Rmm zzajb*&#>fOX|9ey3^H!&i=b^i;8Q|wq}_89-M_-^sOEH02Z=2?L*H;I2{?CBV!ipL zy~=^Q`iAg`6|j!JpG)v@XTjzEV+4_Sb{3}gD{dBqsnzQPRMf4eRHra5s7Kv>tNV79 z0-@W=faOoUx8be|*Q-We`h^g@L*lyl_16j}$EFH`!*AJBVln()RW`P?;;LN;Ok!}B z9~}ggE!I}wRi2iGLGC)#XCNJI*I&EDE82*l2-r!)yL|-G!p@@Me!3mNVEx_q7!j8ZB;dB#!!aO z8?UY`$5`}L&#lBR*F+krQ?H~R-W1LDg5FM#pmkWw!n4Ym(@6cT;ZvyEM{jP`yhoml%hESP2x0kEj7`J22LB-4~?#^$T z7#&{%!f?=DTXZ5NAlq>X53vL<^b3}_+5a?Du*>-}MZNk>SfgD5e-YMa>^w_PkHE$K z9Ci?exGrAdj`VgglL{BNQTuKUs^SQY6yQCO3a84a@L994w*IlSR5WJgb<7j{F}1eE zb%TS0<6U}&*yT>-TTRi26$8&L#eYbCJKAm*`i!JXJNHMvfQO`x_ulV^y}thO4A9Jq!lNJtSxxXr7zso$u%U7WsXqP%U;)OLPqVzngoxm z1MumC(AdR)n1@|EfYbsX-;qJ$aQ>a_)uP)1gq{{>YKQ(a z8aaP{ezsln>aYHEcl%%cDYH|vu9ZL2r(2emoTwyzV{Gi!voNwX*&xadykB`_=_l7D zB^d@3lEhS3bucr|Gla1U30*91P>;j97TRRwyfdVJ5M@L|OB>6h`|Pt8J+a@th+0&4 z>fZg>Enc8jyP~5h(hA}55}BFyF1-&A2&U_Ml25& zBy>i~_8XwqiHMl5bG&_fqSV@%yaMDO|Lp!!VoI5=J^ju)GXj_FMM!;rQ6p1y8_o0l zUQPh>TmbF;??e_59%Rn+;Qk~?AE{c&r}p+%2lR)Vd#78L>WFeQ=Z#vAR13UHBP*cYrJN1~|fDFs?ODTkuCoT;x!OS!u0@`)0 zjQeFb2&ZNbln2QUV80>T_{Uj|m|gmvNG(k84v6y>-$0=$3%Vq0WZyv%(%8~d7cxCv z0FnXcACCbIngkyo0IN@7IBd(wKz@#2t4{onVg|&a{K}V+^w=nxdqq@J1isO{$k(sH z=t0A-0d4bwQ63px_X~HKrdOt10``A3{1vULKXqqVo)X|S1#)LVR)~qwlzq&}5fBuN zjEn>@Q<5U6q?y1rXurH%Y38$;UN!rOdQrog=G_~M0(4ITU$$SS0BQG&_4|FyNs{il-rqR<661x1NC zaQ9ob6%u*>ib(_lwO5G(dfv@zY{W7;HG@w>I(7LE*eG{8eAr0WV{-37MU&A_5U(5`@G(;LGhbH zZ39~TMw1!;G>jqn`@=sd{FcFZdZjb2dECG~4O#>~*Pqhx2C=d;$2pd3X%&KOyIG}0 zBku#TJl=~ibeztAf&;9E0W7{6cv44WQoGx30XhEjX6!&N_d6%3Z4uhr`aeY5*~b48 zZSNBP6m2^Ydb+;^IcXw}t6^_CjN%_`m4M1jq!%=9Cak1S4qvvFy9DbP7M*V} z!-TaoT7cQ)mh;vQbjb<|9v-J4#HuAJ{x#^$`AkefPQB~bALTe*1c5;RowEI7v~1Sc zGqcEzlNIEHPgNM9k%qW(YD>AeT1x&Xy}q&5PG{stMi?rka3k;ZXx<;L>91?!k zgqKxjfynIO_PRCxK-%G;wZG5@$oTU;oW4H6G{!&h14?-jzzy(H_~iP|5~Pk;<-^U2b=86k6RU>P9GbqoG)dGrN4mLJg38Axyz#RFiB z0?j5S*k{`(_EYL4gKqH^BdCdRs-9;@WEeK20Pj7>-{w28N3*VyTLiqTwjj}lJf@sL zA_gH{fT_DD&v1`{q2>_i5<%c~)#WQ_n&z+O`z+xx=6GQMxRQtcZCLq;O8Kq2`LMn4 z>)o0+A&b$e->Y5AuCpQ>Hoq^$>gW4j5BZ6tpx-7oE-o&1_Kk`v{`Mpg9!W_>brbxY z@qBT2KxCd?vj}}a|Alp+rkq}f&eTD!R~(0qep`o`)J!*O$oGE;!Uzk!n? z<|%KzY!tv18ui+QzL57uv{I?ivTwsZ!{fIm)n!f*t<6z$r*x|h8;QNz*B^I=SHb*3 z-0}Mj_C}x4FU<@CRy3FBpFHmc)rY-Ef7i`S%Vzlsswwh+3X-b@2(n7rN#w1>zJB}q zrB!kbS_PY;PTh*l>DASzBBW^JMBBdVOz-|aT`bej0~#W+bjYZMg{{B7I6dCm+uj}= z92B~7W4ivPxixs`R#v$dCNHMpBkF&$D4{?vukGTT+pl1p{dRr*6kc8FX}v7AeGXRe zL*yio0-}gpM8R~=nIw)u6>oK8_x_EzSEb`x%EHB|j;c%A_rGM)XsOb?5wj5Ly7f%; z3ze4jw=35*;zoyaFzzS&4yv1%MX24m@WnCUp!nYUo*7+>KJw~<9n_4qS@>l4Gz(Wi__Utn8 z;ltub+*y82F*8@=kZ%O7D$IDtQVEXNJNiEuj-4E{oqt_^qUd?wxTeHqXZC%Wf|(Y7 z8BWfYU)jK#-Jqe0lzl`y^qt&F_RvaOitu>6j{diIuidiZm=JByYxXP|oQSj)x-GbM zYR62kozTP~#HG6E^A5ilqf_^zx9>OUPt560xaPoL&G@+CoWln@RqJCV30X$%{6`=3 zoaZjBF)zNbFfq*xh zl^d+(g38-t+IW(}L6Lx3vq&8%-SX_Bw^YD()gMvObs*9WWp#9a`p8jcoWnZOH%6<{ zLe_Q9rQ*?M--n==Agk2zR%B-Do;TAP(NM03XW zKYIVT+D1QCQQdVZBKbwY(Gj#K9=V?4RiBpr(#0j&ZN2W4@3I002C~n_?106Esrn7_ zBE@&eLH?cR?}hRU6flyKAmd_U!UL*%_hzM8Nm{_awU>UjW+J-8ElX3nw&s}e2(~;Z z9}@C1DM@>w6MC0+c>IX))D`bJY?!kEW!n^Zy+5vR!Mf{ULjEqncgQ(qe>pk1ci4TM z0f{NV$RLn4a1SFjs;9X#e=2gHFn9k^2XxUpe_n^kEEVY<>mFfM@a>ZkBAi~xaLMAd zskg{50V$FtF^Tw?` zCih^mKJY-2q}(y56@9@xNr>8_|Lh{+5pS~mz0Kj`fSxtYlMG?^+gVB_-aJ>c~56?i3b$Z#Q#)XAk-OS-;#3 zgU%mzSw(e3Wx;gE;h)TKNF>8-SsmUW&)p&ma)+s%+~r0D^jPdn@@of3G-sEwab#=| zG)oB{jB8^b^+$i<(kXl7>&9yNs2e)Ns>-RP0+j1r85)hL#o3+rmX z@Ey{Gd7xOs8BWXYp~X@?siFD7U!U(Jz9+f5_O71q1~I=^_bLl-@*YK&hccx*!mGhtQjXh=PCtLkmTk zA|SoDC`Ed)P$ME$Ktdp)MId|!{JwkddvCq<{(G#IKR7EnduH~`Z+?59J#(b?E}y61 z?iW>}NdgM~%%bQwpPw2S7UKK+8)j)aXDOaDPWA>2Wwprqe`MJf^%KzV38>N->C^-(vCIet+Cl!s`OZON1=j$WZN2!=F|FGqW4Z@gu%Y4815iHT>@mENJ* z*_r#L`b*q1g8wo8n?-r!65qf*ac2s1bGDvDpx4&hot^kiZTlNDE32#Wh~>Rkx~}8p z)`ydh5%T_SMYVn;5;3Malvn#t0w=|TObz$&vFPf2z3GUT`;HcNuPwvmY|{{W$oP<> z>S!!03eU35Tpr@GqQx0NF=^Mf@#K-H~WVHOF{F3AA`7<3+IuC3!;TuJLp-3DL&dG)JRoCT}xkLxcTHzJk~tq z&!cPOCiSw%_SjmI582t;-66+%l)(DPF#*(=N394 zv$Oe?4!3+ivA>S+o9p~(0Kw~J-`d^i7*@yebM6837hav!``?VgM&v9RZ4BGDDqD2& z7&|2I?mH_pqVP2}x28UQdRGgtp#^m(#C6rl`#MPzGqpvhxakoJZE$s2yo(o{5b@`K0acbDRpqa8-p(?O(7DqI`;Lhgw9^T z9Bc&LgvC%wDtLO#Of|0e(?hyjmE5tUck5)j;50*C5O(3wR>cH8L;;6kxhc&dMJz$# zESsIrre0AAJ~oWxlDs5xmU5`3ewRaDT1|p8yp3^`{q-NTIErLNk_qNlSd}kgi!+#I zlJCy?#Ac86Q81E&87=S;e<$A|+&=i>S>-k(o#*(4W`;)Or!@SC^aHxgTwLtnQl7Qo z20Et5lgmk&xP(>!vLxxW%8Jcb6Xp|vY6jzi4~g-J=o;k{WmozFo(1=V21E8}W=4Vx zB^z={Lpj&Wxq`MM@@zAV>x{-Ni9^eZ>oo_VF=z*=nP;NrG^xyD^+eEue0mgsK)~RD zxo=84d~a(hn6_j4>eafeow_B`w&zPm2u)3Lv4_;Z(bVR*cr&VXV`f&bI-@#A$8~nM zrt$tc67-)z4T!Q8Z0Ab^9YlLB?@|Dw=+ZeCS%qzDz;qzwd#)a1$fX#0py+D$mD!*S zNO%WJX@2qK7JpFmJVh+ZBZiX|F8H?|IIW3E&KMNKZ|sBPJ@>jtgdLnVLIGw}l?-i< zWm}T4KKA^a>4PSlfffL?4(C>hj%^KFy-bEQgscG|_ecRQ(34$M2yV1%C|APyUzU5k zHcMGCvz|?xtVm1@-&)<+u)jGeR7ayQv-=XE-tUf;{uVHUv1$!YcKaL)BpkYbz$?H? z*H>18k}P&|BF+y6IDy_DoqEUx!LoP^iDo@wk{WPq3+;CCOG*YjbqXtU!^$#ZX@CSc zyqXIj%W;aN@Zo7CLEZd2h6A$Kf;L1TsUjDwu=_WTo5&q(9KyX)1(kjgW zy7@RQi*$6P<{PpHhr0eG;6s0~%lqF_#XQjvow0EMW!KMy?+&CB;piIp%)!zj&{=@v zFNr897Sx&uoUcclRjT9A&Rt)vOkjvdvYLzX2VJO@*jxC7 zR*qT*i2`7*N;UrrYri};3%tQ{dcLS5FMI)J)GPI6Z6@n+$8~@Qh!My;V8L4h(UUD}ZjEk|WMQbT1`H@kOI=ie3DV{E*XGV`Gcut#$p2gto0u7It=zCuVUqoE51rE30I3YUX!niN+_LGT8u4A^9|aY`pKVtI0dn`afr&D zk5T-tgx#-(xci4~^Tx6Yt}S`>gU-{@;ij>}C^nWoDmh{QS@_iMl1Q;K5(H2m+FxKx zl$K`GR_jb>MN>A*2z3PB*9aQr)+JjW~lJpwBsO9u-y*9JP=_#O(q z+&~}@9CAKjvKTL+vStKGWyg!)SuCXnE&1$eiXL%bP~bxmU81I#!hG}h$_^Zgr4ZCu zD50b{{%g?w>!%IKlPeV5F+hnl_wkDQ|LaPD#0*tT$EkC}rwiZ9!4h*nywRIpDE_7CUeG6qk>oL` z@OfHf*%|xE2ZUtn#d;6sH8QEV6yT99ECjCJqrR=SZsBnaDA&0&J*3{dyqGe$1^310 z*8N_x!od4(41qV!@RE|QdzN->Z}YCzxNaJR-%OP&`ms%>Wn3Nz(g z)!scy8$p1-)BJkXVnpxRpk*mY|7{d6bfT8>3b4*oGVBcyX6}D$QD0;JRyFtHQ#B&z z?I8~8Xb!sW&Q3KuyL`hpIyl~>ikJDZI(*4koK+LTIWyHnK#FXn z-zQ%Bz`pR*E3Ny|=~>3zJV8uVO>K%a-Qm9fLx zwNMR0wX45fNQ;_~b%3;NW2BA-#OaiKKlh7HkIQ~in~sE8%jZAPN%%Q zYj*EVO7SVrimgRu5B(&rdTu1BKcc43 zzhAER_2Rx}n9Egkd{@E0ef{!pLG46Y)OZYZb>sEZz6QqL7Y!sX={{96w6H)2zR?#z z;HGMei-k~Hs5LMrp)C{odeFhu%}pm%N0)0%7O!^ak87nDb*X_;uKrU7x^ww~?Z^$a zcd%Av&;{+NGVbI(kp~Z$1l5xx1^XWbshH``*|}6%gKTyz`e2Kr;(imXv1i(;Qf&Lr zvwUhvyrDBXJ~OD`UNK^=e-6m8{lX( zPxl=jBtTeI3k`t>b9`3$UAC-Ou2@&x|3grH=)N@CT^jqTP@9oernCIx&u!rCJ>;1r z34u%0RZ5qrH+M~7FfaVFhs(3z#04MVh&5&gA2ck)q_rd;r7!5CW|`vMZyN;RF%fK6 zD+{8{h|Ul)W<^jkv=oQ(*rz9Zqdha21=&pb(%GU>D*Fg#7qG(|YcB#)djn1@+6z^N zvm%?;9#5MtF*^5+3bx!oQR)H1!%+T+iuw8Zr#fm?lWxW(wT+D=kCcnq+1XrxURCg? zFrn7@_YT!TKu?;=O0oA|{2X*>9X=p%ztII5t(qY<7Jx1Mf~Jg&i~<))mh%gv#xU4M z`2!t3A}Le|6^mWj3T6N!X=^b2oRiqq#P0ekf zf;)xYxs$QXLRALcQ!KXQe8;p`7&fMBaiAkxc;|KHvxeK^>zNlD(&4UK8m-?}dR=bw zQiCPP*Lv9${wPP)$~dFj$CSZoeIao5QhlRWGFApLC70|8urh`dWtFLpquI38s(jR? zYc4PjeI`mJ>b<^+2|rSUL6OM8gM5O;`QOaCnFW-VGM+o55g!n06AIX-02UV3sxMG8 z=|&fm-`JR*esB8m0h8R_gR@b2E?_JG_H2Vn4$EH~yrWti2+VuVbZ?w|e3CGL!s(Mw zV!x=8!Q<(c0>3oG`zO8M81ga1uF1G>N1o+T(dh)idp+i9?c$Xc?of0d9kEcEw@bU{ z-?&Bp^4fPDaYcf4q>1SVjoyfXwN<6aCh>W{{M#?Jia%)fzL2|r-Lb1yHU3>rr$9xg zYi2sllFX=#tC=rYJZrh8RLk{M-GZe5*pI1FBo4O6rR>SH23qgrSP)WE~&Xw}>q{P@pp zIJ(T2D-K?tMRIO?< z5pQ!g&}Z0?I#+$I>c`_&ZI47M7#>2_7$(xFkv!1&^_TpXT}7a~Uu`=m9d0E}9Ps)) ze_-P`p!zFcSqSl>(bNJw$;Mx3#D`Jup&o4)a(%}PCyPzS+ndUPO5$O|LB(fWx6X!f zrF+__-y6y3vlLadv1e^0P$#MhH4(ie{_XLPjy;|$(b3j6lFl!Ec(lZ%q@;wDjLJ*X zd1X=WfwXgRb)6Tv4(KP$Ow(U=kSkVMX4$Ap_rwSk1MSbN&U9+9lVlrpky3K=X`U|l z#H*bGwgRQ%M=BhGgKu6uCxFuO5h6e5sWQ5e+70#6&b@7mK)`bKWjPX3Qkd7pza7*n zwyO!Npmj>p2qdt7rKLceV(J50fhdRMLaP>TX#D)=dY4s~{>)SBn|H@q^D_(tSy+Af zxjwksal}7MDG@VetxFs5v)^PMNO=2EAoC)XdfWFGYWg=E?Tu8F0z`w*Rt{B32E`zW zpRA;`H_m+A_I+K?WH;G7bix)n0dH{C0ssI2 literal 0 HcmV?d00001 diff --git a/example/ck_tile/15_fused_moe/misc/moe-3.png b/example/ck_tile/15_fused_moe/misc/moe-3.png new file mode 100644 index 0000000000000000000000000000000000000000..77c6d9b6e43ea2c2ef9087eadff6028b6af3f113 GIT binary patch literal 18655 zcmeIZc{tSV|2I6PlFF7O$r7O`E?br?St84jJ%l9tnz3XVOZEztgt0YbWCjyr>{Akj zVHo?GWb7u)Scc*LP}lYQey{KSJMQB?j^{Y;=eeKjFEiuwd7tO`KHukid%e#1Tl!jz z$Ic!Dfk2GfH#LkvAi9Sj(81?N4+6i$|B*Qj0^LMtYp9w$99kSbo~iG*^Itc6&zJu#9Z0mwxK6E!HNBtAFdw$e#vcNK?yr++Z7eTS_$^pn<}!mE9fxWoK*rqLZ+imP~5snrM` zZl1Y8LjyLL=lMNRcX^BM?0)0ob>QuwWEePW(y6flhEw5eO{IwINr`usSWVWfT{+ux z(`;87X*$|2EIq2WC-`~8NEvNtdQ34hKo8uKW4RPanTH53=0edx}J?mEva6wVD8n( z@yFTn;HAioQ3$J}U=WC}^DXnMr?86K;P-AuUVZQQkG;VA9)~l4KtCklF5Jy;|M>Gk zX85Ip`@f%;zQMP*PQ@t1(LFv8T)x78w8ja6D?phxQizMcw)FzQqO}1+(st z_K%Q%bn=g#{D0%@WKTO@3Ehblbp8G@o11r`>->K?jem~uKgan0(qnw@=Jy|z2r0sg z*08>E_;@?KEudk?`W0-3z-M=_MWK(kJ~>;;PMt_CsuS207HLlL$U3Pq*9c2sYptbC zgb%3ghrGD+6H!D&W*xR2hyg;%UqJN z#RK_5o=T$HpYzjmeCICp%jn?=N<>fViuJb@0yMSNMvGalvXvf<7Rwa0!98hkOoBJaIu2HbNGHzpljrkbbnKMECmmODN*gyVaTzt@T1-9SN@2^@&{AgaDS*1?v!nQslN2d5>Z8CjX3jLXY4}KINWt{^l;9DnJV$y z;21U-pn~512mVE-vypA{)wIpO^lMFQ3i!tU>PC!E3PaVC{RD)CaI7{(Si{+~^=qv| zmx1Fm0TDLu!~0napVWb|!?h^|@9-umgTbLO$`kqy2gUx} zkJD0DGp^U=+N3PhjlgWZw5k4%2G)&)YhEKXGRQObfCY+3|&|`3- z|1DCMG!LfP!b%DXMgpLiR|%tI0(^5(oUlpR`^vi%W$SdnY8NB|=D%mf(kk%YJCj$%$DD1Ng-sO8KUsKqE#5-=(P8`xeSy^To7YGkyFp$3bpvKByzQH( z$G+SL7S>uKN`OOSd(8KYYD>XFgt$aI1aK(pr?6o}_I?+|SuI37v zVc!Ydu}MF2Zb8g!2)Dd0M71Dr8Sn1q!#0K!tVwf0sx#4fSELg>lYFrcy`3;?$WwcR zqY1m<4gX9)Nr?{`{6|%26OAjXTJu&{JJv z_Xsvq>ph(LG_v9D^f;Yg=)LcAw(Bqlx_+cGDl zFle0Irczx%PHd9fNf_woHImpaaVfsMM`KXZLI2b?W+~->QZUACX)W|X*Ns@mIKl|^ zdzeW4f$jEsSg%IP#tK)b^C)F7j+?e||7{*jH zm0`)v!%0pvf=tatDwH zzN(9Jca~r`hIxB5whO0TC7cNfsyKvMXNu!b7#n&oD~mPN*`C;DtKk4n-nwF*E8Z&b zQH&!W{gpt@1wjdrhu^l9rt_EM4T{E_uAAw~cvMQoSFZlGs*^ zs+@@n5t$Uk9?tgM`O=jsKsQ!v0G@aF&L)DCcRYZVz2dMpG1tkpdQ3vr^9DC8Ouxv) zZmL{;NKx_XmUl_e;5)~Puj*9kLbg8KyX%*4X4WasY@f#=m3nJp#@oHs0JoNISv@kO zobX6&wWF+JayqP}GffmK?kP`QLwyCNgO@tS$Q#l*f<-mLgF7yg~ z=S^79=zx{JmxTfE&WRJa79-N?@T2a@^m)s5H*``Y^;qJA8oyh~c#UOfwZj&IdrNZ7 zp|)ON%p9S|i3#{P`O0IvRNtqUvkfZ&5n^hn)XG2rg{aEikzhwhpz6ao@i+&VDPh+8 zCpS16PW2SO!dLlIR?nV-LIUi3onaXfCR+P}?(G@#HNp~w<~emqQhHpwld~pd`uR4S zE2V4E)q>f{kw?*okTy%7_+zPE;o(o`?DQ~G)O+o9G84orPfB|Cn&(8FlQNn8ey(yR zXU?r<@o`1_(+`)w3rk^^+(IeU{n!u`lTON0cBILf2OW8?)Gir;9<-|^+Chyla`!&5 z>LKSvc>MQU@tZ%MMz>w{^b6Js|AI2v=>Zgfr+cA)_oBFEYUtKL0P{3ABp4h1#clk} z*d$aT^@411dpzlb=w)@l2cc+C_>ap&!+xN{zPf8bl#HwUmJ4)k6^B>~E)j z-xB0Xa^&q2cz}G_gJENZ{e1L6D*l=zbER8jx`xxTaLK7RW(FsqLWZ`YWgcOPS;e_> z8ysUUZP?+nWXtD2F?V4r&mqR`qbjMm9XrZcGP5UR-LKFk??3t#m3thex7b=BJbmu# zaC+dkyCnhoStLs=bYZGF z6g(TZetPrF;$e{DM}_SEyng?)K`9BzU8*@e76TDqU&Ad1pz{rnp1(N8ML#AS_)q|w zj+7*&DZg}mOx>Ruqp5g!S?$?-;~z30n+6(RG=^?F+oS(d&k2D?2cetI%Jf&I~I z&%+9P=CLA-iUn^bB?+$_2*;#3l_W0g#tw^lOsy?N>Gla+v_B1x_g32J+g@H(F=zYa zH6vMNWrYc#2eEKY5{n;xUrdf%I58LWF*dH8JbeTDh4#eE``PJ=9q1RKddXjVpw)C8J{-qT(@Ot<4K~P<*Cree_ zUh)>j2)N#dT=4iW{L*%jJCQJX`-tId2SIV~ekfaqL<#XH{CjUa2Snkj9{}KL=TK;Y=@o_g72l%$Gul;b^4fMdcjf`;&NKq*gyX)Py zxVDZ(v8>INT2Im`{>Xi^EByB~*ch9`bi0nWs-z{!#BEL}OlhOIy_73*coiu8)!{wa?-oe~CZk+Fgs@4enZqZc{NNQ0DtcBb9`DS!Vf3vQ*>vqt+*% zQLlX6N9|^c*I#)%jb*IwcrcHr-8_g9>E;Kahk-29J(T zqbRnUILt7;X(!kWyR+DUeN`8#ur^<`*5ckO1?P;mt;h=+C0J3U;j@DKNAOtQ$<-;p zEec%c(^;5ngq^5=Vf@Qu4^n$deC2kV3a&BnepP`Ji|P<|lTjB?RL@@;dS#M)>e$%*$%i7jTwqkkk z94VpE!D)Yj%>Y8=Ld~x0Q|8jo8psB!B4+c?2;nmR*DQVrM(o=W|R&+LV2>{)yi;`0R0Y znj?}?ES(xp`G6c+T!w2B7?fy}1_KgwiHlv7PMF(zvx$ux)qU1F3m> z<{DNF_Wf-+eEcjdqFpe9c&5NcB42HU;NkkFsYR*ACpE}IqGSQ+kjz8oqt8n_EbAU{ zYfVc}9)r>ftW_v;L&i=KLGt8TmCb`^#hNxphItw$>jR}sm$u!0xQUn%#jWvMN&Q=o z1Yz;=ZfBx-VXawE&#;ATEevVw!BqGXGOA>;&uyg~&AzF0pLoA=VNB~sE?jm%!;x`ht=wrfM<%SyO?u9W&*!peFUpoYS@*3_k_C4a-&V8mMIRsir?^2(9 zoc=GnKdtW!%S!tT|q{3ck1(S z@c5jp)VolV+nEo(rZ+rFE+%SJhUR@Zc0vJZxB+(=eJ%g!OvjB(Rp#>9H=!*PV+q?G!_?>-b`^Fd;1j3FZ;%XU zv9#rAR#;yE#t(B#&dacII4O4yCH8flWn;--o$xZh!(udRMCt+%UV(u8D4Ge^C<6J> zADpx4ibmg6%roTZbI-gr_SFAA_rXgS)r?O=5xpFjFSpxn*U%m8Pk2b%NN~NM_R0Oc zza&*%3fVCCB7|9x8N3=?+?gUJhvoaNReoulMB4|*_g}eho^qSBmQEh#>LLzjP{@MW zY;Hb3_8kB6vvei}9q2|&tA&sc2U0Z?UfUVo_DH*OyL0-;v2(o_e`-ZcL)4=XC7FM8 zx@Dlw7n4c|cz1XdoKwlxj{zGS7ljBFP?Ee(QnBGBda>Qn+RG0RccY^b$oA6twXI{- zE8RshOqIj~biH_DRJ9H*&OFD>UQSlPBc&-KNkJ3NAx%9$0d2b8`=FH7z#a7-J8yG^`<9%q(txYQh?KeBSTI8(qo5Dg&X)L6h&X4n`n&Yn* zWyi@84FYoBH?VR?6&Kw+VV4%J6h*EaN^~XHu2aXel+5}v2)JT==<)Ut2hFrkQ>WHW zJ02J0%alg37p)@ z>|vj0e5Q%{s%n7D%1^88 zemp6&e77@^d2NOz$WD!L<^Kh>#f-{Z&|ZneYVmtA!$-jyr~iKo;77eZ3Z@M+V+N>; zq<-}Hdl$mU*wpP)0mMyD(!ucCcCyFs>;o2!V_sqDk`5zd<7aL-AG|07l%9f*)m9&G z$?Ka95x48n=-cqk(Fip6P<}bG(${k<(Q{;WF$X%vJ=dT21R?i+^`c%#!OmQN^pn)8 zdV$h>NK5jlcgX5POuz-=OE>8+TF(4FM{#MI&c}w7;j^dJ2<-a=3t($F2}i-I!~1NI znn!yijJ;ym{w(~G^Va?&<}7diR^)XR+sm#=+?rS~QLO**@ZG_O@f@InnOROMDRkmOv+dV!P-?(F;` z=R4w+{tJhy>^Q&IjQHDH&6d^6fy`fP#o%pOUA?x^uWpipy>24<+qK|)!oFPjLps#Tc%G+j==jK-rFm-=j>lIB7C)Abg$x*`?roUIfVikq0RMzf+YOY zv;U!gfF=VJ7GD7{P2SH_sd=`b%PM1yNVH0N;98QYAg+D+BuvdJ0k0v!0xRL$@7a_A z{28L0!!H;D=qcg^GtA$ZrT0~rP_zus!05!4i#=$qy&^4zvys&*q%ZP2+uK>eOgzjr zOD?9Z3H7Bmn^k3jo5IZ@Wx=!S^ptz%6*m`NyzgvKwY${CHa!EqAz6WDArXT`psXUh^CL~I)xA`%XzTax>ImG>E4=+x$ zh+YR*uhYc|^Afj)klc8Ac7nQ|qd6EnWYpL`Uw8iVbc;ATk8RExP#?9WkIwlat+hKD zPvCeU7+9$k(9;FhO{5QyHN(Qr^=zNS7oj#y3p-CDBDU2CR82!3SgNwb1MB7zx}3U{ z)KXPv6f5lFk-xl&nS}c4AbL%4wb#a0RdvnnW!mkXmmaZ+219B223cW4^$o5%#_5;X zzc1 zVW$u>uXNo^9cCJg^h@{!Wt$FAL)H0im^wEE27eM^*I&MNv#>L0dMc;NvvIx8573vY z?d{nB?B;k)PH4w0I~1$p$Ky-?Cdv{0-0Sji6$PNw#ZYRR{=P{ljeh67swR0cww=%p zB(&=;X|D}nZH!>!;0Rf|!(%OZef2Z#BFz59Q=)PaI|ZDa3ZiJ&=W^e22_c7L6`ew! z7t)#F7VvN9Uf}2*(YUqvV>kz{umu?(9ktKLP+ByJQlmNp_rK}#g4Yp|%D2bB^WHbi zA_Umd{lI+4qB<|CS$zmz;>F8&-SxQRxM16@pNsi!JTqMr@N90*u`k7ccgCtn{R(3E z6XR~z=J}5?Cif7eHpX*qqctTIW#83vZ4Dks>yfvQvvX3z?m2ZVdj*`6N%hfTzQyy!PAVDi>-Hb< z?l>!qn4X+-g7?|2Wy}H9|J1<^I0NPoZBe!}Gvuw4)_e#z>>~7mQ#NT$3>3!>#Bd-S zJ{u36ml7_Hl}ZVvQNbC_rA|OA45Y&dkG~oJ7_afZR2kB}2t5GeJG#$a+}X~SpIuu5 zfmjahQwYLBJ2MY~_>S%Ox;MC2+qLKU7C|5luKlO6Y#Z&`AW$yL-m~%JhoHm@AdumH zYrDydEG3+g-?x2#1B6*W3<|%skAZIs%}5C!0L7{8vnWGN2mae{`j)As^WDNiNB&i) zwj$&ti0}CR)UxFn{v=^WzNZ8I*e7V(XN@}P>;DPp^SWc!m0~hCrcz-Tm3*N!z0eV0 zt>JI4?~yc`6>4Lp#6y0hkgd0Ro61-txq6XHYfCMUao5%>!hADf&bI^tKbNmp0sa4P z%6sdTLa)Fkpb|!echCg^u{ufBVW+Tk|i&%&HD~C_#LFLnbh}TF-cqBUT^<8xv3(H4f7?!1zf3r0^-v@8v;_p zAnQ#QuKd+LpS^aJ-vgMpCy5nQm9z&#JhlEDO8hZ1sH;_nxe0E{vGg2_`&eME>%Eot zaL`~2GNhugG*dbWfR=9880+WwG3!S`xj+;H#>~@MY5zK6-(t1x;=49+M`rX-tT_4m z8vP0)VR(J)s((xaPNmc+xfYMDVTaafA^{WX-0 z##*sny5w_enjcC~y2P4Rcf{|Q)wi}Q!?KG5siorAj7Vn+vuqn4xGnw=k$M$oPt-dX z;*oR}EKrdH;^W?**b{zWVmIcyy?D2lChF`x!(h1;UPh#4a^H!RqtZG zsAc|2fj&?7^b1GEk+J7#?>1J&#cEw{NtkVK;Fh_?<;E9B>F*u=EtAr&ZBcyHw0Ju^H71UNuLA`;VB0aLNa@Mhe_CZh z>~BjYvF5V=CHvVu6-YN15n5lkt4pAeUv8C<`pQa&gQJ`^{t};i0T3Va`wV50qKJqLp!tGx&Jx+8kJ z4JGG0-#!H@+h0?gl>v^iGIVe#!g^@NX}76EO4w9r$B&XN&q+_w#R9yl!m}oV2tRhy zK zvB&y@Rk^P*qt3UKANgq+&X^{M#JO8`o-%jXek(B@a?=I?R4nNq*41bK!Dy#!!V77ZG-V5iTRjnU04WBEFk zfMO3K3GElFZha05-3A%tJP}}$qe?1`hH~&h`z13dcR|G-#4uVmBW#_c2^25|!#&?h zR?~o+4Ua+qzDh+NaIYcw&wC9e3!^20c)|UG!-h>-dM`Zo^KLaY8h1Q7fqR`&_&%*v zeiFa@_<=)?v)|C`_dfvs+&IQPHrqbty4qy`x3xH3tc%@=b|Fp;`nlcdZ@3Y8_Ul|u zj0`H}im_(rei0pntAq|tt86gOXR2C@KYXnf8}*odWUQi_gGXakr_A?==jCuOVj&tXa7D^NoxC+>)#7u$e zgeA`W0{e+vJ(c}7@z|;q0Pz8=LZDOZKzr_Y9|!Pds(%a09kJg;ci1k^)pbfSbC-4q zOXnSe&vFY%+y4OGS5+rirn_iQnaWw3&jM5f02!+&aPOI{N3mOFoyIQWm-o8&nGD=> z-sz4Dz~ZYDr+2rBFVK|>9VWYPNJHpQ890N*QQS|iDF_51sf z+>s>8eT91mfaXt$fOMWEkYv~Gw9Mbz8~Za8xPIv=Ig?@$NWJc8X|&nlCj#+vECSs4 z2frvslQ#JmTp1}0FVFpY`M^WAfBEqGR9Xc1ApK;pu#!__j2@KR{BuI53qH3Lx3%_)<5({CNo13huLTmB>20Wv z*rB^X^CAh2B&|(9N$Y09O=kytHbybb&dzFUYs-#!e@Zq`wkWogq2zH3|qNIuwlECNh-==J0bNSV|hb2;ITX+n+0YW^?)#P_N}$+v%p@+ z2|ri~Qi3ZOsW1Q6%byDI&#+yn|P%36TG`Qb7AmN;R*^lBv|Aj^1hgF>Hd`}^yLw@?@5C>C} zq#9c|#PwhO5N{bhIpGkCQN<1uG8A@Fr z4!fkp`XGwAEz}SHS~VzfPMcN4i!#3Uh>7|-JGBw!(H?B_BTdJ2;-DUOEJ95-E*amHWgEg-Wu`dP5#4_}QErr(Pshh(Nsql`tyC7plFbo{1po=$W2!N)h+O zk?PkUW$W;$gl=00j8Z85wx}HBlz79&J!qd*PUKi!V0p!R{wh^SVy$a064>f6lcQpm z?=N~sB2Sru{TGM)zC1fWSL{_Yi@54B^frK3&iihy{~`jMfiUYKRXE~cfvZGDuB&>2 zh#^NyOW0$3$`bY9t%`Tbs$n!rKPu4#8c3ZjK#iDey-H6n9G|bA&NfD2sdFgn8sEXH z2YEr_kU&s43qX#^=gZuq+oBp02}93;VCZQ(^jNh5Dzis_bDA{l0Sn1JIr?HySTXYt*_kpUS{}#?p0W+sw&=$iT} z(XFJ!yZ|wnm=cuNHiYpAP{xoCY?#L3-}97{QSAZUkRxJ)OXW}f-zZkPVjA=yH_KfO zDoQ5SgrA(~Z|>@8SuCPo3)ywM48`U8*+I*!Qf$9Xy?8$K+NL|mDhSU=4GnASLfhW^ zD3eOODu6u+*4M9OO75u%m#;*wH<=WY)&-F(>wr_ql7|pmQ$$Tq&mZRZ5m5Zbh?EIF zaSjIkb~nyp0RZq$qRck$Had&2g`3*J3q{J;x1T;$Z-IU+sgriI2)oa10r9Q1^b9Ne zaxouYHxa{CR$gA*GO>M7m-%ktNicgFMfuG5u6_tJkK#k%cnKX6Xi!ZFm~2ic59Nwr zcpI%8I=eN_NUG7G^GaZbZX3x1t`;z2q^^lzA+`4@2uaEkUw9|RlJVFb7Z(MP`T5=g zAbW8;z?l{~SB&ySS{FW0+Z-~!I=4L5`?y&D$+-W_#+w8SAbDl-Q-q{DUW!IEYz_t0 z3@T{l6m+z<9;-{yqI^Hhap(0XUG_A~_8{4SL$Ys4)uo<)cz2*}2T4h`b4KsQ9P7=k zc}Jz?xz3&rechcgP6r+PpGDjn`jnnLA z%2iZPiM7?#%g0yQy{TXF8{Qd>pXXU-oQsTBl|8eedhA;0=6rywykp@?z)01D(13&N z{@dN_*pt1}^As^g9!0=j8|Jj^TM~N(&RL8f4|h~5u@EY`GllS&L<(^PyKT}-G8fw} z>+XEbV>J;9XL-yG3mWB|l?Rf2a>Sl$v*u0K+I!P2XRY&Gn&ZjrBjriQl?_A{I`wltE(1YU9C+gTPl4Vliq(dvU zDx>SzPj88`H=<_+k`!O0q~sre*q$cQSH1=+;_ro-2!S*nyUEfWxZadfCsU{gisJ)f zQo+kN`=2iZpZiat_dm-4pXmoZz#?q>mz{qiep)5B9sxMJ_-h;+&XOi-O7qz2=0HntQ zoB#;aVugY3#Z~N$=?x(0e3dhOCTx2W6Yz&vASP}|+u#2IZe42ugM&ccK+gVu=u6Yg zpODp)GrbtE@j~N*ATd{3%7=#wlWVva$PXa%W7+lm;lk#77Vh1UIuNaG~kzhl7PSLvlS)O z2$a|1!mO~*n$D-Ip=xXs!0qY}$wzTO&H7|S85kPS`#0^9sQ(^>I{4p!P>nR-C=takA%Yv!-p0(@SGXDMbvw0;Ru1jLgk6GYk^*vfPP$!XOji-n+{uRDD$kqiV z6@f|o1FVB)WwHFlD}!SN0@Bu_fxo8M)e$_|%(#_|e5JB>V;5t9_PMyt-V*YlD!CNKU)Gb=^Pz8D4Wm!?Al8w<^Z)q&{oAv@1`^=TL43+o< z(}&q7))%%G8gKrLp=m3tNx36CI&*wJMbsJ7CEYo~(_?Zx02Z1sKGDvPp_{#P0tZCn zvP7D@eii`DQC-M=2mt-;v(KCeRZt`JX6+vWf<1Y*i-0<3q{RANquTX3hgnuF;A=!q z^afhbIc!%`%&Fn8I{4VFViQv`$YA4&=K(dl(>ZRPeFU_>MNjqERqp%c=*@NO;|hfs zE6^)s3D92Di8e)4-(P*)BSKK< z|F_*1m!OeTkDEu!J*@V28*SkAGmdx3aw%CQ48m<-hl>at2*jz7yK+squ%jsVlFXKazqfAB#4I!3!}(I_sZo%IU&fmGn2=W2b^@-bLeB@%Y`L z>zU3+!M3Vy%A*L)Zy-(LX$pY!jxEB5FDa{OSbvD@}!va#o7A?PA%@Vh3WH@^NlhM25)MLI};~d^f(XZ4uW$ z@8++5?p@EWj`!M#{T=MTBb2G{Bmo>{_?KToq(-s9j$``j=V&MZ)LL?0;x(GJXBsWk zk+Ql)K>7FL2}078;?|mcGABgI8U}Y+8?iJrD0Zfior-aN{9Ld;Q`b750tR2(OEHK| z@QG!=^hg-hVJ7ZGP0q6p3f_deU#`6-tqVB7(m$8z@xo`#*`yQWs64F*Z{w}T$e|bem zRZVIe-xHayHO&gP{iur5+^rjQs&?fk_uq65(z_H({DNLrUhVz5n{Ie`5N&wx-llOS zw}P^Z{VQd^9hY?~@)FnqVG>s>bw&G%y_va+OyJqRe{s$AZ{EFo2jr=d;%8Y;;C`}S zla5T}bm=JQY#@#k*MxEe7upXBGT=TKMs1s!AHTEX$vj_8X}oHB!Y;AtKCPI!G)>I0 zq>G%^t>WidPyfaBSDq~qs!jxM{2jBBZDP6kQH2{mJNH|Q5<>9yFV{jTE@Gpi?ytcX zz!r)8bmTKO4-Htnom75=5v-J(JRa|el?XS{7?W6+#Fnby=H{T-HoYM4^l(9^yl8y~>Y4JSUBdmCUl zWu?9SXqur}991fn%4o-u3P$L%gB;vNd5CyzJgy`5W$we=2}eHqe_f$AlFKo49RCB+ znuX|H8}D9OZEWb?%F1Aa32d+R7o}L++divBRq4Ku43LZ7z*F*nQ z)=?;3oKW+rucek_a+P{nHihG=0)oQD#0pd7{POAFl7s7V{FLyOqmuM8x<0K|ywMd* z*O)u`{w=+jh-<`~)O6OleoQLtn?K|(QE2%zR{lKo-;$09Zw)8s9p86lfsz2zhj$>z zD9|p_?xWP-2FsO_hVItVQ2pt%pW{AfD9;Tch@K(@)W1CjBrH*mrL3?Uo;mv=5cKnc zTKT|H96#pIe97o?l+eiDotwSn=>!r;|73=_t>UwA=pI$&Y8xOGqoZs;P literal 0 HcmV?d00001 diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index b6a44f76b..29305405b 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -14,3 +14,5 @@ add_subdirectory(11_add_rmsnorm2d_rdquant) add_subdirectory(12_smoothquant) add_subdirectory(13_moe_sorting) add_subdirectory(14_moe_smoothquant) +add_subdirectory(15_fused_moe) + diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 3b198502d..3cf0c2595 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -52,6 +52,7 @@ #include "ck_tile/core/tensor/tile_elementwise.hpp" #include "ck_tile/core/tensor/tile_window.hpp" #include "ck_tile/core/tensor/tile_window_linear.hpp" +#include "ck_tile/core/tensor/tile_window_utils.hpp" #include "ck_tile/core/tensor/update_tile.hpp" #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" @@ -62,6 +63,7 @@ #include "ck_tile/core/utility/philox_rand.hpp" #include "ck_tile/core/utility/random.hpp" #include "ck_tile/core/utility/reduce_operator.hpp" +#include "ck_tile/core/utility/static_counter.hpp" #include "ck_tile/core/utility/to_sequence.hpp" #include "ck_tile/core/utility/transpose_vectors.hpp" #include "ck_tile/core/utility/type_traits.hpp" diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 3feede4d2..bebf035e9 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -621,6 +621,65 @@ CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0) asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory"); } +CK_TILE_DEVICE void lds_load_fence(index_t cnt = 0) +{ + asm volatile("s_waitcnt lgkmcnt(%0)" : : "n"(cnt) : "memory"); +} + +template +struct buffer_atomic_add_if; + +template +struct buffer_atomic_add_if +{ + template + CK_TILE_DEVICE void operator()(const T& value, + int32x4_t res /*buffer resource*/, + index_t v_offset, + index_t /*s_offset*/, + index_t i_offset /*max 0xFFF*/, + index_t flag = 1) + { + static_assert(sizeof(T) == 4); + auto save_exec = __builtin_amdgcn_read_exec(); + using mbuf_t = float; + asm volatile("v_cmpx_le_u32 exec, 1, %4\n" + "global_atomic_pk_add_bf16 %0, %1, %2 offset:%3\n" + "s_mov_b64 exec %5" + : + : "v"(v_offset), + "v"(bit_cast(value)), + "s"(res.xy), + "n"(i_offset), + "v"(flag), + "s"(save_exec) + : "memory"); + } +}; + +template +struct buffer_atomic_add; + +template +struct buffer_atomic_add +{ + template + CK_TILE_DEVICE void operator()(const T& value, + int32x4_t res /*buffer resource*/, + index_t v_offset, + index_t /*s_offset*/, + index_t i_offset /*max 0xFFF*/, + index_t /*flag = 1*/) + { + static_assert(sizeof(T) == 4); + using mbuf_t = float; + asm volatile("global_atomic_pk_add_bf16 %0, %1, %2 offset:%3" + : + : "v"(v_offset), "v"(bit_cast(value)), "s"(res.xy), "n"(i_offset) + : "memory"); + } +}; + namespace impl { // below type indicate the data type used for buffer load inline asm // clang-format off @@ -810,6 +869,11 @@ CK_TILE_DEVICE void buffer_store_fence(index_t cnt = 0) asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory"); } +CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt = 0) +{ + asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory"); +} + // buffer load i8 CK_TILE_DEVICE_EXTERN int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, @@ -2378,6 +2442,45 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer& src_thread_ #endif } +template +CK_TILE_DEVICE void amd_buffer_atomic_add_raw(const thread_buffer& src_thread_data, + T* p_dst_wave, + const index_t dst_thread_element_offset, + const index_t dst_linear_element_offset, + const bool dst_thread_element_valid, + const index_t dst_element_space_size, + bool_constant = {}) +{ + const int32x4_t dst_wave_buffer_resource = + make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T)); + + index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); + index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T); + + if constexpr(oob_conditional_check) + { + buffer_atomic_add_if{}(src_thread_data, + dst_wave_buffer_resource, + dst_thread_addr_offset, + 0, + dst_linear_addr_offset, + dst_thread_element_valid); + } + else + { + buffer_atomic_add{}(src_thread_data, + dst_wave_buffer_resource, + dst_thread_addr_offset, + 0, + dst_linear_addr_offset, + 1); + } +} + // buffer_atomic_max requires: // 1) p_dst_wave must point to global memory // 2) p_dst_wave must be a wavewise pointer. diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 65a3a4e2f..afcf982a6 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -73,6 +73,24 @@ CK_TILE_DEVICE void block_sync_lds() #endif } +CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0) +{ +#ifdef __gfx12__ + asm volatile("s_wait_loadcnt %0 \n" + "s_barrier_signal -1 \n" + "s_barrier_wait -1" + : + : "n"(cnt) + : "memory"); +#else + asm volatile("s_waitcnt vmcnt(%0) \n" + "s_barrier" + : + : "n"(cnt) + : "memory"); +#endif +} + CK_TILE_DEVICE void block_sync_lds_direct_load() { asm volatile("\ diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp index a88780459..df0f54c5e 100644 --- a/include/ck_tile/core/arch/utility.hpp +++ b/include/ck_tile/core/arch/utility.hpp @@ -102,4 +102,28 @@ CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane) #endif } +template +CK_TILE_DEVICE auto flag_to_exec(const T& v_flag) +{ + static_assert(sizeof(T) == 4); + // per-thread v_flag store into 2x sgpr + uint32x2_t exec_flag; + asm volatile("v_cmp_ge_u32 %[s_exec_flag], %[v_flag], 1" + : [s_exec_flag] "=s"(exec_flag) + : [v_flag] "v"(v_flag)); + return exec_flag; +} + +template +CK_TILE_DEVICE auto cmp_lt_to_exec(const X& x, const Y& y) +{ + static_assert(sizeof(X) == 4 && sizeof(Y) == 4); + // per-thread cmp store into 2x sgpr + uint32x2_t exec_flag; + asm volatile("v_cmp_lt_u32 %[s_exec_flag], %[v_x], %[v_y]" + : [s_exec_flag] "=s"(exec_flag) + : [v_x] "v"(x), [v_y] "v"(y)); + return exec_flag; +} + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp index 2cc788d42..7dffa0e55 100644 --- a/include/ck_tile/core/tensor/buffer_view.hpp +++ b/include/ck_tile/core/tensor/buffer_view.hpp @@ -437,34 +437,74 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x) + CK_TILE_DEVICE void update(index_t i, + index_t linear_offset, + bool is_valid_element, + const X& x, + bool_constant = {}) { if constexpr(Op == memory_operation_enum::set) { - this->template set(i, linear_offset, is_valid_element, x); + this->template set(i, linear_offset, is_valid_element, x); } else if constexpr(Op == memory_operation_enum::atomic_add) { - this->template atomic_add(i, linear_offset, is_valid_element, x); + this->template atomic_add( + i, linear_offset, is_valid_element, x); } else if constexpr(Op == memory_operation_enum::atomic_max) { - this->template atomic_max(i, linear_offset, is_valid_element, x); + this->template atomic_max( + i, linear_offset, is_valid_element, x); } // FIXME: remove memory_operation_enum::add else if constexpr(Op == memory_operation_enum::add) { - auto tmp = this->template get(i, linear_offset, is_valid_element); - this->template set(i, linear_offset, is_valid_element, x + tmp); + auto tmp = + this->template get(i, linear_offset, is_valid_element); + this->template set( + i, linear_offset, is_valid_element, x + tmp); // tmp += x; // this->template set(i, is_valid_element, tmp); } } + // i is offset of T, not X. i should be aligned to X + template >::scalar_type, + typename vector_traits>::scalar_type>::value, + bool>::type = false> + CK_TILE_DEVICE void update_raw(index_t i, + index_t linear_offset, + bool is_valid_element, + const X& x, + bool_constant = {}, + bool_constant = {}) + { + if constexpr(Op == memory_operation_enum::set) + { + this->template set_raw(i, linear_offset, is_valid_element, x); + } + else if constexpr(Op == memory_operation_enum::atomic_add) + { + this->template atomic_add_raw( + i, linear_offset, is_valid_element, x); + } + else if constexpr(Op == memory_operation_enum::atomic_max) + { + // this->template atomic_max_raw(i, linear_offset, is_valid_element, x); + } + } + // i is offset of T, not X. i should be aligned to X template >::scalar_type, typename vector_traits>::scalar_type>::value, @@ -585,6 +626,39 @@ struct buffer_view>::scalar_type, + typename vector_traits>::scalar_type>::value, + bool>::type = false> + CK_TILE_DEVICE void + atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x) + { + // using scalar_t = typename vector_traits>::scalar_type; + + // X contains multiple T + constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; + + constexpr index_t scalar_per_x_vector = vector_traits>::vector_size; + + static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, + "wrong! X should contain multiple T"); + + static_assert(get_address_space() == address_space_enum::global, "only support global mem"); + + constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; + + amd_buffer_atomic_add_raw, + t_per_x, + Coherence, + oob_conditional_check, + pre_nop>( + x, p_data_, i, linear_offset, is_valid_element, buffer_size_); + } + + template >::scalar_type, typename vector_traits>::scalar_type>::value, diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp index f150fc54c..b280a1725 100644 --- a/include/ck_tile/core/tensor/load_tile.hpp +++ b/include/ck_tile/core/tensor/load_tile.hpp @@ -22,28 +22,32 @@ template CK_TILE_DEVICE auto load_tile(const tile_window_with_static_distribution& tile_window, + number = {}, bool_constant = {}) { - return tile_window.load(number<-1>{}, bool_constant{}); + return tile_window.load(number{}, bool_constant{}); } template CK_TILE_DEVICE auto load_tile(const tile_window_linear& tile_window, + number = {}, bool_constant = {}) { - return tile_window.load(number<-1>{}, bool_constant{}); + return tile_window.load(number{}, bool_constant{}); } template CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile, const tile_window_with_static_distribution& tile_window, + number = {}, bool_constant = {}) { - return tile_window.load(dst_tile, bool_constant{}); + return tile_window.load(dst_tile, number{}, bool_constant{}); +} + +template +CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile, + const tile_window_linear& tile_window, + number = {}, + bool_constant = {}) +{ + return tile_window.load(dst_tile, number{}, bool_constant{}); } /** @@ -76,6 +100,7 @@ template CK_TILE_DEVICE auto load_tile_raw(T& tile, @@ -83,11 +108,12 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile, WindowLengths_, TileDistribution_, NumCoord>& tile_window, + number = {}, bool_constant = {}, bool_constant = {}) { tile_window.load_raw( - tile, number<-1>{}, bool_constant{}, bool_constant{}); + tile, number{}, bool_constant{}, bool_constant{}); } template CK_TILE_DEVICE auto load_tile_raw(T& tile, @@ -102,11 +129,12 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile, WindowLengths_, TileDistribution_, LinearBottomDims_>& tile_window, + number = {}, bool_constant = {}, bool_constant = {}) { tile_window.load_raw( - tile, number<-1>{}, bool_constant{}, bool_constant{}); + tile, number{}, bool_constant{}, bool_constant{}); } template CK_TILE_DEVICE auto @@ -122,11 +151,14 @@ async_load_tile_raw(LdsTileWindow_&& lds_tile, WindowLengths_, TileDistribution_, NumCoord>& tile_window, + number = {}, bool_constant = {}, bool_constant = {}) { - return tile_window.async_load_raw( - lds_tile, number<-1>{}, bool_constant{}, bool_constant{}); + return tile_window.async_load_raw(lds_tile, + number{}, + bool_constant{}, + bool_constant{}); } template CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile, @@ -141,11 +174,14 @@ CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile, WindowLengths_, TileDistribution_, LinearBottomDims_>& tile_window, + number = {}, bool_constant = {}, bool_constant = {}) { - return tile_window.async_load_raw( - lds_tile, number<-1>{}, bool_constant{}, bool_constant{}); + return tile_window.async_load_raw(lds_tile, + number{}, + bool_constant{}, + bool_constant{}); } CK_TILE_DEVICE auto async_load_fence(index_t cnt = 0) diff --git a/include/ck_tile/core/tensor/static_distributed_tensor.hpp b/include/ck_tile/core/tensor/static_distributed_tensor.hpp index 29c20bed0..568d618ec 100644 --- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp +++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp @@ -201,4 +201,30 @@ CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks_from_x_unpacks(YLengths, number return unpacks; } +namespace detail { + +// check if 2 static_distributed_tensor has same data type and size of element +// but only difference in distribution +template +struct is_similiar_distributed_tensor +{ + static constexpr bool value = false; +}; + +template +struct is_similiar_distributed_tensor, + static_distributed_tensor> +{ + using Tx = static_distributed_tensor; + using Ty = static_distributed_tensor; + static constexpr bool value = std::is_same_v && + Tx::get_thread_buffer_size() == Ty::get_thread_buffer_size(); +}; + +template +inline constexpr bool is_similiar_distributed_tensor_v = + is_similiar_distributed_tensor::value; + +} // namespace detail + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp index 698ce5378..4c72ed085 100644 --- a/include/ck_tile/core/tensor/tensor_view.hpp +++ b/include/ck_tile/core/tensor/tensor_view.hpp @@ -333,6 +333,48 @@ struct tensor_view coord.get_offset(), linear_offset, is_valid_element, x); } + // X is vector of DataType. + // "coord" is coordinate of DataType, not X. "coord" should be aligned to X + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr void + update_vectorized_elements_raw(const TensorCoord& coord, + index_t linear_offset, + const X& x, + bool_constant = {}, + bool_constant = {}) + { + buf_.template update_raw( + coord.get_offset(), + linear_offset, + coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord), + x); + } + + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr void + update_vectorized_elements_raw(const TensorCoord& coord, + index_t linear_offset, + bool is_valid_element, + const X& x, + bool_constant = {}, + bool_constant = {}) + { + buf_.template update_raw( + coord.get_offset(), linear_offset, is_valid_element, x); + } + CK_TILE_HOST_DEVICE void print() const { printf("tensor_view{"); diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp index e41024698..caeb03852 100644 --- a/include/ck_tile/core/tensor/tile_window.hpp +++ b/include/ck_tile/core/tensor/tile_window.hpp @@ -292,12 +292,15 @@ struct tile_window_with_static_distribution { constexpr auto tile_dstr = TileDstr{}; auto dst_tensor = make_static_distributed_tensor(tile_dstr); - load(dst_tensor, bool_constant{}); + load(dst_tensor, number{}, bool_constant{}); return dst_tensor; } - template + template CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor, + number = {}, bool_constant = {}) const { using Traits = load_store_traits; @@ -785,6 +788,73 @@ struct tile_window_with_static_distribution }); } + template + CK_TILE_DEVICE void update_raw(const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}, + bool_constant = {}) const + { + using Traits = load_store_traits; + + using vector_t = typename Traits::vector_t; + using SFC_Ys = typename Traits::SFC_Ys; + + constexpr auto tile_dstr = TileDstr{}; + + // loop over thread tensor space [y0, y1, ...] + static_for<0, NumCoord, 1>{}([&](auto iCoord) { + /// TODO: use structure binding (to be captured later) if compiled in C++20 + auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0]; + auto bottom_tensor_thread_coord = pre_computed_coords_[iCoord][I1]; + + static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) { + constexpr auto iAccess = number{}; + + // data index [y0, y1, ...] + constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess); + + // read from distributed tensor + vector_t vec_value; + + static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) { + constexpr auto idx_ys = generate_tuple( + [&](auto jj) { + return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j) + : idx_ys_start[jj]; + }, + number{}); + + constexpr index_t d = + tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys); + + vec_value.template get_as()(j) = + dstr_tensor.get_thread_buffer().template at(); + }); + + // write into bottom tensor + get_bottom_tensor_view().template update_vectorized_elements_raw( + bottom_tensor_thread_coord, + 0, + vec_value, + bool_constant{}, + bool_constant{}); + + // move thread coordinate + if constexpr(iCoordAccess != (NumAccessPerCoord - 1)) + { + constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess); + + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); + + move_window_adaptor_and_bottom_tensor_thread_coordinate( + window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); + } + }); + }); + } + // move thread's botom tensor coordiante // [x0', x1', ... ] ==> [offset] // also move window-origin diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp index 4b921ec5b..96a8352c0 100644 --- a/include/ck_tile/core/tensor/tile_window_linear.hpp +++ b/include/ck_tile/core/tensor/tile_window_linear.hpp @@ -432,23 +432,38 @@ struct tile_window_linear CK_TILE_DEVICE static constexpr index_t get_bottom_linear_offset(number) { constexpr auto linear_coord = get_bottom_linear_coordinate(number{}); - // since this is linear offset, we assum bottom X tensor is always linear - constexpr index_t linear_offset = [&]() { - constexpr auto x_idx_ = linear_coord; - constexpr auto x_len_ = TileDstr{}.get_lengths(); - static_assert(x_idx_.size() == x_len_.size()); - constexpr index_t x_dims_ = x_idx_.size(); - index_t cu_stride_ = 1; - index_t cu_offset_ = 0; - static_for<0, x_dims_, 1>{}([&](auto i_) { - auto r_i_ = number{}; - cu_offset_ += x_idx_[r_i_] * cu_stride_; - cu_stride_ *= x_len_[r_i_]; - }); - return cu_offset_; - }(); - - return linear_offset; + constexpr auto is_pure_linear_tensor = + reduce_on_sequence(LinearBottomDims{}, multiplies{}, number<1>{}); + if constexpr(is_pure_linear_tensor) + { + // this case usually is a LDS window, everything is known at compile tile. + // we directly use BottomTensorView transform to compute the offset, in case padding + auto bottom_tensor_coord = + make_tensor_coordinate(BottomTensorView{}.get_tensor_descriptor(), linear_coord); + return bottom_tensor_coord.get_offset(); + } + else + { + // this case usually is a global window, where last dim can be linear + // we hack here, that use the original TileDstr to compute the linear offset + // ... hoping that there is no extra padding between other dims, which make sense + // since that would introduce runtime length (so can't use linear offset) + constexpr index_t linear_offset = [&]() { + constexpr auto x_idx_ = linear_coord; + constexpr auto x_len_ = TileDstr{}.get_lengths(); + static_assert(x_idx_.size() == x_len_.size()); + constexpr index_t x_dims_ = x_idx_.size(); + index_t cu_stride_ = 1; + index_t cu_offset_ = 0; + static_for<0, x_dims_, 1>{}([&](auto i_) { + auto r_i_ = number{}; + cu_offset_ += x_idx_[r_i_] * cu_stride_; + cu_stride_ *= x_len_[r_i_]; + }); + return cu_offset_; + }(); + return linear_offset; + } } CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; } @@ -509,6 +524,64 @@ struct tile_window_linear return dst_tensor; } + template + CK_TILE_DEVICE auto load(DstTile& dst_tensor, + number = {}, + bool_constant = {}) const + { + using vector_t = typename traits::vector_t; + using SFC_Ys = typename traits::SFC_Ys; + + constexpr auto tile_dstr = TileDstr{}; + + // auto dst_tensor = make_static_distributed_tensor(tile_dstr); + + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + auto bottom_tensor_flag = cached_flags_[IAccess]; + + constexpr auto linear_offset = get_bottom_linear_offset(IAccess); + + // read from bottom tensor + const vector_t vec_value = + get_bottom_tensor_view().template get_vectorized_elements( + bottom_tensor_thread_coord, + linear_offset, + bottom_tensor_flag, + bool_constant{}); +#if 1 + // data index [y0, y1, ...] + constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess); + // write into distributed tensor + static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) { + constexpr auto idx_ys = generate_tuple( + [&](auto jj) { + return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj]; + }, + number{}); + + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys); + + dst_tensor.get_thread_buffer().template at() = + vec_value.template get_as()[j]; + }); +#else + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start); + static_assert(d % traits::ScalarPerVector == 0); + + dst_tensor.get_thread_buffer().template get_as()( + number{}) = bit_cast(vec_value); +#endif + }; + + WINDOW_DISPATCH_ISSUE(); + + return dst_tensor; + } + template + CK_TILE_DEVICE void update_raw(const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}, + bool_constant = {}) const + { + + using vector_t = typename traits::vector_t; + using SFC_Ys = typename traits::SFC_Ys; + + constexpr auto tile_dstr = TileDstr{}; + + // loop over thread tensor space [y0, y1, ...] + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + constexpr auto linear_offset = get_bottom_linear_offset(IAccess); + auto bottom_tensor_flag = cached_flags_[IAccess]; + + // data index [y0, y1, ...] + constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess); + + // read from distributed tensor + vector_t vec_value; + + static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) { + constexpr auto idx_ys = generate_tuple( + [&](auto jj) { + return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj]; + }, + number{}); + + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys); + + vec_value.template get_as()(j) = + dstr_tensor.get_thread_buffer().template at(); + }); + + // write into bottom tensor + get_bottom_tensor_view().template update_vectorized_elements_raw( + bottom_tensor_thread_coord, + linear_offset, + bottom_tensor_flag, + vec_value, + bool_constant{}, + bool_constant{}); + }; + + WINDOW_DISPATCH_ISSUE(); + } + // move thread's botom tensor coordiante // [x0', x1', ... ] ==> [offset] // also move window-origin diff --git a/include/ck_tile/core/tensor/tile_window_utils.hpp b/include/ck_tile/core/tensor/tile_window_utils.hpp new file mode 100644 index 000000000..71a72329f --- /dev/null +++ b/include/ck_tile/core/tensor/tile_window_utils.hpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile/core/arch/arch.hpp" +#include "ck_tile/core/arch/utility.hpp" +#include "ck_tile/core/algorithm/space_filling_curve.hpp" +#include "ck_tile/core/config.hpp" +#include "ck_tile/core/container/array.hpp" +#include "ck_tile/core/container/sequence.hpp" +#include "ck_tile/core/container/tuple.hpp" +#include "ck_tile/core/container/container_helper.hpp" +#include "ck_tile/core/tensor/static_distributed_tensor.hpp" +#include "ck_tile/core/tensor/tensor_adaptor.hpp" +#include "ck_tile/core/tensor/tile_distribution.hpp" +#include "ck_tile/core/utility/functional.hpp" +#include "ck_tile/core/utility/type_traits.hpp" + +#pragma once +namespace ck_tile { + +// input a lds store tile, extract some information from it +// used to set m0 value for gfx9 serious +template +CK_TILE_DEVICE auto get_async_store_smem_info(LdsTileWindow_&& lds_tile) +{ + using LdsTileWindow = remove_cvref_t; + using LdsDataType = typename LdsTileWindow::DataType; + + // issues * warps * lanes + static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded + + const index_t size_per_buf = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<0>{}, number<0>{}, number<0>{})) * + sizeof(LdsDataType); + + const index_t size_per_wave = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<0>{}, number<1>{}, number<0>{})) * + sizeof(LdsDataType) - + size_per_buf; + + const index_t size_per_issue = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<1>{}, number<0>{}, number<0>{})) * + sizeof(LdsDataType) - + size_per_buf; + + const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id(); + + return make_tuple(m0_init_value, size_per_issue); +} + +} // namespace ck_tile diff --git a/include/ck_tile/core/tensor/update_tile.hpp b/include/ck_tile/core/tensor/update_tile.hpp index fbce7c408..570abde18 100644 --- a/include/ck_tile/core/tensor/update_tile.hpp +++ b/include/ck_tile/core/tensor/update_tile.hpp @@ -41,15 +41,65 @@ template + typename DataType_, + index_t i_access = -1, + bool oob_conditional_check = true> CK_TILE_DEVICE void update_tile(tile_window_with_static_distribution& tile_window, - const static_distributed_tensor& dstr_tensor) + const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}) { - tile_window.update(dstr_tensor); + tile_window.update(dstr_tensor, number{}, bool_constant{}); +} + +template +CK_TILE_DEVICE void +update_tile_raw(tile_window_with_static_distribution& tile_window, + const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}, + bool_constant = {}) +{ + tile_window.update_raw(dstr_tensor, + number{}, + bool_constant{}, + bool_constant{}); +} + +template +CK_TILE_DEVICE auto update_tile_raw( + tile_window_linear& + tile_window, + const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}, + bool_constant = {}) +{ + tile_window.update_raw(dstr_tensor, + number{}, + bool_constant{}, + bool_constant{}); } } // namespace ck_tile diff --git a/include/ck_tile/core/utility/static_counter.hpp b/include/ck_tile/core/utility/static_counter.hpp new file mode 100644 index 000000000..84af3dd52 --- /dev/null +++ b/include/ck_tile/core/utility/static_counter.hpp @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/config.hpp" + +namespace ck_tile { + +template +struct static_counter +{ + public: + template + static constexpr index_t next() + { + return next(0) * Step + Start; + } + + template + static constexpr index_t next() + { + struct Unique + { + }; + return next(0) * Step + Start; + } + + template + static constexpr index_t current() + { + return current(0) * Step + Start; + } + + template + static constexpr index_t current() + { + struct Unique + { + }; + return current(0) * Step + Start; + } + + private: + template + struct slot + { + _Pragma("GCC diagnostic push"); + _Pragma("GCC diagnostic ignored \"-Wundefined-internal\""); + friend constexpr bool slot_allocated(slot); + _Pragma("GCC diagnostic pop"); + }; + + template + struct allocate_slot + { + friend constexpr bool slot_allocated(slot) { return true; } + enum + { + value = I + }; + }; + + // If slot_allocated(slot) has NOT been defined, then SFINAE will keep this function out of + // the overload set... + template ())> + static constexpr index_t next(index_t) + { + return next(0); + } + + // ...And this function will be used, instead, which will define slot_allocated(slot) via + // allocate_slot. + template + static constexpr index_t next(double) + { + return allocate_slot::value; + } + + // If slot_allocated(slot) has NOT been defined, then SFINAE will keep this function out of + // the overload set... + template ())> + static constexpr index_t current(index_t) + { + return current(0); + } + + // ...And this function will be used, instead, which will return the current counter, or assert + // in case next() hasn't been called yet. + template + static constexpr index_t current(double) + { + static_assert(I != 0, "You must invoke next() first"); + + return I - 1; + } +}; + +namespace impl { +template +struct static_counter_uniq_; +} + +#define MAKE_SC() \ + ck_tile::static_counter> {} +#define MAKE_SC_WITH(start_, step_) \ + ck_tile::static_counter, start_, step_> {} +#define NEXT_SC(c_) c_.next<__COUNTER__>() +#define NEXT_SCI(c_, static_i_) c_.next<__COUNTER__ + static_i_>() + +// Usage: +// constexpr auto c = MAKE_SC() +// NEXT_SC(c) // -> constexpr 0 +// NEXT_SC(c) // -> constexpr 1 +// NEXT_SC(c) // -> constexpr 2 +} // namespace ck_tile diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp index 2e96009ac..2f3a302ee 100644 --- a/include/ck_tile/host.hpp +++ b/include/ck_tile/host.hpp @@ -11,6 +11,7 @@ #include "ck_tile/host/fill.hpp" #include "ck_tile/host/hip_check_error.hpp" #include "ck_tile/host/host_tensor.hpp" +#include "ck_tile/host/joinable_thread.hpp" #include "ck_tile/host/kernel_launch.hpp" #include "ck_tile/host/ranges.hpp" #include "ck_tile/host/reference/reference_batched_dropout.hpp" @@ -20,6 +21,7 @@ #include "ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp" #include "ck_tile/host/reference/reference_batched_softmax.hpp" #include "ck_tile/host/reference/reference_elementwise.hpp" +#include "ck_tile/host/reference/reference_fused_moe.hpp" #include "ck_tile/host/reference/reference_gemm.hpp" #include "ck_tile/host/reference/reference_im2col.hpp" #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp" diff --git a/include/ck_tile/host/device_memory.hpp b/include/ck_tile/host/device_memory.hpp index 7c8549f74..13684c0e2 100644 --- a/include/ck_tile/host/device_memory.hpp +++ b/include/ck_tile/host/device_memory.hpp @@ -7,6 +7,7 @@ #include #include #include "ck_tile/host/hip_check_error.hpp" +#include "ck_tile/host/host_tensor.hpp" namespace ck_tile { template @@ -36,6 +37,19 @@ struct DeviceMem mpDeviceBuf = nullptr; } } + template + DeviceMem(const HostTensor& t) : mMemSize(t.get_element_space_size_in_bytes()) + { + if(mMemSize != 0) + { + HIP_CHECK_ERROR(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); + } + else + { + mpDeviceBuf = nullptr; + } + ToDevice(t.data()); + } void Realloc(std::size_t mem_size) { if(mpDeviceBuf) @@ -92,6 +106,27 @@ struct DeviceMem HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost)); } } + + // construct a host tensor with type T + template + HostTensor ToHost(std::size_t cpySize) + { + // TODO: host tensor could be slightly larger than the device tensor + // we just copy all data from GPU buffer + std::size_t host_elements = (cpySize + sizeof(T) - 1) / sizeof(T); + HostTensor h_({host_elements}); + if(mpDeviceBuf) + { + HIP_CHECK_ERROR(hipMemcpy(h_.data(), mpDeviceBuf, cpySize, hipMemcpyDeviceToHost)); + } + return h_; + } + template + HostTensor ToHost() + { + return ToHost(mMemSize); + } + void SetZero() const { if(mpDeviceBuf) diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp index 335911860..f24c33875 100644 --- a/include/ck_tile/host/fill.hpp +++ b/include/ck_tile/host/fill.hpp @@ -13,6 +13,7 @@ #include #include "ck_tile/core.hpp" +#include "ck_tile/host/joinable_thread.hpp" namespace ck_tile { @@ -22,13 +23,44 @@ struct FillUniformDistribution float a_{-5.f}; float b_{5.f}; std::optional seed_{11939}; + // ATTENTION: threaded does not guarantee the distribution between thread + bool threaded = false; template void operator()(ForwardIter first, ForwardIter last) const { - std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}()); - std::uniform_real_distribution dis(a_, b_); - std::generate(first, last, [&dis, &gen]() { return ck_tile::type_convert(dis(gen)); }); + if(threaded) + { + uint32_t num_thread = std::thread::hardware_concurrency(); + auto total = static_cast(std::distance(first, last)); + auto work_per_thread = static_cast((total + num_thread - 1) / num_thread); + + std::vector threads(num_thread); + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t iw_begin = it * work_per_thread; + std::size_t iw_end = std::min((it + 1) * work_per_thread, total); + auto thread_f = [this, total, iw_begin, iw_end, &first] { + if(iw_begin > total || iw_end > total) + return; + // need to make each thread unique, add an offset to current seed + std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin) + : std::random_device{}()); + std::uniform_real_distribution dis(a_, b_); + std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() { + return ck_tile::type_convert(dis(gen)); + }); + }; + threads[it] = joinable_thread(thread_f); + } + } + else + { + std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}()); + std::uniform_real_distribution dis(a_, b_); + std::generate( + first, last, [&dis, &gen]() { return ck_tile::type_convert(dis(gen)); }); + } } template @@ -115,13 +147,44 @@ struct FillNormalDistribution float mean_{0.f}; float variance_{1.f}; std::optional seed_{11939}; + // ATTENTION: threaded does not guarantee the distribution between thread + bool threaded = false; template void operator()(ForwardIter first, ForwardIter last) const { - std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}()); - std::normal_distribution dis(mean_, std::sqrt(variance_)); - std::generate(first, last, [&dis, &gen]() { return ck_tile::type_convert(dis(gen)); }); + if(threaded) + { + uint32_t num_thread = std::thread::hardware_concurrency(); + auto total = static_cast(std::distance(first, last)); + auto work_per_thread = static_cast((total + num_thread - 1) / num_thread); + + std::vector threads(num_thread); + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t iw_begin = it * work_per_thread; + std::size_t iw_end = std::min((it + 1) * work_per_thread, total); + auto thread_f = [this, total, iw_begin, iw_end, &first] { + if(iw_begin > total || iw_end > total) + return; + // need to make each thread unique, add an offset to current seed + std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin) + : std::random_device{}()); + std::normal_distribution dis(mean_, std::sqrt(variance_)); + std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() { + return ck_tile::type_convert(dis(gen)); + }); + }; + threads[it] = joinable_thread(thread_f); + } + } + else + { + std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}()); + std::normal_distribution dis(mean_, std::sqrt(variance_)); + std::generate( + first, last, [&dis, &gen]() { return ck_tile::type_convert(dis(gen)); }); + } } template @@ -235,6 +298,44 @@ struct FillMonotonicSeq } }; +template +struct FillStepRange +{ + float start_value_{0}; + float end_value_{3}; + float step_{1}; + + template + void operator()(ForwardIter first, ForwardIter last) const + { + std::generate(first, last, [=, n = start_value_]() mutable { + auto tmp = n; + n += step_; + if constexpr(IsAscending) + { + if(n > end_value_) + n = start_value_; + } + else + { + if(n < end_value_) + n = start_value_; + } + + return type_convert(tmp); + }); + } + + template + auto operator()(ForwardRange&& range) const -> std::void_t< + decltype(std::declval()(std::begin(std::forward(range)), + std::end(std::forward(range))))> + { + (*this)(std::begin(std::forward(range)), + std::end(std::forward(range))); + } +}; + template struct FillConstant { diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp index 5610ba324..3902cad17 100644 --- a/include/ck_tile/host/host_tensor.hpp +++ b/include/ck_tile/host/host_tensor.hpp @@ -8,12 +8,13 @@ #include #include #include -#include #include #include #include +#include #include "ck_tile/core.hpp" +#include "ck_tile/host/joinable_thread.hpp" #include "ck_tile/host/ranges.hpp" namespace ck_tile { @@ -213,23 +214,6 @@ CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old return HostTensorDescriptor(new_lengths, new_strides); } -struct joinable_thread : std::thread -{ - template - joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) - { - } - - joinable_thread(joinable_thread&&) = default; - joinable_thread& operator=(joinable_thread&&) = default; - - ~joinable_thread() - { - if(this->joinable()) - this->join(); - } -}; - template struct ParallelTensorFunctor { @@ -590,6 +574,107 @@ struct HostTensor size() * FromSize / ToSize}; } + friend std::ostream& operator<<(std::ostream& os, const HostTensor& t) + { + os << t.mDesc; + os << "["; + for(typename Data::size_type idx = 0; idx < t.mData.size(); ++idx) + { + if(0 < idx) + { + os << ", "; + } + if constexpr(std::is_same_v || std::is_same_v) + { + os << type_convert(t.mData[idx]) << " #### "; + } + else + { + os << t.mData[idx]; + } + } + os << "]"; + return os; + } + + // read data from a file, as dtype + // the file could dumped from torch as (targeting tensor is t here) + // numpy.savetxt("f.txt", t.view(-1).numpy()) + // numpy.savetxt("f.txt", t.cpu().view(-1).numpy()) # from cuda to cpu to save + // numpy.savetxt("f.txt", t.cpu().view(-1).numpy(), fmt="%d") # save as int + // will output f.txt, each line is a value + // dtype=float or int, internally will cast to real type + void loadtxt(std::string file_name, std::string dtype = "float") + { + std::ifstream file(file_name); + + if(file.is_open()) + { + std::string line; + + index_t cnt = 0; + while(std::getline(file, line)) + { + if(cnt >= static_cast(mData.size())) + { + throw std::runtime_error(std::string("data read from file:") + file_name + + " is too big"); + } + + if(dtype == "float") + { + mData[cnt] = type_convert(std::stof(line)); + } + else if(dtype == "int" || dtype == "int32") + { + mData[cnt] = type_convert(std::stoi(line)); + } + cnt++; + } + file.close(); + if(cnt < static_cast(mData.size())) + { + std::cerr << "Warning! reading from file:" << file_name + << ", does not match the size of this tensor" << std::endl; + } + } + else + { + // Print an error message to the standard error + // stream if the file cannot be opened. + throw std::runtime_error(std::string("unable to open file:") + file_name); + } + } + + // can save to a txt file and read from torch as: + // torch.from_numpy(np.loadtxt('f.txt', dtype=np.int32/np.float32...)).view([...]).contiguous() + void savetxt(std::string file_name, std::string dtype = "float") + { + std::ofstream file(file_name); + + if(file.is_open()) + { + for(auto& itm : mData) + { + if(dtype == "float") + file << type_convert(itm) << std::endl; + else if(dtype == "int") + file << type_convert(itm) << std::endl; + else + // TODO: we didn't implement operator<< for all custom + // data types, here fall back to float in case compile error + file << type_convert(itm) << std::endl; + } + file.close(); + } + else + { + // Print an error message to the standard error + // stream if the file cannot be opened. + throw std::runtime_error(std::string("unable to open file:") + file_name); + } + } + Descriptor mDesc; Data mData; }; diff --git a/include/ck_tile/host/joinable_thread.hpp b/include/ck_tile/host/joinable_thread.hpp new file mode 100644 index 000000000..a822f967d --- /dev/null +++ b/include/ck_tile/host/joinable_thread.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +namespace ck_tile { + +struct joinable_thread : std::thread +{ + template + joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) + { + } + + joinable_thread(joinable_thread&&) = default; + joinable_thread& operator=(joinable_thread&&) = default; + + ~joinable_thread() + { + if(this->joinable()) + this->join(); + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/host/reference/reference_fused_moe.hpp b/include/ck_tile/host/reference/reference_fused_moe.hpp new file mode 100644 index 000000000..bf89f9275 --- /dev/null +++ b/include/ck_tile/host/reference/reference_fused_moe.hpp @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" + +namespace ck_tile { +// [indexing implementation-1] +// using M_a as constexpr block_size to partition all tokens into different slices +// each slice map to one expert, and one expert can have multiple slices +// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5 +// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]] +// tok-0 tok-1 tok-2 tok-3 tok-4 +// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float +// number) +// +// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]] +// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5 +// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]] +// +// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1) +// max_num_tokens_padded : topk * input_tokens + num_experts * M_a - topk (updated) +// * this could be larger than actual, since actual tokens are on GPU +// +// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, +// 0, 1, 2, 5] +// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4 +// -|- exp-5 -| +// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, +// c, f, i, o] +// +// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr +// +// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5] +// * length is (max_num_tokens_padded + block_size - 1) / block_size +/// +// num_tokens_post_padded_ptr : [28] +// num_sorted_tiles_ptr : [7] + +template +void reference_fused_moe( + const ck_tile::HostTensor& a_host, // [tokens, hidden_size] + const ck_tile::HostTensor& g_host, // [experts, interme_size_0, hidden_size] + const ck_tile::HostTensor& d_host, // [experts, hidden_size, interme_size_1] + const ck_tile::HostTensor& sa_host, // [tokens, 1], + const ck_tile::HostTensor& sg_host, // [experts, 1, interme_size_0] + const ck_tile::HostTensor& sd_host, // [experts, 1, hidden_size], + const ck_tile::HostTensor& sy_host, // [experts, 1, interme_size_0] + ck_tile::HostTensor& o_host, // [tokens, hidden_size] + const ck_tile::HostTensor& sorted_token_ids_host, // [max_num_tokens_padded] + const ck_tile::HostTensor& sorted_weight_host, // [max_num_tokens_padded] + const ck_tile::HostTensor& + sorted_expert_ids_host, // [(max_num_tokens_padded + block_size - 1) / block_size] + const ck_tile::HostTensor& num_sorted_tiles_host, // [1] + + const ck_tile::HostTensor& + token_ids_host, // [tokens, topk] --> ugly!!! remove in the future + + ck_tile::index_t block_m, + ck_tile::index_t tokens, + ck_tile::index_t experts, + ck_tile::index_t hidden_size, + ck_tile::index_t intermediate_size, // this size is for gate/up + ck_tile::index_t topk, + ck_tile::index_t gate_only) +{ + assert(sorted_token_ids_host.get_num_of_dimension() == 1); + assert(sorted_weight_host.get_num_of_dimension() == 1); + assert(sorted_expert_ids_host.get_num_of_dimension() == 1); + assert(num_sorted_tiles_host.get_element_size() == 1); + ck_tile::index_t num_sorted_tiles = num_sorted_tiles_host.mData[0] / block_m; + ck_tile::index_t intermediate_size_0 = intermediate_size; + ck_tile::index_t intermediate_size_1 = intermediate_size / (gate_only ? 1 : 2); + + // TODO: better remove this in the future, or modify the token_id value + auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) { + for(ck_tile::index_t i_ = 0; i_ < topk; i_++) + { + if(token_ids_host(token_id_, i_) == expert_id_) + return i_; + } + throw std::runtime_error("not correct token/expert pair\n"); + return -1; // TODO: not correct!! + }; + + ck_tile::HostTensor out_topk_tokens({tokens, topk, hidden_size}); + + int max_num_tokens_padded = topk * tokens + experts * block_m - topk; + // assert(); + auto f = [&](auto i_flatten) { + ck_tile::index_t i_tile = i_flatten / block_m; + if(i_tile >= num_sorted_tiles) + return; + ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile]; + ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten]; + if(i_token >= tokens) + return; + ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly + auto weight = sorted_weight_host.mData[i_flatten]; + + ck_tile::HostTensor acc_0({1, intermediate_size_0}); + // first gemm + for(ck_tile::index_t i_n = 0; i_n < intermediate_size_0; i_n++) + { + AccDataType acc = static_cast(0); + for(ck_tile::index_t i_k = 0; i_k < hidden_size; i_k++) + { + acc += type_convert(a_host(i_token, i_k)) * + type_convert(g_host(i_expert, i_n, i_k)); + } + acc_0(0, i_n) = acc; + // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc); + } + + ck_tile::HostTensor y({1, intermediate_size_1}); + if(gate_only) + { + if(intermediate_size_1 != intermediate_size_0) + throw std::runtime_error( + "intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) + + ", 1:" + std::to_string(intermediate_size_1)); + for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++) + { + Activation{}(y(0, i_n), acc_0(0, i_n)); + // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n)); + } + } + else + { + if(intermediate_size_1 * 2 != intermediate_size_0) + throw std::runtime_error( + "intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) + + ", 1:" + std::to_string(intermediate_size_1)); + for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++) + { + AccDataType tmp; + Activation{}(tmp, acc_0(0, i_n)); + y(0, i_n) = tmp * acc_0(0, i_n + intermediate_size_1); // TODO: elementwise mul + } + } + + // second gemm, loop along gemm-n + ck_tile::HostTensor acc_1({1, hidden_size}); + for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++) + { + AccDataType acc = static_cast(0); + for(ck_tile::index_t i_k = 0; i_k < intermediate_size_1; i_k++) + { + acc += y(0, i_k) * type_convert(d_host(i_expert, i_n, i_k)); + } + acc_1(0, i_n) = acc * weight; // multiple weight here + } + + for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++) + { + out_topk_tokens(i_token, i_topk, i_n) = acc_1(0, i_n); + } + }; + + // make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(f, max_num_tokens_padded)(1); + + // reduce + auto r = [&](auto i_token) { + for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++) + { + AccDataType acc = type_convert(0); + for(ck_tile::index_t i_topk = 0; i_topk < topk; i_topk++) + { + acc += out_topk_tokens(i_token, i_topk, i_n); + } + o_host(i_token, i_n) = type_convert(acc); + } + }; + make_ParallelTensorFunctor(r, tokens)(std::thread::hardware_concurrency()); + + (void)num_sorted_tiles_host; + (void)sa_host; + (void)sg_host; + (void)sd_host; + (void)sy_host; +} +} // namespace ck_tile diff --git a/include/ck_tile/host/reference/reference_permute.hpp b/include/ck_tile/host/reference/reference_permute.hpp index 14ed4f815..4e0f1a877 100644 --- a/include/ck_tile/host/reference/reference_permute.hpp +++ b/include/ck_tile/host/reference/reference_permute.hpp @@ -16,7 +16,7 @@ namespace ck_tile { */ template CK_TILE_HOST void -reference_permute(const HostTensor& x, HostTensor& y, std::vector dims) +reference_permute(const HostTensor& x, HostTensor& y, std::vector perm) { const auto x_len = x.mDesc.get_lengths(); const auto y_len = y.mDesc.get_lengths(); @@ -43,7 +43,7 @@ reference_permute(const HostTensor& x, HostTensor& y, std::v std::vector tmp(rank, 0); for(index_t i = 0; i < rank; i++) { - tmp[dims[i]] = y_coord[i]; + tmp[perm[i]] = y_coord[i]; } return tmp; }(); @@ -54,4 +54,23 @@ reference_permute(const HostTensor& x, HostTensor& y, std::v make_ParallelTensorFunctor(f, x_elm)(std::thread::hardware_concurrency()); } + +template +CK_TILE_HOST auto reference_permute(const HostTensor& x, std::vector perm) +{ + auto x_shape = x.get_lengths(); + ck_tile::index_t rank = perm.size(); + std::vector y_shape = [&]() { + std::vector tmp(rank, 0); + for(int i = 0; i < static_cast(rank); i++) + { + tmp[i] = x_shape[perm[i]]; + } + return tmp; + }(); + + HostTensor y(y_shape); + reference_permute(x, y, perm); + return y; +} } // namespace ck_tile diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp index 01217e16c..e24b1ba76 100644 --- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp +++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp @@ -572,6 +572,105 @@ struct FastGelu } }; +struct FastGeluAsm +{ + template + CK_TILE_HOST void operator()(Y& y, const X& x) const; + + template + CK_TILE_DEVICE void operator()(Y& y, const X& x) const; + + template <> + CK_TILE_HOST void operator()(float& y, const float& x) const + { + // const float u = -2.f * x * (0.035677f * x * x + 0.797885f); + const float c1 = -2.0 * 0.035677f; + const float c2 = -2.0 * 0.797885f; + const float u = x * (c1 * x * x + c2); + const float emu = exp(u); + y = x / (1.f + emu); + } + + // device code, use lower precision "__ocml_exp_f32" and "rcp" + template <> + CK_TILE_DEVICE void operator()(float& y, const float& x) const + { + const uint32_t c1 = 0xbd92220c; // -2.0 * 0.035677f; + const float c2 = -2.0 * 0.797885f; + const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v; + float tmp; + + asm volatile("v_mul_f32 %[v_tmp], %[v_x], %[v_x] ; x*x\n" + "v_fma_f32 %[v_tmp], %[v_tmp], %[s_c1], %[v_c2] ; c1*x*x+c2\n" + "v_mul_f32 %[v_tmp], %[v_tmp], %[v_x] ; x*(c1*x*x+c2)\n" + "v_mul_f32 %[v_tmp], %[v_tmp], %[s_log2e] ; log2e*x*(c1*x*x+c2)\n" + "v_exp_f32 %[v_tmp], %[v_tmp] ; emu = exp2(log2e*x*(c1*x*x+c2))\n" + "s_nop 0 ; hazard for exp\n" + "v_add_f32 %[v_tmp], %[v_tmp], 1.0 ; emu+1.0f\n" + "v_rcp_f32 %[v_tmp], %[v_tmp] ; 1/(emu+1.0f)\n" + "s_nop 0 ; hazard for rcp \n" + "v_mul_f32 %[v_y], %[v_tmp], %[v_x] ; x * 1/(emu+1f)\n" + : [v_y] "=v"(y), [v_tmp] "+v"(tmp) + : [v_x] "v"(x), [s_c1] "s"(c1), [v_c2] "v"(c2), [s_log2e] "s"(log2e_) + :); + } + + template <> + CK_TILE_HOST void operator()(fp32x2_t& y, const fp32x2_t& x) const + { + const float c1 = -2.0 * 0.035677f; + const float c2 = -2.0 * 0.797885f; + const float u0 = x.x * (c1 * x.x * x.x + c2); + const float emu0 = exp(u0); + y.x = x.x / (1.f + emu0); + const float u1 = x.y * (c1 * x.y * x.y + c2); + const float emu1 = exp(u1); + y.y = x.y / (1.f + emu1); + } + + // this is packed verion to remove data hazard for trans + template <> + CK_TILE_DEVICE void operator()(fp32x2_t& y, const fp32x2_t& x) const + { + const uint32_t c1 = 0xbd92220c; // -2.0 * 0.035677f; + float c2 = -2.0 * 0.797885f; + const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v; + float tmp0, tmp1; + float y0 = x.x, y1 = x.y; + + asm volatile( + "v_mul_f32 %[v_tmp0], %[v_y0], %[v_y0] ; x*x\n" + "v_mul_f32 %[v_tmp1], %[v_y1], %[v_y1] ; x*x\n" + "v_fma_f32 %[v_tmp0], %[v_tmp0], %[s_c1], %[v_c2] ; c1*x*x+c2\n" + "v_fma_f32 %[v_tmp1], %[v_tmp1], %[s_c1], %[v_c2] ; c1*x*x+c2\n" + "v_mul_f32 %[v_tmp0], %[v_tmp0], %[v_y0] ; x*(c1*x*x+c2)\n" + "v_mul_f32 %[v_tmp1], %[v_tmp1], %[v_y1] ; x*(c1*x*x+c2)\n" + "v_mul_f32 %[v_tmp0], %[v_tmp0], %[s_log2e] ; log2e*x*(c1*x*x+c2)\n" + "v_mul_f32 %[v_tmp1], %[v_tmp1], %[s_log2e] ; log2e*x*(c1*x*x+c2)\n" + "v_exp_f32 %[v_tmp0], %[v_tmp0] ; emu = exp2(log2e*x*(c1*x*x+c2))\n" + "v_exp_f32 %[v_tmp1], %[v_tmp1] ; emu = exp2(log2e*x*(c1*x*x+c2))\n" + "v_add_f32 %[v_tmp0], %[v_tmp0], 1.0 ; emu+1.0f\n" + "v_add_f32 %[v_tmp1], %[v_tmp1], 1.0 ; emu+1.0f\n" + "v_rcp_f32 %[v_tmp0], %[v_tmp0] ; 1/(emu+1.0f)\n" + "v_rcp_f32 %[v_tmp1], %[v_tmp1] ; 1/(emu+1.0f)\n" + "v_mul_f32 %[v_y0], %[v_tmp0], %[v_y0] ; x * 1/(emu+1f)\n" + "v_mul_f32 %[v_y1], %[v_tmp1], %[v_y1] ; x * 1/(emu+1f)\n" + : [v_y0] "+v"(y0), + [v_y1] "+v"(y1), + [v_c2] "+v"(c2), + // NOTE! it is totally possible that c2/y0/y1 share same register, they are all local + // tmp variables we need to expicitly hint compiler they may read+write, to allow + // allocate different register , the side effect is c2=** may issue for every such + // inline asm block + [v_tmp0] "+v"(tmp0), + [v_tmp1] "+v"(tmp1) + : [s_c1] "s"(c1), [s_log2e] "s"(log2e_) + :); + y.x = y0; + y.y = y1; + } +}; + // https://paperswithcode.com/method/gelu // y = 0.5*x*(1+erf(x/sqrt(2))) struct Gelu diff --git a/include/ck_tile/ops/flatmm.hpp b/include/ck_tile/ops/flatmm.hpp new file mode 100644 index 000000000..eee80cda4 --- /dev/null +++ b/include/ck_tile/ops/flatmm.hpp @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp new file mode 100644 index 000000000..f5c7caf7d --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp @@ -0,0 +1,615 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp" + +namespace ck_tile { + +// A async load to LDS, B direct to AGPR +// B matrix preshuffled in br*kr*w +// require 4 wave, occupancy=1c +// agpr useage:256 +// vgpr usage:64(A local) + 64(acc) + 8(os_a) + 8(os_b) = 144 (rem:112) +// +// for this gemm, 4 16x16x16 transposed layout +// input A vpgpr layout +// v0-v15: [ 0:15](gemm_m)x128(gemm_k) +// v16-v31: [16:31](gemm_m)x128(gemm_k) + +// input B vpgpr layout +// v0-v15: [ 0: 15](gemm_n)x128(gemm_k) +// v16-v31: [ 64: 79](gemm_n)x128(gemm_k) +// ...................... +// v111-v127: [448:463](gemm_n)x128(gemm_k) + +// output C vpgpr layout +// v0-v3 : [ 0:15](gemm_m)x[ 0: 15](gemm_n) +// v4-v7 : [16:31](gemm_m)x[ 0: 15](gemm_n) +// v8-v11: [ 0:15](gemm_m)x[64: 79](gemm_n) +// v12-v15: [16:31](gemm_m)x[64: 79](gemm_n) +// ...................... +// v56-v59: [ 0:15](gemm_m)x[448:463](gemm_n) +// v60-v63: [16:31](gemm_m)x[448:463](gemm_n) +struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16 +{ + static constexpr index_t Block_M = 32; + static constexpr index_t Block_N = 512; + static constexpr index_t Block_K = 128; + + static constexpr index_t WarpPerBlock_M = 1; + static constexpr index_t WarpPerBlock_N = 4; + static constexpr index_t WarpPerBlock_K = 1; + + static constexpr index_t NumWarps = 4; + + static constexpr index_t Warp_M = 16; + static constexpr index_t Warp_N = 16; + static constexpr index_t Warp_K = 32; // 16 * SubKPacks + + static constexpr index_t BlockSize = 256; + + static constexpr index_t SubKPacks = 2; // this is used to gurantee every threads can do dwordx4 + + // TODO: note Nr/Kr/W need consider SubKPacks + static constexpr index_t Block_W = Warp_N * Warp_K; // 512 element + static constexpr index_t Block_Nr = Block_N / Warp_N; // 32 element, 4 per wave + static constexpr index_t Block_Kr = Block_K / Warp_K; // 4 + + static constexpr index_t Repeat_M = Block_M / (Warp_M * WarpPerBlock_M); // 2 + static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 8 + static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 8/2=4 + + static CK_TILE_DEVICE constexpr auto MakeCBlockDist() + { + constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding< + sequence<>, + tuple, sequence>, + tuple>, + tuple>, + sequence<2, 1>, // !! note here is different + sequence<0, 0>>{}; + + using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + return c_block_dstr; + } + + static CK_TILE_DEVICE constexpr auto MakeCBlockTile() + { + using CDataType = float; + constexpr auto c_block_dstr = MakeCBlockDist(); + auto c_block_tensor = make_static_distributed_tensor(c_block_dstr); + return c_block_tensor; + } + + CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A() + { + // A async->LDS + // constexpr index_t Block_M = Problem::BlockShape::Block_M0; + // constexpr index_t Block_K = Problem::BlockShape::Block_K0; + // constexpr index_t BlockSize = Problem::BlockShape::BlockSize; + constexpr index_t warpSize = ck_tile::get_warp_size(); + // constexpr index_t NumWarps = Problem::BlockShape::NumWarps; + + constexpr index_t KPack_ = 8; // GetSmemKPack_A(); // LDS + constexpr index_t KVector = 2; // GetAlignment_A(); // async copy 1 dword + constexpr index_t KPad = KPack_; // pad between warps + + static_assert(Block_K % KVector == 0); + constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K + if constexpr(LanesPerK >= warpSize) + { + // need multiple waves to load K + static_assert(LanesPerK % warpSize == 0); + constexpr index_t wavesPerK = LanesPerK / warpSize; + if constexpr(wavesPerK > NumWarps) + { + // TODO: need multiple issues along K to load all data + } + else + { + constexpr index_t wavesPerM = NumWarps / wavesPerK; + constexpr index_t NumIssues = Block_M / wavesPerM; + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number{}), // k2 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number<1>{}), // k2 + number{}, // lds store vector(actually no explicit store) + number<1>{}); + + constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple( + make_pass_through_transform(number{}), + make_merge_transform(make_tuple(number{}, number{})), + make_merge_transform(make_tuple(number{}, number{}))), + make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})); + + return lds_block_desc_issues_warps_lanes; + } + } + else + { + // lanes within a wave load different M but same K + static_assert(warpSize % LanesPerK == 0); + constexpr index_t LaneGroups = warpSize / LanesPerK; // along m + constexpr index_t NumIssues = Block_M / (LaneGroups * NumWarps); + + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number{}), // k1 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number<1>{}), // k1 + number{}, // lds store vector(actually no explicit store) + number<1>{}); + + constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple(make_pass_through_transform(number{}), + make_pass_through_transform(number{}), + make_merge_transform(make_tuple( + number{}, number{}, number{}))), + make_tuple(sequence<0>{}, sequence<2>{}, sequence<1, 3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})); + + return lds_block_desc_issues_warps_lanes; + } + } + + // template + CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadDesc_A() + { + // load from LDS to register, every wave has same layout + constexpr index_t KPack_ = 8; // GetSmemKPack_A(); // LDS + constexpr index_t KPad = KPack_; // pad between warps + + constexpr index_t kAMLane = 16; + constexpr index_t kABKLane = 4; + constexpr index_t kABKPerLane = 4; + constexpr index_t kKIter = 2; + static_assert(KPack_ == (kABKPerLane * kKIter)); + + constexpr auto lds_block_desc_0 = + make_naive_tensor_descriptor(make_tuple(number{}, // m0 y + number{}, // m1 p + number{}, // k0 y + number{}, // k1 p + number{}), // k2 y-vector + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number<1>{}), // k2 + number{}, // lds load vector + number<1>{}); + + constexpr auto lds_desc_m_k = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple(make_merge_transform(make_tuple(number{}, number{})), + make_merge_transform( + make_tuple(number{}, number{}, number{}))), + make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return lds_desc_m_k; + } + + static constexpr auto GetGemm_AWarpEnc() + { + constexpr index_t kAMLane = 16; + constexpr index_t kABKLane = 4; + constexpr index_t kABKPerLane = 4; + constexpr index_t kKIter = 2; + + using enc_ = tile_distribution_encoding< + sequence<>, + tuple, sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>; + return enc_{}; + } + + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + return 32 * (128 + 8) * sizeof(bf16_t); + } +}; + +struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16x16x32_Base +{ + using ADataType = bf16_t; + using BDataType = bf16_t; + + // TODO: need paired with tile_window_linear! + // TODO: need call init_raw() before call this function! + template + CK_TILE_DEVICE auto + operator()(const ARes& res_a, + const ACoords& cached_coords_a, + const BRes& res_b, + const BCoords& cached_coords_b, + CK_TILE_LDS_ADDR void* smem, + index_t k, + index_t tile_offset_a, // for each tile, the offset to move for each unroll + index_t tile_offset_b) // for each tile, the offset to move for each unroll + { + static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8 + static_assert(BCoords::size() == Repeat_N); + + auto a_sst = make_tile_window( + make_tensor_view( + reinterpret_cast(smem), MakeLdsStoreDesc_A()), + MakeLdsStoreDesc_A().get_lengths(), + {0, 0, 0}); + + auto a_sld = [&]() { + constexpr auto a_warp_enc_ = GetGemm_AWarpEnc(); + constexpr auto a_outer_dstr_enc = tile_distribution_encoding< + sequence, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + constexpr auto a_block_dstr_encode = + detail::make_embed_tile_distribution_encoding(a_outer_dstr_enc, a_warp_enc_); + return make_tile_window_linear( + make_tensor_view( + reinterpret_cast(smem), MakeLdsLoadDesc_A()), + MakeLdsLoadDesc_A().get_lengths(), + {0, 0}, + make_static_tile_distribution(a_block_dstr_encode)); + }(); + + const index_t tile_offset_a_bytes = tile_offset_a * sizeof(ADataType); + const index_t tile_offset_b_bytes = tile_offset_b * sizeof(BDataType); + + const auto [m0_init_value, size_per_issue] = get_async_store_smem_info(a_sst); + constexpr auto smem_buf_size = + MakeLdsLoadDesc_A().get_element_space_size() * sizeof(ADataType); + static_assert(a_sld.get_num_of_access() == 8); + constexpr auto sld_os = generate_tuple( + [&](auto i_access) { + return number{}; + }, + number{}); + + index_t loop_cnt = k / Block_K; + + // this is the acc thread buffer + fp32x4_t v_acc[16]{.0f}; + + // B nr->kr +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winline-asm" + // clang-format off + asm volatile( +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 +#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc" +#undef CK_TILE_FLATMM_UK_MFMA + : [s_loop_cnt]"+s"(loop_cnt), + [v_acc_0]"+v"(v_acc[0]), + [v_acc_1]"+v"(v_acc[1]), + [v_acc_2]"+v"(v_acc[2]), + [v_acc_3]"+v"(v_acc[3]), + [v_acc_4]"+v"(v_acc[4]), + [v_acc_5]"+v"(v_acc[5]), + [v_acc_6]"+v"(v_acc[6]), + [v_acc_7]"+v"(v_acc[7]), + [v_acc_8]"+v"(v_acc[8]), + [v_acc_9]"+v"(v_acc[9]), + [v_acc_10]"+v"(v_acc[10]), + [v_acc_11]"+v"(v_acc[11]), + [v_acc_12]"+v"(v_acc[12]), + [v_acc_13]"+v"(v_acc[13]), + [v_acc_14]"+v"(v_acc[14]), + [v_acc_15]"+v"(v_acc[15]), + [s_mem_]"+r"(smem) + : [s_res_a0]"s"(res_a[0]), + [s_res_a1]"s"(res_a[1]), + [s_res_a2]"s"(res_a[2]), + [s_res_a3]"s"(res_a[3]), + [s_res_b0]"s"(res_b[0]), + [s_res_b1]"s"(res_b[1]), + [s_res_b2]"s"(res_b[2]), + [s_res_b3]"s"(res_b[3]), + [v_os_a0]"v"(static_cast(cached_coords_a[number<0>{}] * sizeof(ADataType))), + [v_os_a1]"v"(static_cast(cached_coords_a[number<1>{}] * sizeof(ADataType))), + [v_os_a2]"v"(static_cast(cached_coords_a[number<2>{}] * sizeof(ADataType))), + [v_os_a3]"v"(static_cast(cached_coords_a[number<3>{}] * sizeof(ADataType))), + [v_os_a4]"v"(static_cast(cached_coords_a[number<4>{}] * sizeof(ADataType))), + [v_os_a5]"v"(static_cast(cached_coords_a[number<5>{}] * sizeof(ADataType))), + [v_os_a6]"v"(static_cast(cached_coords_a[number<6>{}] * sizeof(ADataType))), + [v_os_a7]"v"(static_cast(cached_coords_a[number<7>{}] * sizeof(ADataType))), + + [v_os_b0]"v"(static_cast(cached_coords_b[number<0>{}] * sizeof(BDataType))), + [v_os_b1]"v"(static_cast(cached_coords_b[number<1>{}] * sizeof(BDataType))), + [v_os_b2]"v"(static_cast(cached_coords_b[number<2>{}] * sizeof(BDataType))), + [v_os_b3]"v"(static_cast(cached_coords_b[number<3>{}] * sizeof(BDataType))), + [v_os_b4]"v"(static_cast(cached_coords_b[number<4>{}] * sizeof(BDataType))), + [v_os_b5]"v"(static_cast(cached_coords_b[number<5>{}] * sizeof(BDataType))), + [v_os_b6]"v"(static_cast(cached_coords_b[number<6>{}] * sizeof(BDataType))), + [v_os_b7]"v"(static_cast(cached_coords_b[number<7>{}] * sizeof(BDataType))), + + [v_os_slda]"v"(static_cast(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))), + [s_m0_init]"s"(m0_init_value), + [s_size_per_issue]"s"(size_per_issue), + [smem_sz]"n"(smem_buf_size), //(smem_buf_size), + [sld_os_0]"n"(sld_os[number<0>{}].value), + [sld_os_1]"n"(sld_os[number<1>{}].value), + [sld_os_2]"n"(sld_os[number<2>{}].value), + [sld_os_3]"n"(sld_os[number<3>{}].value), + [sld_os_4]"n"(sld_os[number<4>{}].value), + [sld_os_5]"n"(sld_os[number<5>{}].value), + [sld_os_6]"n"(sld_os[number<6>{}].value), + [sld_os_7]"n"(sld_os[number<7>{}].value), + [s_tile_os_a]"s"(tile_offset_a_bytes), + [s_tile_os_b]"s"(tile_offset_b_bytes) + : "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", + "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", + "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", + "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", + "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", + "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", + "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", + "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", + "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", + "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", + "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", + "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", + "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", + "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", + "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", + "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", + "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", + "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", + "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", + "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", + "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", + "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", + "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", + "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", + "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", + "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255", + "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", + "s86", // s86 as tmp + "v64", "v65", "v66", "v67", "v68", "v69", + "v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", + "v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", + "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99", + "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107", + "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", + "v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123", + "v124", "v125", "v126", "v127" + ); + // clang-format on +#pragma clang diagnostic pop + + // return local scratch + auto c = MakeCBlockTile(); + for(auto i = 0; i < 16; i++) + { + c.get_thread_buffer()[4 * i + 0] = v_acc[i].x; + c.get_thread_buffer()[4 * i + 1] = v_acc[i].y; + c.get_thread_buffer()[4 * i + 2] = v_acc[i].z; + c.get_thread_buffer()[4 * i + 3] = v_acc[i].w; + } + return c; + } +}; + +struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16x16x32_Base +{ + using ADataType = fp16_t; + using BDataType = fp16_t; + + // TODO: need paired with tile_window_linear! + // TODO: need call init_raw() before call this function! + template + CK_TILE_DEVICE auto + operator()(const ARes& res_a, + const ACoords& cached_coords_a, + const BRes& res_b, + const BCoords& cached_coords_b, + CK_TILE_LDS_ADDR void* smem, + index_t k, + index_t tile_offset_a, // for each tile, the offset to move for each unroll + index_t tile_offset_b) // for each tile, the offset to move for each unroll + { + static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8 + static_assert(BCoords::size() == Repeat_N); + + auto a_sst = make_tile_window( + make_tensor_view( + reinterpret_cast(smem), MakeLdsStoreDesc_A()), + MakeLdsStoreDesc_A().get_lengths(), + {0, 0, 0}); + + auto a_sld = [&]() { + constexpr auto a_warp_enc_ = GetGemm_AWarpEnc(); + constexpr auto a_outer_dstr_enc = tile_distribution_encoding< + sequence, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + constexpr auto a_block_dstr_encode = + detail::make_embed_tile_distribution_encoding(a_outer_dstr_enc, a_warp_enc_); + return make_tile_window_linear( + make_tensor_view( + reinterpret_cast(smem), MakeLdsLoadDesc_A()), + MakeLdsLoadDesc_A().get_lengths(), + {0, 0}, + make_static_tile_distribution(a_block_dstr_encode)); + }(); + + const index_t tile_offset_a_bytes = tile_offset_a * sizeof(ADataType); + const index_t tile_offset_b_bytes = tile_offset_b * sizeof(BDataType); + + const auto [m0_init_value, size_per_issue] = get_async_store_smem_info(a_sst); + constexpr auto smem_buf_size = + MakeLdsLoadDesc_A().get_element_space_size() * sizeof(ADataType); + static_assert(a_sld.get_num_of_access() == 8); + constexpr auto sld_os = generate_tuple( + [&](auto i_access) { + return number{}; + }, + number{}); + + index_t loop_cnt = k / Block_K; + + // this is the acc thread buffer + fp32x4_t v_acc[16]{.0f}; + + // B nr->kr +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winline-asm" + // clang-format off + asm volatile( +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16 +#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc" +#undef CK_TILE_FLATMM_UK_MFMA + : [s_loop_cnt]"+s"(loop_cnt), + [v_acc_0]"+v"(v_acc[0]), + [v_acc_1]"+v"(v_acc[1]), + [v_acc_2]"+v"(v_acc[2]), + [v_acc_3]"+v"(v_acc[3]), + [v_acc_4]"+v"(v_acc[4]), + [v_acc_5]"+v"(v_acc[5]), + [v_acc_6]"+v"(v_acc[6]), + [v_acc_7]"+v"(v_acc[7]), + [v_acc_8]"+v"(v_acc[8]), + [v_acc_9]"+v"(v_acc[9]), + [v_acc_10]"+v"(v_acc[10]), + [v_acc_11]"+v"(v_acc[11]), + [v_acc_12]"+v"(v_acc[12]), + [v_acc_13]"+v"(v_acc[13]), + [v_acc_14]"+v"(v_acc[14]), + [v_acc_15]"+v"(v_acc[15]), + [s_mem_]"+r"(smem) + : [s_res_a0]"s"(res_a[0]), + [s_res_a1]"s"(res_a[1]), + [s_res_a2]"s"(res_a[2]), + [s_res_a3]"s"(res_a[3]), + [s_res_b0]"s"(res_b[0]), + [s_res_b1]"s"(res_b[1]), + [s_res_b2]"s"(res_b[2]), + [s_res_b3]"s"(res_b[3]), + [v_os_a0]"v"(static_cast(cached_coords_a[number<0>{}] * sizeof(ADataType))), + [v_os_a1]"v"(static_cast(cached_coords_a[number<1>{}] * sizeof(ADataType))), + [v_os_a2]"v"(static_cast(cached_coords_a[number<2>{}] * sizeof(ADataType))), + [v_os_a3]"v"(static_cast(cached_coords_a[number<3>{}] * sizeof(ADataType))), + [v_os_a4]"v"(static_cast(cached_coords_a[number<4>{}] * sizeof(ADataType))), + [v_os_a5]"v"(static_cast(cached_coords_a[number<5>{}] * sizeof(ADataType))), + [v_os_a6]"v"(static_cast(cached_coords_a[number<6>{}] * sizeof(ADataType))), + [v_os_a7]"v"(static_cast(cached_coords_a[number<7>{}] * sizeof(ADataType))), + + [v_os_b0]"v"(static_cast(cached_coords_b[number<0>{}] * sizeof(BDataType))), + [v_os_b1]"v"(static_cast(cached_coords_b[number<1>{}] * sizeof(BDataType))), + [v_os_b2]"v"(static_cast(cached_coords_b[number<2>{}] * sizeof(BDataType))), + [v_os_b3]"v"(static_cast(cached_coords_b[number<3>{}] * sizeof(BDataType))), + [v_os_b4]"v"(static_cast(cached_coords_b[number<4>{}] * sizeof(BDataType))), + [v_os_b5]"v"(static_cast(cached_coords_b[number<5>{}] * sizeof(BDataType))), + [v_os_b6]"v"(static_cast(cached_coords_b[number<6>{}] * sizeof(BDataType))), + [v_os_b7]"v"(static_cast(cached_coords_b[number<7>{}] * sizeof(BDataType))), + + [v_os_slda]"v"(static_cast(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))), + [s_m0_init]"s"(m0_init_value), + [s_size_per_issue]"s"(size_per_issue), + [smem_sz]"n"(smem_buf_size), //(smem_buf_size), + [sld_os_0]"n"(sld_os[number<0>{}].value), + [sld_os_1]"n"(sld_os[number<1>{}].value), + [sld_os_2]"n"(sld_os[number<2>{}].value), + [sld_os_3]"n"(sld_os[number<3>{}].value), + [sld_os_4]"n"(sld_os[number<4>{}].value), + [sld_os_5]"n"(sld_os[number<5>{}].value), + [sld_os_6]"n"(sld_os[number<6>{}].value), + [sld_os_7]"n"(sld_os[number<7>{}].value), + [s_tile_os_a]"s"(tile_offset_a_bytes), + [s_tile_os_b]"s"(tile_offset_b_bytes) + : "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", + "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", + "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", + "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", + "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", + "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", + "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", + "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", + "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", + "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", + "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", + "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", + "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", + "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", + "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", + "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", + "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", + "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", + "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", + "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", + "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", + "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", + "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", + "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", + "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", + "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255", + "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", + "s86", // s86 as tmp + "v64", "v65", "v66", "v67", "v68", "v69", + "v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", + "v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", + "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99", + "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107", + "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", + "v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123", + "v124", "v125", "v126", "v127" + ); + // clang-format on +#pragma clang diagnostic pop + + // return local scratch + auto c = MakeCBlockTile(); + for(auto i = 0; i < 16; i++) + { + c.get_thread_buffer()[4 * i + 0] = v_acc[i].x; + c.get_thread_buffer()[4 * i + 1] = v_acc[i].y; + c.get_thread_buffer()[4 * i + 2] = v_acc[i].z; + c.get_thread_buffer()[4 * i + 3] = v_acc[i].w; + } + return c; + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp new file mode 100644 index 000000000..203c87b9c --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp" + +namespace ck_tile { + +// "S"tream update output along "N" +// A in smem, B load from global +// require 4 wave, occupancy=1c +struct FlatmmSn_32x128x512_1x4x1_16x16x32_Base +{ + static constexpr index_t Block_M = 32; + static constexpr index_t Block_N = 128; + static constexpr index_t Block_K = 512; + + static constexpr index_t WarpPerBlock_M = 1; + static constexpr index_t WarpPerBlock_N = 4; + static constexpr index_t WarpPerBlock_K = 1; + + static constexpr index_t Warp_M = 16; + static constexpr index_t Warp_N = 16; + static constexpr index_t Warp_K = 32; + + static constexpr index_t BlockSize = 256; + + // static constexpr index_t KPack = 2; // this is used to gurantee every threads can do dwordx4 + + // TODO: note Nr/Kr/W need consider KPack + static constexpr index_t Block_W = Warp_N * Warp_K; // 512 element + static constexpr index_t Block_Nr = Block_N / Warp_N; // 32 element, 4 per wave + static constexpr index_t Block_Kr = Block_K / Warp_K; // 4 + + static constexpr index_t Repeat_M = Block_M / (Warp_M * WarpPerBlock_M); // 2 + static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 2 + static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 16 + + static CK_TILE_DEVICE constexpr auto MakeCBlockDist() + { + constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding< + sequence<>, + tuple, sequence>, + tuple>, + tuple>, + sequence<2, 1>, // !! note here is different + sequence<0, 0>>{}; + + using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + return c_block_dstr; + } + + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + // y y p p p y + // reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4) + // but order is N0*M0*Nv + // in LDS we need store as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4) + // y y wave-id lid/16 lid%16 v + return 2 * 2 * 4 * 4 * (16 * 4 + 4) * sizeof(bf16_t); + } +}; + +struct FlatmmSn_32x128x512_1x4x1_16x16x32_BF16 : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base +{ + using BDataType = bf16_t; + using ODataType = bf16_t; + + // TODO: need paired with tile_window_linear! + // TODO: need call init_raw() before call this function! + // template + template + CK_TILE_DEVICE auto + operator()(const BRes& res_b, + const BCoords& cached_coords_b, + const ORes& res_o, + const OCoords& cached_coords_o, + const OFlags& o_flags, // this should be in sgpr + CK_TILE_LDS_ADDR void* smem, + index_t n, // loop along n dim + const ScaleTensor& scale_, + index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust + index_t tile_offset_o) + { + static_assert(BCoords::size() == 8); // 8 + static_assert(OCoords::size() == 8); + + const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType); + const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType); + + static_assert(ScaleTensor::size() == 2); + float s0 = scale_[number<0>{}]; + float s1 = scale_[number<1>{}]; + + index_t loop_cnt = n / Block_N; + + register float v_c0 asm("v64"); + register float v_c1 asm("v65"); + register float v_c2 asm("v66"); + register float v_c3 asm("v67"); + register float v_c4 asm("v68"); + register float v_c5 asm("v69"); + register float v_c6 asm("v70"); + register float v_c7 asm("v71"); + register float v_c8 asm("v72"); + register float v_c9 asm("v73"); + register float v_c10 asm("v74"); + register float v_c11 asm("v75"); + register float v_c12 asm("v76"); + register float v_c13 asm("v77"); + register float v_c14 asm("v78"); + register float v_c15 asm("v79"); + register float v_c16 asm("v80"); + register float v_c17 asm("v81"); + register float v_c18 asm("v82"); + register float v_c19 asm("v83"); + register float v_c20 asm("v84"); + register float v_c21 asm("v85"); + register float v_c22 asm("v86"); + register float v_c23 asm("v87"); + register float v_c24 asm("v88"); + register float v_c25 asm("v89"); + register float v_c26 asm("v90"); + register float v_c27 asm("v91"); + register float v_c28 asm("v92"); + register float v_c29 asm("v93"); + register float v_c30 asm("v94"); + register float v_c31 asm("v95"); + int32_t nan_hi = 0x7fff0000; + int32_t nan_lo = 0x00007fff; + + // in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4) + // every threads need 8xK in contiguous register + // ... and every wave need the same data + int lane_id = threadIdx.x % 64; + int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128; + sld_y_os *= 2; + + // y y p p p y + // reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4) + // but order is N0*M0*Nv + // in LDS we need store as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4) + // y y wave-id lid/16 lid%16 v + // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4 + int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4); + sfl_sst *= 2; + + // from LDS we need load as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4) + // ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2) + // sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4 + int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4; + sfl_sld *= 2; + + // B nr->kr + // clang-format off +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winline-asm" + asm volatile( +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 +#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc" +#undef CK_TILE_FLATMM_UK_MFMA + :[smem_]"+r"(smem), + [s_loop_cnt]"+s"(loop_cnt), + [c0]"+v" (v_c0), + [c1]"+v" (v_c1), + [c2]"+v" (v_c2), + [c3]"+v" (v_c3), + [c4]"+v" (v_c4), + [c5]"+v" (v_c5), + [c6]"+v" (v_c6), + [c7]"+v" (v_c7), + [c8]"+v" (v_c8), + [c9]"+v" (v_c9), + [c10]"+v"(v_c10), + [c11]"+v"(v_c11), + [c12]"+v"(v_c12), + [c13]"+v"(v_c13), + [c14]"+v"(v_c14), + [c15]"+v"(v_c15), + [c16]"+v"(v_c16), + [c17]"+v"(v_c17), + [c18]"+v"(v_c18), + [c19]"+v"(v_c19), + [c20]"+v"(v_c20), + [c21]"+v"(v_c21), + [c22]"+v"(v_c22), + [c23]"+v"(v_c23), + [c24]"+v"(v_c24), + [c25]"+v"(v_c25), + [c26]"+v"(v_c26), + [c27]"+v"(v_c27), + [c28]"+v"(v_c28), + [c29]"+v"(v_c29), + [c30]"+v"(v_c30), + [c31]"+v"(v_c31) + : + [sld_a_base]"n"(0), + [shfl_base]"n"(0), + [v_sld_y_os]"v"(sld_y_os), + [v_sfl_sld]"v"(sfl_sld), + [v_sfl_sst]"v"(sfl_sst), + [s_res_o0]"s"(res_o[0]), + [s_res_o1]"s"(res_o[1]), + //[s_res_o2]"s"(res_o[2]), + //[s_res_o3]"s"(res_o[3]), + [s_res_b0]"s"(res_b[0]), + [s_res_b1]"s"(res_b[1]), + [s_res_b2]"s"(res_b[2]), + [s_res_b3]"s"(res_b[3]), + [v_os_o0]"v"(static_cast(cached_coords_o[number<0>{}] * sizeof(ODataType))), + [v_os_o1]"v"(static_cast(cached_coords_o[number<1>{}] * sizeof(ODataType))), + [v_os_o2]"v"(static_cast(cached_coords_o[number<2>{}] * sizeof(ODataType))), + [v_os_o3]"v"(static_cast(cached_coords_o[number<3>{}] * sizeof(ODataType))), + [v_os_o4]"v"(static_cast(cached_coords_o[number<4>{}] * sizeof(ODataType))), + [v_os_o5]"v"(static_cast(cached_coords_o[number<5>{}] * sizeof(ODataType))), + [v_os_o6]"v"(static_cast(cached_coords_o[number<6>{}] * sizeof(ODataType))), + [v_os_o7]"v"(static_cast(cached_coords_o[number<7>{}] * sizeof(ODataType))), + [v_os_b0]"v"(static_cast(cached_coords_b[number<0>{}] * sizeof(BDataType))), + [v_os_b1]"v"(static_cast(cached_coords_b[number<1>{}] * sizeof(BDataType))), + [v_os_b2]"v"(static_cast(cached_coords_b[number<2>{}] * sizeof(BDataType))), + [v_os_b3]"v"(static_cast(cached_coords_b[number<3>{}] * sizeof(BDataType))), + [v_os_b4]"v"(static_cast(cached_coords_b[number<4>{}] * sizeof(BDataType))), + [v_os_b5]"v"(static_cast(cached_coords_b[number<5>{}] * sizeof(BDataType))), + [v_os_b6]"v"(static_cast(cached_coords_b[number<6>{}] * sizeof(BDataType))), + [v_os_b7]"v"(static_cast(cached_coords_b[number<7>{}] * sizeof(BDataType))), + + [s_tile_os_o]"s"(tile_stride_o_bytes), + [s_tile_os_b]"s"(tile_stride_b_bytes), + [scale_0]"v"(s0), + [scale_1]"v"(s1), + [v_nan_lo]"v"(nan_lo), + [v_nan_hi]"v"(nan_hi), + [s_execflag_0]"s"(o_flags[number<0>{}]), + [s_execflag_1]"s"(o_flags[number<1>{}]), + [s_execflag_2]"s"(o_flags[number<2>{}]), + [s_execflag_3]"s"(o_flags[number<3>{}]), + [s_execflag_4]"s"(o_flags[number<4>{}]), + [s_execflag_5]"s"(o_flags[number<5>{}]), + [s_execflag_6]"s"(o_flags[number<6>{}]), + [s_execflag_7]"s"(o_flags[number<7>{}]) + : + "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", + "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", + "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", + "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", + "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", + "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", + "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", + "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", + "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", + "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", + "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", + "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", + "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", + "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", + "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", + "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", + "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", + "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", + "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", + "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", + "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", + "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", + "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", + "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", + "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", + "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255", + "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86", + "s36", "s37", + "v50", "v54", "v55", + "v64","v65","v66","v67","v68","v69","v70","v71", + "v72","v73","v74","v75","v76","v77","v78","v79", + "v80","v81","v82","v83","v84","v85","v86","v87", + "v88","v89","v90","v91","v92","v93","v94","v95", + "v128", "v129", "v130", "v131", + "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139", + "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147", + "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155", + "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163", + "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171", + "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", + "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", + "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195", + "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203", + "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211", + "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219", + "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227", + "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235", + "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", + "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", + "v252", "v253", "v254", "v255" + ); +#pragma clang diagnostic pop + // clang-format on + } +}; + +struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base +{ + using BDataType = bf16_t; + using ODataType = bf16_t; + + // TODO: need paired with tile_window_linear! + // TODO: need call init_raw() before call this function! + // template + template + CK_TILE_DEVICE auto + operator()(const BRes& res_b, + const BCoords& cached_coords_b, + const ORes& res_o, + const OCoords& cached_coords_o, + const OFlags& o_flags, // this should be in sgpr + CK_TILE_LDS_ADDR void* smem, + index_t n, // loop along n dim + const ScaleTensor& scale_, + index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust + index_t tile_offset_o) + { + static_assert(BCoords::size() == 8); // 8 + static_assert(OCoords::size() == 8); + + const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType); + const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType); + + static_assert(ScaleTensor::size() == 2); + float s0 = scale_[number<0>{}]; + float s1 = scale_[number<1>{}]; + + index_t loop_cnt = n / Block_N; + + register float v_c0 asm("v64"); + register float v_c1 asm("v65"); + register float v_c2 asm("v66"); + register float v_c3 asm("v67"); + register float v_c4 asm("v68"); + register float v_c5 asm("v69"); + register float v_c6 asm("v70"); + register float v_c7 asm("v71"); + register float v_c8 asm("v72"); + register float v_c9 asm("v73"); + register float v_c10 asm("v74"); + register float v_c11 asm("v75"); + register float v_c12 asm("v76"); + register float v_c13 asm("v77"); + register float v_c14 asm("v78"); + register float v_c15 asm("v79"); + register float v_c16 asm("v80"); + register float v_c17 asm("v81"); + register float v_c18 asm("v82"); + register float v_c19 asm("v83"); + register float v_c20 asm("v84"); + register float v_c21 asm("v85"); + register float v_c22 asm("v86"); + register float v_c23 asm("v87"); + register float v_c24 asm("v88"); + register float v_c25 asm("v89"); + register float v_c26 asm("v90"); + register float v_c27 asm("v91"); + register float v_c28 asm("v92"); + register float v_c29 asm("v93"); + register float v_c30 asm("v94"); + register float v_c31 asm("v95"); + int32_t nan_hi = 0x7fff0000; + int32_t nan_lo = 0x00007fff; + + // in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4) + // every threads need 8xK in contiguous register + // ... and every wave need the same data + int lane_id = threadIdx.x % 64; + int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128; + sld_y_os *= 2; + + // y y p p p y + // reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4) + // but order is N0*M0*Nv + // in LDS we need store as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4) + // y y wave-id lid/16 lid%16 v + // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4 + int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4); + sfl_sst *= 2; + + // from LDS we need load as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4) + // ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2) + // sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4 + int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4; + sfl_sld *= 2; + + // B nr->kr + // clang-format off +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winline-asm" + asm volatile( +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16 +#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc" +#undef CK_TILE_FLATMM_UK_MFMA + :[smem_]"+r"(smem), + [s_loop_cnt]"+s"(loop_cnt), + [c0]"+v" (v_c0), + [c1]"+v" (v_c1), + [c2]"+v" (v_c2), + [c3]"+v" (v_c3), + [c4]"+v" (v_c4), + [c5]"+v" (v_c5), + [c6]"+v" (v_c6), + [c7]"+v" (v_c7), + [c8]"+v" (v_c8), + [c9]"+v" (v_c9), + [c10]"+v"(v_c10), + [c11]"+v"(v_c11), + [c12]"+v"(v_c12), + [c13]"+v"(v_c13), + [c14]"+v"(v_c14), + [c15]"+v"(v_c15), + [c16]"+v"(v_c16), + [c17]"+v"(v_c17), + [c18]"+v"(v_c18), + [c19]"+v"(v_c19), + [c20]"+v"(v_c20), + [c21]"+v"(v_c21), + [c22]"+v"(v_c22), + [c23]"+v"(v_c23), + [c24]"+v"(v_c24), + [c25]"+v"(v_c25), + [c26]"+v"(v_c26), + [c27]"+v"(v_c27), + [c28]"+v"(v_c28), + [c29]"+v"(v_c29), + [c30]"+v"(v_c30), + [c31]"+v"(v_c31) + : + [sld_a_base]"n"(0), + [shfl_base]"n"(0), + [v_sld_y_os]"v"(sld_y_os), + [v_sfl_sld]"v"(sfl_sld), + [v_sfl_sst]"v"(sfl_sst), + [s_res_o0]"s"(res_o[0]), + [s_res_o1]"s"(res_o[1]), + //[s_res_o2]"s"(res_o[2]), + //[s_res_o3]"s"(res_o[3]), + [s_res_b0]"s"(res_b[0]), + [s_res_b1]"s"(res_b[1]), + [s_res_b2]"s"(res_b[2]), + [s_res_b3]"s"(res_b[3]), + [v_os_o0]"v"(static_cast(cached_coords_o[number<0>{}] * sizeof(ODataType))), + [v_os_o1]"v"(static_cast(cached_coords_o[number<1>{}] * sizeof(ODataType))), + [v_os_o2]"v"(static_cast(cached_coords_o[number<2>{}] * sizeof(ODataType))), + [v_os_o3]"v"(static_cast(cached_coords_o[number<3>{}] * sizeof(ODataType))), + [v_os_o4]"v"(static_cast(cached_coords_o[number<4>{}] * sizeof(ODataType))), + [v_os_o5]"v"(static_cast(cached_coords_o[number<5>{}] * sizeof(ODataType))), + [v_os_o6]"v"(static_cast(cached_coords_o[number<6>{}] * sizeof(ODataType))), + [v_os_o7]"v"(static_cast(cached_coords_o[number<7>{}] * sizeof(ODataType))), + [v_os_b0]"v"(static_cast(cached_coords_b[number<0>{}] * sizeof(BDataType))), + [v_os_b1]"v"(static_cast(cached_coords_b[number<1>{}] * sizeof(BDataType))), + [v_os_b2]"v"(static_cast(cached_coords_b[number<2>{}] * sizeof(BDataType))), + [v_os_b3]"v"(static_cast(cached_coords_b[number<3>{}] * sizeof(BDataType))), + [v_os_b4]"v"(static_cast(cached_coords_b[number<4>{}] * sizeof(BDataType))), + [v_os_b5]"v"(static_cast(cached_coords_b[number<5>{}] * sizeof(BDataType))), + [v_os_b6]"v"(static_cast(cached_coords_b[number<6>{}] * sizeof(BDataType))), + [v_os_b7]"v"(static_cast(cached_coords_b[number<7>{}] * sizeof(BDataType))), + + [s_tile_os_o]"s"(tile_stride_o_bytes), + [s_tile_os_b]"s"(tile_stride_b_bytes), + [scale_0]"v"(s0), + [scale_1]"v"(s1), + [v_nan_lo]"v"(nan_lo), + [v_nan_hi]"v"(nan_hi), + [s_execflag_0]"s"(o_flags[number<0>{}]), + [s_execflag_1]"s"(o_flags[number<1>{}]), + [s_execflag_2]"s"(o_flags[number<2>{}]), + [s_execflag_3]"s"(o_flags[number<3>{}]), + [s_execflag_4]"s"(o_flags[number<4>{}]), + [s_execflag_5]"s"(o_flags[number<5>{}]), + [s_execflag_6]"s"(o_flags[number<6>{}]), + [s_execflag_7]"s"(o_flags[number<7>{}]) + : + "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", + "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", + "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", + "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", + "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", + "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", + "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", + "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", + "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", + "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", + "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", + "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", + "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", + "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", + "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", + "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", + "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", + "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", + "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", + "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", + "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", + "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", + "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", + "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", + "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", + "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255", + "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86", + "s36", "s37", + "v50", "v54", "v55", + "v64","v65","v66","v67","v68","v69","v70","v71", + "v72","v73","v74","v75","v76","v77","v78","v79", + "v80","v81","v82","v83","v84","v85","v86","v87", + "v88","v89","v90","v91","v92","v93","v94","v95", + "v128", "v129", "v130", "v131", + "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139", + "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147", + "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155", + "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163", + "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171", + "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", + "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", + "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195", + "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203", + "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211", + "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219", + "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227", + "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235", + "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", + "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", + "v252", "v253", "v254", "v255" + ); +#pragma clang diagnostic pop + // clang-format on + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp b/include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp new file mode 100644 index 000000000..003335c0e --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#define CK_TILE_FLATMM_UK_MFMA_FP16 0 +#define CK_TILE_FLATMM_UK_MFMA_BF16 1 +#define CK_TILE_FLATMM_UK_MFMA_INT8 2 +#define CK_TILE_FLATMM_UK_MFMA_FP8 3 +#define CK_TILE_FLATMM_UK_MFMA_BF8 4 diff --git a/include/ck_tile/ops/flatmm/block/uk/README.md b/include/ck_tile/ops/flatmm/block/uk/README.md new file mode 100644 index 000000000..84fa13229 --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/uk/README.md @@ -0,0 +1 @@ +the files under this folder should not be included directly! \ No newline at end of file diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc new file mode 100644 index 000000000..8b57611f0 --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc @@ -0,0 +1,613 @@ +#ifndef CK_TILE_FLATMM_UK_MFMA +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 +#endif + +#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16 +# define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16" + +# define _UK_PK_CVT_(x0_, x1_, y_) \ + " v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \ + " v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \ + " v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \ + " v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \ + " v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \ + " v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \ + " v_perm_b32 " y_ ", v55, v54, s52 \n" + +# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16" + +#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16 +#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16" + +# define _UK_PK_CVT_(x0_, x1_, y_) \ + " v_cvt_f16_f32 v54, " x0_ " \n" \ + " v_cvt_f16_f32 v55, " x1_ " \n" \ + " v_pack_b32_f16 " y_ ", v54, v55 \n" + +# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16" + +#endif + + +";-------------------------------------------------------------\n" +" s_mov_b32 s52, 0x07060302 ; v_perm\n" +" s_mov_b64 s[38:39], exec ; save current exec\n" +" s_mov_b32 s8, %[s_res_o0] \n" +" s_mov_b32 s9, %[s_res_o1] \n" +" s_mov_b32 s12, %[s_res_b0] \n" +" s_mov_b32 s13, %[s_res_b1] \n" +" s_mov_b32 s14, %[s_res_b2] \n" +" s_mov_b32 s15, %[s_res_b3] \n" +" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n" +" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n" +" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n" +" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n" +" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n" +" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n" +" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n" +" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n" +" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n" +" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n" +" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n" +" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n" +" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n" +" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n" +" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n" +" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n" +" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n" +" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n" +" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n" +" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n" +" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n" +" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n" +" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n" +" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n" +" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n" +" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n" +" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n" +" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n" +" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n" +" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n" +" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n" +" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n" +" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n" +" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n" +" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n" +" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n" +" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n" +" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n" +" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n" +" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n" +" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n" +" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n" +" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n" +" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n" +" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n" +" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n" +" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n" +" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n" +" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n" +" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n" +" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n" +" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n" +" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n" +" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n" +" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n" +" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n" +" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n" +" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n" +" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n" +" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n" +" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n" +" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n" +" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n" +" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n" +" s_waitcnt 0 \n" +" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" +" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +" s_cselect_b32 s86, %[s_tile_os_b], 0 \n" +" s_add_u32 s12, s86, s12 \n" +" s_addc_u32 s13, 0, s13 \n" +" s_waitcnt 0 \n" +"L_start%=: \n" +" s_waitcnt vmcnt(32) \n" +" s_barrier \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n" +" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n" +" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n" +" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n" +" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]\n" +" v_mul_f32 %[c0], %[scale_0], %[c0] \n" +" v_mul_f32 %[c1], %[scale_0], %[c1] \n" +" v_mul_f32 %[c2], %[scale_0], %[c2] \n" +" v_mul_f32 %[c3], %[scale_0], %[c3] \n" +" v_mul_f32 %[c4], %[scale_1], %[c4] \n" +" v_mul_f32 %[c5], %[scale_1], %[c5] \n" +" v_mul_f32 %[c6], %[scale_1], %[c6] \n" +" v_mul_f32 %[c7], %[scale_1], %[c7] \n" +" v_mul_f32 %[c8], %[scale_0], %[c8] \n" +" v_mul_f32 %[c9], %[scale_0], %[c9] \n" +" v_mul_f32 %[c10], %[scale_0], %[c10] \n" +" v_mul_f32 %[c11], %[scale_0], %[c11] \n" +" v_mul_f32 %[c12], %[scale_1], %[c12] \n" +" v_mul_f32 %[c13], %[scale_1], %[c13] \n" +" v_mul_f32 %[c14], %[scale_1], %[c14] \n" +" v_mul_f32 %[c15], %[scale_1], %[c15] \n" +_UK_PK_CVT_("%[c0]", "%[c1]", "%[c0]") +_UK_PK_CVT_("%[c2]", "%[c3]", "%[c1]") +_UK_PK_CVT_("%[c4]", "%[c5]", "%[c2]") +_UK_PK_CVT_("%[c6]", "%[c7]", "%[c3]") +_UK_PK_CVT_("%[c8]", "%[c9]", "%[c4]") +_UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]") +_UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]") +_UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]") +" ;------------------------------ \n" +" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:0 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base] \n" +" s_waitcnt lgkmcnt(0) \n" +" s_barrier \n" +" ds_read_b32 %[c0], %[v_sfl_sld] offset:0 + %[shfl_base] \n" +" ds_read_b32 %[c1], %[v_sfl_sld] offset:32 + %[shfl_base] \n" +" ds_read_b32 %[c2], %[v_sfl_sld] offset:64 + %[shfl_base] \n" +" ds_read_b32 %[c3], %[v_sfl_sld] offset:96 + %[shfl_base] \n" +" ds_read_b32 %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base] \n" +" ds_read_b32 %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base] \n" +" ds_read_b32 %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base] \n" +" ds_read_b32 %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base] \n" +" s_waitcnt lgkmcnt(0) \n" +" s_mov_b64 exec, %[s_execflag_0] \n" +_UK_ATOMIC_ADD_ " %[v_os_o0], %[c0], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_1] \n" +_UK_ATOMIC_ADD_ " %[v_os_o1], %[c1], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_2] \n" +_UK_ATOMIC_ADD_ " %[v_os_o2], %[c2], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_3] \n" +_UK_ATOMIC_ADD_ " %[v_os_o3], %[c3], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_4] \n" +_UK_ATOMIC_ADD_ " %[v_os_o4], %[c4], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_5] \n" +_UK_ATOMIC_ADD_ " %[v_os_o5], %[c5], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_6] \n" +_UK_ATOMIC_ADD_ " %[v_os_o6], %[c6], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_7] \n" +_UK_ATOMIC_ADD_ " %[v_os_o7], %[c7], s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k-- \n" +" s_cmp_gt_i32 %[s_loop_cnt] 0 \n" +" s_cbranch_scc0 L_end%= \n" +" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +" s_cselect_b32 s86, %[s_tile_os_b], 0 \n" +" s_add_u32 s12, s86, s12 \n" +" s_addc_u32 s13, 0, s13 \n" +" s_add_u32 s8, %[s_tile_os_o], s8 \n" +" s_addc_u32 s9, 0, s9 \n" +" s_waitcnt vmcnt(32) \n" +" s_barrier \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0 \n" +" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0 \n" +" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0 \n" +" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0 \n" +" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]\n" +" v_mul_f32 %[c16], %[scale_0], %[c16] \n" +" v_mul_f32 %[c17], %[scale_0], %[c17] \n" +" v_mul_f32 %[c18], %[scale_0], %[c18] \n" +" v_mul_f32 %[c19], %[scale_0], %[c19] \n" +" v_mul_f32 %[c20], %[scale_1], %[c20] \n" +" v_mul_f32 %[c21], %[scale_1], %[c21] \n" +" v_mul_f32 %[c22], %[scale_1], %[c22] \n" +" v_mul_f32 %[c23], %[scale_1], %[c23] \n" +" v_mul_f32 %[c24], %[scale_0], %[c24] \n" +" v_mul_f32 %[c25], %[scale_0], %[c25] \n" +" v_mul_f32 %[c26], %[scale_0], %[c26] \n" +" v_mul_f32 %[c27], %[scale_0], %[c27] \n" +" v_mul_f32 %[c28], %[scale_1], %[c28] \n" +" v_mul_f32 %[c29], %[scale_1], %[c29] \n" +" v_mul_f32 %[c30], %[scale_1], %[c30] \n" +" v_mul_f32 %[c31], %[scale_1], %[c31] \n" + +_UK_PK_CVT_("%[c16]", "%[c17]", "%[c16]") +_UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]") +_UK_PK_CVT_("%[c20]", "%[c21]", "%[c18]") +_UK_PK_CVT_("%[c22]", "%[c23]", "%[c19]") +_UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]") +_UK_PK_CVT_("%[c26]", "%[c27]", "%[c21]") +_UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]") +_UK_PK_CVT_("%[c30]", "%[c31]", "%[c23]") + +" ;------------------------------ \n" +" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:0 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base] \n" +" s_waitcnt lgkmcnt(0) \n" +" s_barrier \n" +" ds_read_b32 %[c16], %[v_sfl_sld] offset:0 + %[shfl_base] \n" +" ds_read_b32 %[c17], %[v_sfl_sld] offset:32 + %[shfl_base] \n" +" ds_read_b32 %[c18], %[v_sfl_sld] offset:64 + %[shfl_base] \n" +" ds_read_b32 %[c19], %[v_sfl_sld] offset:96 + %[shfl_base] \n" +" ds_read_b32 %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base] \n" +" ds_read_b32 %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base] \n" +" ds_read_b32 %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base] \n" +" ds_read_b32 %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base] \n" +" s_waitcnt lgkmcnt(0) \n" +" s_mov_b64 exec, %[s_execflag_0] \n" +_UK_ATOMIC_ADD_ " %[v_os_o0], %[c16], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_1] \n" +_UK_ATOMIC_ADD_ " %[v_os_o1], %[c17], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_2] \n" +_UK_ATOMIC_ADD_ " %[v_os_o2], %[c18], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_3] \n" +_UK_ATOMIC_ADD_ " %[v_os_o3], %[c19], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_4] \n" +_UK_ATOMIC_ADD_ " %[v_os_o4], %[c20], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_5] \n" +_UK_ATOMIC_ADD_ " %[v_os_o5], %[c21], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_6] \n" +_UK_ATOMIC_ADD_ " %[v_os_o6], %[c22], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_7] \n" +_UK_ATOMIC_ADD_ " %[v_os_o7], %[c23], s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k-- \n" +" s_cmp_gt_i32 %[s_loop_cnt] 0 \n" +" s_cbranch_scc0 L_end%= \n" +" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +" s_cselect_b32 s86, %[s_tile_os_b], 0 \n" +" s_add_u32 s12, s86, s12 \n" +" s_addc_u32 s13, 0, s13 \n" +" s_add_u32 s8, %[s_tile_os_o], s8 \n" +" s_addc_u32 s9, 0, s9 \n" +" s_branch L_start%= \n" +"L_end%=: \n" + +#undef _UK_MFMA_ +#undef _UK_PK_CVT_ +#undef _UK_ATOMIC_ADD_ diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc new file mode 100644 index 000000000..a34a21d39 --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc @@ -0,0 +1,516 @@ +#ifndef CK_TILE_FLATMM_UK_MFMA +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 +#endif + +#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16 +#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16" +#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16 +#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16" +#endif + +"s_mov_b32 s16, %[s_res_a0] \n" +"s_mov_b32 s17, %[s_res_a1] \n" +"s_mov_b32 s18, %[s_res_a2] \n" +"s_mov_b32 s19, %[s_res_a3] \n" +"s_mov_b32 s20, %[s_res_b0] \n" +"s_mov_b32 s21, %[s_res_b1] \n" +"s_mov_b32 s22, %[s_res_b2] \n" +"s_mov_b32 s23, %[s_res_b3] \n" +// "s_nop 4\n" +"; -- prefetch A0\n" +"s_add_u32 m0, 0, %[s_m0_init] \n" +"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[smem_sz], %[s_m0_init] \n" +"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move a with cond \n" +"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond \n" +"s_add_u32 s16, s86, s16 ; move a with cond \n" +"s_addc_u32 s17, 0, s17 ; move a with cond \n" +"; -- prefetch A1\n" +"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds \n" +"s_add_u32 m0, 0, %[s_m0_init] \n" +"s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond \n" +"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond \n" +"s_add_u32 s16, s86, s16 ; move a with cond \n" +"s_addc_u32 s17, 0, s17 ; move a with cond \n" +"; -- prefetch B0\n" +"buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n" +"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +"s_cselect_b32 s86, %[s_tile_os_b], 0 ; move b with cond \n" +"s_add_u32 s20, s86, s20 ; move b with cond \n" +"s_addc_u32 s21, 0, s21 ; move b with cond \n" +"s_waitcnt vmcnt(40) \n" +"s_barrier \n" +"ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n" // 1024: N stride, 64 K stride +"ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n" +"ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n" +"ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n" +"ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n" +"ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n" +"ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n" +"ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n" +"L_start%=: \n" +" s_waitcnt vmcnt(24) & lgkmcnt(0) \n" +" s_barrier \n" +_UK_MFMA_ " %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0] \n" +" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0] \n" +" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0] \n" +" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0] \n" +" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1] \n" +" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1] \n" +" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1] \n" +" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1] \n" +" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2] \n" +" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2] \n" +" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2] \n" +" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2] \n" +" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3] \n" +" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3] \n" +" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3] \n" +" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3] \n" +" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[smem_sz], %[s_m0_init] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4] \n" +" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4] \n" +" ds_read_b128 v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0] \n" +_UK_MFMA_ " %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4] \n" +" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4] \n" +" ds_read_b128 v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1] \n" +_UK_MFMA_ " %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5] \n" +" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5] \n" +" ds_read_b128 v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2] \n" +_UK_MFMA_ " %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5] \n" +" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5] \n" +" ds_read_b128 v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3] \n" +_UK_MFMA_ " %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6] \n" +" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6] \n" +" ds_read_b128 v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4] \n" +_UK_MFMA_ " %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6] \n" +" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6] \n" +" ds_read_b128 v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5] \n" +_UK_MFMA_ " %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7] \n" +" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7] \n" +" ds_read_b128 v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6] \n" +_UK_MFMA_ " %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7] \n" +" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7] \n" +" ds_read_b128 v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8] \n" +" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8] \n" +" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9] \n" +" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9] \n" +" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10] \n" +" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10] \n" +" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11] \n" +" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11] \n" +" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12] \n" +" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12] \n" +" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13] \n" +" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13] \n" +" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14] \n" +" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14] \n" +" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15] \n" +" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15] \n" +" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072\n" +_UK_MFMA_ " %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15] \n" +" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 \n" +" s_cmp_gt_i32 %[s_loop_cnt] 0 \n" +" s_cbranch_scc0 L_end%= \n" +" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond \n" +" s_cselect_b32 s86, %[s_tile_os_a], 0 \n" +" s_add_u32 s16, s86, s16 \n" +" s_addc_u32 s17, 0, s17 \n" +" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +" s_cselect_b32 s86, %[s_tile_os_b], 0 \n" +" s_add_u32 s20, s86, s20 \n" +" s_addc_u32 s21, 0, s21 \n" +" ;------------------------------------------ \n" +" s_waitcnt vmcnt(24) & lgkmcnt(0) \n" +" s_barrier \n" +_UK_MFMA_ " %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0] \n" +" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0] \n" +" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0] \n" +" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0] \n" +" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1] \n" +" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1] \n" +" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1] \n" +" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1] \n" +" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2] \n" +" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2] \n" +" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2] \n" +" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2] \n" +" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3] \n" +" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3] \n" +" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3] \n" +" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3] \n" +" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds \n" +" s_add_u32 m0, 0, %[s_m0_init] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4] \n" +" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4] \n" +" ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0] \n" +_UK_MFMA_ " %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4] \n" +" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4] \n" +" ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1] \n" +_UK_MFMA_ " %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5] \n" +" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5] \n" +" ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2] \n" +_UK_MFMA_ " %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5] \n" +" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5] \n" +" ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3] \n" +_UK_MFMA_ " %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6] \n" +" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6] \n" +" ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4] \n" +_UK_MFMA_ " %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6] \n" +" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6] \n" +" ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5] \n" +_UK_MFMA_ " %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7] \n" +" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7] \n" +" ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6] \n" +_UK_MFMA_ " %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7] \n" +" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7] \n" +" ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8] \n" +" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8] \n" +" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9] \n" +" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9] \n" +" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10] \n" +" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10] \n" +" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11] \n" +" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11] \n" +" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12] \n" +" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12] \n" +" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13] \n" +" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13] \n" +" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14] \n" +" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14] \n" +" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15] \n" +" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15] \n" +" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15] \n" +" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 \n" +" s_cmp_gt_i32 %[s_loop_cnt] 0 \n" +" s_cbranch_scc0 L_end%= \n" +" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond \n" +" s_cselect_b32 s86, %[s_tile_os_a], 0 \n" +" s_add_u32 s16, s86, s16 \n" +" s_addc_u32 s17, 0, s17 \n" +" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +" s_cselect_b32 s86, %[s_tile_os_b], 0 \n" +" s_add_u32 s20, s86, s20 \n" +" s_addc_u32 s21, 0, s21 \n" +" s_branch L_start%= \n" +"L_end%=: \n" +" s_nop 2 \n" + +#undef _UK_MFMA_ diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp index 10bb01168..173887513 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp @@ -331,7 +331,8 @@ struct BlockFmhaPipelineQRKSVSAsync Policy::template MakeVDramTileDistribution()); // prefetch K tile - async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})), k_dram_window, k_oob_ck, k_pre_np); + async_load_tile_raw( + k_lds_store(LdsSeq.at(number<0>{})), k_dram_window, number<-1>{}, k_oob_ck, k_pre_np); move_tile_window(k_dram_window, {0, kK0}); __builtin_amdgcn_sched_barrier(0); @@ -355,6 +356,7 @@ struct BlockFmhaPipelineQRKSVSAsync static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) { async_load_tile_raw(k_lds_store(number{})>{}), k_dram_window, + number<-1>{}, k_oob_ck, k_pre_np); if constexpr(i_k0 < k0_loops - 1) @@ -386,7 +388,7 @@ struct BlockFmhaPipelineQRKSVSAsync __builtin_amdgcn_s_barrier(); const auto bias_tile = load_tile(bias_dram_window); // load bias tile - auto v_buf = load_tile(v_dram_window, bool_constant{}); + auto v_buf = load_tile(v_dram_window, number<-1>{}, bool_constant{}); __builtin_amdgcn_sched_barrier(0); { // tail gemm_0(s_acc, @@ -514,7 +516,8 @@ struct BlockFmhaPipelineQRKSVSAsync move_tile_window( v_dram_window, {0, kK1}); // will have scratch if move this right after load_tile(v_dram)... - v_buf = load_tile(v_dram_window, bool_constant{}); // load next v_buf + v_buf = load_tile( + v_dram_window, number<-1>{}, bool_constant{}); // load next v_buf } __builtin_amdgcn_sched_barrier(0); @@ -618,7 +621,8 @@ struct BlockFmhaPipelineQRKSVSAsync static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) { if constexpr(i_k1 != 0 && i_k1 < k1_loops - 1) { - v_buf = load_tile(v_dram_window, bool_constant{}); // load next v_buf + v_buf = load_tile( + v_dram_window, number<-1>{}, bool_constant{}); // load next v_buf } block_sync_lds(); gemm_1(o_acc, @@ -665,8 +669,11 @@ struct BlockFmhaPipelineQRKSVSAsync if constexpr(k1_loops >= 2 && LdsSeq.at(number<0>{}) == LdsSeq.at(number{})) __builtin_amdgcn_s_barrier(); - async_load_tile_raw( - k_lds_store(LdsSeq.at(number<0>{})), k_dram_window, k_oob_ck, k_pre_np); + async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})), + k_dram_window, + number<-1>{}, + k_oob_ck, + k_pre_np); move_tile_window(k_dram_window, {0, kK0}); } // tail diff --git a/include/ck_tile/ops/fused_moe.hpp b/include/ck_tile/ops/fused_moe.hpp index b74607f06..d23af0af8 100644 --- a/include/ck_tile/ops/fused_moe.hpp +++ b/include/ck_tile/ops/fused_moe.hpp @@ -3,7 +3,15 @@ #pragma once +#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp" +#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp" +#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp" #include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp" #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp" #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp" #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp" diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp new file mode 100644 index 000000000..2d25d44f3 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp @@ -0,0 +1,421 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" +#include "ck_tile/ops/elementwise.hpp" +#include +#include + +// clang-format off +// [indexing implementation-1] +// using M_a as constexpr block_size to partition all tokens into different slices +// each slice map to one expert, and one expert can have multiple slices +// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5 +// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]] +// tok-0 tok-1 tok-2 tok-3 tok-4 +// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number) +// +// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]] +// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5 +// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]] +// +// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1) +// * this could be larger than actual, since actual tokens are on GPU +// +// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5] +// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4 -|- exp-5 -| +// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, c, f, i, o] +// +// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr +// +// * Note on token_id_per_expert/sorted_token_ids_ptr data: +// currently we do not have topk information from the data of token_id_per_expert/sorted_token_ids_ptr. +// In some cases(like smooth-quant), we need topk information to indexing into tokens quant from +// different expert smooth quant. So we modify the number stored inside token_id_per_expert/sorted_token_ids_ptr +// +// 32bit 0........23 24.....31 bit +// (data) -> (token_id | topk_id) +// low 24 bit is for token id, top 8 bit is for topk id +// +// the input after smooth-quant is [token, topk, hidden_dim], originally it is [token, hidden_dim] +// the input scale for token is [topk, token, 1], the smooth-quant scale for first gemm is [expert, interm_dim] +// +// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5] +// * length is (max_num_tokens_padded + block_size - 1) / block_size +// +// num_tokens_post_padded_ptr : [28] +// num_sorted_tiles_ptr : [7] +// +// * different from vLLM +// 1) token_id stored in sorted_token_ids_ptr is actual token_id, not token_id*top_K expanded id +// 2)need sorted_weight_ptr +// 3) use num_sorted_tiles_ptr, already divided by M_a +// +// * below used for indexing +// 1) sorted_token_ids_ptr [max_num_tokens_padded] +// 2) sorted_weight_ptr +// 3) sorted_expert_ids_ptr +// 4)num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one) +// +// max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1) +// +// [indexing implementation-2] +// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]] +// tok-0 tok-1 tok-2 tok-3 tok-4 +// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number) +// +// we generate original rol/col id as +// topk_rc_ids : [[0, 5, A], [1, 6, B], [2, 7, C], [3, 8, D], [4, 9, E]] +// let x be one element of above, we can get: +// tpok_row_id(token_id) = x % num_tokens(5) +// tpok_col_id(expert_Id) = x / num_tokens +// topk_row_id/col_id can be used to access original topk_ids/topk_weight +// +// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 5, 5]] +// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5 +// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]] +// +// we can get permuted_rc_ids: +// [[0], [2, 3, 4], [1, 8], [5, 6, 7, D, 9], [], [A, B, C, E]] +// +// +// clang-format on +// +namespace ck_tile { + +// m: num_tokens (or token*input-batch) +// k: intermediate_size +// n: intermediate_size used between 2 FC (TP slice this) +// e: num expert +// if doing pre-shuffle +// nr : n / Block_Nr +// kr : k / Block_Kr +// w : fattened 1d wave buffer +struct FusedMoeGemmHostArgs +{ + const void* a_ptr; // [m, k], input token + const void* a_scale_ptr; // [m, 1], token scale + const void* g_ptr; // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) + const void* d_ptr; // [e, n, k], pre-shuffle([e, nr, kr, w]) + const void* g_scale_ptr; // [e, 1, n], gate(up) scale + const void* d_scale_ptr; // [e, 1, k], down scale + const void* y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input + void* o_ptr; // [m, k], output token + + const void* sorted_token_ids_ptr; // [max_num_tokens_padded] + const void* sorted_weight_ptr; // [max_num_tokens_padded] + const void* sorted_expert_ids_ptr; // [(max_num_tokens_padded + block_size - 1) / block_size] + const void* num_sorted_tiles_ptr; // [1] + + index_t hidden_size; // k + index_t intermediate_size; // n / TP, for Gate. if Gate+Up, Down need divide by 2 + index_t num_tokens; // input number of tokens for current iteration + index_t num_experts; // number of groups + index_t topk; // need this? + + index_t stride_token; // for input/output, stride for each row, should >= hidden_size +}; + +// This is scatter/gather b2b group-gemm +template +struct FusedMoeGemmKernel +{ + using Partitioner = remove_cvref_t; + using Pipeline = remove_cvref_t; + using Epilogue = remove_cvref_t; // TODO: not used + // static constexpr index_t kBlockPerCu = Pipeline::kBlockPerCu; + // static_assert(kBlockPerCu > 0); + + using BlockShape = typename Pipeline::BlockShape; // this is FusedMoeGemmShape + static constexpr index_t BlockSize_ = BlockShape::BlockSize; + + using ADataType = typename Pipeline::Problem::ADataType; + using GDataType = typename Pipeline::Problem::GDataType; + using DDataType = typename Pipeline::Problem::DDataType; + using AccDataType = typename Pipeline::Problem::AccDataType; + using ODataType = typename Pipeline::Problem::ODataType; + using AScaleDataType = typename Pipeline::Problem::AScaleDataType; + using GScaleDataType = typename Pipeline::Problem::GScaleDataType; + using DScaleDataType = typename Pipeline::Problem::DScaleDataType; + using YSmoothScaleDataType = typename Pipeline::Problem::YSmoothScaleDataType; + using TopkWeightDataType = typename Pipeline::Problem::TopkWeightDataType; + using IndexDataType = typename Pipeline::Problem::IndexDataType; + using YDataType = typename Pipeline::Problem::YDataType; + + using Traits = typename Pipeline::Problem::Traits; + static constexpr bool UseUK = true; + + static constexpr bool IsGateOnly = Traits::IsGateOnly; + static constexpr bool UseSmoothQuant = Traits::UseSmoothQuant; + static constexpr bool PadHiddenSize = Traits::PadHiddenSize; + static constexpr bool PadIntermediateSize = Traits::PadIntermediateSize; + + // clang-format off + template struct t2s; + template <> struct t2s { static constexpr const char * name = "fp32"; }; + template <> struct t2s { static constexpr const char * name = "fp16"; }; + template <> struct t2s { static constexpr const char * name = "bf16"; }; + template <> struct t2s { static constexpr const char * name = "fp8"; }; + template <> struct t2s { static constexpr const char * name = "bf8"; }; + template <> struct t2s { static constexpr const char * name = "int8"; }; + // clang-format on + + CK_TILE_HOST static std::string GetName() + { +#define _SS_ std::string +#define _TS_ std::to_string + // clang-format off + using S_ = BlockShape; + + auto prec_str = [&] () { + std::string base_str = _SS_(t2s::name); + if (!std::is_same_v) { + base_str += _SS_("_") + _SS_(t2s::name); + } + return base_str; + }(); + + return _SS_("fused_moe_") + _SS_(prec_str) + "_" + + _TS_(S_::Block_M0) + "x" + _TS_(S_::Block_N0) + "x" + _TS_(S_::Block_K0) + "x" + _TS_(S_::Block_N1) + "_" + + _TS_(S_::WarpPerBlock_M0) + "x" + _TS_(S_::WarpPerBlock_N0) + "x" + _TS_(S_::WarpPerBlock_K0) + "_" + + _TS_(S_::Warp_M0) + "x" + _TS_(S_::Warp_N0) + "x" + _TS_(S_::Warp_K0) + "_" + _SS_(Pipeline::name); +#undef _SS_ +#undef _TS_ + // clang-format on + } + + struct FusedMoeGemmKargs + { + const void* a_ptr; // [m, k], input token + const void* a_scale_ptr; // [m, 1], token scale + const void* g_ptr; // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) + const void* d_ptr; // [e, n, k], pre-shuffle([e, nr, kr, w]) + const void* g_scale_ptr; // [e, 1, n], gate(up) scale + const void* d_scale_ptr; // [e, 1, k], down scale + const void* y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input + void* o_ptr; // [m, k], output token + + const void* sorted_token_ids_ptr; + const void* sorted_weight_ptr; + const void* sorted_expert_ids_ptr; + const void* num_sorted_tiles_ptr; + + index_t hidden_size; // k + index_t intermediate_size; // n / TP, for Gate. if Gate+Up, Down need divide by 2 + index_t num_tokens; // input number of tokens for current iteration + index_t num_experts; // number of groups + index_t topk; // need this? + + index_t stride_token; // for input/output, stride for each row, should >= hidden_size + }; + + // TODO: switch karg based on + using Kargs = FusedMoeGemmKargs; + using Hargs = FusedMoeGemmHostArgs; + + CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) + { + // TODO: hargs/kargs not guranteed to be the same + return bit_cast(hargs); + } + + CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) + { + constexpr index_t block_m = BlockShape::Block_M0; + int max_num_tokens_padded = + hargs.topk * hargs.num_tokens + hargs.num_experts * block_m - hargs.topk; + // printf("xxx max_num_tokens_padded:%d\n", max_num_tokens_padded); + return Partitioner::GridSize(max_num_tokens_padded, hargs.intermediate_size); + } + + CK_TILE_HOST static constexpr auto BlockSize() { return dim3(BlockSize_); } + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + if constexpr(UseUK) + { + __shared__ CK_TILE_LDS_ADDR ADataType smem[GetSmemSize()]; + IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane( + *reinterpret_cast(kargs.num_sorted_tiles_ptr)); + + num_sorted_tiles = num_sorted_tiles / BlockShape::Block_M0; + + const auto [sorted_tile_id, intermediate_tile_id] = + Partitioner{}(num_sorted_tiles, kargs.intermediate_size); + // if(threadIdx.x == 0) + // printf("bid:%d,%d, num_sorted_tiles:%d, sorted_tile_id:%d(%d), + // intermediate_tile_id:%d\n", static_cast(blockIdx.x), + // static_cast(blockIdx.y), num_sorted_tiles, sorted_tile_id, sorted_tile_id >= + // num_sorted_tiles? 1 : 0, intermediate_tile_id); + if(sorted_tile_id >= num_sorted_tiles) + return; + + Pipeline{}(kargs, smem, sorted_tile_id, intermediate_tile_id); + } + else + { + // allocate LDS + // __shared__ char smem_ptr[GetSmemSize()]; + IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane( + *reinterpret_cast(kargs.num_sorted_tiles_ptr)); + constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2; + + index_t nr_0 = kargs.intermediate_size / BlockShape::Block_Nr0; + index_t kr_0 = kargs.hidden_size / BlockShape::Block_Kr0; + index_t nr_1 = kargs.hidden_size / BlockShape::Block_Nr1; // should be same as kr_0 + index_t kr_1 = + kargs.intermediate_size / BlockShape::Block_Kr1; // should be same as nr_0 + + index_t expert_stride_0 = kargs.intermediate_size * hidden_radio_0 * kargs.hidden_size; + index_t expert_stride_1 = kargs.intermediate_size * kargs.hidden_size; + + __shared__ CK_TILE_LDS_ADDR ADataType smem[GetSmemSize()]; + + // note this is in unit of tile, need multiple tile size to get the index + const auto [sorted_tile_id, intermediate_tile_id] = + Partitioner{}(num_sorted_tiles, kargs.intermediate_size); + if(sorted_tile_id >= num_sorted_tiles) + return; + + const IndexDataType expert_id = + __builtin_amdgcn_readfirstlane(reinterpret_cast( + kargs.sorted_expert_ids_ptr)[sorted_tile_id]); + + // index along intermediate_size + // index_t hidden_idx = __builtin_amdgcn_readfirstlane(intermediate_tile_id * + // BlockShape::Block_N0); + index_t interm_idx_nr = + __builtin_amdgcn_readfirstlane(intermediate_tile_id * BlockShape::Block_Nr0); + + const auto a_coord = Pipeline::GetACoord(); // 2d thread offset, [i_row, i_col] + const auto sorted_token_id = + a_coord[number<0>{}] + sorted_tile_id * BlockShape::Block_M0; + + index_t token_id = + reinterpret_cast(kargs.sorted_token_ids_ptr)[sorted_token_id]; + auto topk_weight = reinterpret_cast( + kargs.sorted_weight_ptr)[sorted_token_id]; + + const auto a_window = [&]() { + // A is already pre-padded in previous kernel + const ADataType* a_ptr = reinterpret_cast(kargs.a_ptr); + const auto a_view_ = make_naive_tensor_view( + a_ptr, + make_tuple(kargs.num_tokens, kargs.hidden_size), + make_tuple(kargs.stride_token, 1), + number{}, + number<1>{}); + + // gather is here use indexing transform + const auto a_gather_view_ = transform_tensor_view( + a_view_, + make_tuple(make_indexing_transform(kargs.num_tokens, token_id), + make_pass_through_transform(kargs.hidden_size)), + make_tuple(sequence<0>{}, sequence<1>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + const auto a_window_ = make_tile_window( + a_gather_view_, + make_tuple(number{}, number{}), + {0, 0}); + return a_window_; + }(); + + // TODO: gtile using NSub to have less register pressure + const auto g_window = [&]() { + const GDataType* g_ptr = reinterpret_cast(kargs.g_ptr) + + static_cast(expert_id) * expert_stride_0 + + interm_idx_nr * kr_0 * BlockShape::Block_W0; + const auto g_view_ = make_naive_tensor_view( + g_ptr, + make_tuple(nr_0, kr_0, number{}), + make_tuple(kr_0 * BlockShape::Block_W0, number{}, 1), + number{}, + number<1>{}); + const auto g_view_1_ = + pad_tensor_view(g_view_, + make_tuple(number{}, + number{}, + number{}), + sequence{}); + + const auto g_window_ = make_tile_window(g_view_1_, + make_tuple(number{}, + number{}, + number{}), + {0, 0, 0}); + return g_window_; + }(); + + const auto d_window = [&]() { + const DDataType* d_ptr = reinterpret_cast(kargs.d_ptr) + + static_cast(expert_id) * expert_stride_1 + + interm_idx_nr * BlockShape::Block_W1; + // note interm_idx_nr is along the gemm-k dim of 2nd gemm + + const auto d_view_ = make_naive_tensor_view( + d_ptr, + make_tuple(nr_1, kr_1, BlockShape::Block_W1), + make_tuple(kr_1 * BlockShape::Block_W1, BlockShape::Block_W1, 1), + number{}, + number<1>{}); + const auto d_view_1_ = + pad_tensor_view(d_view_, + make_tuple(number{}, + number{}, + number{}), + sequence{}); + + const auto d_window_ = make_tile_window(d_view_1_, + make_tuple(number{}, + number{}, + number{}), + {0, 0, 0}); + return d_window_; + }(); + + auto o_window = [&]() { + ODataType* o_ptr = reinterpret_cast(kargs.o_ptr); + auto o_view_ = make_naive_tensor_view( + o_ptr, + make_tuple(kargs.num_tokens, kargs.hidden_size), + make_tuple(kargs.stride_token, 1), + number{}, + number<1>{}); + + // gather is here + auto o_scatter_view_ = transform_tensor_view( + o_view_, + make_tuple(make_indexing_transform(kargs.num_tokens, token_id), + make_pass_through_transform(kargs.hidden_size)), + make_tuple(sequence<0>{}, sequence<1>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + auto o_window_ = make_tile_window( + o_scatter_view_, + make_tuple(number{}, number{}), + {0, 0}); + return o_window_; + }(); + + // do compute yeah + Pipeline{}(a_window, + g_window, + d_window, + o_window, + topk_weight, + smem, + kargs.hidden_size, + kargs.intermediate_size, + kargs.stride_token); + } + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp new file mode 100644 index 000000000..4f3f8bb7d --- /dev/null +++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +/* +tensors: +1. act (A): input feature map +2. gate (G): B matrix for first gemm, output will do activation(Silu) +3. up (U): B matrix for first gemm +4. down (D): B matrix for second gemm + N1 + / \ + +----------+ | + | Down | | + x----------x | + hidden hidden K1 | | | + N0 N0 x----------x | + | +------x-----x------+------x-----x------+ | | | + dim | | Gate | | | Up | | | | | | + contiguous | | | | | | | | | | | + | | | | | | | | | | | + v +------x-----x------+------x-----x------+ +----------+ V + K0 | | | | | contiguous + / \ v v v v | + +---------+ +------x-----x------+------x-----x------+ | +M0 | A | | | | | | | | | + +---------+ +------x-----x------+------x-----x------+ | + ----------> | | | + contiguous | V V + | x-----x +----------+ + +------------> M1 | Y | ---------> | Out(O) | + ACT x-----x +----------+ + K1 = N0 dim + +* Note: Act could be Gelu/Silu/... +* Note: some model does not have Up +*/ +template +struct FusedMoeGemmShape +{ + using BlockTile_0 = remove_cvref_t; + using WarpPerBlock_0 = remove_cvref_t; + using WarpTile_0 = remove_cvref_t; + using BlockTile_1 = remove_cvref_t; + using WarpPerBlock_1 = remove_cvref_t; + using WarpTile_1 = remove_cvref_t; + + static constexpr index_t NumWarps = + reduce_on_sequence(WarpPerBlock_0{}, multiplies{}, number<1>{}); + + // TODO: we don't support half warps aound to 1 warp here + static_assert(NumWarps == reduce_on_sequence(WarpPerBlock_1{}, multiplies{}, number<1>{})); + + static constexpr index_t Block_M0 = BlockTile_0::at(number<0>{}); + static constexpr index_t Block_N0 = BlockTile_0::at(number<1>{}); + static constexpr index_t Block_K0 = BlockTile_0::at(number<2>{}); + static constexpr index_t WarpPerBlock_M0 = WarpPerBlock_0::at(number<0>{}); + static constexpr index_t WarpPerBlock_N0 = WarpPerBlock_0::at(number<1>{}); + static constexpr index_t WarpPerBlock_K0 = WarpPerBlock_0::at(number<2>{}); + static constexpr index_t Warp_M0 = WarpTile_0::at(number<0>{}); + static constexpr index_t Warp_N0 = WarpTile_0::at(number<1>{}); + static constexpr index_t Warp_K0 = WarpTile_0::at(number<2>{}); + + static constexpr index_t ThreadPerBlock_M0 = Warp_M0 * WarpPerBlock_M0; + static constexpr index_t ThreadPerBlock_N0 = Warp_N0 * WarpPerBlock_N0; + static constexpr index_t ThreadPerBlock_K0 = Warp_K0 * WarpPerBlock_K0; + static_assert(Block_M0 % ThreadPerBlock_M0 == 0); + static_assert(Block_N0 % ThreadPerBlock_N0 == 0); + static_assert(Block_K0 % ThreadPerBlock_K0 == 0); + static constexpr index_t Repeat_M0 = Block_M0 / ThreadPerBlock_M0; + static constexpr index_t Repeat_N0 = Block_N0 / ThreadPerBlock_N0; + static constexpr index_t Repeat_K0 = Block_K0 / ThreadPerBlock_K0; + + static constexpr index_t Block_M1 = BlockTile_1::at(number<0>{}); + static constexpr index_t Block_N1 = BlockTile_1::at(number<1>{}); + static constexpr index_t Block_K1 = BlockTile_1::at(number<2>{}); + static constexpr index_t WarpPerBlock_M1 = WarpPerBlock_1::at(number<0>{}); + static constexpr index_t WarpPerBlock_N1 = WarpPerBlock_1::at(number<1>{}); + static constexpr index_t WarpPerBlock_K1 = WarpPerBlock_1::at(number<2>{}); + static constexpr index_t Warp_M1 = WarpTile_1::at(number<0>{}); + static constexpr index_t Warp_N1 = WarpTile_1::at(number<1>{}); + static constexpr index_t Warp_K1 = WarpTile_1::at(number<2>{}); + + static constexpr index_t ThreadPerBlock_M1 = Warp_M1 * WarpPerBlock_M1; + static constexpr index_t ThreadPerBlock_N1 = Warp_N1 * WarpPerBlock_N1; + static constexpr index_t ThreadPerBlock_K1 = Warp_K1 * WarpPerBlock_K1; + static_assert(Block_M1 % ThreadPerBlock_M1 == 0); + static_assert(Block_N1 % ThreadPerBlock_N1 == 0); + static_assert(Block_K1 % ThreadPerBlock_K1 == 0); + static constexpr index_t Repeat_M1 = Block_M1 / ThreadPerBlock_M1; + static constexpr index_t Repeat_N1 = Block_N1 / ThreadPerBlock_N1; + static constexpr index_t Repeat_K1 = Block_K1 / ThreadPerBlock_K1; + + static constexpr index_t BlockSize = warpSize * NumWarps; + + // some assert + static_assert(Block_M0 == Block_M1); + static_assert(Block_N0 == Block_K1 || (Block_N0 / 2) == Block_K1); // Gate Only or Gate+Up + + // pre-shuffle tile size compute (assume only for B matrix) + // we flatten the each wave tile to a 1d linear tensor(at model loading time) + // e.g. originally we have Block_N*Block_K tile size, after pre-shuffle + // we can have Block_Nr*Block_Kr*Block_W, where Block_W is Warp_N*Warp_K, + // and Block_Nr=Block_N/Warp_N, Block_Kr=Block_K/Warp_K + static constexpr index_t Block_W0 = Warp_N0 * Warp_K0; + static constexpr index_t Block_Nr0 = Block_N0 / Warp_N0; + static constexpr index_t Block_Kr0 = Block_K0 / Warp_K0; + static constexpr index_t Block_W1 = Warp_N1 * Warp_K1; + static constexpr index_t Block_Nr1 = Block_N1 / Warp_N1; + static constexpr index_t Block_Kr1 = Block_K1 / Warp_K1; + + static_assert(Block_W0 == Block_W1); + // static_assert(Block_Nr0 == Block_Kr1); +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp new file mode 100644 index 000000000..381edb650 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +namespace ck_tile { + +template +struct FusedMoeGemmTilePartitioner_Linear +{ + // FusedMoeGemmShape + using BlockShape = ck_tile::remove_cvref_t; + + static constexpr const char* name = "lin"; + + CK_TILE_DEVICE auto operator()(ck_tile::index_t /*num_sorted_tiles*/, + ck_tile::index_t /*intermediate_size*/) + { + index_t i_n = blockIdx.x; + index_t i_m = blockIdx.y; + + return ck_tile::make_tuple(i_m, i_n); + } + + CK_TILE_HOST static constexpr auto GridSize(index_t max_tokens, index_t intermediate_size) + { + // TODO: this may need tuning + index_t ms = ck_tile::integer_divide_ceil(max_tokens, BlockShape::Block_M0); + index_t ns = ck_tile::integer_divide_ceil(intermediate_size, BlockShape::Block_N0); + return dim3(ns, ms, 1); + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp new file mode 100644 index 000000000..e9577e230 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp @@ -0,0 +1,651 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp" + +namespace ck_tile { + +/* +This pipeline deal with a gemm(actually 2 gemm) with one very small(token), one very big(weight) +we need to design the pipeline such that all waves along gemm-N dim (gemm-m only 1 wave) + + <----- gemm-N ------> + +----+----+----+----+ + | w0 | w1 | w2 | w3 | gemm-m + +----+----+----+----+ +*/ +template +struct FusedMoeGemmPipeline_FlatmmEx +{ + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + + using BlockShape = typename Problem::BlockShape; // this is FusedMoeGemmShape + + using ADataType = typename Problem::ADataType; + using GDataType = typename Problem::GDataType; + using DDataType = typename Problem::DDataType; + using AccDataType = typename Problem::AccDataType; + using ODataType = typename Problem::ODataType; + using AScaleDataType = typename Problem::AScaleDataType; + using GScaleDataType = typename Problem::GScaleDataType; + using DScaleDataType = typename Problem::DScaleDataType; + using YSmoothScaleDataType = typename Problem::YSmoothScaleDataType; + using TopkWeightDataType = typename Problem::TopkWeightDataType; + using IndexDataType = typename Problem::IndexDataType; + using YDataType = typename Problem::YDataType; + + using Traits = typename Problem::Traits; + + static constexpr bool IsGateOnly = Traits::IsGateOnly; + static constexpr bool UseSmoothQuant = Traits::UseSmoothQuant; + static constexpr bool PadHiddenSize = Traits::PadHiddenSize; + static constexpr bool PadIntermediateSize = Traits::PadIntermediateSize; + + static constexpr index_t kAlignmentA = Policy::template GetAlignment_A(); + static constexpr index_t kAlignmentG = Policy::template GetAlignment_G(); + static constexpr index_t kAlignmentD = Policy::template GetAlignment_D(); + static constexpr index_t kAlignmentO = Policy::template GetAlignment_O(); + + static constexpr index_t SLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::SLD_A); + static constexpr index_t GLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_A); + static constexpr index_t GLD_B = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_B); + static constexpr index_t GST_O = static_cast(FusedMoeGemmPipelineSequencerEnum::GST_O); + + static constexpr index_t kBlockPerCu = []() { + if constexpr(Problem::kBlockPerCu != -1) + return Problem::kBlockPerCu; + else + { + // minimize occupancy + return 2; + } + }(); + + static constexpr const char* name = "fused_moe_flatmm"; + + // TODO: there are multiple buffers + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize_A() + { + return Policy::template GetSmemSize_A(); + } + + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + // this is the thread-offset along row/col + CK_TILE_HOST_DEVICE static auto GetACoord() + { + constexpr auto a_dist = Policy::template MakeGlobalTileDistribution_A(); + const auto a_coord = a_dist.calculate_index(); + return a_coord; + } + + // this is the thread-offset along row/col + CK_TILE_HOST_DEVICE static auto GetOCoord() + { + constexpr auto o_dist = Policy::template MakeOGlobalTileDistribution(); + const auto o_coord = o_dist.calculate_index(); + return o_coord; + } + + template + CK_TILE_DEVICE auto operator()(const AWindow& a_window_, + const GWindow& g_window_, + const DWindow& d_window_, + OWindow& o_window_, + TopkWeightDataType /*topk_weight*/, + CK_TILE_LDS_ADDR void* smem, + index_t hidden_size, + index_t intermediate_size) + { + _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wc++20-extensions\""); + constexpr auto NEG1 = number<-1>{}; + constexpr auto I0 = number<0>{}; + constexpr auto I1 = number<1>{}; + constexpr auto TRUE = bool_constant{}; + constexpr auto FALSE = bool_constant{}; + + CK_TILE_LDS_ADDR ADataType* smem_0 = reinterpret_cast(smem); + CK_TILE_LDS_ADDR ADataType* smem_1 = reinterpret_cast( + reinterpret_cast(smem) + + Policy::template GetSmemSize_A()); + + auto g_view = g_window_.get_bottom_tensor_view(); + + auto u_view = [&]() { + if constexpr(IsGateOnly) + { + return g_view; + } + else + { + index_t nr_0 = intermediate_size / BlockShape::Block_Nr0; + index_t kr_0 = hidden_size / BlockShape::Block_Kr0; + + const GDataType* g_ptr = + g_window_.get_bottom_tensor_view().get_buffer_view().p_data_; + const GDataType* u_ptr = g_ptr + (nr_0 / 2) * kr_0 * number{}; + + const auto u_view_ = make_naive_tensor_view( + u_ptr, + make_tuple(nr_0, kr_0, number{}), + make_tuple(kr_0 * BlockShape::Block_W0, number{}, 1), + number{}, + number<1>{}); + const auto u_view_1_ = + pad_tensor_view(u_view_, + make_tuple(number{}, + number{}, + number{}), + sequence{}); + return u_view_1_; + } + }(); + + auto a_win = make_tile_window_linear( + a_window_, Policy::template MakeGlobalTileDistribution_A()); + auto g_win = + make_tile_window_linear(g_window_, + Policy::template MakeGlobalTileDistribution_G(), + sequence<0, 1, 1>{}); + auto d_win = + make_tile_window_linear(d_window_, + Policy::template MakeGlobalTileDistribution_D(), + sequence<0, 1, 1>{}); + auto o_win = make_tile_window_linear( + o_window_, Policy::template MakeGlobalTileDistribution_O()); + + using g_thread_type = decltype(load_tile(g_win)); + using d_thread_type = decltype(load_tile(d_win)); + + using WarpGemm0 = decltype(Policy::template GetWarpGemm0()); + using WarpGemm1 = decltype(Policy::template GetWarpGemm1()); + auto warp_gemm_0 = WarpGemm0{}; + auto warp_gemm_1 = WarpGemm1{}; + + // issues_warps_lanes + auto a_sst_win0 = + make_tile_window(make_tensor_view( + smem_0, Policy::template MakeLdsStoreDesc_A()), + Policy::template MakeLdsStoreDesc_A().get_lengths(), + {0, 0, 0}); + + auto a_sst_win1 = + make_tile_window(make_tensor_view( + smem_1, Policy::template MakeLdsStoreDesc_A()), + Policy::template MakeLdsStoreDesc_A().get_lengths(), + {0, 0, 0}); + // m*k + auto a_sld_win0 = [&]() { + using WG = WarpGemm0; + constexpr auto a_outer_dstr_enc = tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + a_outer_dstr_enc, typename WG::AWarpDstrEncoding{}); + return make_tile_window_linear( + make_tensor_view( + smem_0, Policy::template MakeLdsLoadDesc_A()), + Policy::template MakeLdsLoadDesc_A().get_lengths(), + {0, 0}, + make_static_tile_distribution(a_block_dstr_encode)); + }(); + + // m*k + auto a_sld_win1 = [&]() { + using WG = WarpGemm0; + constexpr auto a_outer_dstr_enc = tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + a_outer_dstr_enc, typename WG::AWarpDstrEncoding{}); + return make_tile_window_linear( + make_tensor_view( + smem_1, Policy::template MakeLdsLoadDesc_A()), + Policy::template MakeLdsLoadDesc_A().get_lengths(), + {0, 0}, + make_static_tile_distribution(a_block_dstr_encode)); + }(); + + auto bridge_sst_win = [&]() { + return make_tile_window( + make_tensor_view( + reinterpret_cast(smem), + Policy::template MakeBridgeLdsStoreDesc()), + Policy::template MakeBridgeLdsStoreDesc().get_lengths(), + {0, 0}); + }(); + + auto bridge_sld_win = [&]() { + return make_tile_window_linear( + make_tensor_view( + reinterpret_cast(smem), + Policy::template MakeBridgeLdsLoadDesc()), + Policy::template MakeBridgeLdsLoadDesc().get_lengths(), + {0, 0}, + Policy::template MakeYTileDistribution()); + }(); + + // also OK with C array, 2 register buffer + statically_indexed_array gs; + + constexpr auto issues_a = number{}; + constexpr auto issues_g = number{}; + // constexpr auto issues_d = number{}; + // constexpr auto issues_o = number{}; + constexpr auto issues_gemm0 = + number{}; + constexpr auto issues_gemm1 = + number{}; + // constexpr auto issues_sld_a = number{}; + + const index_t num_blocks_k0 = + (hidden_size + BlockShape::Block_K0 - 1) / BlockShape::Block_K0; + const index_t num_blocks_n1 = + (hidden_size + BlockShape::Block_N1 - 1) / BlockShape::Block_N1; + + using a_thread_type = decltype(load_tile(a_sld_win0)); + statically_indexed_array as; + + auto gld_a = [&]>( + auto& a_store_, auto i_access, PreNop = {}) + { + async_load_tile_raw(a_store_, a_win, i_access, PreNop{}); + }; + auto move_a = [&]() { + move_tile_window(a_win, {number<0>{}, number{}}); + }; + auto sld_a = [&](auto& a_, auto& win_, auto i_access) { + load_tile_raw(a_, win_, i_access); + }; + + auto gld_g = [&]>( + auto& g_, auto i_access, PreNop = {}) + { + if constexpr(IsGateOnly) + { + // TODO: hack! + if constexpr(i_access.value == 0) + { + g_win.bottom_tensor_view_ = g_view; + } + else if constexpr(i_access.value == issues_g / 2) + { + g_win.bottom_tensor_view_ = u_view; + } + } + load_tile_raw(g_, g_win, i_access, FALSE, PreNop{}); + }; + auto move_g = [&]() { + move_tile_window(g_win, {number<0>{}, number{}, number<0>{}}); + }; + statically_indexed_array ds; + + auto gld_d = [&]>( + auto& d_, auto i_access, PreNop = {}) + { + load_tile_raw(d_, d_win, i_access, FALSE, PreNop{}); + }; + auto move_d = [&]() { + // d move along gemm-n + move_tile_window(d_win, {number{}, number<0>{}}); + }; + + auto atomic_add_o = [&]>( + auto& o_, auto i_access, PreNop = {}) + { + update_tile_raw(o_win, o_, i_access, TRUE, PreNop{}); + }; + + auto acc_0 = Policy::template MakeCBlockTile_Gemm0(); + auto acc_1s = generate_tuple( + [&](auto) { return Policy::template MakeCBlockTile_Gemm1(); }, number<2>{}); + + // clang-format off + auto gemm_0 = [&]> + (auto& t_c, auto& t_a, auto& t_b, auto i_access, PostNop = {}) { + using WarpGemm = remove_cvref_t; + + constexpr auto repeat_sub = WarpGemm::get_num_of_access(); + constexpr auto repeat_m = BlockShape::Repeat_M0; + // constexpr auto repeat_n = BlockShape::Repeat_N0; + constexpr auto repeat_k = BlockShape::Repeat_K0; + // loop order n->m->k + constexpr auto i_sub = i_access % repeat_sub; + constexpr auto i_k = (i_access / repeat_sub) % repeat_k; + constexpr auto i_m = (i_access / (repeat_sub * repeat_k )) % repeat_m; + constexpr auto i_n = (i_access / (repeat_sub * repeat_k )) / repeat_m; + + using AWarpTensor = typename WarpGemm::AWarpTensor; + using BWarpTensor = typename WarpGemm::BWarpTensor; + using CWarpTensor = typename WarpGemm::CWarpTensor; + using AWarpDstr = typename WarpGemm::AWarpDstr; + using BWarpDstr = typename WarpGemm::BWarpDstr; + using CWarpDstr = typename WarpGemm::CWarpDstr; + + constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t{}; + constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t{}; + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + constexpr auto a_warp_y_lengths = to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto b_warp_y_lengths = to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + + AWarpTensor w_a; + w_a.get_thread_buffer() = t_a.get_y_sliced_thread_data( + merge_sequences(sequence{}, a_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, a_warp_y_lengths)); + + BWarpTensor w_b; + w_b.get_thread_buffer() = t_b.get_y_sliced_thread_data( + merge_sequences(sequence{}, b_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)); + + CWarpTensor w_c; + w_c.get_thread_buffer() = t_c.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + warp_gemm_0(w_c, w_a, w_b, number{}, PostNop{}); + + t_c.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + w_c.get_thread_buffer()); + }; + // clang-format on + + // clang-format off + auto gemm_1 = [&]> + (auto& t_c, auto& t_a, auto& t_b, auto i_access, PostNop = {}) { + using WarpGemm = remove_cvref_t; + + constexpr auto repeat_sub = WarpGemm::get_num_of_access(); + constexpr auto repeat_m = BlockShape::Repeat_M0; + // constexpr auto repeat_n = BlockShape::Repeat_N0; + constexpr auto repeat_k = BlockShape::Repeat_K0; + // loop order n->m->k + constexpr auto i_sub = i_access % repeat_sub; + constexpr auto i_k = (i_access / repeat_sub) % repeat_k; + constexpr auto i_m = (i_access / (repeat_sub * repeat_k )) % repeat_m; + constexpr auto i_n = (i_access / (repeat_sub * repeat_k )) / repeat_m; + + using AWarpTensor = typename WarpGemm::AWarpTensor; + using BWarpTensor = typename WarpGemm::BWarpTensor; + using CWarpTensor = typename WarpGemm::CWarpTensor; + using AWarpDstr = typename WarpGemm::AWarpDstr; + using BWarpDstr = typename WarpGemm::BWarpDstr; + using CWarpDstr = typename WarpGemm::CWarpDstr; + + constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t{}; + constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t{}; + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + constexpr auto a_warp_y_lengths = to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto b_warp_y_lengths = to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + + AWarpTensor w_a; + w_a.get_thread_buffer() = t_a.get_y_sliced_thread_data( + merge_sequences(sequence{}, a_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, a_warp_y_lengths)); + + BWarpTensor w_b; + w_b.get_thread_buffer() = t_b.get_y_sliced_thread_data( + merge_sequences(sequence{}, b_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)); + + CWarpTensor w_c; + w_c.get_thread_buffer() = t_c.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + warp_gemm_1(w_c, w_a, w_b, number{}, PostNop{}); + + t_c.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + w_c.get_thread_buffer()); + }; + // clang-format on + _Pragma("clang diagnostic pop"); + + // this gemm pipeline is designed with assumption that issues of buffer-load/ds_read can + // be hide under mfma. In other words, issues of mfma is >= memory this is true if we + // pre-shuffle B matrix, and A matrix is relatively small we prefer use multiple mfma + // paired with 1 buffer-load B matrix, to get max throughput of buffer_load. and by + // preshuffle, we always pack to dwordx4 load, and this will already extend to multiple + // mfma but that is already consumed inside warpgemm-impl. So indeed how many extra + // mfma(that can reuse the B matrix) only affected by M repeat. + auto pipeline_gemm0 = [&]() { + constexpr index_t total_loops = issues_gemm0; + constexpr auto sr = Policy::template GetSequencer_0(); + static_assert(sr.size() == total_loops); + + constexpr auto c_sld_a_0 = MAKE_SC(); + constexpr auto c_gld_a_0 = MAKE_SC(); + constexpr auto c_gld_b_0 = MAKE_SC(); + // compute buffer 1 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_0(acc_0, as[I0], gs[I0], i_issue); + constexpr index_t slot = sr.at(i_issue); + + if constexpr(slot & SLD_A) + sld_a(as[I1], a_sld_win1, number{}); + if constexpr(slot & GLD_A) + gld_a(a_sst_win0, number{}); + if constexpr(slot & GLD_B) + gld_g(gs[I0], number{}); + }); + move_g(); + move_a(); + block_sync_load_raw(issues_a + issues_g); + lds_load_fence(); + + constexpr auto c_sld_a_1 = MAKE_SC(); + constexpr auto c_gld_a_1 = MAKE_SC(); + constexpr auto c_gld_b_1 = MAKE_SC(); + + // compute buffer 1 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_0(acc_0, as[I1], gs[I1], i_issue); + constexpr index_t slot = sr.at(i_issue); + + if constexpr(slot & SLD_A) + sld_a(as[I0], a_sld_win0, number{}); + if constexpr(slot & GLD_A) + gld_a(a_sst_win1, number{}); + if constexpr(slot & GLD_B) + gld_g(gs[I1], number{}); + }); + move_g(); + move_a(); + block_sync_load_raw(issues_a + issues_g); + lds_load_fence(); + }; + + auto pipeline_gemm0_tail = [&]() { + constexpr index_t total_loops = issues_gemm0; + constexpr auto sr = Policy::template GetSequencer_0(); + static_assert(sr.size() == total_loops); + + constexpr auto c_gld_b_0 = MAKE_SC(); + + // compute buffer 0 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_0(acc_0, as[I0], gs[I0], i_issue); + constexpr index_t slot = sr.at(i_issue); + + if constexpr(slot & GLD_B) + gld_g(gs[I1], number{}); + }); + + block_sync_load_raw(issues_g); + sld_a(as[I1], a_sld_win1, NEG1); + + // compute buffer 1 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + constexpr auto last_nop = [&]() { + if constexpr(i_issue == (total_loops - 1)) + return TRUE; + else + return FALSE; + }(); + gemm_0(acc_0, as[I1], gs[I1], i_issue, last_nop); // last gemm has nop + }); + }; + + auto y = Policy::template MakeYBlockTile(); + + auto pipeline_bridge = [&]() { + // cast to Y data + auto y_pre = cast_tile(acc_0); + store_tile(bridge_sst_win, y_pre); + clear_tile(acc_1s(I0)); + // wave_barrier(); + load_tile(y, bridge_sld_win); + clear_tile(acc_1s(I1)); + }; + + // note, gemm-1 start from idx-1 to N-2 (0, 1, 2....N-1) + auto pipeline_gemm1 = [&]() { + constexpr index_t total_loops = issues_gemm1; + constexpr auto sr = Policy::template GetSequencer_1(); + static_assert(sr.size() == total_loops); + + constexpr auto c_gld_b_0 = MAKE_SC(); + constexpr auto c_gst_o_0 = MAKE_SC(); + constexpr auto c_gld_b_1 = MAKE_SC(); + constexpr auto c_gst_o_1 = MAKE_SC(); + + // compute buffer 0 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_1(acc_1s[I1], y, ds[I1], i_issue); + constexpr index_t slot = sr.at(i_issue); + if constexpr(slot & GLD_B) + gld_d(ds[I0], number{}); + + if constexpr(slot & GST_O) + { + auto out = cast_tile(acc_1s[I0]); + atomic_add_o(out, number{}); + } + }); + move_d(); + // move_o(); + + // compute buffer 1 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_1(acc_1s[I0], y, ds[I0], i_issue); + constexpr index_t slot = sr.at(i_issue); + if constexpr(slot & GLD_B) + gld_d(ds[I1], number{}); + + if constexpr(slot & GST_O) + { + auto out = cast_tile(acc_1s[I1]); + atomic_add_o(out, number{}); + } + }); + move_d(); + }; + + auto pipeline_gemm1_head = [&]() { + constexpr index_t total_loops = issues_gemm1; + constexpr auto sr = Policy::template GetSequencer_1(); + static_assert(sr.size() == total_loops); + + constexpr auto c_gld_b_0 = MAKE_SC(); + + // compute buffer 0 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_1(acc_1s[I0], y, ds[I0], i_issue); + constexpr index_t slot = sr.at(i_issue); + if constexpr(slot & GLD_B) + gld_d(ds[I1], number{}); + }); + move_d(); + }; + auto pipeline_gemm1_tail = [&]() { + constexpr index_t total_loops = issues_gemm1; + constexpr auto sr = Policy::template GetSequencer_1(); + static_assert(sr.size() == total_loops); + + constexpr auto c_gst_o_0 = MAKE_SC(); + + // compute buffer 1 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_1(acc_1s[I1], y, ds[I1], i_issue); + + constexpr index_t slot = sr.at(i_issue); + if constexpr(slot & GST_O) + { + auto out = cast_tile(acc_1s[I0]); + atomic_add_o(out, number{}); + } + }); + { + auto out = cast_tile(acc_1s[I1]); + atomic_add_o(out, NEG1); + } + }; + + // start of pipeline + // clang-format off + gld_a(a_sst_win0, NEG1, TRUE); + gld_g(gs[I0], NEG1, TRUE); + move_a(); + move_g(); + clear_tile(acc_0); + + // preload for next round + gld_a(a_sst_win1, NEG1); + gld_g(gs[I1], NEG1); + + // make sure a,g loaded + block_sync_load_raw(issues_a + issues_g); + lds_load_fence(); + + // we manually unroll double buffer inside hot loop + const index_t iters_0 = (num_blocks_k0 - 2) / 2; + index_t i_0 = 0; // (void)i_0; (void)iters_0; (void)pipeline_gemm0; + while(i_0++ < iters_0) + { + pipeline_gemm0(); + } + pipeline_gemm0_tail(); + + pipeline_bridge(); + + const index_t iters_1 = (num_blocks_n1 - 2) / 2; + index_t i_1 = 0; // (void) i_1; (void)iters_1; (void)pipeline_gemm1; + pipeline_gemm1_head(); + while(i_1++ < iters_1) + { + pipeline_gemm1(); + } + pipeline_gemm1_tail(); + // clang-format on + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp new file mode 100644 index 000000000..fea30f029 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp @@ -0,0 +1,831 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp" +#include "ck_tile/ops/flatmm.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp" + +namespace ck_tile { + +struct FusedMoeGemmPipelineFlatmmPolicy +{ + CK_TILE_HOST_DEVICE static constexpr index_t GetAsyncCopyDwords() + { + // TODO: always 1 dword + return 1; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_A() + { + // using async + constexpr index_t copy_bytes = 4 * GetAsyncCopyDwords(); + constexpr index_t data_bytes = sizeof(typename Problem::ADataType); + static_assert(copy_bytes % data_bytes == 0); + return copy_bytes / data_bytes; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_G() + { + constexpr index_t copy_bytes = [&]() { return 16; }(); + constexpr index_t data_bytes = sizeof(typename Problem::GDataType); + static_assert(copy_bytes % data_bytes == 0); + return copy_bytes / data_bytes; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_D() + { + constexpr index_t copy_bytes = [&]() { return 16; }(); + constexpr index_t data_bytes = sizeof(typename Problem::DDataType); + static_assert(copy_bytes % data_bytes == 0); + return copy_bytes / data_bytes; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_O() + { + if constexpr(Problem::Traits::OAtomic == 1) + { + // pack fp16/bf16 atomic + static_assert(sizeof(typename Problem::ODataType) == 2); + return 2; + } + else if constexpr(Problem::Traits::OAtomic == 2) + { + // fp32 atomic + return 1; + } + else + { + return 16 / sizeof(typename Problem::ODataType); + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPack() + { + // TODO: this is for 3d layout + return 16 / sizeof(remove_cvref_t); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPack_A() + { + return GetSmemKPack(); + } + + // used for bridge LDS shuffle + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPack_Y() + { + // TODO: this should match mfma layout + return 16 / sizeof(typename Problem::YDataType); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize_A() + { + constexpr auto a_sld_desc = MakeLdsLoadDesc_A(); + constexpr auto a_sst_desc = MakeLdsStoreDesc_A(); + static_assert(a_sld_desc.get_element_space_size() == a_sst_desc.get_element_space_size()); + return a_sld_desc.get_element_space_size(); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize_Bridge() + { + constexpr auto bridge_sld_desc = MakeBridgeLdsLoadDesc(); + constexpr auto bridge_sst_desc = MakeBridgeLdsStoreDesc(); + static_assert(bridge_sld_desc.get_element_space_size() == + bridge_sst_desc.get_element_space_size()); + return bridge_sld_desc.get_element_space_size(); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + constexpr index_t a_lds = GetSmemSize_A(); + constexpr index_t bridge_lds = GetSmemSize_Bridge(); + return max(a_lds, bridge_lds); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_SimpleMxK() + { + constexpr index_t K_vec = Alignment; + constexpr index_t K_rem = KPerBlock / K_vec; + + if constexpr(get_warp_size() < K_rem) + { + static_assert(K_rem % get_warp_size() == 0); + constexpr index_t K_lan = get_warp_size(); // lane within same wave is along gemm-k + constexpr index_t K_wav = K_rem / get_warp_size(); + static_assert(K_wav <= NumWarps, "not not support thread has repeat along K yet"); + constexpr index_t M_wav = NumWarps / K_wav; + static_assert(MPerBlock % M_wav == 0, "this tile size is too small please check"); + constexpr index_t M_rep = MPerBlock / M_wav; + + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + tuple, sequence>, + tuple, sequence<2>>, + tuple, sequence<1>>, + sequence<1, 2>, + sequence<0, 2>>{}); + } + else + { + constexpr index_t K_lan = K_rem; + constexpr index_t M_lan = get_warp_size() / K_lan; + constexpr index_t M_wav = NumWarps; + static_assert(MPerBlock % (M_lan * M_wav) == 0, + "this tile size is too small please check"); + constexpr index_t M_rep = MPerBlock / (M_lan * M_wav); + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + } + + // optimized version for async, not same as simple MXK dist(pay attention!!) + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_SimpleMxK_Async() + { + constexpr index_t K_vec = Alignment; + constexpr index_t K_rem = KPerBlock / K_vec; + + if constexpr(get_warp_size() <= K_rem) + { + static_assert(K_rem % get_warp_size() == 0); + constexpr index_t K_lan = get_warp_size(); // lane within same wave is along gemm-k + constexpr index_t K_wav = K_rem / get_warp_size(); + static_assert(K_wav <= NumWarps, "do not support thread has repeat along K yet"); + constexpr index_t M_wav = NumWarps / K_wav; + static_assert(MPerBlock % M_wav == 0, "this tile size is too small please check"); + constexpr index_t M_rep = MPerBlock / M_wav; + // NOTE: no swap, but hard to avoid LDS bank conflict + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + tuple, sequence>, + tuple, sequence<2>>, + tuple, sequence<1>>, + sequence<1, 2>, + sequence<0, 2>>{}); + } + else + { + constexpr index_t K_lan = K_rem; + constexpr index_t M_lan = get_warp_size() / K_lan; + constexpr index_t M_wav = NumWarps; + static_assert(MPerBlock % (M_lan * M_wav) == 0, + "this tile size is too small please check"); + constexpr index_t M_rep = MPerBlock / (M_lan * M_wav); + // NOTE: swapped for LDS load bank conflict free + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + // Note M_wave(num waves) is the fastest dim, different from sipmle 2d + // distribution + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<1, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_Nr_Kr_W() + { + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, + sequence, + sequence>, + tuple, sequence<3>>, + tuple, sequence<0>>, + sequence<1, 2, 3>, + sequence<0, 0, 1>>{}); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_A() + { + constexpr index_t Block_M_ = Problem::BlockShape::Block_M0; + constexpr index_t Block_K_ = Problem::BlockShape::Block_K0; + constexpr index_t NumWarps_ = Problem::BlockShape::NumWarps; + constexpr index_t Alignment_ = GetAlignment_A(); + return MakeGlobalTileDistribution_SimpleMxK_Async(); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_G() + { + constexpr auto PermuteEnum = Problem::Traits::PermuteEnum; + // constexpr index_t hidden_radio_0 = Problem::Traits::IsGateOnly ? 1 : 2; + using S_ = typename Problem::BlockShape; + if constexpr(PermuteEnum == FusedMoeGemmWeightPermuteEnum::b_nr_kr_waveflatten) + { + // number{}.rrr(); + // number{}.eee(); + return MakeGlobalTileDistribution_Nr_Kr_W()>(); + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_D() + { + constexpr auto PermuteEnum = Problem::Traits::PermuteEnum; + using S_ = typename Problem::BlockShape; + if constexpr(PermuteEnum == FusedMoeGemmWeightPermuteEnum::b_nr_kr_waveflatten) + { + return MakeGlobalTileDistribution_Nr_Kr_W()>(); + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_O() + { + using S_ = remove_cvref_t; + using WarpGemm = remove_cvref_t())>; + // using CDataType = typename WarpGemm::CDataType; + + constexpr auto c_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + return c_block_dstr; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A() + { + // A async->LDS + constexpr index_t Block_M = Problem::BlockShape::Block_M0; + constexpr index_t Block_K = Problem::BlockShape::Block_K0; + // constexpr index_t BlockSize = Problem::BlockShape::BlockSize; + constexpr index_t warpSize = ck_tile::get_warp_size(); + constexpr index_t NumWarps = Problem::BlockShape::NumWarps; + + constexpr index_t KPack = GetSmemKPack_A(); // LDS + constexpr index_t KVector = GetAlignment_A(); // async copy 1 dword + constexpr index_t KPad = KPack; // pad between warps + + static_assert(Block_K % KVector == 0); + constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K + if constexpr(LanesPerK >= warpSize) + { + // need multiple waves to load K + static_assert(LanesPerK % warpSize == 0); + constexpr index_t wavesPerK = LanesPerK / warpSize; + if constexpr(wavesPerK > NumWarps) + { + // TODO: need multiple issues along K to load all data + } + else + { + constexpr index_t wavesPerM = NumWarps / wavesPerK; + constexpr index_t NumIssues = Block_M / wavesPerM; + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number{}), // k2 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number<1>{}), // k2 + number{}, // lds store vector(actually no explicit store) + number<1>{}); + + constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple( + make_pass_through_transform(number{}), + make_merge_transform(make_tuple(number{}, number{})), + make_merge_transform(make_tuple(number{}, number{}))), + make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})); + + return lds_block_desc_issues_warps_lanes; + } + } + else + { + // lanes within a wave load different M but same K + static_assert(warpSize % LanesPerK == 0); + constexpr index_t LaneGroups = warpSize / LanesPerK; // along m + constexpr index_t NumIssues = Block_M / (LaneGroups * NumWarps); + + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number{}), // k1 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number<1>{}), // k1 + number{}, // lds store vector(actually no explicit store) + number<1>{}); + + constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple(make_pass_through_transform(number{}), + make_pass_through_transform(number{}), + make_merge_transform(make_tuple( + number{}, number{}, number{}))), + make_tuple(sequence<0>{}, sequence<2>{}, sequence<1, 3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})); + + return lds_block_desc_issues_warps_lanes; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadDesc_A() + { + // A async->LDS + // Note that, this descriptor is only to construct the layout inside LDS + // in real Gemm pipeline, ds_read may not follow this pattern + // (may follow that in tile_distribution) + // below code is almost the same as SmemStore dist, with difference: + // 1). modify the GuaranteedLastDimensionVectorLength of naive tensor desc + // 2). return discriptor is in NxK 2d layout + constexpr index_t Block_M = Problem::BlockShape::Block_M0; + constexpr index_t Block_K = Problem::BlockShape::Block_K0; + // constexpr index_t BlockSize = Problem::BlockShape::BlockSize; + constexpr index_t warpSize = ck_tile::get_warp_size(); + constexpr index_t NumWarps = Problem::BlockShape::NumWarps; + + constexpr index_t KPack = GetSmemKPack_A(); // LDS + constexpr index_t KVector = GetAlignment_A(); // async copy 1 dword + constexpr index_t KPad = KPack; // pad between warps + + static_assert(Block_K % KVector == 0); + constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K + if constexpr(LanesPerK >= warpSize) + { + // need multiple waves to load K + static_assert(LanesPerK % warpSize == 0); + constexpr index_t wavesPerK = LanesPerK / warpSize; + if constexpr(wavesPerK >= NumWarps) + { + // TODO: need multiple issues along K to load all data + } + else + { + constexpr index_t wavesPerM = NumWarps / wavesPerK; + constexpr index_t NumIssues = Block_M / wavesPerM; + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number{}), // k2 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number<1>{}), // k2 + number{}, // lds load vector + number<1>{}); + + constexpr auto lds_desc_m_k = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple( + make_merge_transform(make_tuple(number{}, number{})), + make_merge_transform(make_tuple( + number{}, number{}, number{}))), + make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return lds_desc_m_k; + } + } + else + { + // lanes within a wave load different M but same K + static_assert(warpSize % LanesPerK == 0); + constexpr index_t LaneGroups = warpSize / LanesPerK; // along m + constexpr index_t NumIssues = Block_M / (LaneGroups * NumWarps); + + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number{}), // k1 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number<1>{}), // k1 + number{}, // lds load vector + number<1>{}); + + constexpr auto lds_desc_m_k = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple( + make_merge_transform( + make_tuple(number{}, number{}, number{})), + make_merge_transform(make_tuple(number{}, number{}))), + make_tuple(sequence<0, 1, 2>{}, sequence<3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return lds_desc_m_k; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsLoadDesc() + { + constexpr index_t Block_M = Problem::BlockShape::Block_M0; + constexpr index_t Block_N = Problem::BlockShape::Block_N0; + + constexpr index_t KVector = GetSmemKPack_Y(); // async copy 1 dword + constexpr index_t KPad = 0; // pad between warps + + constexpr auto desc = + make_naive_tensor_descriptor(make_tuple(number{}, number{}), + make_tuple(number{}, number<1>{}), + number{}, + number<1>{}); + return desc; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsStoreDesc() + { + constexpr index_t Block_M = Problem::BlockShape::Block_M0; + constexpr index_t Block_N = Problem::BlockShape::Block_N0; + + constexpr index_t KVector = GetSmemKPack_Y(); // async copy 1 dword + constexpr index_t KPad = 0; // KVector; // pad between warps + + constexpr auto desc = + make_naive_tensor_descriptor(make_tuple(number{}, number{}), + make_tuple(number{}, number<1>{}), + number{}, + number<1>{}); + return desc; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsStoreForUKDesc() + { + constexpr index_t WarpPerBlock_N = Problem::BlockShape::WarpPerBlock_N0; + constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N0; + constexpr index_t Repeat_M = Problem::BlockShape::Repeat_M0; + + constexpr index_t kAMLane = 16; + constexpr index_t kABKLane = 4; + constexpr index_t kABKPerLane = 4; + + constexpr index_t KPack = kABKPerLane; + + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m + number{}, // n + number{}, // n + number{}, // n + number{}, // m + number{}), // n + make_tuple(number{}, // m + number{}, // n + number{}, // n + number{}, // n + number{}, // m + number<1>{}), // n + number{}, // lds store vector(actually no explicit store) + number<1>{}); + + constexpr auto desc = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple(make_merge_transform(make_tuple(number{}, number{})), + make_merge_transform(make_tuple(number{}, + number{}, + number{}, + number{}))), + make_tuple(sequence<0, 4>{}, sequence<1, 2, 3, 5>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return desc; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemm0() + { + using S_ = typename Problem::BlockShape; + // A is vgpr, B is agpr. But since we transposed, so also need swap this + // TODO: this is ugly + constexpr auto wg_ctrl = WGAttrCtlEnum::Raw_avv; + // TODO: ugly + if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16) + { + return WarpGemmImpl, + 2>>{}; + } + else if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 32) + { + return WarpGemmImpl, + 2>>{}; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSequencer_0() + { + // this function return seq<...> used to identify gld/sld/valu... inside mfma sequence + // the purpose is to hide thoes instructions under mfma + // every value inside seq<...> is a mask, indicating a specific operation + using S_ = typename Problem::BlockShape; + constexpr index_t SLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::SLD_A); + constexpr index_t GLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_A); + constexpr index_t GLD_B = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_B); + if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16 && + S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 && + S_::Block_N1 == 128) + { + // Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async + // gld_a 8x ds_read_b128 sld_a total 64 slot :) + // clang-format off + constexpr auto seq_all = + // 0 1 2 3 4 5 6 7 + sequence{}; // 7 + return seq_all; + // clang-format on + } + else if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16 && + S_::Block_M0 == 32 && S_::Block_N0 == 256 && S_::Block_K0 == 128 && + S_::Block_N1 == 128) + { + // Total 32 instructions, 16 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async + // gld_a 8x ds_read_b128 sld_a total 64 slot :) + // clang-format off + constexpr auto seq_all = + // 0 1 2 3 4 5 6 7 + sequence{}; // 3 + return seq_all; + // clang-format on + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSequencer_1() + { + // this function return seq<...> used to identify gld/sld/valu... inside mfma sequence + // the purpose is to hide thoes instructions under mfma + // every value inside seq<...> is a mask, indicating a specific operation + using S_ = typename Problem::BlockShape; + constexpr index_t GLD_B = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_B); + constexpr index_t GST_O = static_cast(FusedMoeGemmPipelineSequencerEnum::GST_O); + if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M1 == 32 && S_::Warp_N1 == 32 && S_::Warp_K1 == 16 && + S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 && + S_::Block_N1 == 128) + { + // Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async + // gld_a 8x ds_read_b128 sld_a total 64 slot :) + // clang-format off + constexpr auto seq_all = + // 0 1 2 3 4 5 6 7 + sequence{}; // 7 + return seq_all; + // clang-format on + } + else if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M1 == 32 && S_::Warp_N1 == 32 && S_::Warp_K1 == 16 && + S_::Block_M0 == 32 && S_::Block_N0 == 256 && S_::Block_K0 == 128 && + S_::Block_N1 == 128) + { + // Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async + // gld_a 8x ds_read_b128 sld_a total 64 slot :) + // clang-format off + constexpr auto seq_all = + // 0 1 2 3 4 5 6 7 + sequence{}; // 3 + return seq_all; + // clang-format on + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemm1() + { + using S_ = typename Problem::BlockShape; + constexpr auto wg_ctrl = WGAttrCtlEnum::Raw_avv; + // TODO: ugly + if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16) + { + return WarpGemmImpl, + 2>>{}; + } + else if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 32) + { + return WarpGemmImpl, + 2>>{}; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeCBlockTile_Gemm0() + { + using S_ = remove_cvref_t; + using WarpGemm = remove_cvref_t())>; + using CDataType = typename WarpGemm::CDataType; + + constexpr auto c_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + auto c_block_tensor = make_static_distributed_tensor(c_block_dstr); + return c_block_tensor; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeCBlockTile_Gemm1() + { + using S_ = remove_cvref_t; + using WarpGemm = remove_cvref_t())>; + using CDataType = typename WarpGemm::CDataType; + + constexpr auto c_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + auto c_block_tensor = make_static_distributed_tensor(c_block_dstr); + return c_block_tensor; + } + + // this is used as A matrix for 2nd gemm + template + CK_TILE_HOST_DEVICE static constexpr auto MakeYTileDistribution() + { + using S_ = remove_cvref_t; + using WarpGemm = remove_cvref_t())>; + + // TODO: all waves a along different N, but same M + constexpr auto y_outer_dstr_enc = + tile_distribution_encoding, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto y_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + y_outer_dstr_enc, typename WarpGemm::AWarpDstrEncoding{}); + constexpr auto y_block_dstr = make_static_tile_distribution(y_block_dstr_encode); + return y_block_dstr; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeYBlockTile() + { + constexpr auto y_block_dstr = MakeYTileDistribution(); + auto y_block_tensor = + make_static_distributed_tensor(y_block_dstr); + return y_block_tensor; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetUK_0() + { + using S_ = typename Problem::BlockShape; + if constexpr(std::is_same_v && + std::is_same_v && + S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 && + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32) + { + return Flatmm_32x512x128_1x4x1_16x16x32_BF16{}; + } + else if constexpr(std::is_same_v && + std::is_same_v && + S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 && + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32) + { + return Flatmm_32x512x128_1x4x1_16x16x32_FP16{}; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetUK_1() + { + using S_ = typename Problem::BlockShape; + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 && + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32) + { + return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16{}; + } + else if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 && + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32) + { + return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{}; + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp new file mode 100644 index 000000000..a6f71eafa --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp" + +namespace ck_tile { + +/* +This pipeline deal with a gemm(actually 2 gemm) with one very small(token), one very big(weight) +we need to design the pipeline such that all waves along gemm-N dim (gemm-m only 1 wave) + + <----- gemm-N ------> + +----+----+----+----+ + | w0 | w1 | w2 | w3 | gemm-m + +----+----+----+----+ +*/ +template +struct FusedMoeGemmPipeline_FlatmmUk +{ + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + + using BlockShape = typename Problem::BlockShape; // this is FusedMoeGemmShape + + using ADataType = typename Problem::ADataType; + using GDataType = typename Problem::GDataType; + using DDataType = typename Problem::DDataType; + using AccDataType = typename Problem::AccDataType; + using ODataType = typename Problem::ODataType; + using AScaleDataType = typename Problem::AScaleDataType; + using GScaleDataType = typename Problem::GScaleDataType; + using DScaleDataType = typename Problem::DScaleDataType; + using YSmoothScaleDataType = typename Problem::YSmoothScaleDataType; + using TopkWeightDataType = typename Problem::TopkWeightDataType; + using IndexDataType = typename Problem::IndexDataType; + using YDataType = typename Problem::YDataType; + + using Traits = typename Problem::Traits; + + static constexpr bool IsGateOnly = Traits::IsGateOnly; + static constexpr bool UseSmoothQuant = Traits::UseSmoothQuant; + static constexpr bool PadHiddenSize = Traits::PadHiddenSize; + static constexpr bool PadIntermediateSize = Traits::PadIntermediateSize; + + static constexpr index_t kAlignmentA = Policy::template GetAlignment_A(); + static constexpr index_t kAlignmentG = Policy::template GetAlignment_G(); + static constexpr index_t kAlignmentD = Policy::template GetAlignment_D(); + static constexpr index_t kAlignmentO = Policy::template GetAlignment_O(); + + static constexpr index_t SLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::SLD_A); + static constexpr index_t GLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_A); + static constexpr index_t GLD_B = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_B); + static constexpr index_t GST_O = static_cast(FusedMoeGemmPipelineSequencerEnum::GST_O); + + static constexpr index_t kBlockPerCu = []() { + if constexpr(Problem::kBlockPerCu != -1) + return Problem::kBlockPerCu; + else + { + // minimize occupancy + return 2; + } + }(); + + static constexpr const char* name = "flatmm_uk"; + + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + constexpr index_t smem_0 = Policy::template GetUK_0().GetSmemSize(); + constexpr index_t smem_1 = Policy::template GetUK_1().GetSmemSize(); + constexpr index_t smem_bridge = + BlockShape::Block_M0 * BlockShape::Block_N0 * sizeof(YDataType); + return max(smem_0, max(smem_1, smem_bridge)); + } + + // this is the thread-offset along row/col + CK_TILE_HOST_DEVICE static auto GetACoord() + { + constexpr auto a_dist = Policy::template MakeGlobalTileDistribution_A(); + const auto a_coord = a_dist.calculate_index(); + return a_coord; + } + + // this is the thread-offset along row/col + CK_TILE_HOST_DEVICE static auto GetOCoord() + { + constexpr auto o_dist = Policy::template MakeOGlobalTileDistribution(); + const auto o_coord = o_dist.calculate_index(); + return o_coord; + } + + CK_TILE_DEVICE constexpr auto GetNumRowCoords_A() + { + constexpr index_t KLans = BlockShape::Block_K0 / kAlignmentA; + constexpr index_t MLans = BlockShape::BlockSize / KLans; + constexpr index_t MRepeat = BlockShape::Block_M0 / MLans; + + return MRepeat; + } + + // TODO: properlly support scatter/gather + CK_TILE_DEVICE auto GetRowCoords_A(index_t base_offset) + { + constexpr index_t KLans = BlockShape::Block_K0 / kAlignmentA; + constexpr index_t MLans = BlockShape::BlockSize / KLans; + constexpr index_t MRepeat = BlockShape::Block_M0 / MLans; + + auto base_coord = threadIdx.x / KLans + base_offset; + + array coords; + static_for<0, MRepeat, 1>{}([&](auto i) { coords.at(i) = base_coord + i * MLans; }); + + return coords; + } + + template + CK_TILE_DEVICE auto GetRowID(const ROW_COORDS coords, const IndexDataType* sorted_token_ids_ptr) + { + constexpr index_t n_size = coords.size(); + + array row_ids; + static_for<0, n_size, 1>{}([&](auto i) { + row_ids.at(i) = sorted_token_ids_ptr[coords[i]]; // base_coord + i * MLans; + }); + + return row_ids; + } + + template + CK_TILE_DEVICE auto GetWeightScale(const ROW_COORDS coords, + const TopkWeightDataType* sorted_weight_ptr) + { + constexpr index_t n_size = coords.size(); + + array w; + static_for<0, n_size, 1>{}([&](auto i) { + w.at(i) = sorted_weight_ptr[coords[i]]; // base_coord + i * MLans; + }); + + return w; + } + + // TODO: this row id is before shuffle atomic, need use acc distribution + CK_TILE_DEVICE auto GetRowCoords_O(index_t base_offset) + { + constexpr index_t MLanes = BlockShape::Warp_M1; + constexpr index_t Repeat_M = BlockShape::Repeat_M1; + + auto base_coord = threadIdx.x % MLanes + base_offset; + + array coords; + static_for<0, Repeat_M, 1>{}([&](auto i) { coords.at(i) = base_coord + i * MLanes; }); + + return coords; + } + + template + CK_TILE_DEVICE auto operator()(const Karg& kargs, + CK_TILE_LDS_ADDR void* smem, + index_t sorted_tile_id, + index_t intermediate_tile_id) + { + constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2; + ck_tile::index_t shared_intermediate_size_0 = kargs.intermediate_size; + ck_tile::index_t shared_intermediate_size_1 = kargs.intermediate_size / hidden_radio_0; + + index_t nr_0 = shared_intermediate_size_0 / BlockShape::Warp_N0; // divide N in W + index_t kr_0 = kargs.hidden_size / BlockShape::Warp_K0; // divide K in W + index_t nr_1 = kargs.hidden_size / BlockShape::Warp_N1; + index_t kr_1 = shared_intermediate_size_1 / BlockShape::Warp_K1; + + const IndexDataType expert_id = __builtin_amdgcn_readfirstlane( + reinterpret_cast(kargs.sorted_expert_ids_ptr)[sorted_tile_id]); + index_t expert_stride_0 = shared_intermediate_size_0 * kargs.hidden_size; + index_t expert_stride_1 = shared_intermediate_size_1 * kargs.hidden_size; + + // nr*kr*w + index_t interm_idx_nr0 = __builtin_amdgcn_readfirstlane( + intermediate_tile_id * + BlockShape::Block_Nr0); // intermediate_tile_id * Block_N / (N in W) + + index_t interm_idx_kr1 = __builtin_amdgcn_readfirstlane( + intermediate_tile_id * + BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W) + + auto row_coords_a = GetRowCoords_A(sorted_tile_id * BlockShape::Block_M0); + auto row_ids_a = GetRowID( + row_coords_a, reinterpret_cast(kargs.sorted_token_ids_ptr)); + auto a_coords = generate_tuple( + [&](auto i) { + return row_ids_a[i] * kargs.stride_token + + threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA; + }, + number{}); + auto a_res = + make_wave_buffer_resource(reinterpret_cast(kargs.a_ptr), + kargs.num_tokens * kargs.stride_token * sizeof(ADataType)); + + auto g_win = [&]() { + const GDataType* g_ptr = reinterpret_cast(kargs.g_ptr) + + static_cast(expert_id) * expert_stride_0 + + interm_idx_nr0 * kr_0 * BlockShape::Block_W0; + auto g_view_ = make_naive_tensor_view( + g_ptr, + make_tuple(nr_0, kr_0, number{}), + make_tuple(kr_0 * BlockShape::Block_W0, number{}, 1), + number{}, + number<1>{}); + + auto g_window_ = make_tile_window_linear_raw( + g_view_, + make_tuple(number{}, + number{}, + number{}), + {0, 0, 0}, + Policy::template MakeGlobalTileDistribution_G(), + sequence<0, 1, 1>{}); + return g_window_; + }(); + + auto g_res = g_win.get_bottom_tensor_view().get_buffer_view().cached_buf_res_; + auto g_coords = generate_tuple([&](auto i) { return g_win.cached_coords_[i].get_offset(); }, + number{}); + + const auto d_win = [&]() { + const DDataType* d_ptr = reinterpret_cast(kargs.d_ptr) + + static_cast(expert_id) * expert_stride_1 + + interm_idx_kr1 * BlockShape::Block_W1; + // note interm_idx_nr0 is along the gemm-k dim of 2nd gemm + + const auto d_view_ = make_naive_tensor_view( + d_ptr, + make_tuple(nr_1, kr_1, BlockShape::Block_W1), + make_tuple(kr_1 * BlockShape::Block_W1, BlockShape::Block_W1, 1), + number{}, + number<1>{}); + + const auto d_window_ = make_tile_window_linear_raw( + d_view_, + make_tuple(number{}, + number{}, + number{}), + {0, 0, 0}, + Policy::template MakeGlobalTileDistribution_D(), + sequence<0, 1, 1>{}); + return d_window_; + }(); + auto d_res = d_win.get_bottom_tensor_view().get_buffer_view().cached_buf_res_; + + // TODO: load D order is N0.K0...127, N64.K0...127, N0.K128...255, N64.K128...255 + // block-k=512, block-n=128 + // wg |<----- W_ ----->| + // Nr(2)*Nw(4)* Kr *Kr0(4)*Kr1(4) * [Kl(4)*Nl(16)*Kv(8)]->one issue + // y p y y p p y + // 1 2 0(imm) + auto d_coords = [&]() { + constexpr index_t Nr_ = 2; + constexpr index_t Nw_ = 4; + constexpr index_t Kr0_ = 4; + constexpr index_t Kr1_ = 4; + constexpr index_t Kl_ = 4; + constexpr index_t Nl_ = 16; + constexpr index_t Kv_ = 8; + constexpr index_t W_ = Kl_ * Nl_ * Kv_; + constexpr index_t num_offsets_ = Nr_ * Kr0_; + index_t base_os_ = (threadIdx.x % 64) * Kv_ + (threadIdx.x / 64) * + shared_intermediate_size_1 * + Nl_; // Kr0_ * Kr1_ * W_; + return generate_tuple( + [&](auto i) { + constexpr auto i_nr_ = number{}; + constexpr auto i_kr0_ = number{}; + + return i_nr_ * shared_intermediate_size_1 * Nw_ * Nl_ + i_kr0_ * Kr1_ * W_ + + base_os_; + }, + number{}); + }(); + + auto o_coords = generate_tuple( + [&](auto i) { + return row_ids_a[i] * kargs.stride_token + + threadIdx.x % (BlockShape::Block_N1 / kAlignmentO) * kAlignmentO; + }, + number{}); + + auto o_flags = + generate_tuple([&](auto i) { return cmp_lt_to_exec(row_ids_a[i], kargs.num_tokens); }, + number{}); + + auto bridge_sst_win = [&]() { + constexpr auto desc_ = Policy::template MakeBridgeLdsStoreForUKDesc(); + constexpr auto dist_ = Policy::template GetUK_0().MakeCBlockDist(); + return make_tile_window_linear(make_tensor_view( + reinterpret_cast(smem), desc_), + desc_.get_lengths(), + {0, 0}, + dist_); + }(); + auto o_res = + make_wave_buffer_resource(reinterpret_cast(kargs.o_ptr), + kargs.num_tokens * kargs.stride_token * sizeof(ODataType)); + + auto row_coords_o = GetRowCoords_O(sorted_tile_id * BlockShape::Block_M0); + auto w_scale = GetWeightScale( + row_coords_o, reinterpret_cast(kargs.sorted_weight_ptr)); + + auto uk_0 = Policy::template GetUK_0(); + auto acc_0 = uk_0(a_res, + a_coords, + g_res, + g_coords, + smem, + kargs.hidden_size, + BlockShape::Block_K0, // tile offset for B matrix each unroll + BlockShape::Block_Kr0 * + BlockShape::Block_W0); // tile offset for B matrix each unroll + + sweep_tile( + acc_0, + [&](auto idx0, auto idx1) { + fp32x2_t v_{acc_0(idx0), acc_0(idx1)}; + typename Problem::GateActivation{}(v_, v_); + acc_0(idx0) = v_.x; + acc_0(idx1) = v_.y; + }, + sequence<1, 2>{}); + + auto y_pre = cast_tile(acc_0); + + block_sync_lds(); + + store_tile(bridge_sst_win, y_pre); + block_sync_lds(); + + auto uk_1 = Policy::template GetUK_1(); + uk_1(d_res, + d_coords, + o_res, + o_coords, + o_flags, + smem, + kargs.hidden_size, // total n number + w_scale, + BlockShape::Block_Nr1 * kr_1 * BlockShape::Block_W1, // along N + BlockShape::Block_N1); // along N + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp new file mode 100644 index 000000000..6089c2558 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +// TODO: alow 2 gemm have different type +template +struct FusedMoeGemmPipelineProblem +{ + using ADataType = remove_cvref_t; + using GDataType = remove_cvref_t; + using DDataType = remove_cvref_t; + using AccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; + using AScaleDataType = remove_cvref_t; + using GScaleDataType = remove_cvref_t; + using DScaleDataType = remove_cvref_t; + using YSmoothScaleDataType = remove_cvref_t; + using TopkWeightDataType = remove_cvref_t; + using IndexDataType = remove_cvref_t; + + // the input for next gemm should have same time as + using YDataType = ADataType; + + using GateActivation = remove_cvref_t; + using BlockShape = remove_cvref_t; + using Traits = remove_cvref_t; +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp new file mode 100644 index 000000000..d7127b098 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +enum class FusedMoeGemmWeightPermuteEnum +{ + // permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6 + // permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6 + no_permute = 0, + b_nr_kr_kw_nw_kv = 1, // 0,1,3,4,2,5 + b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv, +}; + +template +struct FusedMoeGemmTraits +{ + // Gate+Up or Gate only + static constexpr bool IsGateOnly = IsGateOnly_; + static constexpr bool UseSmoothQuant = UseSmoothQuant_; + static constexpr index_t OAtomic = OAtomic_; + static constexpr FusedMoeGemmWeightPermuteEnum PermuteEnum = PermuteEnum_; + static constexpr bool PadHiddenSize = PadHiddenSize_; + static constexpr bool PadIntermediateSize = PadIntermediateSize_; +}; + +// Note: this need to be a bit mask +enum class FusedMoeGemmPipelineSequencerEnum +{ + SLD_A = 1 << 0, // shared load a + SLD_B = 1 << 1, + GLD_A = 1 << 2, // global load a + GLD_B = 1 << 3, + SST_A = 1 << 4, // shared store a + SST_B = 1 << 5, + GST_O = 1 << 6, // global store out +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp index 7ca4a697a..89ea82c5b 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp @@ -10,114 +10,134 @@ namespace ck_tile { // fp16 -using WarpGemmMfmaF16F16F32M32N32K8 = - WarpGemmImpl>; -using WarpGemmMfmaF16F16F32M16N16K16 = - WarpGemmImpl>; +using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfmaF16F16F32M32N32K16 = - WarpGemmImpl>; +using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfmaF16F16F32M16N16K32 = - WarpGemmImpl>; +using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl, + 2>>; -using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl< - WarpGemmAtrributeMfmaIterateK_SwizzleA>; +using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl, + 2>>; -using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl< - WarpGemmAtrributeMfmaIterateK_SwizzleA>; +using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl, + 1>>; -using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl, + 2>>; -using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution = + WarpGemmImpl>>; + +using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution = + WarpGemmImpl>>; using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution = WarpGemmImpl, 2>>; using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution = WarpGemmImpl, 2>>; using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmImpl, 2>>; // bf16 -using WarpGemmMfmaBf16Bf16F32M32N32K8 = - WarpGemmImpl>; -using WarpGemmMfmaBf16Bf16F32M16N16K16 = - WarpGemmImpl>; +using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; + +using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfmaBf16Bf16F32M32N32K16 = - WarpGemmImpl>; +using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl, + 2>>; -using WarpGemmMfmaBf16Bf16F32M16N16K32 = - WarpGemmImpl>; +using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl, + 2>>; -using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl< - WarpGemmAtrributeMfmaIterateK_SwizzleA>; +using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl, + 1>>; -using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA = WarpGemmImpl< - WarpGemmAtrributeMfmaIterateK_SwizzleA>; +using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA = + WarpGemmImpl, + 2>>; -using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution = + WarpGemmImpl>>; -using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution = + WarpGemmImpl>>; using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution = WarpGemmImpl, 2>>; using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution = WarpGemmImpl, 2>>; using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmImpl, 2>>; // fp8 -using WarpGemmMfma_f32_32x32x16_fp8_fp8 = - WarpGemmImpl>; -using WarpGemmMfma_f32_32x32x16_fp8_bf8 = - WarpGemmImpl>; +using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; + +using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfma_f32_32x32x16_bf8_fp8 = - WarpGemmImpl>; +using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfma_f32_32x32x16_bf8_bf8 = - WarpGemmImpl>; +using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed = + WarpGemmImpl>>; -using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed = + WarpGemmImpl>>; -using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed = + WarpGemmImpl>>; -using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed = + WarpGemmImpl>>; template using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmImpl, + WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base, 2, swizzle_factor>>; diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp index d80e5198e..0a8d2dfbe 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp @@ -25,6 +25,8 @@ struct WarpGemmAtrributeMfma static constexpr index_t kN = Impl::kN; static constexpr index_t kK = Impl::kK; + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -51,10 +53,13 @@ struct WarpGemmAtrributeMfma sequence<0, 2>>; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { - Impl{}(c_vec, a_vec, b_vec); + Impl{}(c_vec, a_vec, b_vec, bool_constant{}); } // c_vec = a_vec * b_vec @@ -85,6 +90,8 @@ struct WarpGemmAtrributeMfmaIterateK static constexpr index_t kN = Impl::kN; static constexpr index_t kK = Impl::kK * kKIter; + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -111,8 +118,11 @@ struct WarpGemmAtrributeMfmaIterateK sequence<0, 2>>; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { using buf_a = thread_buffer; using buf_b = thread_buffer; @@ -122,10 +132,33 @@ struct WarpGemmAtrributeMfmaIterateK reinterpret_cast(a_vec) .template get_as()[iKIter], reinterpret_cast(b_vec) - .template get_as()[iKIter]); + .template get_as()[iKIter], + bool_constant{}); }); } + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + number, + bool_constant = {}) const + { + using buf_a = thread_buffer; + using buf_b = thread_buffer; + + static_assert(iKIter < kKIter); + + // static_for<0, kKIter, 1>{}([&](auto iKIter) { + Impl{}(c_vec, + reinterpret_cast(a_vec) + .template get_as()[iKIter], + reinterpret_cast(b_vec) + .template get_as()[iKIter], + bool_constant{}); + //}); + } + // c_vec = a_vec * b_vec CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const { @@ -168,6 +201,8 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution static constexpr index_t kN = Impl::kM; static constexpr index_t kK = Impl::kK; + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -194,11 +229,14 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution sequence<0, 2>>; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { // swap A and B - Impl{}(c_vec, b_vec, a_vec); + Impl{}(c_vec, b_vec, a_vec, bool_constant{}); } // c_vec = a_vec * b_vec @@ -226,6 +264,8 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB static constexpr index_t kN = Impl::kM; static constexpr index_t kK = Impl::kK; + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -255,12 +295,15 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB sequence<2, 2>, sequence<0, 2>>; + template // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { // swap A and B - Impl{}(c_vec, b_vec, a_vec); + Impl{}(c_vec, b_vec, a_vec, bool_constant{}); } // c_vec = a_vec * b_vec @@ -291,6 +334,8 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution static constexpr index_t kN = Impl::kM; static constexpr index_t kK = Impl::kK * kKIter; + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -316,9 +361,12 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution sequence<2, 2>, sequence<0, 2>>; + template // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { using buf_a = thread_buffer; using buf_b = thread_buffer; @@ -328,10 +376,34 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution reinterpret_cast(b_vec) .template get_as()[iKIter], reinterpret_cast(a_vec) - .template get_as()[iKIter]); + .template get_as()[iKIter], + bool_constant{}); }); } + template + // c_vec += a_vec * b_vec + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + number, + bool_constant = {}) const + { + using buf_a = thread_buffer; + using buf_b = thread_buffer; + + static_assert(iKIter < kKIter); + // swap A and B, value and type + // static_for<0, kKIter, 1>{}([&](auto iKIter) { + Impl{}(c_vec, + reinterpret_cast(b_vec) + .template get_as()[iKIter], + reinterpret_cast(a_vec) + .template get_as()[iKIter], + bool_constant{}); + //}); + } + // c_vec = a_vec * b_vec CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const { @@ -377,6 +449,8 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB static constexpr index_t kK = Impl::kK * kKIter; static constexpr index_t SFactor = SFactor_; // group how many CM1 together + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -429,8 +503,11 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB sequence<0, 2>>; #endif // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { using buf_a = thread_buffer; using buf_b = thread_buffer; @@ -440,10 +517,33 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB reinterpret_cast(b_vec) .template get_as()[iKIter], reinterpret_cast(a_vec) - .template get_as()[iKIter]); + .template get_as()[iKIter], + bool_constant{}); }); } + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + number, + bool_constant = {}) const + { + using buf_a = thread_buffer; + using buf_b = thread_buffer; + + static_assert(iKIter < kKIter); + // swap A and B, value and type + // static_for<0, kKIter, 1>{}([&](auto iKIter) { + Impl{}(c_vec, + reinterpret_cast(b_vec) + .template get_as()[iKIter], + reinterpret_cast(a_vec) + .template get_as()[iKIter], + bool_constant{}); + //}); + } + // c_vec = a_vec * b_vec CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const { @@ -488,6 +588,8 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA static constexpr index_t kK = Impl::kK * kKIter; static constexpr index_t SFactor = SFactor_; // group how many CM1 together + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple>; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { using buf_a = thread_buffer; using buf_b = thread_buffer; @@ -529,10 +634,33 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA reinterpret_cast(a_vec) .template get_as()[iKIter], reinterpret_cast(b_vec) - .template get_as()[iKIter]); + .template get_as()[iKIter], + bool_constant{}); }); } + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + number, + bool_constant = {}) const + { + using buf_a = thread_buffer; + using buf_b = thread_buffer; + + static_assert(iKIter < kKIter); + + // static_for<0, kKIter, 1>{}([&](auto iKIter) { + Impl{}(c_vec, + reinterpret_cast(a_vec) + .template get_as()[iKIter], + reinterpret_cast(b_vec) + .template get_as()[iKIter], + bool_constant{}); + //}); + } + // c_vec = a_vec * b_vec CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const { diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp index bb59a7298..0aba1f535 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -7,12 +7,68 @@ namespace ck_tile { +// TODO: refactor warp-gemm +// currently there is a discrepency for vav/vva if we need transpose C/D +// e.g. if we want A:agpr, B:vgpr, we have to use vva in WGAttrEnum +// because we swap the A/B pointer in _impl code (but not known this info here) +enum class WGAttrCtlEnum +{ + Default_ = 0, + Raw_vvv = 1, // c-vgpr, a-vgpr, b-vgpr + Raw_vaa = 2, // c-vgpr, a-agpr, b-agpr + Raw_vav = 3, // c-vgpr, a-agpr, b-vgpr + Raw_vva = 4, // c-vgpr, a-vgpr, b-agpr + Raw_avv = 5, // c-agpr, a-vgpr, b-vgpr + // raw_a_a_a = 3, // c-agpr, a-agpr, b-agpr +}; + +#define DISPATCH_MFMA_(mfma_, dmod_, amod_, bmod_, cmod_) \ + if constexpr(post_nop_) \ + { \ + asm volatile(mfma_ " %0, %1, %2, %3 ; yyy\n" \ + "s_nop 3" \ + : dmod_(c_vec) \ + : amod_(a_vec), bmod_(b_vec), cmod_(c_vec) \ + :); \ + } \ + else \ + { \ + asm volatile(mfma_ " %0, %1, %2, %3\n" \ + : dmod_(c_vec) \ + : amod_(a_vec), bmod_(b_vec), cmod_(c_vec) \ + :); \ + } + +#define DISPATCH_MFMA_CTRL_(mfma_, ctrl_) \ + if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vvv) \ + { \ + DISPATCH_MFMA_(mfma_, "+v", "v", "v", "v") \ + } \ + else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vaa) \ + { \ + DISPATCH_MFMA_(mfma_, "+v", "a", "a", "v") \ + } \ + else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vav) \ + { \ + DISPATCH_MFMA_(mfma_, "+v", "a", "v", "v") \ + } \ + else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vva) \ + { \ + DISPATCH_MFMA_(mfma_, "+v", "v", "a", "v") \ + } \ + else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_avv) \ + { \ + DISPATCH_MFMA_(mfma_, "+a", "v", "v", "a") \ + } + // FP16 +template struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8 { - using ADataType = fp16_t; - using BDataType = fp16_t; - using CDataType = float; + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = fp16_t; + using BDataType = fp16_t; + using CDataType = float; using AVecType = ext_vector_t; using BVecType = ext_vector_t; @@ -33,16 +89,23 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8 static constexpr index_t kCM1PerLane = 4; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { + DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x8f16", Ctrl) + else + { #if defined(__gfx9__) - c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0); + c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0); #else - ignore = c_vec; - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; #endif + } } // c_vec = a_vec * b_vec @@ -52,18 +115,20 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8 return bit_cast( __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0)); #else - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; return CVecType{0.f}; #endif } }; +template struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16 { - using ADataType = fp16_t; - using BDataType = fp16_t; - using CDataType = float; + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = fp16_t; + using BDataType = fp16_t; + using CDataType = float; using AVecType = ext_vector_t; using BVecType = ext_vector_t; @@ -84,16 +149,23 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16 static constexpr index_t kCM1PerLane = 4; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { + DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x16f16", Ctrl) + else + { #if defined(__gfx9__) - c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0); + c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0); #else - ignore = c_vec; - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; #endif + } } // c_vec = a_vec * b_vec @@ -103,19 +175,21 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16 return bit_cast( __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0)); #else - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; return CVecType{0.f}; #endif } }; // Bf16 +template struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8 { - using ADataType = bf16_t; - using BDataType = bf16_t; - using CDataType = float; + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = bf16_t; + using BDataType = bf16_t; + using CDataType = float; using AVecType = ext_vector_t; using BVecType = ext_vector_t; @@ -136,28 +210,35 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8 static constexpr index_t kCM1PerLane = 4; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { + DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x8bf16_1k", Ctrl) + else + { #if defined(__gfx90a__) || defined(__gfx94__) - c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0); + c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0); #elif defined(__gfx908__) - static_for<0, 2, 1>{}([&](auto k) { - c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16( - reinterpret_cast&>(a_vec) - .template get_as>()[number{}], - reinterpret_cast&>(b_vec) - .template get_as>()[number{}], - c_vec, - 0, - 0, - 0); - }); + static_for<0, 2, 1>{}([&](auto k) { + c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16( + reinterpret_cast&>(a_vec) + .template get_as>()[number{}], + reinterpret_cast&>(b_vec) + .template get_as>()[number{}], + c_vec, + 0, + 0, + 0); + }); #else - ignore = c_vec; - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; #endif + } } // c_vec = a_vec * b_vec @@ -181,18 +262,20 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8 }); return c_vec; #else - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; return CVecType{0.f}; #endif } }; +template struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16 { - using ADataType = bf16_t; - using BDataType = bf16_t; - using CDataType = float; + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = bf16_t; + using BDataType = bf16_t; + using CDataType = float; using AVecType = ext_vector_t; using BVecType = ext_vector_t; @@ -213,28 +296,34 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16 static constexpr index_t kCM1PerLane = 4; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { + DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x16bf16_1k", Ctrl) + { #if defined(__gfx90a__) || defined(__gfx94__) - c_vec = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0); + c_vec = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0); #elif defined(__gfx908__) - static_for<0, 2, 1>{}([&](auto k) { - c_vec = __builtin_amdgcn_mfma_f32_16x16x8bf16( - reinterpret_cast&>(a_vec) - .template get_as>()[number{}], - reinterpret_cast&>(b_vec) - .template get_as>()[number{}], - c_vec, - 0, - 0, - 0); - }); + static_for<0, 2, 1>{}([&](auto k) { + c_vec = __builtin_amdgcn_mfma_f32_16x16x8bf16( + reinterpret_cast&>(a_vec) + .template get_as>()[number{}], + reinterpret_cast&>(b_vec) + .template get_as>()[number{}], + c_vec, + 0, + 0, + 0); + }); #else - ignore = c_vec; - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; #endif + } } // c_vec = a_vec * b_vec @@ -258,20 +347,21 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16 }); return c_vec; #else - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; return CVecType{0.f}; #endif } }; // FP8 -template +template struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base { - using ADataType = AType_; - using BDataType = BType_; - using CDataType = float; + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = AType_; + using BDataType = BType_; + using CDataType = float; using AVecType = ext_vector_t; using BVecType = ext_vector_t; @@ -292,38 +382,120 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base static constexpr index_t kCM1PerLane = 4; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { + if constexpr(Ctrl == WGAttrCtlEnum::Raw_vvv) + { + if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "v", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "v", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "v", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "v", "v", "v") + } + } + else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vaa) + { + if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "a", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "a", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "a", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "a", "a", "v") + } + } + else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vav) + { + if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "a", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "a", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "a", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "a", "v", "v") + } + } + else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vva) + { + if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "v", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "v", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "v", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "v", "a", "v") + } + } + else + { #if defined(__gfx94__) - if constexpr(std::is_same_v && std::is_same_v) - c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); - else if constexpr(std::is_same_v && std::is_same_v) - c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); - else if constexpr(std::is_same_v && std::is_same_v) - c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); - else if constexpr(std::is_same_v && std::is_same_v) - c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + if constexpr(std::is_same_v && std::is_same_v) + c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8( + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + else if constexpr(std::is_same_v && std::is_same_v) + c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8( + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + else if constexpr(std::is_same_v && std::is_same_v) + c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8( + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + else if constexpr(std::is_same_v && std::is_same_v) + c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8( + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); #elif defined(__gfx908__) || defined(__gfx90a__) - static_for<0, 8, 1>{}([&](auto k) { - float a_f32 = - type_convert(reinterpret_cast&>(a_vec) - .template get_as()[number{}]); - float b_f32 = - type_convert(reinterpret_cast&>(b_vec) - .template get_as()[number{}]); - - c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0); - }); + static_for<0, 8, 1>{}([&](auto k) { + float a_f32 = + type_convert(reinterpret_cast&>(a_vec) + .template get_as()[number{}]); + float b_f32 = + type_convert(reinterpret_cast&>(b_vec) + .template get_as()[number{}]); + + c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0); + }); #else - ignore = c_vec; - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; #endif + } } // c_vec = a_vec * b_vec @@ -356,20 +528,97 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base }); return c_vec; #else - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; return CVecType{0.f}; #endif } }; +template using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8 = - WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + +template using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 = - WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + +template using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8 = - WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + +template using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8 = - WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + +// int8 +template +struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8 +{ + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = int8_t; + using BDataType = int8_t; + using CDataType = int32_t; + + using AVecType = ext_vector_t; + using BVecType = ext_vector_t; + using CVecType = ext_vector_t; + + static constexpr index_t kM = 32; + static constexpr index_t kN = 32; + static constexpr index_t kK = 16; + + static constexpr index_t kAMLane = 32; + static constexpr index_t kBNLane = 32; + static constexpr index_t kABKLane = 2; + static constexpr index_t kABKPerLane = 8; + + static constexpr index_t kCMLane = 2; + static constexpr index_t kCNLane = 32; + static constexpr index_t kCM0PerLane = 4; + static constexpr index_t kCM1PerLane = 4; + + // c_vec += a_vec * b_vec + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const + { + DISPATCH_MFMA_CTRL_("v_mfma_i32_32x32x16_i8", Ctrl) + else + { +#if defined(__gfx94__) + c_vec = __builtin_amdgcn_mfma_i32_32x32x8i8( + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); +#elif defined(__gfx908__) || defined(__gfx90a__) + static_for<0, 8, 1>{}([&](auto k) { + float a_f32 = + type_convert(reinterpret_cast&>(a_vec) + .template get_as()[number{}]); + float b_f32 = + type_convert(reinterpret_cast&>(b_vec) + .template get_as()[number{}]); + + c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0); + }); +#else + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; +#endif + } + } + + // c_vec = a_vec * b_vec + CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const + { + CVecType c_vec{0}; + operator()(c_vec, a_vec, b_vec); + return c_vec; + } +}; + +#undef DISPATCH_MFMA_ } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp index 4183d9cb9..99cd5d787 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -21,40 +21,40 @@ struct WarpGemmMfmaDispatcher; // clang-format off // fp16 -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; }; // bf16 -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; }; // fp8 -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; }; // clang-format on } // namespace impl diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp index eb9dbf127..182d023a0 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp @@ -31,11 +31,21 @@ struct WarpGemmImpl using BWarpTensor = static_distributed_tensor; using CWarpTensor = static_distributed_tensor; - CK_TILE_DEVICE void operator()(CWarpTensor& c, const AWarpTensor& a, const BWarpTensor& b) const + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { - using AVec = ext_vector_t; - using BVec = ext_vector_t; - using CVec = ext_vector_t; + return WarpGemmAttribute_::get_num_of_access(); + } + + template + CK_TILE_DEVICE void + operator()(CTensor& c, const ATensor& a, const BTensor& b, bool_constant = {}) const + { + static_assert(detail::is_similiar_distributed_tensor_v && + detail::is_similiar_distributed_tensor_v && + detail::is_similiar_distributed_tensor_v); + using AVec = ext_vector_t; + using BVec = ext_vector_t; + using CVec = ext_vector_t; constexpr auto I0 = number<0>{}; @@ -44,18 +54,49 @@ struct WarpGemmImpl auto c_vec = c.get_thread_buffer().template get_as()[I0]; // c_vec += a_vec * b_vec - WarpGemmAttribute{}(c_vec, a_vec, b_vec); + WarpGemmAttribute{}(c_vec, a_vec, b_vec, bool_constant{}); c.get_thread_buffer().template set_as(I0, c_vec); } - CK_TILE_DEVICE auto operator()(const AWarpTensor& a, const BWarpTensor& b) const + template + CK_TILE_DEVICE void operator()(CTensor& c, + const ATensor& a, + const BTensor& b, + number, + bool_constant = {}) const { - CWarpTensor c; + using AVec = ext_vector_t; + using BVec = ext_vector_t; + using CVec = ext_vector_t; + + constexpr auto I0 = number<0>{}; - using AVec = ext_vector_t; - using BVec = ext_vector_t; - using CVec = ext_vector_t; + const auto a_vec = a.get_thread_buffer().template get_as()[I0]; + const auto b_vec = b.get_thread_buffer().template get_as()[I0]; + auto c_vec = c.get_thread_buffer().template get_as()[I0]; + + // c_vec += a_vec * b_vec + WarpGemmAttribute{}(c_vec, a_vec, b_vec, number{}, bool_constant{}); + + c.get_thread_buffer().template set_as(I0, c_vec); + } + + template + CK_TILE_DEVICE auto operator()(const ATensor& a, const BTensor& b) const + { + using CTensor = CWarpTensor; + static_assert(detail::is_similiar_distributed_tensor_v && + detail::is_similiar_distributed_tensor_v); + CTensor c; + + using AVec = ext_vector_t; + using BVec = ext_vector_t; + using CVec = ext_vector_t; constexpr auto I0 = number<0>{}; diff --git a/include/ck_tile/ops/moe_sorting.hpp b/include/ck_tile/ops/moe_sorting.hpp deleted file mode 100644 index b74607f06..000000000 --- a/include/ck_tile/ops/moe_sorting.hpp +++ /dev/null @@ -1,11 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp" -#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp" -#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp" -#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp" -#include "ck_tile/ops/common/generic_2d_block_shape.hpp" -#include "ck_tile/ops/common/tensor_layout.hpp" -- GitLab From b6bcd76d881421af2f04246b1e4bbac45b7ce3b9 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Tue, 26 Nov 2024 08:45:14 +0100 Subject: [PATCH 079/153] CK-Tile first draft of universal block gemm with interwave & intrawave scheduler (#1676) * Block universal gemm. * Universal block gemm with interwave scheduler - draft. * Refactoring * Move a/b_warp_tiles into BlockGemmImpl * set BlockGemmImpl as a class member * Change tile size for more suitable to memory bound cases. * Introduce kKPerThread to WarpGemm * Add documentation comment. * Fix Interwave scheduler block gemm. * Add compute/memory friendly tile configuration. * Clean * New tile configurations in gemm mem example. * Add more static checks and fix loop order in block gemm. * Add more static checks and use warp gemm mfma dispatcher. * Add default scheduler block gemm. * Remove logging in example. --- example/01_gemm/run_gemm_example_v2.inc | 2 +- example/ck_tile/03_gemm/gemm_mem_pipeline.cpp | 33 +- example/ck_tile/03_gemm/run_gemm_example.inc | 22 +- include/ck_tile/ops/gemm.hpp | 1 + .../block/block_universal_gemm_as_bs_cr.hpp | 661 ++++++++++++++++++ .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 12 +- .../gemm_pipeline_ag_bg_cr_scheduler.hpp | 2 + ...ine_agmem_bgmem_creg_v1_default_policy.hpp | 40 +- .../gemm/pipeline/gemm_pipeline_problem.hpp | 2 + .../gemm/warp/warp_gemm_attribute_mfma.hpp | 55 +- .../ck_tile/ops/gemm/warp/warp_gemm_impl.hpp | 7 +- 11 files changed, 780 insertions(+), 57 deletions(-) create mode 100644 include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc index 71524fdec..5b6969f1d 100644 --- a/example/01_gemm/run_gemm_example_v2.inc +++ b/example/01_gemm/run_gemm_example_v2.inc @@ -261,7 +261,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) if(config.time_kernel) { ave_time = - invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 5, 10, true, 4}); + invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 50, 100, true, 4}); std::size_t flop = 2_uz * M * N * K; std::size_t num_btype = diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp index ff9d8bad3..97d150412 100644 --- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp +++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp @@ -17,9 +17,24 @@ template float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) { - // ToDo: This will be modified by the codegen code later. +#if 1 + // Memory friendly for Interwave scheduler constexpr ck_tile::index_t M_Tile = 128; - constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t N_Tile = 32; + constexpr ck_tile::index_t K_Tile = 64; + + constexpr ck_tile::index_t M_Warp = 4; + constexpr ck_tile::index_t N_Warp = 1; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + +#else + // Compute friendly for Intrawave scheduler + constexpr ck_tile::index_t M_Tile = 256; + constexpr ck_tile::index_t N_Tile = 256; constexpr ck_tile::index_t K_Tile = 32; constexpr ck_tile::index_t M_Warp = 2; @@ -28,12 +43,12 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) constexpr ck_tile::index_t M_Warp_Tile = 32; constexpr ck_tile::index_t N_Warp_Tile = 32; - constexpr ck_tile::index_t K_Warp_Tile = 8; + constexpr ck_tile::index_t K_Warp_Tile = 16; +#endif - // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part. - constexpr bool kPadM = true; - constexpr bool kPadN = true; - constexpr bool kPadK = true; + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; constexpr int kBlockPerCu = 1; @@ -174,8 +189,8 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) { std::ostringstream err; err << "When there's no hot loop, this tail number \"" << tail_num - << "\" is not supported! " << __FILE__ << ":" << __LINE__ - << ", in function: " << __func__; + << "\" is not supported! PrefetchStages: " << BaseGemmPipeline::PrefetchStages + << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__; throw std::runtime_error(err.str()); } } diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index 8db131738..5199c1e3e 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -31,15 +31,13 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, float ave_time = gemm_calc( args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); - std::string op_name{"Gemm{MemBoundPipeline}"}; - std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N; float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_byte / 1.E6 / ave_time; - std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K + std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl; @@ -114,7 +112,6 @@ int run_gemm_example_with_layouts(int argc, f_host_tensor_descriptor(M, N, stride_C, CLayout{})); // TODO: add different init types - ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); @@ -202,14 +199,15 @@ int run_gemm_example(int argc, char* argv[]) { return run_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{}); } - else if(a_layout == "C" && b_layout == "C") - { - return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{}); - } - else if(a_layout == "C" && b_layout == "R") - { - return run_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{}); - } + // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not + // work. else if(a_layout == "C" && b_layout == "C") + // { + // return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{}); + // } + // else if(a_layout == "C" && b_layout == "R") + // { + // return run_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{}); + // } else { throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!"); diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index ac74782a3..9a033ee2d 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -22,6 +22,7 @@ #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp" #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp" #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp" +#include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp" diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp new file mode 100644 index 000000000..5f98a7a0b --- /dev/null +++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" + +namespace ck_tile { + +// A is block window on shared memory +// B is block window on shared memory +// C is block distributed tensor +template +struct BlockUniversalGemmAsBsCr +{ + private: + // TODO: This should be in Policy - UniversalGemmPolicyBase ? + template + struct GemmTraits_ + { + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; + + static constexpr index_t kBlockSize = Problem::kBlockSize; + static constexpr auto Scheduler = Problem::Scheduler; + + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + static constexpr auto config = Policy::template GetWarpGemmMWarpNWarp(); + + using WarpGemm = remove_cvref_t())>; + + static constexpr index_t MWarp = config.template at<1>(); + static constexpr index_t NWarp = config.template at<2>(); + + static_assert(MWarp == BlockGemmShape::BlockWarps::at(number<0>{}), + "Error! WarpGemm's MWarp is not consisten with BlockGemmShape!"); + static_assert(NWarp == BlockGemmShape::BlockWarps::at(number<1>{}), + "Error! WarpGemm's NWarp is not consisten with BlockGemmShape!"); + static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(number<0>{}), + "Error! WarpGemm's M is not consisten with BlockGemmShape!"); + static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(number<1>{}), + "Error! WarpGemm's N is not consisten with BlockGemmShape!"); + + static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM); + static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN); + static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK; + + static_assert(MIterPerWarp * MWarp * WarpGemm::kM == MPerBlock, + "Error! Warps should cover all Block tile!"); + static_assert(NIterPerWarp * NWarp * WarpGemm::kN == NPerBlock, + "Error! Warps should cover all Block tile!"); + + static constexpr index_t MPerBlockPerIter = MWarp * WarpGemm::kM; + static constexpr index_t NPerBlockPerIter = NWarp * WarpGemm::kN; + static constexpr index_t KPerBlockPerIter = WarpGemm::kK; + + using AWarpTileDistr = remove_cvref_t; + using BWarpTileDistr = remove_cvref_t; + + using AWarpTile = + remove_cvref_t(AWarpTileDistr{}))>; + using BWarpTile = + remove_cvref_t(BWarpTileDistr{}))>; + + // TODO: Should we have two policies? Interwave & Intrawave ?? + static constexpr index_t InterWaveSchedulingMacClusters = 1; + + static constexpr index_t KPack = WarpGemm::kKPerThread; + static constexpr index_t KPerThread = KPerBlock / WarpGemm::kK * KPack; + static constexpr index_t KRepeat = KPerThread / KPack; + }; + + public: + using Traits = GemmTraits_; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + + using WarpGemm = remove_cvref_t; + + static constexpr index_t KIterPerWarp = Traits::KIterPerWarp; + static constexpr index_t MIterPerWarp = Traits::MIterPerWarp; + static constexpr index_t NIterPerWarp = Traits::NIterPerWarp; + + static constexpr index_t MWarp = Traits::MWarp; + static constexpr index_t NWarp = Traits::NWarp; + + static constexpr auto Scheduler = Traits::Scheduler; + + private: + template + struct BlockGemmImpl + { + }; + + template + struct BlockGemmImpl + { + // C += A * B + template + CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor, + const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + static_assert( + std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); + static_assert(std::is_same_v && + std::is_same_v, + "The ADataType and BDataType as defined in " + "traits should be the same as correspoinding block window data type!"); + + static_assert( + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + "MPerBlock, NPerBlock, KPerBlock defined in " + " BlockGemmShape are different from A/B block smem windows apropriate dims!"); + + const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + + // TODO: refactor warp_window tile type to class member as it should be + // compile-time known information. + auto a_warp_window_tmp = make_tile_window( + a_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_block_window.get_window_origin() + + multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + + using AWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::AWarpTile::get_num_of_dimension() == + AWarpWindow::get_num_of_dimension(), + "AWarpWindow number of dimensions must be equal to " + "AWarpTile number of dimensions!"); + static_assert(GemmTraits::AWarpTile::get_lengths() == + AWarpWindow{}.get_window_lengths(), + "AWarpWindow lengths must be equal to AWarpTile lengths!"); + + statically_indexed_array< + statically_indexed_array, + GemmTraits::MIterPerWarp> + a_warp_windows; + + // construct B-warp-window + auto b_warp_window_tmp = make_tile_window( + b_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_block_window.get_window_origin() + + multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + + using BWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::BWarpTile::get_num_of_dimension() == + BWarpWindow::get_num_of_dimension(), + "BWarpWindow number of dimensions must be equal to " + "BWarpTile number of dimensions!"); + static_assert(GemmTraits::BWarpTile::get_lengths() == + BWarpWindow{}.get_window_lengths(), + "BWarpWindow lengths must be equal to BWarpTile lengths!"); + + statically_indexed_array< + statically_indexed_array, + GemmTraits::NIterPerWarp> + b_warp_windows; + + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + a_warp_windows(mIter)(kIter) = a_warp_window_tmp; + + // TODO: I don't have to move 0,0 window! + move_tile_window(a_warp_windows(mIter)(kIter), + {mIter * GemmTraits::MPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + b_warp_windows(nIter)(kIter) = b_warp_window_tmp; + + move_tile_window(b_warp_windows(nIter)(kIter), + {nIter * GemmTraits::NPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; + using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + + constexpr auto c_warp_y_lengths = + to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + // hot loop: + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + const auto a_warp_tile = load_tile(a_warp_windows(mIter)(kIter)); + + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + const auto b_warp_tile = load_tile(b_warp_windows(nIter)(kIter)); + + // read C warp tensor from C block tensor- + CWarpTensor c_warp_tensor; + + c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + // warp GEMM + typename GemmTraits::WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile); + + // write C warp tensor into C block tensor + c_block_tensor.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + c_warp_tensor.get_thread_buffer()); + }); + }); + }); + } + }; + + template + struct BlockGemmImpl + { + statically_indexed_array< + statically_indexed_array, + GemmTraits::MIterPerWarp> + a_warp_tiles_; + + statically_indexed_array< + statically_indexed_array, + GemmTraits::NIterPerWarp> + b_warp_tiles_; + + template + CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + static_assert( + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + "MPerBlock, NPerBlock, KPerBlock defined in " + " BlockGemmShape are different from A/B block smem windows apropriate dims!"); + + static_assert(std::is_same_v && + std::is_same_v, + "The ADataType and BDataType as defined in " + "traits should be the same as correspoinding block window data type!"); + + const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + + // TODO: refactor warp_window tile type to class member as it should be + // compile-time known information. + auto a_warp_window_tmp = make_tile_window( + a_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_block_window.get_window_origin() + + multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + + using AWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::AWarpTile::get_num_of_dimension() == + AWarpWindow::get_num_of_dimension(), + "AWarpWindow number of dimensions must be equal to " + "AWarpTile number of dimensions!"); + static_assert(GemmTraits::AWarpTile::get_lengths() == + AWarpWindow{}.get_window_lengths(), + "AWarpWindow lengths must be equal to AWarpTile lengths!"); + + statically_indexed_array< + statically_indexed_array, + GemmTraits::MIterPerWarp> + a_warp_windows; + + // construct B-warp-window + auto b_warp_window_tmp = make_tile_window( + b_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_block_window.get_window_origin() + + multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + + using BWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::BWarpTile::get_num_of_dimension() == + BWarpWindow::get_num_of_dimension(), + "BWarpWindow number of dimensions must be equal to " + "BWarpTile number of dimensions!"); + static_assert(GemmTraits::BWarpTile::get_lengths() == + BWarpWindow{}.get_window_lengths(), + "BWarpWindow lengths must be equal to BWarpTile lengths!"); + + statically_indexed_array< + statically_indexed_array, + GemmTraits::NIterPerWarp> + b_warp_windows; + + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + a_warp_windows(mIter)(kIter) = a_warp_window_tmp; + + // TODO: I don't have to move 0,0 window! + move_tile_window(a_warp_windows(mIter)(kIter), + {mIter * GemmTraits::MPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + b_warp_windows(nIter)(kIter) = b_warp_window_tmp; + + move_tile_window(b_warp_windows(nIter)(kIter), + {nIter * GemmTraits::NPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + // read A warp tensor from A block window + load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter)); + }); + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + // read B warp tensor from B Block window + load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter)); + }); + }); + } + + // C += A * B + template + CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor, + [[maybe_unused]] const ASmemBlockWindow& a_block_window, + [[maybe_unused]] const BSmemBlockWindow& b_block_window) + { + static_assert( + std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); + + using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; + using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + + constexpr auto c_warp_y_lengths = + to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + // hot loop: + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + // read C warp tensor from C block tensor- + CWarpTensor c_warp_tensor; + + c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + // warp GEMM + typename GemmTraits::WarpGemm{}(c_warp_tensor, + a_warp_tiles_[mIter][kIter], + b_warp_tiles_[nIter][kIter]); + + // write C warp tensor into C block tensor + c_block_tensor.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + c_warp_tensor.get_thread_buffer()); + }); + }); + }); + } + }; + + template + struct BlockGemmImpl + { + static constexpr index_t KPerThread = GemmTraits::KPerThread; + static constexpr index_t NumMacClusters = GemmTraits::InterWaveSchedulingMacClusters; + static constexpr index_t KPerInnerLoop = + ck_tile::max(KPerThread / NumMacClusters, GemmTraits::KPack); + // TODO: do we really need this?? Are there any cases when this would be >=1 ?? + // Would we need InterWaveSchedulingMacClusters > 1 ??? + static constexpr index_t KRepeat = KPerThread / KPerInnerLoop; + static constexpr index_t KInnerLoopIter = KPerInnerLoop / GemmTraits::KPack; + + statically_indexed_array< + statically_indexed_array, + GemmTraits::MIterPerWarp> + a_warp_tiles_; + + statically_indexed_array< + statically_indexed_array, + GemmTraits::NIterPerWarp> + b_warp_tiles_; + + template + CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + static_assert( + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + "MPerBlock, NPerBlock, KPerBlock defined in " + " BlockGemmShape are different from A/B block smem windows apropriate dims!"); + + static_assert(std::is_same_v && + std::is_same_v, + "The ADataType and BDataType as defined in " + "traits should be the same as correspoinding block window data type!"); + + const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + + // TODO: refactor warp_window tile type to class member as it should be + // compile-time known information. + auto a_warp_window_tmp = make_tile_window( + a_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_block_window.get_window_origin() + + multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, KIdx * KPerInnerLoop}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + + using AWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::AWarpTile::get_num_of_dimension() == + AWarpWindow::get_num_of_dimension(), + "AWarpWindow number of dimensions must be equal to " + "AWarpTile number of dimensions!"); + static_assert(GemmTraits::AWarpTile::get_lengths() == + AWarpWindow{}.get_window_lengths(), + "AWarpWindow lengths must be equal to AWarpTile lengths!"); + + statically_indexed_array, + GemmTraits::MIterPerWarp> + a_warp_windows; + + // construct B-warp-window + auto b_warp_window_tmp = make_tile_window( + b_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_block_window.get_window_origin() + + multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, KIdx * KPerInnerLoop}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + + using BWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::BWarpTile::get_num_of_dimension() == + BWarpWindow::get_num_of_dimension(), + "BWarpWindow number of dimensions must be equal to " + "BWarpTile number of dimensions!"); + static_assert(GemmTraits::BWarpTile::get_lengths() == + BWarpWindow{}.get_window_lengths(), + "BWarpWindow lengths must be equal to BWarpTile lengths!"); + + statically_indexed_array, + GemmTraits::NIterPerWarp> + b_warp_windows; + + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { + a_warp_windows(mIter)(kIter) = a_warp_window_tmp; + + move_tile_window(a_warp_windows(mIter)(kIter), + {mIter * GemmTraits::MPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { + b_warp_windows(nIter)(kIter) = b_warp_window_tmp; + + move_tile_window(b_warp_windows(nIter)(kIter), + {nIter * GemmTraits::NPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + // TODO check if a_warp_tiles has same desc as a_warp_window + static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + // read A warp tensor from A block window + load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter)); + }); + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + // read B warp tensor from B Block window + load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter)); + }); + }); + } + + // C += A * B + template + CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor, + const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + static_assert( + std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); + + using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; + using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + + constexpr auto c_warp_y_lengths = + to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + // hot loop: + static_for<0, KRepeat, 1>{}([&](auto kIter) { + LocalPrefetch(a_block_window, b_block_window); + __builtin_amdgcn_sched_barrier(0); + // NOTE: Synchronize threads in a workgroup at the start of each MAC + // cluster, but except the first, as we can shorten non-MAC cluster a bit + // and there's no observable negative impact. The desired effect is waves in + // a workgroup executing MAC in sync. This avoids some out-of-sync waves + // hijacking MAC resource from other workgroups and reducing the chance of + // latency hiding by waiting for the rest of the workgroup at the eventual + // sync point. + if constexpr(kIter.value != 0 || KRepeat == 1) + { + __builtin_amdgcn_s_barrier(); + __builtin_amdgcn_sched_barrier(0); + } + + static_for<0, KInnerLoopIter, 1>{}([&](auto kInnerIter) { + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + // read C warp tensor from C block tensor- + CWarpTensor c_warp_tensor; + + c_warp_tensor.get_thread_buffer() = + c_block_tensor.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + // The block_sync_lds() here performs double duty: + // A) safeguard against data hazard because barrier from + // blockwise_gemm is moved here B) reduce VMEM FIFO congestion + // by applying small delays to different wavefronts It is + // performed near the end of MAC cluster to minimize lgkmcnt + // penalty + if constexpr(kIter.value == KRepeat - 1 && + kInnerIter.value == KInnerLoopIter - 1 && + mIter.value == GemmTraits::MIterPerWarp - 1 && + nIter.value == GemmTraits::NIterPerWarp - 1) + { + __builtin_amdgcn_sched_barrier(0); + block_sync_lds(); + __builtin_amdgcn_sched_barrier(0); + } + // warp GEMM + typename GemmTraits::WarpGemm{}(c_warp_tensor, + a_warp_tiles_[mIter][kInnerIter], + b_warp_tiles_[nIter][kInnerIter]); + + // write C warp tensor into C block tensor + c_block_tensor.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + c_warp_tensor.get_thread_buffer()); + + if constexpr(kInnerIter.value == 0 && mIter.value == 0 && + nIter.value == 0) + { + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(1); + __builtin_amdgcn_sched_barrier(0); + } + }); + }); + }); + + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(0); + __builtin_amdgcn_sched_barrier(0); + }); + } + }; + + public: + CK_TILE_DEVICE static constexpr auto MakeCBlockTile() + { + constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding< + sequence<>, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + auto c_block_tensor = make_static_distributed_tensor(c_block_dstr); + + return c_block_tensor; + } + + template + CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + block_gemm_impl_.template LocalPrefetch(a_block_window, b_block_window); + } + + // C += A * B + template + CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor, + const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window); + } + + // C = A * B + template + CK_TILE_DEVICE auto operator()(const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + auto c_block_tensor = MakeCBlockTile(); + block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window); + return c_block_tensor; + } + + private: + BlockGemmImpl block_gemm_impl_{}; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index 85c5c5805..4634e9dcb 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -247,8 +247,8 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem b_lds_block, make_tuple(number{}, number{}), {0, 0}); // Block GEMM - constexpr auto block_gemm = BlockGemm(); - auto c_block_tile = block_gemm.MakeCBlockTile(); + auto block_gemm = BlockGemm(); + auto c_block_tile = block_gemm.MakeCBlockTile(); using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution()); using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution()); @@ -290,7 +290,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem { static_for<0, PrefetchStages, 1>{}([&](auto prefetch_idx) { block_sync_lds(); - // block_gemm.LocalPrefetch(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_sync_lds(); @@ -318,7 +318,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem static_for<1, tail_num, 1>{}([&](auto prefetch_idx) { block_sync_lds(); - // block_gemm.LocalPrefetch(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_sync_lds(); @@ -331,14 +331,14 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem }); block_sync_lds(); - // block_gemm.LocalPrefetch(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); }; if constexpr(TailNum == TailNumber::One) { block_sync_lds(); - // block_gemm.LocalPrefetch(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); } else if constexpr(TailNum == TailNumber::Two) diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp index 5e93ca21c..6f51e6b8a 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp @@ -11,6 +11,7 @@ namespace ck_tile { enum struct GemmPipelineScheduler { + Default, Intrawave, Interwave, }; @@ -43,6 +44,7 @@ inline std::ostream& operator<<(std::ostream& os, const ck_tile::GemmPipelineSch { switch(s) { + case ck_tile::GemmPipelineScheduler::Default: os << "Default"; break; case ck_tile::GemmPipelineScheduler::Intrawave: os << "Intrawave"; break; case ck_tile::GemmPipelineScheduler::Interwave: os << "Interwave"; break; default: os << ""; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp index c765b3ce9..b475ebb7b 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp @@ -4,6 +4,7 @@ #pragma once #include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp" namespace ck_tile { @@ -52,6 +53,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM; constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK; + // TODO: this 8 is AK1! should be a policy parameter! constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor( make_tuple(number{}, number{}, number<8>{}), make_tuple(number<(kMPerBlock + 1) * 8>{}, number<8>{}, number<1>{}), @@ -264,6 +266,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error."); static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error."); constexpr index_t M0 = MPerBlock / (M2 * M1); + static_assert(M0 * M1 * M2 == MPerBlock, + "Incorrect M0, M2, M1 configuration! " + "M0, M1, M2 must cover whole MPerBlock!"); return make_static_tile_distribution( tile_distribution_encoding, @@ -277,6 +282,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy { constexpr index_t M0 = BlockSize / get_warp_size(); constexpr index_t M1 = MPerBlock / (M2 * M0); + static_assert(M0 * M1 * M2 == MPerBlock, + "Incorrect M0, M1, M2 configuration! " + "M0, M1, M2 must cover whole MPerBlock!"); return make_static_tile_distribution( tile_distribution_encoding, tuple, sequence>, @@ -350,6 +358,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error."); static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error."); constexpr index_t N0 = NPerBlock / (N2 * N1); + static_assert(N0 * N1 * N2 == NPerBlock, + "Incorrect N0, N1, N2 configuration! " + "N0, N1, N2 must cover whole NPerBlock!"); return make_static_tile_distribution( tile_distribution_encoding, @@ -364,7 +375,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy { constexpr index_t N0 = BlockSize / get_warp_size(); constexpr index_t N1 = NPerBlock / (N2 * N0); - + static_assert(N0 * N1 * N2 == NPerBlock, + "Incorrect N0, N1, N2 configuration! " + "N0, N1, N2 must cover whole NPerBlock!"); return make_static_tile_distribution( tile_distribution_encoding, tuple, sequence>, @@ -475,9 +488,28 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy template CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm() { - using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1DefaultPolicy; - - return BlockGemmASmemBSmemCRegV1{}; + constexpr bool TransposeC = false; + constexpr auto I0 = number<0>{}; + constexpr auto I1 = number<1>{}; + constexpr auto I2 = number<2>{}; + + using AccDataType = float; + using BlockWarps = typename Problem::BlockGemmShape::BlockWarps; + using WarpTile = typename Problem::BlockGemmShape::WarpTile; + using WarpGemm = WarpGemmMfmaDispatcher; + using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy; + + return BlockUniversalGemmAsBsCr{}; } }; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp index 3c43790bd..bf51577ae 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp @@ -33,6 +33,8 @@ struct GemmPipelineProblemBase static constexpr bool kPadN = GemmTraits::kPadN; static constexpr bool kPadK = GemmTraits::kPadK; + static constexpr auto Scheduler = GemmPipelineScheduler::Default; + CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentA() { if constexpr(std::is_same_v) diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp index 0a8d2dfbe..a9e466a79 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -21,9 +21,10 @@ struct WarpGemmAtrributeMfma using BVecType = typename Impl::BVecType; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kM; - static constexpr index_t kN = Impl::kN; - static constexpr index_t kK = Impl::kK; + static constexpr index_t kM = Impl::kM; + static constexpr index_t kN = Impl::kN; + static constexpr index_t kK = Impl::kK; + static constexpr index_t kKPerThread = Impl::kABKPerLane; CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } @@ -86,9 +87,10 @@ struct WarpGemmAtrributeMfmaIterateK ext_vector_t::vector_size * kKIter>; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kM; - static constexpr index_t kN = Impl::kN; - static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kM = Impl::kM; + static constexpr index_t kN = Impl::kN; + static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter; CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } @@ -197,9 +199,10 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution using BVecType = typename Impl::AVecType; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kN; - static constexpr index_t kN = Impl::kM; - static constexpr index_t kK = Impl::kK; + static constexpr index_t kM = Impl::kN; + static constexpr index_t kN = Impl::kM; + static constexpr index_t kK = Impl::kK; + static constexpr index_t kKPerThread = Impl::kABKPerLane; CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } @@ -260,9 +263,10 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB using BVecType = typename Impl::AVecType; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kN; - static constexpr index_t kN = Impl::kM; - static constexpr index_t kK = Impl::kK; + static constexpr index_t kM = Impl::kN; + static constexpr index_t kN = Impl::kM; + static constexpr index_t kK = Impl::kK; + static constexpr index_t kKPerThread = Impl::kABKPerLane; CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } @@ -330,9 +334,10 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution ext_vector_t::vector_size * kKIter>; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kN; - static constexpr index_t kN = Impl::kM; - static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kM = Impl::kN; + static constexpr index_t kN = Impl::kM; + static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter; CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } @@ -444,10 +449,11 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB ext_vector_t::vector_size * kKIter>; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kN; - static constexpr index_t kN = Impl::kM; - static constexpr index_t kK = Impl::kK * kKIter; - static constexpr index_t SFactor = SFactor_; // group how many CM1 together + static constexpr index_t kM = Impl::kN; + static constexpr index_t kN = Impl::kM; + static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter; + static constexpr index_t SFactor = SFactor_; // group how many CM1 together CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } @@ -583,10 +589,11 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA ext_vector_t::vector_size * kKIter>; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kM; - static constexpr index_t kN = Impl::kN; - static constexpr index_t kK = Impl::kK * kKIter; - static constexpr index_t SFactor = SFactor_; // group how many CM1 together + static constexpr index_t kM = Impl::kM; + static constexpr index_t kN = Impl::kN; + static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter; + static constexpr index_t SFactor = SFactor_; // group how many CM1 together CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp index 182d023a0..f9d50ed35 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -14,6 +14,11 @@ struct WarpGemmImpl static constexpr index_t kM = WarpGemmAttribute::kM; static constexpr index_t kN = WarpGemmAttribute::kN; static constexpr index_t kK = WarpGemmAttribute::kK; + /// @brief The number of elements in K dimension processed by single thread in wavefront. + /// + /// @note Note that WarpGemm may run MFMA instruction multiple times (on different K). + /// In such situation this value reflects this fact. + static constexpr index_t kKPerThread = WarpGemmAttribute::kKPerThread; using ADataType = typename WarpGemmAttribute::ADataType; using BDataType = typename WarpGemmAttribute::BDataType; -- GitLab From cf2d635ea27c074e7025896514c4b94034d370cc Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Tue, 26 Nov 2024 20:37:54 +0800 Subject: [PATCH 080/153] [CK_TILE] Fix incorrect computation of group mode PagedAttention (#1688) * Allow getting batch size from splitkv tile partitioner * Fix wrong paged-kvcache impl for group mode * Fix wrong example code for page-kvcache * Undo changes in fmha_fwd.cpp * Always use 2D block table * Add is_gappy kernel argument for paged-kvcache The is_gappy argument is used for differentiating seqstart_k_ptr usage in flash-attention & xformers * Remove out-of-date comments * Remove no-longer used method * Fix wrong # page-block calculation * Fix wrong comment --------- Co-authored-by: Qianfeng --- example/ck_tile/01_fmha/fmha_fwd.cpp | 1 + example/ck_tile/01_fmha/fmha_fwd.hpp | 12 +++ .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 56 +++++++----- .../fmha_fwd_splitkv_tile_partitioner.hpp | 10 +-- ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp | 90 +++++++++++-------- 5 files changed, 105 insertions(+), 64 deletions(-) diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index 00e0a1653..1f0d73d95 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -1046,6 +1046,7 @@ bool run(const ck_tile::ArgParser& arg_parser) (0 < page_block_size ? block_table_buf.GetDeviceBuffer() : nullptr); args.batch_stride_block_table = batch_stride_block_table; args.page_block_size = page_block_size; + args.is_gappy = false; // use 'false' for flash-attention integration args.cache_batch_idx = (use_cache_batch_idx ? cache_batch_idx_buf.GetDeviceBuffer() : nullptr); diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 704453baa..8a821b917 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -165,6 +165,8 @@ struct fmha_fwd_splitkv_args void* block_table_ptr; ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr ck_tile::index_t page_block_size; // only used if 'block_table_ptr' is not nullptr + bool is_gappy; // differentiate seqstart_k_ptr usage. only used if 'block_table_ptr' is not + // nullptr. const void* cache_batch_idx; @@ -173,12 +175,21 @@ struct fmha_fwd_splitkv_args // seqlen_k = kargs.seqlen_k // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b] // seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b] + // or kargs.seqlen_k_ptr[b] + // // batch mode (kvcache): // seqlen_q = kargs.seqlen_q // seqlen_k = kargs.seqlen_k_ptr[b] // group mode (kvcache): // seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b] + // + // when is_gappy=true: + // seqlen_k = kargs.seqlen_k_ptr[b] + // seqstart_k_ptr[b] now store local offset of each batch + // + // when is_gappy=false: // seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b] + // or kargs.seqlen_k_ptr[b] const void* seqstart_q_ptr; const void* seqstart_k_ptr; const void* seqlen_k_ptr; @@ -395,6 +406,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args) args.block_table_ptr, args.batch_stride_block_table, args.page_block_size, + args.is_gappy, args.scale_s, args.scale_p, args.stride_q, diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index 3c4e02d08..dcb671d81 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -172,13 +172,18 @@ struct FmhaFwdSplitKVKernel float scale_p; }; - struct PageBlockTableKargs + struct CommonPageBlockTableKargs { const int32_t* block_table_ptr; ck_tile::index_t batch_stride_block_table; ck_tile::index_t page_block_size; }; + struct GroupModePageBlockTableKargs : CommonPageBlockTableKargs + { + bool is_gappy = false; + }; + struct CacheBatchIdxKargs { const int32_t* cache_batch_idx; @@ -193,7 +198,7 @@ struct FmhaFwdSplitKVKernel EmptyKargs<0>>>, std::conditional_t>, std::conditional_t>, - std::conditional_t + std::conditional_t { const int32_t* seqlen_k_ptr; @@ -215,7 +220,7 @@ struct FmhaFwdSplitKVKernel EmptyKargs<0>>>, std::conditional_t>, std::conditional_t>, - std::conditional_t> + std::conditional_t> { const int32_t* seqstart_q_ptr; const int32_t* seqstart_k_ptr; @@ -375,6 +380,7 @@ struct FmhaFwdSplitKVKernel const void* block_table_ptr, ck_tile::index_t batch_stride_block_table, ck_tile::index_t page_block_size, + bool is_gappy, float scale_s, float scale_p, ck_tile::index_t stride_q, @@ -461,6 +467,7 @@ struct FmhaFwdSplitKVKernel kargs.block_table_ptr = reinterpret_cast(block_table_ptr); kargs.batch_stride_block_table = batch_stride_block_table; kargs.page_block_size = page_block_size; + kargs.is_gappy = is_gappy; } return kargs; @@ -495,11 +502,13 @@ struct FmhaFwdSplitKVKernel const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; - long_index_t batch_offset_k = 0; - long_index_t batch_offset_v = 0; + long_index_t batch_offset_k = 0; // unused for paged-kvcache + long_index_t batch_offset_v = 0; // unused for paged-kvcache long_index_t batch_offset_bias = 0; long_index_t batch_offset_lse_acc = 0; long_index_t batch_offset_o_acc = 0; + index_t kv_l2p_offset = + 0; // logical-to-physical offset of seqlen_k coordinate. only used for paged-kvcache if constexpr(kIsGroupMode) { @@ -508,22 +517,14 @@ struct FmhaFwdSplitKVKernel const long_index_t key_start = kargs.seqstart_k_ptr[i_batch]; batch_offset_q = query_start * kargs.stride_q; - if constexpr(kIsPagedKV) + batch_offset_k = key_start * kargs.stride_k; + if constexpr(std::is_same_v) { - batch_offset_k = static_cast(i_batch) * kargs.batch_stride_k; - batch_offset_v = static_cast(i_batch) * kargs.batch_stride_v; + batch_offset_v = key_start * kargs.stride_v; } else { - batch_offset_k = key_start * kargs.stride_k; - if constexpr(std::is_same_v) - { - batch_offset_v = key_start * kargs.stride_v; - } - else - { - batch_offset_v = key_start; - } + batch_offset_v = key_start; } if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) { @@ -551,6 +552,15 @@ struct FmhaFwdSplitKVKernel { kargs.seqlen_k = kargs.seqstart_k_ptr[i_batch + 1] - kargs.seqstart_k_ptr[i_batch]; } + + if constexpr(kIsPagedKV) + { + if(kargs.is_gappy) + { + // seqstart_k_ptr has different meaning in this case + kv_l2p_offset = kargs.seqstart_k_ptr[i_batch]; + } + } } else { @@ -703,7 +713,7 @@ struct FmhaFwdSplitKVKernel reinterpret_cast(kargs.block_table_ptr) + i_batch_ * kargs.batch_stride_block_table; const index_t num_blocks = - integer_divide_ceil(kargs.seqlen_k, kargs.page_block_size); + integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size); const long_index_t fixed_offset = static_cast(i_nhead_ / kargs.nhead_ratio_qk) * @@ -718,7 +728,8 @@ struct FmhaFwdSplitKVKernel kargs.page_block_size, k_dram, make_k_dram(nullptr, - kargs.seqlen_k - (num_blocks - 1) * kargs.page_block_size)); + (kv_l2p_offset + kargs.seqlen_k) - + (num_blocks - 1) * kargs.page_block_size)); } else { @@ -733,7 +744,7 @@ struct FmhaFwdSplitKVKernel reinterpret_cast(kargs.block_table_ptr) + i_batch_ * kargs.batch_stride_block_table; const index_t num_blocks = - integer_divide_ceil(kargs.seqlen_k, kargs.page_block_size); + integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size); const long_index_t fixed_offset = static_cast(i_nhead_ / kargs.nhead_ratio_qk) * @@ -748,7 +759,8 @@ struct FmhaFwdSplitKVKernel kargs.page_block_size, v_dram, make_v_dram(nullptr, - kargs.seqlen_k - (num_blocks - 1) * kargs.page_block_size)); + (kv_l2p_offset + kargs.seqlen_k) - + (num_blocks - 1) * kargs.page_block_size)); } else { @@ -896,6 +908,7 @@ struct FmhaFwdSplitKVKernel mask, position_encoding, kargs.scale_s, + kv_l2p_offset, smem_ptr); } else @@ -912,6 +925,7 @@ struct FmhaFwdSplitKVKernel mask, position_encoding, kargs.scale_s, + kv_l2p_offset, smem_ptr); } }(); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp index 675a31019..5a52fa0f6 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp @@ -18,11 +18,11 @@ struct FmhaFwdSplitKVTilePartitioner static constexpr ck_tile::index_t kN1 = BlockFmhaShape::kN1; static constexpr ck_tile::index_t kK1 = BlockFmhaShape::kK1; - __host__ static constexpr auto GridSize(ck_tile::index_t batch_size, - ck_tile::index_t nhead, - ck_tile::index_t max_seqlen_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_splits) + CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size, + ck_tile::index_t nhead, + ck_tile::index_t max_seqlen_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_splits) { // TODO: this may need tuning return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, kM0) * diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp index 4e8d8694d..04aa85644 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp @@ -143,6 +143,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS FmhaMask mask, PositionEncoding position_encoding, float scale_s, + index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate void* smem_ptr) const { static_assert( @@ -211,16 +212,16 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS set_tile(m, -numeric::infinity()); clear_tile(l); - const auto q_origin = q_dram_window.get_window_origin(); - const auto [seqlen_k_start, seqlen_k_end] = mask.GetTileRangeAlongX( + const auto q_origin = q_dram_window.get_window_origin(); + const auto [logical_seqlen_k_start, logical_seqlen_k_end] = mask.GetTileRangeAlongX( q_origin.at(number<0>{}), number{}, number{}, num_splits, i_split); // check early exit if no work to do if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits) { - const index_t original_num_total_loop = - integer_divide_ceil(seqlen_k_end - seqlen_k_start, kN0); - if(original_num_total_loop <= 0) + const index_t logical_num_total_loop = + integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0); + if(logical_num_total_loop <= 0) { if constexpr(kStoreLSE) { @@ -239,33 +240,41 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS } } - // make sure the first tile is completely located in page-block - const index_t adjusted_seqlen_k_start = [&, seqlen_k_start_ = seqlen_k_start] { - if constexpr(kIsPagedKV) - { - return kN0 * integer_divide_floor(seqlen_k_start_, kN0); - } - else - { - return seqlen_k_start_; - } - }(); + const index_t physical_seqlen_k_start = logical_seqlen_k_start + kv_l2p_offset; + const index_t physical_seqlen_k_end = logical_seqlen_k_end + kv_l2p_offset; + // make sure the first tile is completely located in page-block (page-block size should be + // divisible by kN0) + // relationship between each *_start variables: aligned_physical_seqlen_k_start <= + // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start + const index_t aligned_physical_seqlen_k_start = + [&, physical_seqlen_k_start_ = physical_seqlen_k_start] { + if constexpr(kIsPagedKV) + { + return kN0 * integer_divide_floor(physical_seqlen_k_start_, kN0); + } + else + { + return physical_seqlen_k_start_; + } + }(); const index_t num_total_loop = - integer_divide_ceil(seqlen_k_end - adjusted_seqlen_k_start, kN0); + integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0); auto [i_page_block_k, k_dram_block_window] = k_page_block_navigator.make_tile_window( - k_dram_block_window_lengths, {adjusted_seqlen_k_start, 0}); + k_dram_block_window_lengths, {aligned_physical_seqlen_k_start, 0}); const auto bias_origin = bias_dram_block_window_tmp.get_window_origin(); auto bias_dram_window = make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(), bias_dram_block_window_tmp.get_window_lengths(), - {bias_origin.at(number<0>{}), adjusted_seqlen_k_start}, // M/N + {bias_origin.at(number<0>{}), + logical_seqlen_k_start - (physical_seqlen_k_start - + aligned_physical_seqlen_k_start)}, // M/N Policy::template MakeBiasDramTileDistribution()); auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window( v_dram_block_window_lengths, - {0, adjusted_seqlen_k_start}, // TODO: hdim split? + {0, aligned_physical_seqlen_k_start}, // TODO: hdim split? Policy::template MakeVDramTileDistribution()); auto q_tile = tile_elementwise_in(q_element_func, q); @@ -379,7 +388,8 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS constexpr auto i_j_idx = make_tuple(idx0, idx1); s_acc(i_j_idx) *= scale_s; - position_encoding.update(s_acc(i_j_idx), row, col); + // position_encoding accept only logical coordinates, do conversion here + position_encoding.update(s_acc(i_j_idx), row, col - kv_l2p_offset); }); }); } @@ -397,29 +407,31 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS { const auto k_origin = k_page_block_navigator.to_global_window_origin( i_page_block_k, k_dram_block_window.get_window_origin()); - set_tile_if(s_acc, - -numeric::infinity(), - [&, seqlen_k_start_ = seqlen_k_start, seqlen_k_end_ = seqlen_k_end]( - auto tile_idx) { - const auto col = - k_origin.at(number<0>{}) + tile_idx.at(number<1>{}); - if constexpr(kIsPagedKV) - { - return col < seqlen_k_start_ || seqlen_k_end_ <= col; - } - else - { - return seqlen_k_end_ <= col; - } - }); + set_tile_if( + s_acc, + -numeric::infinity(), + [&, + physical_seqlen_k_start_ = physical_seqlen_k_start, + physical_seqlen_k_end_ = physical_seqlen_k_end](auto tile_idx) { + const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{}); + if constexpr(kIsPagedKV) + { + return col < physical_seqlen_k_start_ || physical_seqlen_k_end_ <= col; + } + else + { + return physical_seqlen_k_end_ <= col; + } + }); } if constexpr(kPadSeqLenK || FmhaMask::IsMasking) { const auto k_origin = k_page_block_navigator.to_global_window_origin( i_page_block_k, k_dram_block_window.get_window_origin()); + // mask accept only logical coordinates, do conversion here bool need_perpixel_check = mask.IsEdgeTile(q_origin.at(number<0>{}), - k_origin.at(number<0>{}), + k_origin.at(number<0>{}) - kv_l2p_offset, number{}, number{}); if(need_perpixel_check) @@ -428,7 +440,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS s_acc, -numeric::infinity(), [&](auto tile_idx) { const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{}); const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{}); - return mask.IsOutOfBound(row, col); + return mask.IsOutOfBound(row, col - kv_l2p_offset); }); } } @@ -659,6 +671,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS FmhaMask mask, PositionEncoding position_encoding, float scale_s, + index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate void* smem_ptr) const { return operator()(q_dram_block_window_tmp, @@ -681,6 +694,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS mask, position_encoding, scale_s, + kv_l2p_offset, smem_ptr); } }; -- GitLab From b70f367f8051e0c66071a25ab95a77e076762808 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Tue, 26 Nov 2024 13:56:32 +0100 Subject: [PATCH 081/153] Add check for bf16 splitk support for grouped gemm splitk (#1673) * add check for bf16 splitk support for grouped gemm splitk * Update if condition --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> --- .../device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp index 6d9d1459c..cb0afbb08 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp @@ -538,6 +538,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK && arg.K_BATCH > 1 && !is_bf16_atomic_supported()) + { + return false; + } + bool supported = true; for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i) { -- GitLab From bfe983a1518935ef8d81066b540b8aea51b8e883 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:36:53 +0100 Subject: [PATCH 082/153] Change block gemm pipeline local prefill loop order. (#1692) * Fix loop order. * Fix loop order in pipeline v4 --- .../blockwise_gemm_pipeline_xdlops_v2.hpp | 130 +++++++++--------- .../blockwise_gemm_pipeline_xdlops_v4.hpp | 65 +++++---- 2 files changed, 96 insertions(+), 99 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp index 711c47854..54edf0c35 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp @@ -269,15 +269,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run( - b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); }); }); @@ -341,14 +340,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); }); }); @@ -396,14 +395,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); }); }); @@ -447,14 +446,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); }); }); @@ -760,15 +759,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run( - b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k0, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); }); __builtin_amdgcn_sched_barrier(0); // NOTE: Synchronize threads in a workgroup at the start of each MAC @@ -866,14 +864,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k0, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); }); __builtin_amdgcn_sched_barrier(0); @@ -942,14 +940,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k0, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); }); __builtin_amdgcn_sched_barrier(0); @@ -1018,14 +1016,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k0, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); }); __builtin_amdgcn_sched_barrier(0); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp index bd5a1bedf..e8d105111 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp @@ -305,14 +305,14 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf.At(I0), - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_bufs(I0)); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(I0), + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(I0)); }); }); @@ -356,15 +356,14 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto n0) { - b_thread_copy_.Run( - b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf.At(lds_read_buf), - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_bufs(lds_read_reg_buf)); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(lds_read_buf), + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(lds_read_reg_buf)); }); }); @@ -437,14 +436,14 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf.At(lds_read_buf), - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_bufs(lds_read_reg_buf)); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(lds_read_buf), + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(lds_read_reg_buf)); }); }); @@ -496,14 +495,14 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf.At(lds_read_buf), - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_bufs(lds_read_reg_buf)); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(lds_read_buf), + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(lds_read_reg_buf)); }); }); -- GitLab From abae2afc721d9b335ef07d7227e0f9e55b1c575a Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 27 Nov 2024 05:01:15 +0800 Subject: [PATCH 083/153] support max3 in smoothquant and add+ rmsnorm + rdquant (#1654) * Fix cmake example build * Support max3 in smoothquant one pass * support max3 in two pass * support max3 in add_rmsnorm_rdquant --- example/ck_tile/12_smoothquant/CMakeLists.txt | 4 +- ...msnorm2d_rdquant_fwd_pipeline_one_pass.hpp | 37 +++++++++++++++---- ...norm2d_rdquant_fwd_pipeline_three_pass.hpp | 26 ++++++++++--- .../smoothquant_pipeline_one_pass.hpp | 30 +++++++++++++-- .../smoothquant_pipeline_two_pass.hpp | 16 +++++++- 5 files changed, 94 insertions(+), 19 deletions(-) diff --git a/example/ck_tile/12_smoothquant/CMakeLists.txt b/example/ck_tile/12_smoothquant/CMakeLists.txt index 09a56c6da..3849833ac 100644 --- a/example/ck_tile/12_smoothquant/CMakeLists.txt +++ b/example/ck_tile/12_smoothquant/CMakeLists.txt @@ -18,7 +18,7 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC) target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS}) endfunction(add_smoothquant_example TARGET_NAME MAIN_SRC) -file(GLOB INSTANCE_SRCS instances/*.cpp) -add_smoothquant_example(tile_smoothquant smoothquant.cpp ${INSTANCE_SRCS}) add_smoothquant_example(tile_example_smoothquant example_smoothquant.cpp) +file(GLOB INSTANCE_SRCS instances/*.cpp) +add_smoothquant_example(tile_smoothquant smoothquant.cpp ${INSTANCE_SRCS}) diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp index 12a15938a..24f35d363 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp @@ -28,8 +28,9 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass static constexpr bool kSaveX = Problem::kSaveX; static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; - static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM - static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool UseMax3 = true; // TODO - Move to trait static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) @@ -69,9 +70,16 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass auto reduce_square_sum_func = ReduceOp::SquareAdd{}; auto reduce_sum_func = ReduceOp::Add{}; auto reduce_absmax_func = ReduceOp::AbsMax{}; - auto reduce_max_func = ReduceOp::Max{}; - auto block_reduce2d = Policy::template GetBlockReduce2d(); - auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) { + float rtn; + asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)" + : "=v"(rtn) + : "v"(acc_), "v"(v_0_), "v"(v_1_)); + return rtn; + }; + auto reduce_max_func = ReduceOp::Max{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); auto block_reduce2d_cross_warp_sync = Policy::template GetBlockReduce2dCrossWarpSync(); @@ -116,8 +124,23 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass }); // compute absmax, each-thread->cross-lane->cross-warp - auto absmax = block_reduce2d( - y, reduce_absmax_func.GetIdentityValue(), reduce_absmax_func); + auto absmax = [&]() { + constexpr auto x_size_per_row = + x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{}); + if constexpr(UseMax3 && std::is_same_v && + x_size_per_row % 2 == 0) + { + return block_reduce2d(y, + reduce_absmax_func.GetIdentityValue(), + reduce_absmax3_func, + sequence<1, 2>{}); + } + else + { + return block_reduce2d( + y, reduce_absmax_func.GetIdentityValue(), reduce_absmax_func); + } + }(); block_reduce2d_sync(absmax, reduce_max_func); block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func); diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp index 0dbb20645..aec7368e2 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp @@ -28,8 +28,9 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass static constexpr bool kSaveX = Problem::kSaveX; static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; - static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM - static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool UseMax3 = true; // TODO - Move to trait static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) @@ -76,9 +77,16 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass auto reduce_square_sum_func = ReduceOp::SquareAdd{}; auto reduce_sum_func = ReduceOp::Add{}; auto reduce_absmax_func = ReduceOp::AbsMax{}; - auto reduce_max_func = ReduceOp::Max{}; - auto block_reduce2d = Policy::template GetBlockReduce2d(); - auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) { + float rtn; + asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)" + : "=v"(rtn) + : "v"(acc_), "v"(v_0_), "v"(v_1_)); + return rtn; + }; + auto reduce_max_func = ReduceOp::Max{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); auto block_reduce2d_cross_warp_sync = Policy::template GetBlockReduce2dCrossWarpSync(); @@ -177,7 +185,13 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass y(idx) = type_convert(y_); }); - block_reduce2d(y, absmax, reduce_absmax_func); + constexpr auto x_size_per_row = + x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{}); + if constexpr(UseMax3 && std::is_same_v && + x_size_per_row % 2 == 0) + block_reduce2d(y, absmax, reduce_absmax3_func, sequence<1, 2>{}); + else + block_reduce2d(y, absmax, reduce_absmax_func); if constexpr(kSaveX) move_tile_window(x_window, {0, -Block_N}); diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp index d5b3780de..b2fc240c1 100644 --- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp @@ -25,6 +25,7 @@ struct SmoothquantPipelineOnePass static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; static constexpr bool kPadM = false; // TODO - BlockSmoothquantProblem::kPadM static constexpr bool kPadN = Problem::kPadN; + static constexpr bool UseMax3 = true; // TODO - Move to trait static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) @@ -52,7 +53,15 @@ struct SmoothquantPipelineOnePass xscale_window_, Policy::template MakeXScaleBlockTileDistribution()); auto reduce_absmax_func = ReduceOp::AbsMax{}; - auto reduce_max_func = ReduceOp::Max{}; + auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) { + float rtn; + asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)" + : "=v"(rtn) + : "v"(acc_), "v"(v_0_), "v"(v_1_)); + return rtn; + }; + auto reduce_max_func = ReduceOp::Max{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); auto block_reduce2d_cross_warp_sync = @@ -68,8 +77,23 @@ struct SmoothquantPipelineOnePass xscale); // compute absmax, cross-lane->cross-warp - auto absmax = block_reduce2d( - y, reduce_absmax_func.GetIdentityValue(), reduce_absmax_func); + auto absmax = [&]() { + constexpr auto x_size_per_row = + x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{}); + if constexpr(UseMax3 && std::is_same_v && + x_size_per_row % 2 == 0) + { + return block_reduce2d(y, + reduce_absmax_func.GetIdentityValue(), + reduce_absmax3_func, + sequence<1, 2>{}); + } + else + { + return block_reduce2d( + y, reduce_absmax_func.GetIdentityValue(), reduce_absmax_func); + } + }(); block_reduce2d_sync(absmax, reduce_max_func); block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func); diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp index 7878ef1d3..9e9df663b 100644 --- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp @@ -25,6 +25,7 @@ struct SmoothquantPipelineTwoPass static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; static constexpr bool kPadM = false; // TODO - BlockSmoothquantProblem::kPadM static constexpr bool kPadN = Problem::kPadN; + static constexpr bool UseMax3 = true; // TODO - Move to trait static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) @@ -56,6 +57,13 @@ struct SmoothquantPipelineTwoPass __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); auto reduce_absmax_func = ReduceOp::AbsMax{}; + auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) { + float rtn; + asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)" + : "=v"(rtn) + : "v"(acc_), "v"(v_0_), "v"(v_1_)); + return rtn; + }; auto reduce_max_func = ReduceOp::Max{}; auto block_reduce2d = Policy::template GetBlockReduce2d(); auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); @@ -77,7 +85,13 @@ struct SmoothquantPipelineTwoPass x, xscale); - block_reduce2d(y, absmax, reduce_absmax_func); + constexpr auto x_size_per_row = + x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{}); + if constexpr(UseMax3 && std::is_same_v && + x_size_per_row % 2 == 0) + block_reduce2d(y, absmax, reduce_absmax3_func, sequence<1, 2>{}); + else + block_reduce2d(y, absmax, reduce_absmax_func); move_tile_window(x_window, {0, Block_N}); move_tile_window(xscale_window, {Block_N}); -- GitLab From cb8c7f42d6123f548306cbd679c3d18349f10b6d Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:58:35 -0800 Subject: [PATCH 084/153] update mainline compiler branch name (#1696) --- Dockerfile | 4 ++-- Jenkinsfile | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 76e6f0ebe..38a563ce3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -116,7 +116,7 @@ ENV compiler_commit=$compiler_commit RUN sh -c "echo compiler version = '$compiler_version'" && \ sh -c "echo compiler commit = '$compiler_commit'" -RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \ +RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ cd llvm-project && mkdir build && cd build && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ @@ -124,7 +124,7 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd else echo "using the release compiler"; \ fi -RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" != "" ]; then \ +RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ diff --git a/Jenkinsfile b/Jenkinsfile index 2f790d8e5..b448a5130 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -133,7 +133,7 @@ def buildDocker(install_prefix){ def image_name = getDockerImageName() echo "Building Docker for ${image_name}" def dockerArgs = "--squash --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' " - if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ + if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ dockerArgs = dockerArgs + " --no-cache " } echo "Build Args: ${dockerArgs}" @@ -358,7 +358,7 @@ def buildHipClangJob(Map conf=[:]){ dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ + if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " } def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') @@ -549,7 +549,7 @@ def Build_CK(Map conf=[:]){ dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ + if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " } if(params.BUILD_LEGACY_OS){ @@ -737,7 +737,7 @@ def process_results(Map conf=[:]){ CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true 0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true - 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true + 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false 0 13 * * * % BUILD_LEGACY_OS=true''' : "" @@ -765,7 +765,7 @@ pipeline { string( name: 'COMPILER_VERSION', defaultValue: '', - description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline-open, or leave blank (default).') + description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline, or leave blank (default).') string( name: 'COMPILER_COMMIT', defaultValue: '', -- GitLab From 061ac0649c75deb315a418466d00dea2c49e65f3 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:02:44 +0100 Subject: [PATCH 085/153] Polished Grouped GEMM APIs and new BF16 instances (#1600) * Few small fixes. * New GroupedGemm instances (BF16) * Unify and refactor GroupedGEMM device API. * Adapt changes to new API. * Adapt grouped gemm profiler. * Accept multiple kbatches for grouped gemm profiler. - delete obsolete two stage as it is now covered by grouped gemm * Update unit test for grouped gemm. * Fix thresholds for BF16 and F8. Unblock tests. * Fix few instances. * Multiple small fixes. * Adapt to new API, check dynamic casting. * Uncomment few data types in grouped gemm profiler. * Fix call to SetDeviceArgs. * Fix profile grouped gemm multiply tile loop. * Fix grouped gemm tile loop kernel args in client examples. * Review comments. --- ...emm_multiply_bias_fastgelu_xdl_bf16_i8.cpp | 2 +- .../grouped_gemm_multiply_xdl_bf16_i8.cpp | 2 +- ...rouped_gemm_multiple_d_splitk_xdl_fp16.cpp | 4 +- .../grouped_gemm_multiple_d_xdl_fp16.cpp | 2 +- .../grouped_gemm_xdl_fixed_nk_bias_fp16.cpp | 4 +- .../grouped_gemm_xdl_fixed_nk_fp16.cpp | 4 +- .../grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp | 4 +- .../run_grouped_gemm_example.inc | 18 +- .../gpu/device/device_grouped_gemm.hpp | 132 ++++++- .../device/device_grouped_gemm_fixed_nk.hpp | 50 +-- .../device_grouped_gemm_multiple_d_splitk.hpp | 136 ------- .../gpu/device/device_grouped_gemm_splitk.hpp | 20 +- .../device/device_grouped_gemm_tile_loop.hpp | 92 +---- ...ltiple_d_splitk_xdl_cshuffle_two_stage.hpp | 93 +++-- ...gemm_multiple_d_xdl_cshuffle_tile_loop.hpp | 24 +- .../device/impl/device_grouped_gemm_xdl.hpp | 21 +- .../impl/device_grouped_gemm_xdl_fixed_nk.hpp | 72 +++- ...evice_grouped_gemm_xdl_splitk_cshuffle.hpp | 35 +- include/ck/utility/loop_scheduler.hpp | 1 - .../gpu/grouped_gemm.hpp | 185 ++++++++- ...evice_grouped_gemm_xdl_splitk_instance.hpp | 138 +++++++ .../gpu/grouped_gemm/CMakeLists.txt | 22 +- ..._bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp | 32 ++ ...bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp | 36 ++ ..._bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp | 33 ++ ..._bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp | 32 ++ ...bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp | 36 ++ ..._bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp | 38 ++ ..._bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp | 32 ++ ...bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp | 36 ++ ..._bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp | 33 ++ ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp | 47 +-- ...16_f16_f16_mk_kn_mn_irregular_instance.cpp | 123 ------ ...itk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp | 32 ++ ...6_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp | 36 ++ ...itk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp | 33 ++ ...l_splitk_f16_f16_f16_mk_nk_mn_instance.cpp | 51 +-- ...16_f16_f16_mk_nk_mn_irregular_instance.cpp | 55 +-- ...ultiply_bf16_i8_bf16_mk_kn_mn_instance.cpp | 234 ----------- .../profiler/profile_grouped_gemm_impl.hpp | 121 +++--- ...e_grouped_gemm_multiply_tile_loop_impl.hpp | 3 +- .../profile_grouped_gemm_tile_loop_impl.hpp | 2 +- .../profile_grouped_gemm_two_stage_impl.hpp | 367 ------------------ profiler/src/CMakeLists.txt | 1 - profiler/src/profile_grouped_gemm.cpp | 89 ++++- .../src/profile_grouped_gemm_fixed_nk.cpp | 8 +- .../src/profile_grouped_gemm_two_stage.cpp | 228 ----------- test/grouped_gemm/CMakeLists.txt | 6 - .../test_grouped_gemm_splitk_xdl.cpp | 46 ++- .../test_grouped_gemm_ut_cases.inc | 131 +------ test/grouped_gemm/test_grouped_gemm_util.hpp | 139 +++---- 51 files changed, 1399 insertions(+), 1722 deletions(-) delete mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp delete mode 100644 profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp delete mode 100644 profiler/src/profile_grouped_gemm_two_stage.cpp diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp index 4b284c74d..47d3e0abf 100644 --- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp +++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp @@ -121,7 +121,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co constexpr ck::index_t NumDTensor = 2; using GroupedGemmKernelArgument = - ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments; + ck::tensor_operation::device::GroupedGemmKernelArgument; std::vector grouped_gemm_kernel_args_; grouped_gemm_kernel_args_.reserve(group_count); diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp index 6cc83e06f..8c705d3bc 100644 --- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp +++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp @@ -120,7 +120,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co constexpr ck::index_t NumDTensor = 1; using GroupedGemmKernelArgument = - ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments; + ck::tensor_operation::device::GroupedGemmKernelArgument; std::vector grouped_gemm_kernel_args_; grouped_gemm_kernel_args_.reserve(group_count); diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp index ecff7b471..8bbf8e629 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp @@ -246,7 +246,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co // do GEMM auto argument = gemm.MakeArgument( p_As, p_Bs, p_Ds, p_Cs, gemm_descs, a_element_op, b_element_op, cde_element_op); - gemm.SetKBatchSize(argument, config.k_batch); + gemm.SetKBatchSize(&argument, config.k_batch); if(!gemm.IsSupportedArgument(argument)) { throw std::runtime_error( @@ -257,7 +257,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co gemm.SetWorkSpacePointer(&argument, gemm_workspace_dev.GetDeviceBuffer()); DeviceMem gemm_arg_dev_mem(gemm.GetDeviceKernelArgSize(&argument)); - gemm.SetDeviceKernelArgs(argument, gemm_arg_dev_mem.GetDeviceBuffer()); + gemm.SetDeviceKernelArgs(&argument, gemm_arg_dev_mem.GetDeviceBuffer()); invoker.Run(argument, StreamConfig{nullptr, false, 1}); diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp index 965a0e7e3..e7b2ee417 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp @@ -91,7 +91,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co { auto group_count = problem_size.group_count; - using KernelArguments = ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments; + using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument; using GemmDesc = ck::tensor_operation::device::GemmDesc; // GEMM shape diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp index a193fc39b..3b3ef508c 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -254,7 +254,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co gemm.GetDeviceKernelArgSize(&argument), hipMemcpyHostToDevice)); - gemm.SetDeviceKernelArgs(argument, gemm_kernel_args_dev.GetDeviceBuffer()); + gemm.SetDeviceKernelArgs(&argument, gemm_kernel_args_dev.GetDeviceBuffer()); gemm.SetKBatch(argument, config.k_batch); invoker.Run(argument, StreamConfig{nullptr, false}); diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp index 1a2bcfb33..c1043f419 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -239,7 +239,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co "not support this GEMM problem"); } - gemm.SetDeviceKernelArgs(argument, gemm_arg_dev_mem.GetDeviceBuffer()); + gemm.SetDeviceKernelArgs(&argument, gemm_arg_dev_mem.GetDeviceBuffer()); gemm.SetKBatch(argument, config.k_batch); invoker.Run(argument, StreamConfig{nullptr, false}); diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp index 0a63a2984..c81874b06 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -240,7 +240,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co "not support this GEMM problem"); } - gemm.SetDeviceKernelArgs(argument, gemm_arg_dev_mem.GetDeviceBuffer()); + gemm.SetDeviceKernelArgs(&argument, gemm_arg_dev_mem.GetDeviceBuffer()); gemm.SetKBatch(argument, config.k_batch); invoker.Run(argument, StreamConfig{nullptr, false}); diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc index 320870e0d..7cb0588b8 100644 --- a/example/15_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc @@ -168,9 +168,23 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co auto argument = gemm.MakeArgument( p_a, p_b, p_Ds, p_c, gemm_descs, a_element_op, b_element_op, c_element_op); - DeviceMem gemm_desc_workspace(gemm.GetWorkSpaceSize(&argument)); + std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument); + std::size_t kargs_size = gemm.GetDeviceKernelArgSize(&argument); - gemm.SetWorkSpacePointer(&argument, gemm_desc_workspace.GetDeviceBuffer()); + DeviceMem gemm_workspace, gemm_kargs; + + // The following is necessary since TwoStage kernel is using additional memory both + // for Workspace and kernel arguments. + if(kargs_size > 0) + { + gemm_kargs.Realloc(kargs_size); + gemm.SetDeviceKernelArgs(&argument, gemm_kargs.GetDeviceBuffer()); + } + if(workspace_size > 0 && workspace_size != kargs_size) + { + gemm_workspace.Realloc(workspace_size); + gemm.SetWorkSpacePointer(&argument, gemm_workspace.GetDeviceBuffer()); + } if(!gemm.IsSupportedArgument(argument)) { diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp index 1e0340553..267a970ee 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp @@ -1,17 +1,87 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#include #include +#include +#include #include #include "device_base.hpp" +#include "ck/utility/ignore.hpp" namespace ck { namespace tensor_operation { namespace device { +/// +/// @brief Structure representing single GEMM problem arguments. +/// +/// The pointer to the vector of those structures is passed to the GroupedGEMM entry +/// point kernel. +/// +/// @tparam NumDTensor The number of D input tensors. +/// +template +struct GroupedGemmKernelArgument +{ + __host__ __device__ GroupedGemmKernelArgument(const void* p_a_grid_, + const void* p_b_grid_, + std::array p_ds_grid_, + void* p_e_grid_, + index_t M_, + index_t N_, + index_t K_, + index_t StrideA_, + index_t StrideB_, + std::array StrideDs_, + index_t StrideE_) + : p_a_grid{p_a_grid_}, + p_b_grid{p_b_grid_}, + p_ds_grid{p_ds_grid_}, + p_e_grid{p_e_grid_}, + M{M_}, + N{N_}, + K{K_}, + StrideA{StrideA_}, + StrideB{StrideB_}, + StrideDs{StrideDs_}, + StrideE{StrideE_} + { + } + + const void* p_a_grid; + const void* p_b_grid; + std::array p_ds_grid; + void* p_e_grid; + index_t M; + index_t N; + index_t K; + index_t StrideA; + index_t StrideB; + std::array StrideDs; + index_t StrideE; + + void Print() const + { + std::stringstream str; + for(auto sd : StrideDs) + str << sd << ","; + + std::cout << "arg {" + << "M:" << M << ", " + << "N:" << N << ", " + << "K:" << K << ", " + << "SA:" << StrideA << ", " + << "SB:" << StrideB << ", " + << "SE:" << StrideE << ", " + << "SDs: {" << str.str() << "}" + << "}" << std::endl; + } +}; + struct GemmDesc { ck::index_t M_, N_, K_; @@ -48,6 +118,66 @@ struct DeviceGroupedGemm : public BaseOperator CElementwiseOperation c_element_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; + + //--------------------------------------------------------------------------------------------- + /// @brief Sets the device kernel arguments pointer and may copy data to device. + /// + /// TODO: Add which kernels are using this (TileLoop * FixedNK ??) + /// + /// @param p_arg The pointer to the Argument we're going to update. + /// @param[in] p_dev_kernel_args The pointer to the device memory which will contain kernel + /// arguments. + /// @param[in] p_host_kernel_args The pointer to the host memory which contains kernel + /// arguments that should be copied to device memory. + /// + virtual void SetDeviceKernelArgs(BaseArgument* p_arg, + void* p_dev_kernel_args, + const void* p_host_kernel_args) const + { + ignore = p_arg; + ignore = p_dev_kernel_args; + ignore = p_host_kernel_args; + + std::ostringstream err; + err << "This function is not implemented by the kernel: " << this->GetTypeString() + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } + + //---------------------------------------------------------------------------------------------- + /// @brief Sets the device kernel arguments pointer and may copy data to device. + /// + /// @param p_arg The pointer to the Argument we're going to update. + /// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel + /// arguments. + /// + virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const + { + ignore = p_arg; + ignore = p_dev_kernel_args; + + std::ostringstream err; + err << "This function is not implemented by the kernel: " << this->GetTypeString() + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } + + //---------------------------------------------------------------------------------------------- + /// @brief Gets the device kernel argument size. + /// + /// @param[in] p_arg The pointer to the Device op Argument. + /// + /// @return The device kernel argument size. + /// + virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const + { + ignore = p_arg; + + std::ostringstream err; + err << "This function is not implemented by the kernel: " << this->GetTypeString() + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp index fcb2ba6a4..780a0c30c 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp @@ -1,35 +1,14 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include -#include - -#include "device_grouped_gemm.hpp" +#include "device_grouped_gemm_splitk.hpp" namespace ck { namespace tensor_operation { namespace device { -template -struct GroupedGemmKernelArgument -{ - const void* p_a_grid; - const void* p_b_grid; - std::array p_ds_grid; - void* p_e_grid; - - index_t M; - index_t N; - index_t K; - - index_t StrideA; - index_t StrideB; - std::array StrideDs; - index_t StrideE; -}; - template -struct DeviceGroupedGemmFixedNK : DeviceGroupedGemm +struct DeviceGroupedGemmFixedNK : DeviceGroupedGemmSplitK { - virtual void SetDeviceKernelArgs(BaseArgument* p_arg, const void* kernel_args) const = 0; - virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const = 0; - virtual void SetKBatch(BaseArgument* p_arg, index_t k_batch) const = 0; }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp deleted file mode 100644 index d91eac073..000000000 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include -#include - -#include "device_grouped_gemm.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -/// -/// @brief Structure representing single GEMM problem arguments. -/// -/// The pointer to the vector of those structures is passed to the GroupedGEMM entry -/// point kernel. -/// -/// @tparam NumDTensor The number of D input tensors. -/// -template -struct GroupedGemmMultipleDKernelArguments -{ - __host__ __device__ - GroupedGemmMultipleDKernelArguments(const void* p_a_grid_, - const void* p_b_grid_, - std::array p_ds_grid_, - void* p_e_grid_, - index_t M_, - index_t N_, - index_t K_, - index_t StrideA_, - index_t StrideB_, - std::array StrideDs_, - index_t StrideE_) - : p_a_grid{p_a_grid_}, - p_b_grid{p_b_grid_}, - p_ds_grid{p_ds_grid_}, - p_e_grid{p_e_grid_}, - M{M_}, - N{N_}, - K{K_}, - StrideA{StrideA_}, - StrideB{StrideB_}, - StrideDs{StrideDs_}, - StrideE{StrideE_} - { - } - - const void* p_a_grid; - const void* p_b_grid; - std::array p_ds_grid; - void* p_e_grid; - index_t M; - index_t N; - index_t K; - index_t StrideA; - index_t StrideB; - std::array StrideDs; - index_t StrideE; - - void Print() const - { - std::stringstream str; - for(auto sd : StrideDs) - str << sd << ","; - - std::cout << "arg {" - << "M:" << M << ", " - << "N:" << N << ", " - << "K:" << K << ", " - << "SA:" << StrideA << ", " - << "SB:" << StrideB << ", " - << "SE:" << StrideE << ", " - << "SDs: {" << str.str() << "}" - << "}" << std::endl; - } -}; - -template -struct DeviceGroupedGemmMultipleDSplitK : public DeviceGroupedGemm -{ - //---------------------------------------------------------------------------------------------- - /// @brief Sets the k batch size. - /// - /// @param p_arg Pointer to the Argument we're going to change. - /// @param[in] kbatch The kbatch value. - /// - virtual void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const = 0; - - //---------------------------------------------------------------------------------------------- - /// @brief Sets the device kernel arguments pointer. - /// - /// @param p_arg The pointer to the Argument we're going to update. - /// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel - /// arguments. - /// - virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const = 0; - - //---------------------------------------------------------------------------------------------- - /// @brief Gets the device kernel argument size. - /// - /// @param[in] p_arg The pointer to the Device op Argument. - /// - /// @return The device kernel argument size. - /// - virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const = 0; -}; - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp index 06d180d30..3ea650190 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include -#include #include "device_grouped_gemm.hpp" @@ -31,7 +31,23 @@ struct DeviceGroupedGemmSplitK : public DeviceGroupedGemm { + //---------------------------------------------------------------------------------------------- + /// @brief Sets the k batch size. + /// + /// @param p_arg Pointer to the Argument we're going to change. + /// @param[in] kbatch The kbatch value. + /// virtual void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const = 0; + //---------------------------------------------------------------------------------------------- + /// @brief Sets the k batch size. + /// + /// @param p_arg Pointer to the Argument we're going to change. + /// @param[in] kbatch The kbatch value. + /// + virtual void SetKBatch(BaseArgument* p_arg, index_t kbatch) const + { + this->SetKBatchSize(p_arg, kbatch); + }; }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp index c1030f31c..712fbfd9e 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp @@ -3,83 +3,20 @@ #pragma once -#include -#include -#include -#include - #include "device_grouped_gemm.hpp" namespace ck { namespace tensor_operation { namespace device { +/// @brief Grouped GEMM kernel using output Tile Looping algorithm /// -/// @brief Structure representing single GEMM problem arguments. -/// -/// The pointer to the vector of those structures is passed to the GroupedGEMM entry -/// point kernel. -/// -/// @tparam NumDTensor The number of D input tensors. +/// @par This kernel does not require any knowledge about input data sizes (GEMM M/N/K) +/// It requires only the number of groups to launch. Other information like +/// data pointers and GEMM sizes, packed into gemm kernel args may be all dynamic +/// (known only at kernel run-time). /// -template -struct GroupedGemmTileLoopKernelArguments -{ - __host__ __device__ - GroupedGemmTileLoopKernelArguments(const void* p_a_grid_, - const void* p_b_grid_, - std::array p_ds_grid_, - void* p_e_grid_, - index_t M_, - index_t N_, - index_t K_, - index_t StrideA_, - index_t StrideB_, - std::array StrideDs_, - index_t StrideE_) - : p_a_grid{p_a_grid_}, - p_b_grid{p_b_grid_}, - p_ds_grid{p_ds_grid_}, - p_e_grid{p_e_grid_}, - M{M_}, - N{N_}, - K{K_}, - StrideA{StrideA_}, - StrideB{StrideB_}, - StrideDs{StrideDs_}, - StrideE{StrideE_} - { - } - - const void* p_a_grid; - const void* p_b_grid; - std::array p_ds_grid; - void* p_e_grid; - index_t M; - index_t N; - index_t K; - index_t StrideA; - index_t StrideB; - std::array StrideDs; - index_t StrideE; - - void Print() const - { - std::stringstream str; - for(auto sd : StrideDs) - str << sd << ","; - - std::cout << "arg {" - << "M:" << M << ", " - << "N:" << N << ", " - << "K:" << K << ", " - << "SA:" << StrideA << ", " - << "SB:" << StrideB << ", " - << "SE:" << StrideE << ", " - << "SDs: {" << str.str() << "}" - << "}" << std::endl; - } -}; +/// @note This kernel does not support SplitK. template { - //---------------------------------------------------------------------------------------------- - /// @brief Sets the device kernel arguments pointer. - /// - /// @param p_arg The pointer to the Argument we're going to update. - /// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel - /// arguments. - /// - virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const = 0; - - //---------------------------------------------------------------------------------------------- - /// @brief Gets the device kernel argument size. - /// - /// @param[in] p_arg The pointer to the Device op Argument. - /// - /// @return The device kernel argument size. - /// - virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const = 0; }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp index 68c6dcc0f..0535c8032 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp @@ -18,7 +18,6 @@ #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" @@ -78,17 +77,17 @@ template = false> struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage - : public DeviceGroupedGemmMultipleDSplitK + : public DeviceGroupedGemmSplitK { using DeviceOp = DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage; @@ -530,7 +529,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage index_t skipped_group_count_; index_t grid_size_; // Pointer to device memory with GEMM kernel arguments. - const void* p_dev_gemm_args_; + void* p_dev_gemm_kargs_; AElementwiseOperation a_element_op_; BElementwiseOperation b_element_op_; @@ -566,7 +565,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage /// @return The average kernel execution time (if time measurement is enabled.) /// float Run(const Argument& arg, - const void* dev_gemm_args, + void* dev_gemm_args, void* dev_gemm_workspace, const StreamConfig& stream_config = StreamConfig{}) { @@ -621,7 +620,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage /// float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - if(arg.p_dev_gemm_args_ == nullptr) + if(arg.p_dev_gemm_kargs_ == nullptr) { std::ostringstream err; err << "The gemm arguments device buffer is not allocated!" @@ -637,7 +636,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage throw std::runtime_error(err.str()); } - return Run(arg, arg.p_dev_gemm_args_, arg.p_workspace_, stream_config); + return Run(arg, arg.p_dev_gemm_kargs_, arg.p_workspace_, stream_config); } float Run(const BaseArgument* p_arg, @@ -723,7 +722,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage template float DispatchKernel(const Argument& arg, - const void* dev_gemm_args, + void* dev_gemm_kargs, void* dev_gemm_workspace, const StreamConfig& stream_config) const { @@ -746,7 +745,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage return LaunchKernel(gemm_kernel, elementwise_kernel, arg, - dev_gemm_args, + dev_gemm_kargs, dev_gemm_workspace, stream_config); } @@ -755,12 +754,19 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage float LaunchKernel(const KernelFunction& gemm_kernel, const KernelFunction2& elementwise_kernel, const Argument& arg, - const void* dev_gemm_args, + void* dev_gemm_kargs, [[maybe_unused]] void* dev_gemm_workspace, const StreamConfig& stream_config) const { float time{0.f}; + hip_check_error( + hipMemcpyWithStream(dev_gemm_kargs, + arg.gemm_kernel_args_.data(), + arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg), + hipMemcpyHostToDevice, + stream_config.stream_id_)); + auto preprocess = [&]() { hip_check_error(hipMemsetAsync( dev_gemm_workspace, 0, arg.GetWorkspaceSizeBytes(), stream_config.stream_id_)); @@ -774,7 +780,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage dim3(arg.grid_size_), dim3(BlockSize), 0, - cast_pointer_to_constant_address_space(dev_gemm_args), + cast_pointer_to_constant_address_space(dev_gemm_kargs), arg.gemm_kernel_args_.size(), arg.a_element_op_, arg.b_element_op_, @@ -930,18 +936,30 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage return str.str(); } - void SetDeviceKernelArgs(Argument& arg, void* p_dev_kernel_args) const + void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override { - arg.p_dev_gemm_args_ = p_dev_kernel_args; - hip_check_error(hipMemcpy(p_dev_kernel_args, - arg.gemm_kernel_args_.data(), - GetDeviceKernelArgSize(&arg), - hipMemcpyHostToDevice)); + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + arg_ptr->p_dev_gemm_kargs_ = p_dev_kernel_args; + } + else + throw std::runtime_error( + "The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"); } - void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override + size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override { - return SetDeviceKernelArgs(*dynamic_cast(p_arg), p_dev_kernel_args); + auto arg = dynamic_cast(p_arg); + if(arg) + { + return arg->gemm_kernel_args_.size() * sizeof(GemmTransKernelArg); + } + else + throw std::runtime_error( + "The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"); } size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override @@ -974,17 +992,22 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"); } - static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); } - - void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override + [[deprecated]] static void SetKBatchSize(Argument& arg, index_t kbatch) { - return SetKBatchSize(*dynamic_cast(p_arg), kbatch); + arg.UpdateKBatch(kbatch); } - size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override + void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override { - return dynamic_cast(p_arg)->gemm_kernel_args_.size() * - sizeof(GemmTransKernelArg); + auto p_arg_ = dynamic_cast(p_arg); + if(p_arg_) + { + p_arg_->UpdateKBatch(kbatch); + } + else + throw std::runtime_error( + "The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"); } }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp index 2884e558c..f673713f3 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp @@ -20,7 +20,6 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp" // stare wywalic -#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp" namespace ck { @@ -522,7 +521,7 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop ComputeTypeA, ComputeTypeB>; - using KernelArguments = GroupedGemmTileLoopKernelArguments; + using KernelArguments = GroupedGemmKernelArgument; using Block2ETileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>; using OffsettedLocalBlock2ETileMap = OffsettedBlockToCTileMap2; @@ -936,12 +935,31 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop return str.str(); } + void SetDeviceKernelArgs(Argument& arg, + void* p_dev_kernel_args, + const void* p_host_kernel_args) const + { + arg.p_dev_gemm_args_ = p_dev_kernel_args; + hip_check_error(hipMemcpy(p_dev_kernel_args, + p_host_kernel_args, + GetDeviceKernelArgSize(&arg), + hipMemcpyHostToDevice)); + } + + virtual void SetDeviceKernelArgs(BaseArgument* p_arg, + void* p_dev_kernel_args, + const void* p_host_kernel_args) const override + { + return SetDeviceKernelArgs( + *dynamic_cast(p_arg), p_dev_kernel_args, p_host_kernel_args); + } + void SetDeviceKernelArgs(Argument& arg, void* p_dev_kernel_args) const { arg.p_dev_gemm_args_ = p_dev_kernel_args; } - void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override + virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override { return SetDeviceKernelArgs(*dynamic_cast(p_arg), p_dev_kernel_args); } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp index 658f32351..86cf1da15 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp @@ -1,6 +1,6 @@ #pragma once // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -717,7 +717,24 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm(p_arg)->group_count_ * sizeof(GemmBiasTransKernelArg); + auto p_arg_ = dynamic_cast(p_arg); + if(p_arg_) + { + return p_arg_->group_count_ * sizeof(GemmBiasTransKernelArg); + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDXdlCShuffle::Argument structure!"); + } + + size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override + { + return GetWorkSpaceSize(p_arg); + } + + void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override + { + return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args); } }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp index ac05a0703..1fee02bad 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp @@ -445,6 +445,7 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK; using GroupedGemmBlock2ETileMap = OffsettedBlockToCTileMapMLoops; + // TODO: replace with GroupedGemmKernelArgument struct GemmBiasTransKernelArg { // pointers @@ -900,40 +901,58 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK(p_arg), kernel_args); + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + arg_ptr->grouped_gemm_kernel_args_dev = kernel_args; + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); } size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override { - auto arg = *dynamic_cast(p_arg); - - return arg.group_count_ * arg.barrier_size_grp_ * sizeof(uint32_t); + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + return arg_ptr->group_count_ * arg_ptr->barrier_size_grp_ * sizeof(uint32_t); + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); } size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override { - auto arg = *dynamic_cast(p_arg); - - return arg.group_count_ * sizeof(GroupedGemmKernelArgument); + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + return arg_ptr->group_count_ * sizeof(GroupedGemmKernelArgument); + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); } void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace, const StreamConfig& stream_config = StreamConfig{}) const override { - auto p_arg_ = dynamic_cast(p_arg); - p_arg_->p_workspace_ = p_workspace; + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + arg_ptr->p_workspace_ = p_workspace; + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); hip_check_error( - hipMemsetAsync(p_workspace, 0, GetWorkSpaceSize(p_arg), stream_config.stream_id_)); + hipMemsetAsync(p_workspace, 0, GetWorkSpaceSize(arg_ptr), stream_config.stream_id_)); } static void SetKBatch(Argument& arg, index_t k_batch) { arg.UpdateKBatch(k_batch); } @@ -941,7 +960,26 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK(p_arg), k_batch); + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + arg_ptr->UpdateKBatch(k_batch); + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); + } + + void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override + { + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + arg_ptr->UpdateKBatch(kbatch); + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); } }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp index cb0afbb08..626ffbe97 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp @@ -546,7 +546,8 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK(p_arg)->gemm_kernel_args_.size() * - sizeof(GemmTransKernelArg); + auto p_arg_ = dynamic_cast(p_arg); + if(p_arg_) + { + return p_arg_->gemm_kernel_args_.size() * sizeof(GemmTransKernelArg); + } + else + throw std::runtime_error( + "The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDSplitKXdlCShuffle::Argument structure!"); + } + + size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override + { + return GetWorkSpaceSize(p_arg); } + // TODO: deperecation notice. static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); } // polymorphic void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override { - return SetKBatchSize(*dynamic_cast(p_arg), kbatch); + auto p_arg_ = dynamic_cast(p_arg); + if(p_arg_) + { + p_arg_->UpdateKBatch(kbatch); + } + else + throw std::runtime_error( + "The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDSplitKXdlCShuffle::Argument structure!"); + } + + void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override + { + return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args); } }; diff --git a/include/ck/utility/loop_scheduler.hpp b/include/ck/utility/loop_scheduler.hpp index 0c4d85bed..a88109249 100644 --- a/include/ck/utility/loop_scheduler.hpp +++ b/include/ck/utility/loop_scheduler.hpp @@ -5,7 +5,6 @@ #pragma once #include "ck/utility/common_header.hpp" -#include "ck/tensor_description/tensor_adaptor.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp index 87426fd52..a999f9e3a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp @@ -95,6 +95,45 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances( PassThrough, PassThrough>>>& instances); +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2_instances( + std::vector>>& instances); + void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances( std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances( + std::vector>>& instances); + #endif #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8) @@ -262,7 +419,11 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instances( op_ptrs); + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances( + op_ptrs); + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances( + op_ptrs); + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances( + op_ptrs); + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances( + op_ptrs); + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances( + op_ptrs); } } #endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp new file mode 100644 index 000000000..7721e42c3 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp" +#include "ck/utility/loop_scheduler.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using BF16 = ck::bhalf_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using Empty_Tuple = ck::Tuple<>; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto PipelineV1 = ck::PipelineVersion::v1; +static constexpr auto PipelineV2 = ck::PipelineVersion::v2; +static constexpr auto DefaultScheduler = ck::LoopScheduler::Default; +static constexpr auto InterwaveScheduler = ck::LoopScheduler::Interwave; +static constexpr auto GemmMNKPadding = device::GemmSpecialization::MNKPadding; +static constexpr auto GemmDefault = device::GemmSpecialization::Default; + +template = false> +using device_grouped_gemm_xdl_splitk_2Bt_rrr_instances = std::tuple< + // clang-format off + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Pipeline | Loop | + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Version | Scheduler | + //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 24, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler> + // clang-format on + >; + +template = false> +using device_grouped_gemm_xdl_splitk_2Bt_rcr_instances = std::tuple< + // clang-format off + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Pipeline | Loop | + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Version | Scheduler | + //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler> + // clang-format on + >; + +template = false> +using device_grouped_gemm_xdl_splitk_2Bt_crr_instances = std::tuple< + // clang-format off + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Pipeline | Loop | + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Version | Scheduler | + //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, S<1, 4, 24, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 2, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler> + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt index de2032194..4a3e1a4ad 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt @@ -4,12 +4,30 @@ add_instance_library(device_grouped_gemm_instance device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp - device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp + device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp - device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp + + device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp + device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp + device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp + device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp + + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp + + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp + + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp + device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instance.cpp + device_grouped_gemm_multiple_d_splitk_xdl_two_stage_f16_f16_f16_mk_kn_mn_instance.cpp device_grouped_gemm_multiple_d_splitk_xdl_two_stage_bf16_bf16_bf16_mk_kn_mn_instance.cpp device_grouped_gemm_multiple_d_splitk_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp new file mode 100644 index 000000000..b8a03871c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_crr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp new file mode 100644 index 000000000..10141165c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_crr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp new file mode 100644 index 000000000..b96f5983c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_crr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp new file mode 100644 index 000000000..8fad42316 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp new file mode 100644 index 000000000..7845136ca --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp new file mode 100644 index 000000000..a2d79edf6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp new file mode 100644 index 000000000..033a2929f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp new file mode 100644 index 000000000..cf8c94bf4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rcr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp new file mode 100644 index 000000000..70c0d703e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rcr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp index 98e476f8b..077a8a18c 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp @@ -1,53 +1,14 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using Empty_Tuple = ck::Tuple<>; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; - -// a[m, k] * b[k, n] = e[m, n] -using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple< - // clang-format off - //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8> - // clang-format on - >; - void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances( std::vector>>& instances) { - add_device_operation_instances(instances, - device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances{}); + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp deleted file mode 100644 index ed0a8c7b7..000000000 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = ck::half_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using Empty_Tuple = ck::Tuple<>; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - -using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instances = std::tuple< - // clang-format off - //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 24, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1>, - - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 24, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 24, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v2> - // clang-format on - >; - -void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp new file mode 100644 index 000000000..8ad4736ac --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp new file mode 100644 index 000000000..1d968c821 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp new file mode 100644 index 000000000..ee3d7d73b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp index aa6365cd9..085e74f0c 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp @@ -1,57 +1,14 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using Empty_Tuple = ck::Tuple<>; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; - -// a[m, k] * b[n, k] = e[m, n] -using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple< - // clang-format off - //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> - // clang-format on - >; - void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances( std::vector>>& instances) { - add_device_operation_instances(instances, - device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances{}); + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp index f4460b360..320bb933b 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp @@ -1,63 +1,14 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using Empty_Tuple = ck::Tuple<>; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - -using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple< - // clang-format off - //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 48, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 256, 32, 8, 8, 32, 32, 1, 4, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> - // clang-format on - >; - void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances( std::vector>>& instances) { add_device_operation_instances( - instances, device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances{}); + instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp deleted file mode 100644 index c98328e52..000000000 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp +++ /dev/null @@ -1,234 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using BF16 = ck::bhalf_t; -using I8 = int8_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using Multiply = ck::tensor_operation::element_wise::Multiply; -using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu; -using MultiplyFastGelu = ck::tensor_operation::element_wise::MultiplyFastGelu; -using MultiplyAdd = ck::tensor_operation::element_wise::MultiplyAdd; - -static constexpr auto GemmDefault = GemmSpecialization::Default; -static constexpr auto GemmKPadding = GemmSpecialization::KPadding; -static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; -static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; - -static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; -static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; - -template -using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances = std::tuple< - // clang-format off - //###########################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //###########################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //###########################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //###########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 224, 256, 64, 8, 4, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 2, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 128, 256, 32, 8, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1> - - // clang-format on - >; - -template -using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances = - std::tuple< - // clang-format off - //###########################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //###########################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //###########################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //###########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 4>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 16, 32, 256, 8, 4, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - // Memory friendly - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 64, 16, 16, 256, 8, 4, 16, 16, 1, 1, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 4>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 16, 32, 256, 8, 4, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 16, 64, 128, 8, 4, 16, 16, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 32, 64, 128, 8, 4, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<8,8,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 16, 128, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 32, 128, 64, 8, 4, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<8,8,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 16, 256, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 16>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 32, 256, 64, 8, 4, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 16>, S<8,8,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> - // clang-format on - >; - -void add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instances( - std::vector, - Row, - BF16, - I8, - ck::Tuple, - BF16, - PassThrough, - PassThrough, - Multiply>>>& instances) -{ - // comp - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances, - ck::Tuple, - Multiply, - GemmDefault>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances, - ck::Tuple, - Multiply, - GemmMNKPadding>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances, - ck::Tuple, - Multiply, - GemmMNPadding>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances, - ck::Tuple, - Multiply, - GemmKPadding>{}); - // mem - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmDefault, - Intrawave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmMNKPadding, - Intrawave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmMNPadding, - Intrawave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmKPadding, - Intrawave>{}); - - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmDefault, - Interwave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmMNKPadding, - Interwave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmMNPadding, - Interwave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmKPadding, - Interwave>{}); -} - -void add_device_grouped_gemm_xdl_tile_loop_multiply_bias_fastgelu_bf16_i8_bf16_mk_kn_mn_instances( - std::vector, - Row, - BF16, - I8, - ck::Tuple, - BF16, - PassThrough, - PassThrough, - MultiplyAddFastGelu>>>& instances) -{ - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_irregular_tile_instances< - ck::Tuple, - ck::Tuple, - MultiplyAddFastGelu>{}); -} - -void add_device_grouped_gemm_xdl_tile_loop_multiply_fastgelu_bf16_i8_bf16_mk_kn_mn_instances( - std::vector, - Row, - BF16, - I8, - ck::Tuple, - BF16, - PassThrough, - PassThrough, - MultiplyFastGelu>>>& instances) -{ - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_irregular_tile_instances< - ck::Tuple, - ck::Tuple, - MultiplyFastGelu>{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp index 0b73e4fcd..c10cd0ea9 100644 --- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -17,7 +17,6 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" #include "ck/library/utility/fill.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" @@ -42,11 +41,14 @@ bool profile_grouped_gemm_impl(int do_verification, const std::vector& StrideAs, const std::vector& StrideBs, const std::vector& StrideCs, - int kbatch = 1, - int n_warmup = 1, - int n_iter = 10) + const std::vector& kbatches = {}, + int n_warmup = 1, + int n_iter = 10) { bool pass = true; + // TODO: Fixme - we do not pass compute data type here but need it + // to compute error thresholds. + using ComputeDataType = ADataType; auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { @@ -75,6 +77,7 @@ bool profile_grouped_gemm_impl(int do_verification, std::vector> c_m_n_host_results; std::vector> c_m_n_device_results; + ComputeDataType max_abs_in_val = 0.f; for(std::size_t i = 0; i < group_count; i++) { a_m_k.push_back( @@ -93,17 +96,18 @@ bool profile_grouped_gemm_impl(int do_verification, << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i << "]:" << c_m_n_device_results[i].mDesc << std::endl; } - std::size_t num_thread = 1; switch(init_method) { case 0: break; case 1: - a_m_k[i].GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - b_k_n[i].GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(a_m_k[i]); + ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(b_k_n[i]); + max_abs_in_val = 2.f; break; default: - a_m_k[i].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}, num_thread); - b_k_n[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); + ck::utils::FillUniformDistribution{-0.5f, 0.5f}(a_m_k[i]); + ck::utils::FillUniformDistribution{-0.5f, 0.5f}(b_k_n[i]); + max_abs_in_val = 0.5f; } } @@ -164,7 +168,20 @@ bool profile_grouped_gemm_impl(int do_verification, BElementOp, CElementOp>; - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + // If kbatch would be bigger than 1, then we will use SplitK version. + using DeviceOpSplitK = ck::tensor_operation::device::DeviceGroupedGemmSplitK, + CLayout, + ADataType, + BDataType, + ck::Tuple<>, + CDataType, + AElementOp, + BElementOp, + CElementOp>; + + auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< DeviceOp>::GetInstances(); if(op_ptrs.size() <= 0) @@ -205,7 +222,6 @@ bool profile_grouped_gemm_impl(int do_verification, ref_invoker.Run(ref_argument); } } - // profile device GEMM instances for(auto& gemm_ptr : op_ptrs) { @@ -221,43 +237,44 @@ bool profile_grouped_gemm_impl(int do_verification, auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); - DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get())); + std::size_t workspace_size = gemm_ptr->GetWorkSpaceSize(argument_ptr.get()); + std::size_t kargs_size = gemm_ptr->GetDeviceKernelArgSize(argument_ptr.get()); - gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer()); - std::string gemm_name = gemm_ptr->GetTypeString(); + DeviceMem gemm_workspace, gemm_kargs; - using DeviceOpSplitK = ck::tensor_operation::device::DeviceGroupedGemmSplitK, - CLayout, - ADataType, - BDataType, - ck::Tuple<>, - CDataType, - AElementOp, - BElementOp, - CElementOp>; - - // skip non-splitk grouped_gemm - if(dynamic_cast(gemm_ptr.get()) == nullptr) + // The following is necessary since TwoStage kernel is using additional memory both + // for Workspace and kernel arguments. + if(kargs_size > 0) { - continue; + gemm_kargs.Realloc(kargs_size); + gemm_ptr->SetDeviceKernelArgs(argument_ptr.get(), gemm_kargs.GetDeviceBuffer()); + } + if(workspace_size > 0 && workspace_size != kargs_size) + { + gemm_workspace.Realloc(workspace_size); + gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_workspace.GetDeviceBuffer()); } + std::string gemm_name = gemm_ptr->GetTypeString(); + std::vector kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64}; - if(kbatch > 0) + // If the user will provide not empty kbatches list, then we test predefined set of kbatch + // values. + if(!kbatches.empty()) { - kbatch_list = {kbatch}; + kbatch_list = kbatches; } for(std::size_t j = 0; j < kbatch_list.size(); j++) { - auto kbatch_curr = kbatch_list[j]; - dynamic_cast(gemm_ptr.get()) - ->SetKBatchSize(argument_ptr.get(), kbatch_curr); + if(kbatch_curr > 1 && dynamic_cast(gemm_ptr.get()) != nullptr) + { + dynamic_cast(gemm_ptr.get()) + ->SetKBatchSize(argument_ptr.get(), kbatch_curr); + } if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) { @@ -272,23 +289,18 @@ bool profile_grouped_gemm_impl(int do_verification, bool instance_pass = true; for(std::size_t i = 0; i < gemm_descs.size(); i++) { - c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data()); - - if(std::is_same_v && kbatch_curr > 1) - { - instance_pass = - instance_pass && ck::utils::check_err(c_m_n_device_results[i], - c_m_n_host_results[i], - "Error: Incorrect results!", - 0.06); - } - else - { - instance_pass = - instance_pass && ck::utils::check_err(c_m_n_device_results[i], - c_m_n_host_results[i]); - } + auto atol = ck::utils::get_absolute_threshold( + max_abs_in_val, gemm_descs[i].K_); + auto rtol = ck::utils::get_relative_threshold( + gemm_descs[i].K_); + + instance_pass = + instance_pass && ck::utils::check_err(c_m_n_device_results[i], + c_m_n_host_results[i], + "Error: Incorrect results!", + rtol, + atol); if(do_log) { @@ -311,11 +323,12 @@ bool profile_grouped_gemm_impl(int do_verification, pass = pass && instance_pass; } - float ave_time = invoker_ptr->Run( - argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter}); - if(time_kernel) { + float ave_time = + invoker_ptr->Run(argument_ptr.get(), + StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter}); + std::size_t flop = 0, num_btype = 0; for(std::size_t i = 0; i < gemm_descs.size(); i++) { diff --git a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp index f66564416..94ee2a37e 100644 --- a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp @@ -143,8 +143,7 @@ bool profile_grouped_gemm_multiply_tile_loop_impl(int do_verification, p_ds.reserve(group_count); p_e.reserve(group_count); - using KernelArguments = - ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments; + using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument; std::vector gemm_descs; std::vector gemm_kargs; diff --git a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp index 74faf15be..3a4ca24dd 100644 --- a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp @@ -127,7 +127,7 @@ bool profile_grouped_gemm_tile_loop_impl(int do_verification, p_b.reserve(group_count); p_c.reserve(group_count); - using KernelArguments = ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<>; + using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument<>; std::vector gemm_descs; std::vector gemm_kargs; diff --git a/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp deleted file mode 100644 index 14df96d50..000000000 --- a/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp +++ /dev/null @@ -1,367 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/convolution_parameter.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/utility/literals.hpp" -#include "ck/library/utility/fill.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" - -namespace ck { -namespace profiler { - -template -bool profile_grouped_gemm_two_stage_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - const std::vector& Ms, - const std::vector& Ns, - const std::vector& Ks, - const std::vector& StrideAs, - const std::vector& StrideBs, - const std::vector& StrideCs, - int kbatch = 1, - int n_warmup = 1, - int n_iter = 10) -{ - bool pass = true; - - auto f_host_tensor_descriptor = - [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - using namespace ck::literals; - - if(is_same::value) - { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); - } - else - { - return HostTensorDescriptor({row, col}, {1_uz, stride}); - } - }; - - std::size_t group_count = Ms.size(); - - if(!(group_count == Ns.size() && group_count == Ks.size() && group_count == StrideAs.size() && - group_count == StrideBs.size() && group_count == StrideCs.size())) - { - throw std::runtime_error("wrong! inconsistent M/N/Ks, StrideA/B/Cs size\n"); - } - - std::vector> a_m_k; - std::vector> b_k_n; - std::vector> c_m_n_host_results; - std::vector> c_m_n_device_results; - - for(std::size_t i = 0; i < group_count; i++) - { - a_m_k.push_back( - Tensor(f_host_tensor_descriptor(Ms[i], Ks[i], StrideAs[i], ALayout{}))); - b_k_n.push_back( - Tensor(f_host_tensor_descriptor(Ks[i], Ns[i], StrideBs[i], BLayout{}))); - - c_m_n_device_results.push_back( - Tensor(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); - - c_m_n_host_results.push_back( - Tensor(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" - << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i - << "]:" << c_m_n_device_results[i].mDesc << std::endl; - } - std::size_t num_thread = 1; - switch(init_method) - { - case 0: break; - case 1: - a_m_k[i].GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - b_k_n[i].GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - break; - default: - a_m_k[i].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}, num_thread); - b_k_n[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); - } - } - - using AElementOp = ck::tensor_operation::element_wise::PassThrough; - using BElementOp = ck::tensor_operation::element_wise::PassThrough; - using CElementOp = ck::tensor_operation::element_wise::PassThrough; - - const auto a_element_op = AElementOp{}; - const auto b_element_op = BElementOp{}; - const auto c_element_op = CElementOp{}; - - using DeviceMemPtr = std::unique_ptr; - std::vector a_device_buf, b_device_buf, c_device_buf; - - a_device_buf.reserve(group_count); - b_device_buf.reserve(group_count); - c_device_buf.reserve(group_count); - - std::vector p_a, p_b; - std::vector p_c; - - p_a.reserve(group_count); - p_b.reserve(group_count); - p_c.reserve(group_count); - - std::vector gemm_descs; - - gemm_descs.reserve(group_count); - - for(std::size_t i = 0; i < group_count; i++) - { - a_device_buf.emplace_back( - std::make_unique(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize())); - b_device_buf.emplace_back( - std::make_unique(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize())); - c_device_buf.emplace_back(std::make_unique( - sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize())); - - a_device_buf[i]->ToDevice(a_m_k[i].mData.data()); - b_device_buf[i]->ToDevice(b_k_n[i].mData.data()); - - gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}}); - - p_a.push_back(a_device_buf[i]->GetDeviceBuffer()); - p_b.push_back(b_device_buf[i]->GetDeviceBuffer()); - p_c.push_back(c_device_buf[i]->GetDeviceBuffer()); - } - - using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm, - CLayout, - ADataType, - BDataType, - ck::Tuple<>, - CDataType, - AElementOp, - BElementOp, - CElementOp>; - - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - if(op_ptrs.size() <= 0) - { - throw std::runtime_error("wrong! no device GEMM instance found"); - } - - std::string best_gemm_name; - float best_ave_time = 0; - float best_tflops = 0; - float best_gb_per_sec = 0; - float best_kbatch = 0; - - auto p_ds = std::vector>{}; - - if(do_verification) - { - for(std::size_t i = 0; i < gemm_descs.size(); i++) - { - using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; - - auto ref_gemm = ReferenceGemmInstance{}; - auto ref_invoker = ref_gemm.MakeInvoker(); - - auto ref_argument = ref_gemm.MakeArgument(a_m_k[i], - b_k_n[i], - c_m_n_host_results[i], - a_element_op, - b_element_op, - c_element_op); - - ref_invoker.Run(ref_argument); - } - } - - // profile device GEMM instances - for(auto& gemm_ptr : op_ptrs) - { - auto argument_ptr = - gemm_ptr->MakeArgumentPointer(p_a, - p_b, - p_ds, - p_c, - gemm_descs, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}); - - auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); - - DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get())); - gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer()); - - std::string gemm_name = gemm_ptr->GetTypeString(); - - using DeviceOpSplitK = - ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitK, - CLayout, - ADataType, - BDataType, - ck::Tuple<>, - CDataType, - AElementOp, - BElementOp, - CElementOp>; - - // skip non-splitk grouped_gemm - if(dynamic_cast(gemm_ptr.get()) == nullptr) - { - continue; - } - - std::vector kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64}; - - if(kbatch > 0) - { - kbatch_list = {kbatch}; - } - - for(std::size_t j = 0; j < kbatch_list.size(); j++) - { - - auto kbatch_curr = kbatch_list[j]; - dynamic_cast(gemm_ptr.get()) - ->SetKBatchSize(argument_ptr.get(), kbatch_curr); - - DeviceMem gemm_arg_dev_mem(dynamic_cast(gemm_ptr.get()) - ->GetDeviceKernelArgSize(argument_ptr.get())); - dynamic_cast(gemm_ptr.get()) - ->SetDeviceKernelArgs(argument_ptr.get(), gemm_arg_dev_mem.GetDeviceBuffer()); - - if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) - { - gemm_desc_workspace.SetZero(); - for(std::size_t i = 0; i < gemm_descs.size(); i++) - c_device_buf[i]->SetZero(); - - invoker_ptr->Run(argument_ptr.get(), - StreamConfig{nullptr, false, 0, n_warmup, n_iter}); - if(do_verification) - { - bool instance_pass = true; - for(std::size_t i = 0; i < gemm_descs.size(); i++) - { - c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data()); - if(std::is_same_v && kbatch_curr > 1) - { - instance_pass = - instance_pass && ck::utils::check_err(c_m_n_device_results[i], - c_m_n_host_results[i], - "Error: Incorrect results!", - 0.06); - } - else - { - instance_pass = - instance_pass && ck::utils::check_err(c_m_n_device_results[i], - c_m_n_host_results[i]); - } - - if(do_log) - { - LogRangeAsType(std::cout << "a : ", a_m_k[i].mData, ",") - << std::endl; - LogRangeAsType(std::cout << "b: ", b_k_n[i].mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "c_device: ", c_m_n_device_results[i].mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "c_host : ", c_m_n_host_results[i].mData, ",") - << std::endl; - } - } - - std::cout << "Instance: " << gemm_name << " verification " - << (instance_pass ? "SUCCEED" : "FAILED") << std::endl; - - pass = pass && instance_pass; - } - float ave_time = invoker_ptr->Run( - argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter}); - if(time_kernel) - { - std::size_t flop = 0, num_btype = 0; - for(std::size_t i = 0; i < gemm_descs.size(); i++) - { - flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i]; - - num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + - sizeof(BDataType) * Ks[i] * Ns[i] + - sizeof(CDataType) * Ms[i] * Ns[i]; - } - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops - << " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << ", KBatch " - << kbatch_curr << std::endl; - - if(tflops > best_tflops) - { - best_gemm_name = gemm_name; - best_tflops = tflops; - best_ave_time = ave_time; - best_gb_per_sec = gb_per_sec; - best_kbatch = kbatch_curr; - } - } - } - else - { - std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem" - << std::endl; - } - } - } - - if(time_kernel) - { - std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " - << best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch - << std::endl; - } - - return pass; -} - -} // namespace profiler -} // namespace ck diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index f079d554b..35e91f817 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -43,7 +43,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_gemm_two_stage.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp) diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp index fbf44d720..2adcd6483 100644 --- a/profiler/src/profile_grouped_gemm.cpp +++ b/profiler/src/profile_grouped_gemm.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -39,16 +39,13 @@ namespace { std::vector argToIntArray(char* input) { std::vector out; - std::istringstream in(input); - std::string item; while(std::getline(in, item, ',')) { out.push_back(std::stoi(item)); } - return out; } @@ -69,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[]) << "arg7: time kernel (0=n0, 1=yes)\n" << "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 " "64,64 64,64 128,128)\n" - << "arg15: kbatch value (default 1)\n" + << "arg15: kbatch values (default 1)\n" << "optional:\n" << "arg16: number of warm-up cycles (default 1)\n" << "arg17: number of iterations (default 10)\n" @@ -92,7 +89,7 @@ int profile_grouped_gemm(int argc, char* argv[]) const auto StrideAs = argToIntArray(argv[11]); const auto StrideBs = argToIntArray(argv[12]); const auto StrideCs = argToIntArray(argv[13]); - const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1; + const auto kbatches = argc >= 15 ? argToIntArray(argv[14]) : std::vector{}; int n_warmup = 1; int n_iter = 10; @@ -102,7 +99,6 @@ int profile_grouped_gemm(int argc, char* argv[]) n_iter = std::stoi(argv[16]); } -#ifdef CK_ENABLE_FP16 if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) { ck::profiler::profile_grouped_gemm_impl(do_verification, + init_method, + do_log, + time_kernel, + Ms, + Ns, + Ks, + StrideAs, + StrideBs, + StrideCs, + kbatches, + n_warmup, + n_iter); + } + else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN) + { + ck::profiler::profile_grouped_gemm_impl(do_verification, + init_method, + do_log, + time_kernel, + Ms, + Ns, + Ks, + StrideAs, + StrideBs, + StrideCs, + kbatches, + n_warmup, + n_iter); + } + else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN) + { + ck::profiler::profile_grouped_gemm_impl(do_verification, + init_method, + do_log, + time_kernel, + Ms, + Ns, + Ks, + StrideAs, + StrideBs, + StrideCs, + kbatches, n_warmup, n_iter); } @@ -239,7 +301,6 @@ int profile_grouped_gemm(int argc, char* argv[]) { throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); } -#endif return 0; } diff --git a/profiler/src/profile_grouped_gemm_fixed_nk.cpp b/profiler/src/profile_grouped_gemm_fixed_nk.cpp index de90a33ef..e33d79850 100644 --- a/profiler/src/profile_grouped_gemm_fixed_nk.cpp +++ b/profiler/src/profile_grouped_gemm_fixed_nk.cpp @@ -32,9 +32,7 @@ namespace { std::vector argToIntArray(char* input) { std::vector out; - std::istringstream in(input); - std::string item; while(std::getline(in, item, ',')) @@ -83,7 +81,7 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) const auto StrideAs = argToIntArray(argv[11]); const auto StrideBs = argToIntArray(argv[12]); const auto StrideCs = argToIntArray(argv[13]); - const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1; + const int kbatch = argc >= 15 ? std::stoi(argv[14]) : 1; using F32 = float; using F16 = ck::half_t; @@ -97,8 +95,8 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) int n_iter = 10; if(argc == 17) { - n_warmup = std::stoi(argv[16]); - n_iter = std::stoi(argv[17]); + n_warmup = std::stoi(argv[15]); + n_iter = std::stoi(argv[16]); } #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8) diff --git a/profiler/src/profile_grouped_gemm_two_stage.cpp b/profiler/src/profile_grouped_gemm_two_stage.cpp deleted file mode 100644 index db37a0b76..000000000 --- a/profiler/src/profile_grouped_gemm_two_stage.cpp +++ /dev/null @@ -1,228 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include - -#include "profiler/profile_grouped_gemm_two_stage_impl.hpp" -#include "profiler_operation_registry.hpp" - -enum struct GemmMatrixLayout -{ - MK_KN_MN, // 0 - MK_NK_MN, // 1 -}; - -enum struct GemmDataType -{ - F16_F16_F16, // 0 - BF16_INT8_BF16, // 1 - BF16_BF16_BF16 // 2 -}; - -#define OP_NAME "grouped_gemm_two_stage" -#define OP_DESC "Grouped GEMM TwoStage" - -namespace { - -std::vector argToIntArray(char* input) -{ - std::vector out; - - std::istringstream in(input); - - std::string item; - - while(std::getline(in, item, ',')) - { - out.push_back(std::stoi(item)); - } - - return out; -} - -int profile_grouped_gemm_two_stage(int argc, char* argv[]) -{ - if(argc < 14) - { - std::cout - << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" - << "arg2: data type (0: fp16; 1: bf16@int8; 2: bf16)\n" - << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n]);\n" - << "arg4: verification (0: no; 1: yes)\n" - << "arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n" - << "arg6: print tensor value (0: no; 1: yes)\n" - << "arg7: time kernel (0=n0, 1=yes)\n" - << "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 " - "64,64 64,64 128,128)\n" - << "arg15: kbatch value (default 1)\n" - << "optional:\n" - << "arg16: number of warm-up cycles (default 1)\n" - << "arg17: number of iterations (default 10)\n" - << std::endl; - - exit(1); - } - - const auto data_type = static_cast(std::stoi(argv[2])); - const auto layout = static_cast(std::stoi(argv[3])); - const bool do_verification = std::stoi(argv[4]); - const int init_method = std::stoi(argv[5]); - const bool do_log = std::stoi(argv[6]); - const bool time_kernel = std::stoi(argv[7]); - - const auto Ms = argToIntArray(argv[8]); - const auto Ns = argToIntArray(argv[9]); - const auto Ks = argToIntArray(argv[10]); - - auto StrideAs = argToIntArray(argv[11]); - auto StrideBs = argToIntArray(argv[12]); - auto StrideCs = argToIntArray(argv[13]); - const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1; - - const int DefaultStrideA = Ks[0]; - const int DefaultStrideB = Ns[0]; - const int DefaultStrideC = Ns[0]; - - for(size_t i = 0; i < Ms.size(); ++i) - { - StrideAs[i] = StrideAs[i] == -1 ? DefaultStrideA : StrideAs[i]; - StrideBs[i] = StrideBs[i] == -1 ? DefaultStrideB : StrideBs[i]; - StrideCs[i] = StrideCs[i] == -1 ? DefaultStrideC : StrideCs[i]; - } - - int n_warmup = 1; - int n_iter = 10; - if(argc == 17) - { - n_warmup = std::stoi(argv[16]); - n_iter = std::stoi(argv[17]); - } - - if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) - { - ck::profiler::profile_grouped_gemm_two_stage_impl( - do_verification, - init_method, - do_log, - time_kernel, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); - } - else if(data_type == GemmDataType::BF16_INT8_BF16 && layout == GemmMatrixLayout::MK_KN_MN) - { - ck::profiler::profile_grouped_gemm_two_stage_impl( - do_verification, - init_method, - do_log, - time_kernel, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); - } - else if(data_type == GemmDataType::BF16_INT8_BF16 && layout == GemmMatrixLayout::MK_NK_MN) - { - ck::profiler::profile_grouped_gemm_two_stage_impl( - do_verification, - init_method, - do_log, - time_kernel, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); - } - else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN) - { - ck::profiler::profile_grouped_gemm_two_stage_impl( - do_verification, - init_method, - do_log, - time_kernel, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); - } - else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN) - { - ck::profiler::profile_grouped_gemm_two_stage_impl( - do_verification, - init_method, - do_log, - time_kernel, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); - } - else - { - throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); - } - return 0; -} - -} // anonymous namespace - -REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_gemm_two_stage); diff --git a/test/grouped_gemm/CMakeLists.txt b/test/grouped_gemm/CMakeLists.txt index 55cb20977..f47685cf9 100644 --- a/test/grouped_gemm/CMakeLists.txt +++ b/test/grouped_gemm/CMakeLists.txt @@ -6,12 +6,6 @@ if(result EQUAL 0) add_dependencies(test_grouped_gemm test_grouped_gemm_splitk) endif() -add_gtest_executable(test_grouped_gemm_two_stage_splitk test_grouped_gemm_two_stage_multiple_d_splitk_xdl.cpp) -if(result EQUAL 0) - target_link_libraries(test_grouped_gemm_two_stage_splitk PRIVATE utility device_grouped_gemm_instance) - add_dependencies(test_grouped_gemm test_grouped_gemm_two_stage_splitk) -endif() - add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface_xdl.cpp) if(result EQUAL 0) target_link_libraries(test_grouped_gemm_interface PRIVATE utility device_grouped_gemm_instance) diff --git a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp index d9282fa92..74d49eb57 100644 --- a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp +++ b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -10,25 +10,35 @@ #include "gtest/gtest.h" #include "test_grouped_gemm_util.hpp" -using F16 = ck::half_t; +using F16 = ck::half_t; +using BF16 = ck::bhalf_t; +using F8 = ck::f8_t; +using I8 = int8_t; + using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; -using RRR_F16_F16_F16 = ck::test::TestGroupedGemm>; -using RCR_F16_F16_F16 = ck::test::TestGroupedGemm>; - -using RRR_F16_F16_F16_LargeK = ck::test::TestGroupedGemm>; -using RCR_F16_F16_F16_LargeK = ck::test::TestGroupedGemm>; - -const std::vector KBATCH{1, 2, 3, 5, 8}; - -INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_MK_KN, RRR_F16_F16_F16, testing::ValuesIn(KBATCH)); -INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_MK_NK, RCR_F16_F16_F16, testing::ValuesIn(KBATCH)); -INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_LargeK_MK_KN, - RRR_F16_F16_F16_LargeK, - testing::Values(32, 64)); -INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_LargeK_MK_NK, - RCR_F16_F16_F16_LargeK, - testing::Values(32, 64)); +template +class TestGroupedGemm : public ck::test::TestGroupedGemm +{ +}; + +// clang-format off +using KernelTypes = ::testing::Types< + std::tuple< Row, Row, Row, F16, F16, F16>, + std::tuple< Row, Col, Row, F16, F16, F16>, + std::tuple< Col, Row, Row, F16, F16, F16>, + std::tuple< Col, Col, Row, F16, F16, F16>, + std::tuple< Row, Row, Row, BF16, BF16, BF16>, + std::tuple< Row, Col, Row, BF16, BF16, BF16>, + std::tuple< Col, Row, Row, BF16, BF16, BF16>, + std::tuple< Row, Row, Row, BF16, I8, BF16>, + std::tuple< Row, Col, Row, BF16, I8, BF16>, + std::tuple< Row, Row, Row, F16, F8, F16>, + std::tuple< Row, Row, Row, F8, F16, F16> + >; +// clang-format on + +TYPED_TEST_SUITE(TestGroupedGemm, KernelTypes); #include "test_grouped_gemm_ut_cases.inc" diff --git a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc index d94d140d9..f4011cf99 100644 --- a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc +++ b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc @@ -1,6 +1,6 @@ #pragma once -TEST_P(RRR_F16_F16_F16, TinyCases) +TYPED_TEST(TestGroupedGemm, TinyCases) { const std::vector Ms{0, 1}; constexpr int N = 768; @@ -8,14 +8,11 @@ TEST_P(RRR_F16_F16_F16, TinyCases) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } -TEST_P(RRR_F16_F16_F16, SmallCases) +TYPED_TEST(TestGroupedGemm, SmallCases) { const std::vector Ms{2, 1, 3, 4, 5, 0}; constexpr int N = 768; @@ -23,14 +20,11 @@ TEST_P(RRR_F16_F16_F16, SmallCases) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } -TEST_P(RRR_F16_F16_F16, MidCases) +TYPED_TEST(TestGroupedGemm, MidCases) { const std::vector Ms{167, 183, 177, 153, 139, 204}; constexpr int N = 768; @@ -38,14 +32,11 @@ TEST_P(RRR_F16_F16_F16, MidCases) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } -TEST_P(RRR_F16_F16_F16, Regular) +TYPED_TEST(TestGroupedGemm, Regular) { const std::vector Ms{64, 128, 256}; constexpr int N = 768; @@ -53,14 +44,11 @@ TEST_P(RRR_F16_F16_F16, Regular) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } -TEST_P(RRR_F16_F16_F16, MNKPadded) +TYPED_TEST(TestGroupedGemm, MNKPadded) { const std::vector Ms{127, 150, 188, 210}; constexpr int N = 136; @@ -68,88 +56,11 @@ TEST_P(RRR_F16_F16_F16, MNKPadded) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } -TEST_P(RCR_F16_F16_F16, TinyCases) -{ - const std::vector Ms{0, 1}; - constexpr int N = 768; - constexpr int K = 544; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RCR_F16_F16_F16, SmallCases) -{ - const std::vector Ms{2, 1, 3, 4, 5, 0}; - constexpr int N = 768; - constexpr int K = 544; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RCR_F16_F16_F16, MidCases) -{ - const std::vector Ms{167, 183, 177, 153, 139, 204}; - constexpr int N = 768; - constexpr int K = 544; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RCR_F16_F16_F16, Regular) -{ - const std::vector Ms{32, 64, 128, 256}; - constexpr int N = 768; - constexpr int K = 320; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RCR_F16_F16_F16, MNKPadded) -{ - const std::vector Ms{127, 150, 188, 210}; - constexpr int N = 136; - constexpr int K = 280; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RRR_F16_F16_F16_LargeK, TestLargeKBatch) +TYPED_TEST(TestGroupedGemm, TestLargeKBatch) { const std::vector Ms{188, 210}; constexpr int N = 768; @@ -157,24 +68,8 @@ TEST_P(RRR_F16_F16_F16_LargeK, TestLargeKBatch) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} -TEST_P(RCR_F16_F16_F16_LargeK, TestLargeKBatch) -{ - const std::vector Ms{188, 210}; - constexpr int N = 768; - constexpr int K = 4096; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); + this->k_batches_ = {32, 64}; - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } diff --git a/test/grouped_gemm/test_grouped_gemm_util.hpp b/test/grouped_gemm/test_grouped_gemm_util.hpp index 9e1395b9f..a3ab0e087 100644 --- a/test/grouped_gemm/test_grouped_gemm_util.hpp +++ b/test/grouped_gemm/test_grouped_gemm_util.hpp @@ -22,7 +22,6 @@ #include "ck/utility/tuple.hpp" #include "ck/utility/number.hpp" #include "profiler/profile_grouped_gemm_impl.hpp" -#include "profiler/profile_grouped_gemm_two_stage_impl.hpp" namespace ck { namespace test { @@ -40,7 +39,7 @@ std::string serialize_range(const Range& range) } template -class TestGroupedGemm : public testing::TestWithParam +class TestGroupedGemm : public testing::Test { protected: using ALayout = std::tuple_element_t<0, Tuple>; @@ -50,23 +49,77 @@ class TestGroupedGemm : public testing::TestWithParam using BDataType = std::tuple_element_t<4, Tuple>; using EDataType = std::tuple_element_t<5, Tuple>; + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + public: static constexpr bool verify_ = true; - static constexpr int init_method_ = 1; // decimal value initialization + static constexpr int init_method_ = 1; // integer value initialization static constexpr bool log_ = false; static constexpr bool bench_ = false; // measure kernel performance + static constexpr int n_warmup_ = 0; + static constexpr int n_iter_ = 1; + std::vector k_batches_; - void SetUp() override {} + void SetUp() override { k_batches_ = {1, 2, 3, 5, 8}; } + private: + template + void SetStrides(std::vector& strides, + const std::vector& rows, + const std::vector& cols) const + { + if(std::is_same_v) + { + for(const auto c : cols) + { + strides.emplace_back(c); + } + } + else if(std::is_same_v) + { + for(const auto r : rows) + { + strides.emplace_back(r); + } + } + } + + public: void Run(const std::vector& Ms, const std::vector& Ns, const std::vector& Ks, - const std::vector& StrideAs, - const std::vector& StrideBs, - const std::vector& StrideCs, - int kbatch = 1, - int n_warmup = 1, - int n_iter = 10) + const std::vector& StrideAs = {}, + const std::vector& StrideBs = {}, + const std::vector& StrideCs = {}) + { + std::vector stride_as = StrideAs; + std::vector stride_bs = StrideBs; + std::vector stride_cs = StrideCs; + + if(stride_as.empty()) + { + SetStrides(stride_as, Ms, Ks); + } + if(stride_bs.empty()) + { + SetStrides(stride_bs, Ks, Ns); + } + if(stride_cs.empty()) + { + SetStrides(stride_cs, Ms, Ns); + } + + RunSingle(Ms, Ns, Ks, stride_as, stride_bs, stride_cs, k_batches_); + } + + void RunSingle(const std::vector& Ms, + const std::vector& Ns, + const std::vector& Ks, + const std::vector& StrideAs, + const std::vector& StrideBs, + const std::vector& StrideCs, + const std::vector& kbatches) { bool pass = ck::profiler::profile_grouped_gemm_impl StrideAs, StrideBs, StrideCs, - kbatch, - n_warmup, - n_iter); - EXPECT_TRUE(pass); - } -}; - -template -class TestGroupedGemmTwoStage : public testing::TestWithParam -{ - protected: - using ALayout = std::tuple_element_t<0, Tuple>; - using BLayout = std::tuple_element_t<1, Tuple>; - using ELayout = std::tuple_element_t<2, Tuple>; - using ADataType = std::tuple_element_t<3, Tuple>; - using BDataType = std::tuple_element_t<4, Tuple>; - using EDataType = std::tuple_element_t<5, Tuple>; - - public: - static constexpr bool verify_ = true; - static constexpr int init_method_ = 1; // decimal value initialization - static constexpr bool log_ = false; - static constexpr bool bench_ = false; // measure kernel performance - - void SetUp() override {} - - void Run(const std::vector& Ms, - const std::vector& Ns, - const std::vector& Ks, - const std::vector& StrideAs, - const std::vector& StrideBs, - const std::vector& StrideCs, - int kbatch = 1, - int n_warmup = 1, - int n_iter = 10) - { - bool pass = ck::profiler::profile_grouped_gemm_two_stage_impl(verify_, - init_method_, - log_, - bench_, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); + kbatches, + n_warmup_, + n_iter_); EXPECT_TRUE(pass); } }; @@ -263,7 +264,7 @@ struct DeviceGroupedGemmSplitkInstanceWrapper p_As, p_Bs, p_Ds, p_Cs, gemm_descs, PassThrough{}, PassThrough{}, PassThrough{}); if(kbatch > 1) { - ggemm_instance.SetKBatchSize(argument, kbatch); + ggemm_instance.SetKBatchSize(&argument, kbatch); } return ggemm_instance.IsSupportedArgument(argument); @@ -300,13 +301,13 @@ struct DeviceGroupedGemmSplitkInstanceWrapper p_As, p_Bs, p_Ds, p_Cs, gemm_descs, PassThrough{}, PassThrough{}, PassThrough{}); if(kbatch > 1) { - ggemm_instance.SetKBatchSize(argument, kbatch); + ggemm_instance.SetKBatchSize(&argument, kbatch); } EXPECT_TRUE(ggemm_instance.IsSupportedArgument(argument)); auto invoker = ggemm_instance.MakeInvoker(); - DeviceMem gemm_desc_workspace(ggemm_instance.GetWorkSpaceSize(&argument)); - ggemm_instance.SetWorkSpacePointer(&argument, gemm_desc_workspace.GetDeviceBuffer()); + DeviceMem dev_gemm_kargs(ggemm_instance.GetDeviceKernelArgSize(&argument)); + ggemm_instance.SetDeviceKernelArgs(&argument, dev_gemm_kargs.GetDeviceBuffer()); return invoker.Run(argument, StreamConfig{nullptr, false}); } }; -- GitLab From fe6b185b97e9f9875ef470884e9f9fba17be02d5 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 27 Nov 2024 06:12:56 -0800 Subject: [PATCH 086/153] move utility headers from library/include to include path (#1697) --- codegen/CMakeLists.txt | 1 + {library/include => include}/ck/library/utility/algorithm.hpp | 0 {library/include => include}/ck/library/utility/check_err.hpp | 0 {library/include => include}/ck/library/utility/conv_common.hpp | 0 .../utility/convolution_host_tensor_descriptor_helper.hpp | 0 .../ck/library/utility/convolution_parameter.hpp | 0 .../include => include}/ck/library/utility/device_memory.hpp | 0 {library/include => include}/ck/library/utility/fill.hpp | 0 .../include => include}/ck/library/utility/host_common_util.hpp | 0 {library/include => include}/ck/library/utility/host_gemm.hpp | 0 {library/include => include}/ck/library/utility/host_tensor.hpp | 0 .../ck/library/utility/host_tensor_generator.hpp | 0 {library/include => include}/ck/library/utility/iterator.hpp | 0 {library/include => include}/ck/library/utility/literals.hpp | 0 {library/include => include}/ck/library/utility/numeric.hpp | 0 {library/include => include}/ck/library/utility/ranges.hpp | 0 16 files changed, 1 insertion(+) rename {library/include => include}/ck/library/utility/algorithm.hpp (100%) rename {library/include => include}/ck/library/utility/check_err.hpp (100%) rename {library/include => include}/ck/library/utility/conv_common.hpp (100%) rename {library/include => include}/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp (100%) rename {library/include => include}/ck/library/utility/convolution_parameter.hpp (100%) rename {library/include => include}/ck/library/utility/device_memory.hpp (100%) rename {library/include => include}/ck/library/utility/fill.hpp (100%) rename {library/include => include}/ck/library/utility/host_common_util.hpp (100%) rename {library/include => include}/ck/library/utility/host_gemm.hpp (100%) rename {library/include => include}/ck/library/utility/host_tensor.hpp (100%) rename {library/include => include}/ck/library/utility/host_tensor_generator.hpp (100%) rename {library/include => include}/ck/library/utility/iterator.hpp (100%) rename {library/include => include}/ck/library/utility/literals.hpp (100%) rename {library/include => include}/ck/library/utility/numeric.hpp (100%) rename {library/include => include}/ck/library/utility/ranges.hpp (100%) diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt index 1ca0d1282..45c47672b 100644 --- a/codegen/CMakeLists.txt +++ b/codegen/CMakeLists.txt @@ -7,6 +7,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) +configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h) find_package(ROCM) include(ROCMInstallTargets) diff --git a/library/include/ck/library/utility/algorithm.hpp b/include/ck/library/utility/algorithm.hpp similarity index 100% rename from library/include/ck/library/utility/algorithm.hpp rename to include/ck/library/utility/algorithm.hpp diff --git a/library/include/ck/library/utility/check_err.hpp b/include/ck/library/utility/check_err.hpp similarity index 100% rename from library/include/ck/library/utility/check_err.hpp rename to include/ck/library/utility/check_err.hpp diff --git a/library/include/ck/library/utility/conv_common.hpp b/include/ck/library/utility/conv_common.hpp similarity index 100% rename from library/include/ck/library/utility/conv_common.hpp rename to include/ck/library/utility/conv_common.hpp diff --git a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp b/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp similarity index 100% rename from library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp rename to include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp diff --git a/library/include/ck/library/utility/convolution_parameter.hpp b/include/ck/library/utility/convolution_parameter.hpp similarity index 100% rename from library/include/ck/library/utility/convolution_parameter.hpp rename to include/ck/library/utility/convolution_parameter.hpp diff --git a/library/include/ck/library/utility/device_memory.hpp b/include/ck/library/utility/device_memory.hpp similarity index 100% rename from library/include/ck/library/utility/device_memory.hpp rename to include/ck/library/utility/device_memory.hpp diff --git a/library/include/ck/library/utility/fill.hpp b/include/ck/library/utility/fill.hpp similarity index 100% rename from library/include/ck/library/utility/fill.hpp rename to include/ck/library/utility/fill.hpp diff --git a/library/include/ck/library/utility/host_common_util.hpp b/include/ck/library/utility/host_common_util.hpp similarity index 100% rename from library/include/ck/library/utility/host_common_util.hpp rename to include/ck/library/utility/host_common_util.hpp diff --git a/library/include/ck/library/utility/host_gemm.hpp b/include/ck/library/utility/host_gemm.hpp similarity index 100% rename from library/include/ck/library/utility/host_gemm.hpp rename to include/ck/library/utility/host_gemm.hpp diff --git a/library/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp similarity index 100% rename from library/include/ck/library/utility/host_tensor.hpp rename to include/ck/library/utility/host_tensor.hpp diff --git a/library/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp similarity index 100% rename from library/include/ck/library/utility/host_tensor_generator.hpp rename to include/ck/library/utility/host_tensor_generator.hpp diff --git a/library/include/ck/library/utility/iterator.hpp b/include/ck/library/utility/iterator.hpp similarity index 100% rename from library/include/ck/library/utility/iterator.hpp rename to include/ck/library/utility/iterator.hpp diff --git a/library/include/ck/library/utility/literals.hpp b/include/ck/library/utility/literals.hpp similarity index 100% rename from library/include/ck/library/utility/literals.hpp rename to include/ck/library/utility/literals.hpp diff --git a/library/include/ck/library/utility/numeric.hpp b/include/ck/library/utility/numeric.hpp similarity index 100% rename from library/include/ck/library/utility/numeric.hpp rename to include/ck/library/utility/numeric.hpp diff --git a/library/include/ck/library/utility/ranges.hpp b/include/ck/library/utility/ranges.hpp similarity index 100% rename from library/include/ck/library/utility/ranges.hpp rename to include/ck/library/utility/ranges.hpp -- GitLab From e7b6286441aae59d3a87db67f42369d3cc2636a4 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Wed, 27 Nov 2024 18:25:07 +0100 Subject: [PATCH 087/153] Add interwave scheduler for gemm mem pipeline (#1647) * add interwave scheduler for gemm mem pipeline * Fix merge artifacts. * Refactor unit tests. * Switch to interwave scheduler for mem example --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> Co-authored-by: Adam Osewski --- example/ck_tile/03_gemm/gemm_mem_pipeline.cpp | 3 +- example/ck_tile/03_gemm/run_gemm_example.inc | 3 +- .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 224 ++++++++++++++++++ test/ck_tile/gemm/test_gemm_mem_pipeline.cpp | 19 +- .../gemm/test_gemm_mem_pipeline_ut_cases.inc | 59 ++++- .../gemm/test_gemm_mem_pipeline_util.hpp | 25 +- 6 files changed, 311 insertions(+), 22 deletions(-) diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp index 97d150412..cd9d9d96b 100644 --- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp +++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp @@ -30,7 +30,6 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) constexpr ck_tile::index_t M_Warp_Tile = 32; constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 8; - #else // Compute friendly for Intrawave scheduler constexpr ck_tile::index_t M_Tile = 256; @@ -84,7 +83,7 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) AccDataType, GemmShape, Traits, - ck_tile::GemmPipelineScheduler::Intrawave, + ck_tile::GemmPipelineScheduler::Interwave, has_hot_loop_v, tail_number_v>>; using Kernel = ck_tile::GemmKernel; diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index 5199c1e3e..a1fc15577 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -200,7 +200,8 @@ int run_gemm_example(int argc, char* argv[]) return run_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{}); } // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not - // work. else if(a_layout == "C" && b_layout == "C") + // work. + // else if(a_layout == "C" && b_layout == "C") // { // return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{}); // } diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index 4634e9dcb..847c5b187 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -322,6 +322,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_sync_lds(); + LocalPrefill(a_copy_lds_window, a_block_tiles.get(number{}), a_element_func); @@ -374,6 +375,229 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem } }; + template <> + struct PipelineImpl + { + template + CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, + SrcTileWindow& dram_tile_window) const + { + load_tile(dst_block_tile, dram_tile_window); + move_tile_window(dram_tile_window, {0, KPerBlock}); + } + + template + CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window, + const SrcBlockTile& src_block_tile, + const ElementFunction& element_func) const + { + const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile); + store_tile(lds_tile_window, block_tile_tmp); + } + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const AElementFunction& a_element_func, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + const BElementFunction& b_element_func, + index_t num_loop, + void* p_smem) const + { + static_assert( + std::is_same_v> && + std::is_same_v>, + "A/B Dram block window should have the same data type as appropriate " + "([A|B]DataType) defined in Problem definition!"); + + static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && + NPerBlock == + BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}], + "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" + " or KPerBlock!"); + + // ------------------------------------------------------------------------------------ + // Definitions of all needed tiles + + // A tile in LDS + ADataType* p_a_lds = static_cast(p_smem); + constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor(); + auto a_lds_block = make_tensor_view(p_a_lds, a_lds_block_desc); + + // TODO: LDS alignment should come from Policy! + constexpr index_t a_lds_block_space_size_aligned = + integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), + 16) * + 16; + + // B tile in LDS + BDataType* p_b_lds = static_cast( + static_cast(static_cast(p_smem) + a_lds_block_space_size_aligned)); + constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor(); + auto b_lds_block = make_tensor_view(p_b_lds, b_lds_block_desc); + + // A DRAM tile window for load + auto a_copy_dram_window = + make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_dram_block_window_tmp.get_window_origin(), + Policy::template MakeADramTileDistribution()); + + // A LDS tile window for store + auto a_copy_lds_window = + make_tile_window(a_lds_block, + make_tuple(number{}, number{}), + {0, 0}, + a_copy_dram_window.get_tile_distribution()); + // B DRAM tile window for load + auto b_copy_dram_window = + make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_dram_block_window_tmp.get_window_origin(), + Policy::template MakeBDramTileDistribution()); + + // B LDS tile window for store + auto b_copy_lds_window = + make_tile_window(b_lds_block, + make_tuple(number{}, number{}), + {0, 0}, + b_copy_dram_window.get_tile_distribution()); + + // A LDS tile for block GEMM + auto a_lds_gemm_window = make_tile_window( + a_lds_block, make_tuple(number{}, number{}), {0, 0}); + // B LDS tile for block GEMM + auto b_lds_gemm_window = make_tile_window( + b_lds_block, make_tuple(number{}, number{}), {0, 0}); + + // Block GEMM + auto block_gemm = BlockGemm(); + auto c_block_tile = block_gemm.MakeCBlockTile(); + + using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution()); + using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution()); + + using ABlockTile = + decltype(make_static_distributed_tensor(ABlockTileDistr{})); + using BBlockTile = + decltype(make_static_distributed_tensor(BBlockTileDistr{})); + + tuple_array a_block_tiles; + tuple_array b_block_tiles; + + // ----------------------------------------------------------------------------------------- + // Gemm pipeline start + + // prefetch + // global read 0 + GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); + GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); + + // initialize C + tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); + + // LDS write 0 + LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); + LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + + // Global prefetch [1, PrefetchStages] + static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) { + GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); + GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); + }); + + // main body + if constexpr(HasHotLoop) + { + index_t i = 0; + do + { + static_for<0, PrefetchStages, 1>{}([&](auto prefetch_idx) { + block_sync_lds(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + // no second block_sync_lds because it's interwave + + LocalPrefill( + a_copy_lds_window, + a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), + a_element_func); + LocalPrefill( + b_copy_lds_window, + b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), + b_element_func); + + GlobalPrefetch(a_block_tiles.get(number{}), + a_copy_dram_window); + GlobalPrefetch(b_block_tiles.get(number{}), + b_copy_dram_window); + }); + + i += PrefetchStages; + } while(i < (num_loop - PrefetchStages)); + } + + auto HotLoopTail = [&](auto tail_num) { + static_for<1, tail_num, 1>{}([&](auto prefetch_idx) { + block_sync_lds(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + // no second block_sync_lds because it's interwave + + LocalPrefill(a_copy_lds_window, + a_block_tiles.get(number{}), + a_element_func); + LocalPrefill(b_copy_lds_window, + b_block_tiles.get(number{}), + b_element_func); + }); + + block_sync_lds(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + }; + + if constexpr(TailNum == TailNumber::One) + { + block_sync_lds(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + } + else if constexpr(TailNum == TailNumber::Two) + { + HotLoopTail(number<2>{}); + } + else if constexpr(TailNum == TailNumber::Three) + { + HotLoopTail(number<3>{}); + } + else if constexpr(TailNum == TailNumber::Four) + { + HotLoopTail(number<4>{}); + } + else if constexpr(TailNum == TailNumber::Five) + { + HotLoopTail(number<5>{}); + } + else if constexpr(TailNum == TailNumber::Six) + { + HotLoopTail(number<6>{}); + } + else if constexpr(TailNum == TailNumber::Seven) + { + HotLoopTail(number<7>{}); + } + else if constexpr(TailNum == TailNumber::Full) + { + HotLoopTail(number{}); + } + + return c_block_tile; + } + }; + template +class TestCkTileGemmMemPipelineIntrawave : public TestCkTileGemmMemPipeline +{ +}; + +template +class TestCkTileGemmMemPipelineInterwave : public TestCkTileGemmMemPipeline +{ +}; // clang-format off using KernelTypes = ::testing::Types< @@ -24,6 +36,7 @@ using KernelTypes = ::testing::Types< >; // clang-format on -TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes); +TYPED_TEST_SUITE(TestCkTileGemmMemPipelineIntrawave, KernelTypes); +TYPED_TEST_SUITE(TestCkTileGemmMemPipelineInterwave, KernelTypes); #include "test_gemm_mem_pipeline_ut_cases.inc" diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc index b26114f39..6b914e797 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc @@ -1,6 +1,57 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + #pragma once -TYPED_TEST(TestCkTileGemmMemPipeline, SmallM) +//------------------------------------------------------------------------------------------------ +// INTERWAVE SCHEDULER +//------------------------------------------------------------------------------------------------ + +TYPED_TEST(TestCkTileGemmMemPipelineInterwave, SmallM) +{ + std::vector Ms{1, 2, 3, 4, 5, 6}; + constexpr int N = 1024; + constexpr int K = 320; + + for(int M : Ms) + this->Run(M, N, K); +} + +TYPED_TEST(TestCkTileGemmMemPipelineInterwave, MidLargeM) +{ + std::vector Ms{127, 255, 312, 799, 1573}; + constexpr int N = 1024; + constexpr int K = 320; + + for(int M : Ms) + this->Run(M, N, K); +} + +TYPED_TEST(TestCkTileGemmMemPipelineInterwave, PaddK) +{ + std::vector Ms{127}; + constexpr int N = 1024; + constexpr int K = 432; + + for(int M : Ms) + this->Run(M, N, K); +} + +TYPED_TEST(TestCkTileGemmMemPipelineInterwave, Regular) +{ + std::vector Ms{512}; + constexpr int N = 1024; + constexpr int K = 512; + + for(int M : Ms) + this->Run(M, N, K); +} + +//------------------------------------------------------------------------------------------------ +// INTRAWAVE SCHEDULER +//------------------------------------------------------------------------------------------------ + +TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, SmallM) { std::vector Ms{1, 2, 3, 4, 5, 6}; constexpr int N = 1024; @@ -10,7 +61,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, SmallM) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM) +TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, MidLargeM) { std::vector Ms{127, 255, 312, 799, 1573}; constexpr int N = 1024; @@ -20,7 +71,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipeline, PaddK) +TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, PaddK) { std::vector Ms{127}; constexpr int N = 1024; @@ -30,7 +81,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, PaddK) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipeline, Regular) +TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, Regular) { std::vector Ms{512}; constexpr int N = 1024; diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp index 6b4789833..15f9f516e 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp @@ -11,20 +11,21 @@ #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" -template +template class TestCkTileGemmMemPipeline : public ::testing::Test { protected: - using ALayout = std::tuple_element_t<0, Tuple>; - using BLayout = std::tuple_element_t<1, Tuple>; - using CLayout = std::tuple_element_t<2, Tuple>; - using ADataType = std::tuple_element_t<3, Tuple>; - using BDataType = std::tuple_element_t<4, Tuple>; - using AccDataType = std::tuple_element_t<5, Tuple>; - using CDataType = std::tuple_element_t<6, Tuple>; + using ALayout = std::tuple_element_t<0, Tuple>; + using BLayout = std::tuple_element_t<1, Tuple>; + using CLayout = std::tuple_element_t<2, Tuple>; + using ADataType = std::tuple_element_t<3, Tuple>; + using BDataType = std::tuple_element_t<4, Tuple>; + using AccDataType = std::tuple_element_t<5, Tuple>; + using CDataType = std::tuple_element_t<6, Tuple>; + static constexpr auto Scheduler = Scheduler_; // TODO: expose tile size through test t-param ? - struct gemm_basic_args + struct gemm_args { const void* p_a; const void* p_b; @@ -38,7 +39,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test ck_tile::index_t stride_C; }; - void invoke_gemm(const gemm_basic_args& args, const ck_tile::stream_config& s) + void invoke_gemm(const gemm_args& args, const ck_tile::stream_config& s) { // TODO: This should be parameterized in tests constexpr ck_tile::index_t M_Tile = 128; @@ -89,7 +90,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test AccDataType, GemmShape, Traits, - ck_tile::GemmPipelineScheduler::Intrawave, + Scheduler, has_hot_loop_v, tail_number_v>>; using Kernel = ck_tile::GemmKernel; @@ -288,7 +289,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test c_m_n_dev_buf.SetZero(); c_m_n_dev_result.SetZero(); - gemm_basic_args args; + gemm_args args; args.p_a = a_m_k_dev_buf.GetDeviceBuffer(); args.p_b = b_k_n_dev_buf.GetDeviceBuffer(); args.p_c = c_m_n_dev_buf.GetDeviceBuffer(); -- GitLab From f49b595dc02f3a40b61455c6914e8456b5f42f41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Thu, 28 Nov 2024 17:51:49 +0100 Subject: [PATCH 088/153] [CK TILE] Add gemm compute pipeline v3 (#1661) * [CK TILE] Add gemm compute pipeline v3 * Enable universal gemm compute pipeline. * Rename example and add compute pipeline. * Introduce ag bg cr pipeline impl base. * Refactor to reuse code. * Cleaning * Formatting. --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> Co-authored-by: Adam Osewski --- example/ck_tile/03_gemm/CMakeLists.txt | 2 +- ...mm_mem_pipeline.cpp => universal_gemm.cpp} | 25 +- include/ck_tile/ops/gemm.hpp | 2 + .../block/block_universal_gemm_as_bs_cr.hpp | 223 +++++----- .../pipeline/gemm_pipeline_ag_bg_cr_base.hpp | 111 +++++ .../gemm_pipeline_ag_bg_cr_comp_v3.hpp | 383 ++++++++++++++++++ .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 266 ++++-------- 7 files changed, 712 insertions(+), 300 deletions(-) rename example/ck_tile/03_gemm/{gemm_mem_pipeline.cpp => universal_gemm.cpp} (89%) create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt index 8ae46cadc..d166eed45 100644 --- a/example/ck_tile/03_gemm/CMakeLists.txt +++ b/example/ck_tile/03_gemm/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp) -add_executable(tile_example_gemm_mem_pipeline EXCLUDE_FROM_ALL gemm_mem_pipeline.cpp) +add_executable(tile_example_universal_gemm EXCLUDE_FROM_ALL universal_gemm.cpp) diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp similarity index 89% rename from example/ck_tile/03_gemm/gemm_mem_pipeline.cpp rename to example/ck_tile/03_gemm/universal_gemm.cpp index cd9d9d96b..eaafc13b9 100644 --- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -14,10 +14,17 @@ #include "ck_tile/host.hpp" #include "gemm_basic.hpp" +#define CK_TILE_PIPELINE_COMPUTE 1 +#define CK_TILE_PIPELINE_MEMORY 2 + +#ifndef CK_TILE_PIPELINE_DEFAULT +#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE +#endif + template float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) { -#if 1 +#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) // Memory friendly for Interwave scheduler constexpr ck_tile::index_t M_Tile = 128; constexpr ck_tile::index_t N_Tile = 32; @@ -30,7 +37,8 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) constexpr ck_tile::index_t M_Warp_Tile = 32; constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 8; -#else + +#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE) // Compute friendly for Intrawave scheduler constexpr ck_tile::index_t M_Tile = 256; constexpr ck_tile::index_t N_Tile = 256; @@ -63,8 +71,11 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) ck_tile::Default2DEpilogueProblem>; using Traits = ck_tile::TileGemmTraits; - +#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem< +#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE) + using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3< +#endif ck_tile::GemmPipelineProblem>; const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(args.K); @@ -77,13 +88,21 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) constexpr bool has_hot_loop_v = has_hot_loop_.value; constexpr auto tail_number_v = tail_number_.value; +#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem< +#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE) + using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3< +#endif ck_tile::UniversalGemmPipelineProblem>; using Kernel = ck_tile::GemmKernel; diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 9a033ee2d..1340fb204 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -25,6 +25,8 @@ #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp" diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp index 5f98a7a0b..c9e648f43 100644 --- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp +++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp @@ -41,13 +41,16 @@ struct BlockUniversalGemmAsBsCr static constexpr index_t MWarp = config.template at<1>(); static constexpr index_t NWarp = config.template at<2>(); - static_assert(MWarp == BlockGemmShape::BlockWarps::at(number<0>{}), + using I0 = number<0>; + using I1 = number<1>; + + static_assert(MWarp == BlockGemmShape::BlockWarps::at(I0{}), "Error! WarpGemm's MWarp is not consisten with BlockGemmShape!"); - static_assert(NWarp == BlockGemmShape::BlockWarps::at(number<1>{}), + static_assert(NWarp == BlockGemmShape::BlockWarps::at(I1{}), "Error! WarpGemm's NWarp is not consisten with BlockGemmShape!"); - static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(number<0>{}), + static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(I0{}), "Error! WarpGemm's M is not consisten with BlockGemmShape!"); - static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(number<1>{}), + static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(I1{}), "Error! WarpGemm's N is not consisten with BlockGemmShape!"); static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM); @@ -99,6 +102,9 @@ struct BlockUniversalGemmAsBsCr static constexpr auto Scheduler = Traits::Scheduler; + using I0 = number<0>; + using I1 = number<1>; + private: template struct BlockGemmImpl @@ -114,35 +120,31 @@ struct BlockUniversalGemmAsBsCr const ASmemBlockWindow& a_block_window, const BSmemBlockWindow& b_block_window) { - static_assert( - std::is_same_v, - "The CDataType as defined in traits should be the same as correspoinding " - "C block tensor data type!"); - static_assert(std::is_same_v && - std::is_same_v, + static_assert(std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); + static_assert(std::is_same_v && + std::is_same_v, "The ADataType and BDataType as defined in " "traits should be the same as correspoinding block window data type!"); static_assert( - GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}], "MPerBlock, NPerBlock, KPerBlock defined in " " BlockGemmShape are different from A/B block smem windows apropriate dims!"); - const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; - const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + const index_t iMWarp = get_warp_id() / NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * NWarp); // TODO: refactor warp_window tile type to class member as it should be // compile-time known information. auto a_warp_window_tmp = make_tile_window( a_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - a_block_window.get_window_origin() + - multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + make_tuple(number{}, number{}), + a_block_window.get_window_origin() + multi_index<2>{iMWarp * WarpGemm::kM, 0}, + make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{})); using AWarpWindow = remove_cvref_t; @@ -156,16 +158,15 @@ struct BlockUniversalGemmAsBsCr statically_indexed_array< statically_indexed_array, - GemmTraits::MIterPerWarp> + MIterPerWarp> a_warp_windows; // construct B-warp-window auto b_warp_window_tmp = make_tile_window( b_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - b_block_window.get_window_origin() + - multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + make_tuple(number{}, number{}), + b_block_window.get_window_origin() + multi_index<2>{iNWarp * WarpGemm::kN, 0}, + make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{})); using BWarpWindow = remove_cvref_t; @@ -179,10 +180,10 @@ struct BlockUniversalGemmAsBsCr statically_indexed_array< statically_indexed_array, - GemmTraits::NIterPerWarp> + NIterPerWarp> b_warp_windows; - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { a_warp_windows(mIter)(kIter) = a_warp_window_tmp; @@ -193,7 +194,7 @@ struct BlockUniversalGemmAsBsCr }); }); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { b_warp_windows(nIter)(kIter) = b_warp_window_tmp; @@ -203,8 +204,8 @@ struct BlockUniversalGemmAsBsCr }); }); - using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; - using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + using CWarpDstr = typename WarpGemm::CWarpDstr; + using CWarpTensor = typename WarpGemm::CWarpTensor; constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); @@ -212,10 +213,10 @@ struct BlockUniversalGemmAsBsCr // hot loop: static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { const auto a_warp_tile = load_tile(a_warp_windows(mIter)(kIter)); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { const auto b_warp_tile = load_tile(b_warp_windows(nIter)(kIter)); // read C warp tensor from C block tensor- @@ -226,7 +227,7 @@ struct BlockUniversalGemmAsBsCr merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); // warp GEMM - typename GemmTraits::WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile); + WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile); // write C warp tensor into C block tensor c_block_tensor.set_y_sliced_thread_data( @@ -243,13 +244,13 @@ struct BlockUniversalGemmAsBsCr struct BlockGemmImpl { statically_indexed_array< - statically_indexed_array, - GemmTraits::MIterPerWarp> + statically_indexed_array, + MIterPerWarp> a_warp_tiles_; statically_indexed_array< - statically_indexed_array, - GemmTraits::NIterPerWarp> + statically_indexed_array, + NIterPerWarp> b_warp_tiles_; template @@ -257,30 +258,27 @@ struct BlockUniversalGemmAsBsCr const BSmemBlockWindow& b_block_window) { static_assert( - GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}], "MPerBlock, NPerBlock, KPerBlock defined in " " BlockGemmShape are different from A/B block smem windows apropriate dims!"); - static_assert(std::is_same_v && - std::is_same_v, + static_assert(std::is_same_v && + std::is_same_v, "The ADataType and BDataType as defined in " "traits should be the same as correspoinding block window data type!"); - const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; - const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + const index_t iMWarp = get_warp_id() / NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * NWarp); // TODO: refactor warp_window tile type to class member as it should be // compile-time known information. auto a_warp_window_tmp = make_tile_window( a_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - a_block_window.get_window_origin() + - multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + make_tuple(number{}, number{}), + a_block_window.get_window_origin() + multi_index<2>{iMWarp * WarpGemm::kM, 0}, + make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{})); using AWarpWindow = remove_cvref_t; @@ -292,18 +290,16 @@ struct BlockUniversalGemmAsBsCr AWarpWindow{}.get_window_lengths(), "AWarpWindow lengths must be equal to AWarpTile lengths!"); - statically_indexed_array< - statically_indexed_array, - GemmTraits::MIterPerWarp> + statically_indexed_array, + MIterPerWarp> a_warp_windows; // construct B-warp-window auto b_warp_window_tmp = make_tile_window( b_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - b_block_window.get_window_origin() + - multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + make_tuple(number{}, number{}), + b_block_window.get_window_origin() + multi_index<2>{iNWarp * WarpGemm::kN, 0}, + make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{})); using BWarpWindow = remove_cvref_t; @@ -315,13 +311,12 @@ struct BlockUniversalGemmAsBsCr BWarpWindow{}.get_window_lengths(), "BWarpWindow lengths must be equal to BWarpTile lengths!"); - statically_indexed_array< - statically_indexed_array, - GemmTraits::NIterPerWarp> + statically_indexed_array, + NIterPerWarp> b_warp_windows; - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { - static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, KIterPerWarp, 1>{}([&](auto kIter) { a_warp_windows(mIter)(kIter) = a_warp_window_tmp; // TODO: I don't have to move 0,0 window! @@ -331,8 +326,8 @@ struct BlockUniversalGemmAsBsCr }); }); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { - static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, KIterPerWarp, 1>{}([&](auto kIter) { b_warp_windows(nIter)(kIter) = b_warp_window_tmp; move_tile_window(b_warp_windows(nIter)(kIter), @@ -341,12 +336,12 @@ struct BlockUniversalGemmAsBsCr }); }); - static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { // read A warp tensor from A block window load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter)); }); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { // read B warp tensor from B Block window load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter)); }); @@ -359,22 +354,21 @@ struct BlockUniversalGemmAsBsCr [[maybe_unused]] const ASmemBlockWindow& a_block_window, [[maybe_unused]] const BSmemBlockWindow& b_block_window) { - static_assert( - std::is_same_v, - "The CDataType as defined in traits should be the same as correspoinding " - "C block tensor data type!"); + static_assert(std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); - using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; - using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + using CWarpDstr = typename WarpGemm::CWarpDstr; + using CWarpTensor = typename WarpGemm::CWarpTensor; constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; // hot loop: - static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { // read C warp tensor from C block tensor- CWarpTensor c_warp_tensor; @@ -383,9 +377,9 @@ struct BlockUniversalGemmAsBsCr merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); // warp GEMM - typename GemmTraits::WarpGemm{}(c_warp_tensor, - a_warp_tiles_[mIter][kIter], - b_warp_tiles_[nIter][kIter]); + WarpGemm{}(c_warp_tensor, + a_warp_tiles_[mIter][kIter], + b_warp_tiles_[nIter][kIter]); // write C warp tensor into C block tensor c_block_tensor.set_y_sliced_thread_data( @@ -412,12 +406,12 @@ struct BlockUniversalGemmAsBsCr statically_indexed_array< statically_indexed_array, - GemmTraits::MIterPerWarp> + MIterPerWarp> a_warp_tiles_; statically_indexed_array< statically_indexed_array, - GemmTraits::NIterPerWarp> + NIterPerWarp> b_warp_tiles_; template @@ -425,30 +419,28 @@ struct BlockUniversalGemmAsBsCr const BSmemBlockWindow& b_block_window) { static_assert( - GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}], "MPerBlock, NPerBlock, KPerBlock defined in " " BlockGemmShape are different from A/B block smem windows apropriate dims!"); - static_assert(std::is_same_v && - std::is_same_v, + static_assert(std::is_same_v && + std::is_same_v, "The ADataType and BDataType as defined in " "traits should be the same as correspoinding block window data type!"); - const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; - const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + const index_t iMWarp = get_warp_id() / NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * NWarp); // TODO: refactor warp_window tile type to class member as it should be // compile-time known information. auto a_warp_window_tmp = make_tile_window( a_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), + make_tuple(number{}, number{}), a_block_window.get_window_origin() + - multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, KIdx * KPerInnerLoop}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + multi_index<2>{iMWarp * WarpGemm::kM, KIdx * KPerInnerLoop}, + make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{})); using AWarpWindow = remove_cvref_t; @@ -461,16 +453,16 @@ struct BlockUniversalGemmAsBsCr "AWarpWindow lengths must be equal to AWarpTile lengths!"); statically_indexed_array, - GemmTraits::MIterPerWarp> + MIterPerWarp> a_warp_windows; // construct B-warp-window auto b_warp_window_tmp = make_tile_window( b_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), + make_tuple(number{}, number{}), b_block_window.get_window_origin() + - multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, KIdx * KPerInnerLoop}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + multi_index<2>{iNWarp * WarpGemm::kN, KIdx * KPerInnerLoop}, + make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{})); using BWarpWindow = remove_cvref_t; @@ -483,10 +475,10 @@ struct BlockUniversalGemmAsBsCr "BWarpWindow lengths must be equal to BWarpTile lengths!"); statically_indexed_array, - GemmTraits::NIterPerWarp> + NIterPerWarp> b_warp_windows; - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { a_warp_windows(mIter)(kIter) = a_warp_window_tmp; @@ -496,7 +488,7 @@ struct BlockUniversalGemmAsBsCr }); }); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { b_warp_windows(nIter)(kIter) = b_warp_window_tmp; @@ -508,11 +500,11 @@ struct BlockUniversalGemmAsBsCr // TODO check if a_warp_tiles has same desc as a_warp_window static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { // read A warp tensor from A block window load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter)); }); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { // read B warp tensor from B Block window load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter)); }); @@ -525,13 +517,12 @@ struct BlockUniversalGemmAsBsCr const ASmemBlockWindow& a_block_window, const BSmemBlockWindow& b_block_window) { - static_assert( - std::is_same_v, - "The CDataType as defined in traits should be the same as correspoinding " - "C block tensor data type!"); + static_assert(std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); - using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; - using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + using CWarpDstr = typename WarpGemm::CWarpDstr; + using CWarpTensor = typename WarpGemm::CWarpTensor; constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); @@ -555,8 +546,8 @@ struct BlockUniversalGemmAsBsCr } static_for<0, KInnerLoopIter, 1>{}([&](auto kInnerIter) { - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { // read C warp tensor from C block tensor- CWarpTensor c_warp_tensor; @@ -573,17 +564,17 @@ struct BlockUniversalGemmAsBsCr // penalty if constexpr(kIter.value == KRepeat - 1 && kInnerIter.value == KInnerLoopIter - 1 && - mIter.value == GemmTraits::MIterPerWarp - 1 && - nIter.value == GemmTraits::NIterPerWarp - 1) + mIter.value == MIterPerWarp - 1 && + nIter.value == NIterPerWarp - 1) { __builtin_amdgcn_sched_barrier(0); block_sync_lds(); __builtin_amdgcn_sched_barrier(0); } // warp GEMM - typename GemmTraits::WarpGemm{}(c_warp_tensor, - a_warp_tiles_[mIter][kInnerIter], - b_warp_tiles_[nIter][kInnerIter]); + WarpGemm{}(c_warp_tensor, + a_warp_tiles_[mIter][kInnerIter], + b_warp_tiles_[nIter][kInnerIter]); // write C warp tensor into C block tensor c_block_tensor.set_y_sliced_thread_data( diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp new file mode 100644 index 000000000..431534af1 --- /dev/null +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template +struct GemmPipelineAgBgCrImplBase +{ + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; + + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + template + CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, + SrcTileWindow& dram_tile_window) const + { + load_tile(dst_block_tile, dram_tile_window); + move_tile_window(dram_tile_window, {0, KPerBlock}); + } + + template + CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window, + const SrcBlockTile& src_block_tile, + const ElementFunction& element_func) const + { + const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile); + store_tile(lds_tile_window, block_tile_tmp); + } + + CK_TILE_DEVICE auto GetABLdsTensorViews(void* p_smem) const + { + // A tile in LDS + ADataType* p_a_lds = static_cast(p_smem); + constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor(); + auto a_lds_block = make_tensor_view(p_a_lds, a_lds_block_desc); + + // TODO: LDS alignment should come from Policy! + constexpr index_t a_lds_block_space_size_aligned = + integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) * + 16; + + // B tile in LDS + BDataType* p_b_lds = static_cast( + static_cast(static_cast(p_smem) + a_lds_block_space_size_aligned)); + constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor(); + auto b_lds_block = make_tensor_view(p_b_lds, b_lds_block_desc); + + return make_tuple(std::move(a_lds_block), std::move(b_lds_block)); + } + + template + CK_TILE_DEVICE auto GetAWindows(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const ALdsTensorView& a_lds_block_view) const + { + // A DRAM tile window for load + auto a_copy_dram_window = + make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_dram_block_window_tmp.get_window_origin(), + Policy::template MakeADramTileDistribution()); + + // A LDS tile window for store + auto a_copy_lds_window = + make_tile_window(a_lds_block_view, + make_tuple(number{}, number{}), + {0, 0}, + a_copy_dram_window.get_tile_distribution()); + + auto a_lds_gemm_window = make_tile_window( + a_lds_block_view, make_tuple(number{}, number{}), {0, 0}); + + return make_tuple(std::move(a_copy_dram_window), + std::move(a_copy_lds_window), + std::move(a_lds_gemm_window)); + } + + template + CK_TILE_DEVICE auto GetBWindows(const BDramBlockWindowTmp& b_dram_block_window_tmp, + const BLdsTensorView& b_lds_block_view) const + { + auto b_copy_dram_window = + make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_dram_block_window_tmp.get_window_origin(), + Policy::template MakeBDramTileDistribution()); + + // B LDS tile window for store + auto b_copy_lds_window = + make_tile_window(b_lds_block_view, + make_tuple(number{}, number{}), + {0, 0}, + b_copy_dram_window.get_tile_distribution()); + + auto b_lds_gemm_window = make_tile_window( + b_lds_block_view, make_tuple(number{}, number{}), {0, 0}); + + return make_tuple(std::move(b_copy_dram_window), + std::move(b_copy_lds_window), + std::move(b_lds_gemm_window)); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp new file mode 100644 index 000000000..a72728b4a --- /dev/null +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" + +namespace ck_tile { + +// A Tile Window: global memory +// B Tile Window: global memory +// C Distributed tensor: register +template +struct BaseGemmPipelineAgBgCrCompV3 +{ + static constexpr index_t PrefetchStages = 2; + static constexpr index_t PrefillStages = 1; + static constexpr index_t GlobalBufferNum = 1; + + CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop) + { + return num_loop > PrefetchStages; + } + + CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop) + { + ignore = num_loop; + return TailNumber::Full; + } +}; + +// Compute optimized pipeline +// GlobalPrefetchStages: 2 +// LocalPreFillStages: 1 +// LocalPreFetchStages: 1 +// LocalSharedMemoryBuffer: 1 +template +struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3 +{ + using Base = BaseGemmPipelineAgBgCrCompV3; + using PipelineImplBase = GemmPipelineAgBgCrImplBase; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; + + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + + using BlockGemm = remove_cvref_t())>; + using I0 = number<0>; + using I1 = number<1>; + using I2 = number<2>; + + static constexpr index_t BlockSize = Problem::kBlockSize; + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + static constexpr index_t VectorSizeA = Problem::VectorSizeA; + static constexpr index_t VectorSizeB = Problem::VectorSizeB; + static constexpr index_t VectorSizeC = Problem::VectorSizeC; + + static constexpr bool kPadM = Problem::kPadM; + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kPadK = Problem::kPadK; + + // Where is the right place for HasHotLoop and TailNum ??? + static constexpr bool HasHotLoop = Problem::HasHotLoop; + static constexpr auto TailNum = Problem::TailNum; + static constexpr auto Scheduler = Problem::Scheduler; + + using Base::PrefetchStages; + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + struct PipelineImpl : public PipelineImplBase + { + }; + + template <> + struct PipelineImpl : public PipelineImplBase + { + using Base = PipelineImplBase; + + CK_TILE_DEVICE static constexpr auto HotLoopScheduler() + { + constexpr index_t MPerXDL = BlockGemmShape::WarpTile::at(I0{}); + constexpr index_t NPerXDL = BlockGemmShape::WarpTile::at(I1{}); + constexpr index_t KPerXDL = BlockGemmShape::WarpTile::at(I2{}); + + constexpr index_t WaveSize = 64; + constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{}); + constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{}); + + constexpr index_t A_LDS_Read_Width = KPerXDL; + constexpr index_t B_LDS_Read_Width = KPerXDL; + + constexpr index_t A_Buffer_Load_Inst_Num = + MPerBlock * KPerBlock / (BlockSize * VectorSizeA); + constexpr index_t B_Buffer_Load_Inst_Num = + NPerBlock * KPerBlock / (BlockSize * VectorSizeB); + + constexpr index_t A_LDS_Write_Inst_Num = MPerBlock * KPerBlock / (BlockSize * KPerXDL); + constexpr index_t B_LDS_Write_Inst_Num = NPerBlock * KPerBlock / (BlockSize * KPerXDL); + + constexpr index_t A_LDS_Read_Inst_Num = + WaveNumN * MPerBlock * KPerBlock / (BlockSize * KPerXDL); + constexpr index_t B_LDS_Read_Inst_Num = + WaveNumM * MPerBlock * KPerBlock / (BlockSize * KPerXDL); + + constexpr index_t C_MFMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock / + (BlockSize / WaveSize) / + (MPerXDL * NPerXDL * KPerXDL); + + // A/B split schedule + // compiler is likely to use ds_read2 when instruction width smaller than 16bytes + constexpr auto num_ds_read_inst_a = A_LDS_Read_Width * sizeof(ADataType) == 16 + ? A_LDS_Read_Inst_Num + : A_LDS_Read_Inst_Num / 2; + constexpr auto num_ds_read_inst_b = B_LDS_Read_Width * sizeof(BDataType) == 16 + ? B_LDS_Read_Inst_Num + : B_LDS_Read_Inst_Num / 2; + + constexpr auto num_ds_write_inst_a = A_LDS_Write_Inst_Num; + constexpr auto num_ds_write_inst_b = B_LDS_Write_Inst_Num; + + constexpr auto num_buffer_load_inst_a = A_Buffer_Load_Inst_Num; + constexpr auto num_buffer_load_inst_b = B_Buffer_Load_Inst_Num; + + constexpr auto num_mfma_inst = C_MFMA_Inst_Num; + + constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32; + constexpr auto ds_read_a_issue_cycle = + A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4; + constexpr auto ds_read_b_issue_cycle = + B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4; + constexpr auto ds_read_a_mfma_rate = + (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle); + constexpr auto ds_read_b_mfma_rate = + (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle); + + constexpr auto num_dsread_a_mfma = + (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate; + constexpr auto num_dsread_b_mfma = + (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate; + + // stage 1 + // Separate this part? + // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) > + // sizeof(ComputeDataType) / + // sizeof(BDataType) + // ? sizeof(ComputeDataType) / + // sizeof(ADataType) : sizeof(ComputeDataType) + // / sizeof(BDataType); + constexpr auto num_mfma_stage1 = + num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma); + constexpr auto num_mfma_per_issue = + num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b); + constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a; + constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b; + + static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) { + ignore = i; + static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) { + ignore = idswrite; + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read + __builtin_amdgcn_sched_group_barrier( + 0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA + }); + static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) { + ignore = i; + static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) { + ignore = idswrite; + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read + __builtin_amdgcn_sched_group_barrier( + 0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA + }); + + // stage 2 + static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) { + if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >= + ds_read_a_mfma_rate) + { + __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read + } + else + { + __builtin_amdgcn_sched_group_barrier( + 0x100, + num_ds_read_inst_a - (num_dsread_a_mfma - 1) * ds_read_a_mfma_rate, + 0); // DS read + } + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + + static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) { + if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >= + ds_read_b_mfma_rate) + { + __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read + } + else + { + __builtin_amdgcn_sched_group_barrier( + 0x100, + num_ds_read_inst_b - (num_dsread_b_mfma - 1) * ds_read_b_mfma_rate, + 0); // DS read + } + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + } + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const AElementFunction& a_element_func, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + const BElementFunction& b_element_func, + index_t num_loop, + void* p_smem) const + { + static_assert( + std::is_same_v> && + std::is_same_v>, + "A/B Dram block window should have the same data type as appropriate " + "([A|B]DataType) defined in Problem definition!"); + + static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && + NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}], + "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" + " or KPerBlock!"); + + // ------------------------------------------------------------------------------------ + // Definitions of all needed tiles + + // A/B tiles in LDS + auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem); + + // A DRAM tile window for load + // A LDS tile window for store + // A LDS tile for block GEMM + auto&& [a_copy_dram_window, a_copy_lds_window, a_lds_gemm_window] = + Base::GetAWindows(a_dram_block_window_tmp, a_lds_block); + + // B DRAM tile window for load + // B LDS tile window for store + // B LDS tile for block GEMM + auto&& [b_copy_dram_window, b_copy_lds_window, b_lds_gemm_window] = + Base::GetBWindows(b_dram_block_window_tmp, b_lds_block); + + // Block GEMM + auto block_gemm = BlockGemm(); + auto c_block_tile = block_gemm.MakeCBlockTile(); + + using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution()); + using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution()); + + using ABlockTile = + decltype(make_static_distributed_tensor(ABlockTileDistr{})); + using BBlockTile = + decltype(make_static_distributed_tensor(BBlockTileDistr{})); + + ABlockTile a_block_tile; + BBlockTile b_block_tile; + + // ----------------------------------------------------------------------------------------- + // Gemm pipeline start + + // prefetch + // global read 0 + Base::GlobalPrefetch(a_block_tile, a_copy_dram_window); + Base::GlobalPrefetch(b_block_tile, b_copy_dram_window); + + // initialize C + tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); + + // LDS write 0 + Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func); + Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func); + + Base::GlobalPrefetch(a_block_tile, a_copy_dram_window); + Base::GlobalPrefetch(b_block_tile, b_copy_dram_window); + + block_sync_lds(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); + + __builtin_amdgcn_sched_barrier(0); + + // main body + if constexpr(HasHotLoop) + { + index_t i = 0; + do + { + block_sync_lds(); + + Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func); + Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func); + + Base::GlobalPrefetch(a_block_tile, a_copy_dram_window); + Base::GlobalPrefetch(b_block_tile, b_copy_dram_window); + + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + + block_sync_lds(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); + HotLoopScheduler(); + __builtin_amdgcn_sched_barrier(0); + + i += 1; + } while(i < (num_loop - 1)); + } + // tail + if constexpr(TailNum == TailNumber::Full) + { + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + } + // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle + // latency + // __builtin_amdgcn_sched_barrier(0); + return c_block_tile; + } + }; + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const AElementFunction& a_element_func, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + const BElementFunction& b_element_func, + index_t num_loop, + void* p_smem) const + { + return PipelineImpl{}.template operator()( + a_dram_block_window_tmp, + a_element_func, + b_dram_block_window_tmp, + b_element_func, + num_loop, + p_smem); + } + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + index_t num_loop, + void* p_smem) const + { + return PipelineImpl{}.template operator()( + a_dram_block_window_tmp, + [](const ADataType& a) { return a; }, + b_dram_block_window_tmp, + [](const BDataType& b) { return b; }, + num_loop, + p_smem); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index 847c5b187..e2e94cf92 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -6,6 +6,7 @@ #include "ck_tile/core.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" namespace ck_tile { @@ -90,7 +91,8 @@ struct BaseGemmPipelineAgBgCrMem template struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem { - using Base = BaseGemmPipelineAgBgCrMem; + using Base = BaseGemmPipelineAgBgCrMem; + using PipelineImplBase = GemmPipelineAgBgCrImplBase; using ADataType = remove_cvref_t; using BDataType = remove_cvref_t; @@ -103,8 +105,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem using BlockGemm = remove_cvref_t())>; using I0 = number<0>; + using I1 = number<1>; + using I2 = number<2>; - static constexpr index_t BlockSize = Problem::kBlockSize; static constexpr index_t MPerBlock = BlockGemmShape::kM; static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t KPerBlock = BlockGemmShape::kK; @@ -124,46 +127,20 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem using Base::PrefetchStages; - CK_TILE_HOST_DEVICE constexpr index_t GetStaticLdsSize() - { - return integer_divide_ceil( - sizeof(ADataType) * - Policy::template MakeALdsBlockDescriptor().get_element_space_size(), - 16) * - 16 + - sizeof(BDataType) * - Policy::template MakeBLdsBlockDescriptor().get_element_space_size(); - } - CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Policy::template GetSmemSize(); } template - struct PipelineImpl + struct PipelineImpl : public PipelineImplBase { }; template <> - struct PipelineImpl + struct PipelineImpl : public PipelineImplBase { - template - CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, - SrcTileWindow& dram_tile_window) const - { - load_tile(dst_block_tile, dram_tile_window); - move_tile_window(dram_tile_window, {0, KPerBlock}); - } - - template - CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window, - const SrcBlockTile& src_block_tile, - const ElementFunction& element_func) const - { - const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile); - store_tile(lds_tile_window, block_tile_tmp); - } + using Base = PipelineImplBase; template "A/B Dram block window should have the same data type as appropriate " "([A|B]DataType) defined in Problem definition!"); - static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && - NPerBlock == - BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && - KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}], + static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && + NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}], "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" " or KPerBlock!"); // ------------------------------------------------------------------------------------ // Definitions of all needed tiles - // A tile in LDS - ADataType* p_a_lds = static_cast(p_smem); - constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor(); - auto a_lds_block = make_tensor_view(p_a_lds, a_lds_block_desc); - - // TODO: LDS alignment should come from Policy! - constexpr index_t a_lds_block_space_size_aligned = - integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), - 16) * - 16; - - // B tile in LDS - BDataType* p_b_lds = static_cast( - static_cast(static_cast(p_smem) + a_lds_block_space_size_aligned)); - constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor(); - auto b_lds_block = make_tensor_view(p_b_lds, b_lds_block_desc); + // A/B tiles in LDS + // With c++20 could simplify to below line. + // Currently get error: captured structured bindings are a C++20 extension + // auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem); + auto ab_lds_blocks = Base::GetABLdsTensorViews(p_smem); + auto& a_lds_block = ab_lds_blocks.at(I0{}); + auto& b_lds_block = ab_lds_blocks.at(I1{}); // A DRAM tile window for load - auto a_copy_dram_window = - make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - a_dram_block_window_tmp.get_window_origin(), - Policy::template MakeADramTileDistribution()); - // A LDS tile window for store - auto a_copy_lds_window = - make_tile_window(a_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - a_copy_dram_window.get_tile_distribution()); - // B DRAM tile window for load - auto b_copy_dram_window = - make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - b_dram_block_window_tmp.get_window_origin(), - Policy::template MakeBDramTileDistribution()); + // A LDS tile for block GEMM + auto a_windows = Base::GetAWindows(a_dram_block_window_tmp, a_lds_block); + auto& a_copy_dram_window = a_windows.at(I0{}); + auto& a_copy_lds_window = a_windows.at(I1{}); + auto& a_lds_gemm_window = a_windows.at(I2{}); + // B DRAM tile window for load // B LDS tile window for store - auto b_copy_lds_window = - make_tile_window(b_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - b_copy_dram_window.get_tile_distribution()); - - // A LDS tile for block GEMM - auto a_lds_gemm_window = make_tile_window( - a_lds_block, make_tuple(number{}, number{}), {0, 0}); // B LDS tile for block GEMM - auto b_lds_gemm_window = make_tile_window( - b_lds_block, make_tuple(number{}, number{}), {0, 0}); + auto b_windows = Base::GetBWindows(b_dram_block_window_tmp, b_lds_block); + auto& b_copy_dram_window = b_windows.at(I0{}); + auto& b_copy_lds_window = b_windows.at(I1{}); + auto& b_lds_gemm_window = b_windows.at(I2{}); // Block GEMM auto block_gemm = BlockGemm(); @@ -266,20 +215,20 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem // prefetch // global read 0 - GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); // initialize C tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); // LDS write 0 - LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); - LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); + Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); // Global prefetch [1, PrefetchStages] static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) { - GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); }); // main body @@ -295,19 +244,19 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_sync_lds(); - LocalPrefill( + Base::LocalPrefill( a_copy_lds_window, a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), a_element_func); - LocalPrefill( + Base::LocalPrefill( b_copy_lds_window, b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), b_element_func); - GlobalPrefetch(a_block_tiles.get(number{}), - a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(number{}), - b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(number{}), + a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(number{}), + b_copy_dram_window); }); i += PrefetchStages; @@ -323,12 +272,12 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_sync_lds(); - LocalPrefill(a_copy_lds_window, - a_block_tiles.get(number{}), - a_element_func); - LocalPrefill(b_copy_lds_window, - b_block_tiles.get(number{}), - b_element_func); + Base::LocalPrefill(a_copy_lds_window, + a_block_tiles.get(number{}), + a_element_func); + Base::LocalPrefill(b_copy_lds_window, + b_block_tiles.get(number{}), + b_element_func); }); block_sync_lds(); @@ -376,24 +325,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem }; template <> - struct PipelineImpl + struct PipelineImpl : public PipelineImplBase { - template - CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, - SrcTileWindow& dram_tile_window) const - { - load_tile(dst_block_tile, dram_tile_window); - move_tile_window(dram_tile_window, {0, KPerBlock}); - } - - template - CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window, - const SrcBlockTile& src_block_tile, - const ElementFunction& element_func) const - { - const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile); - store_tile(lds_tile_window, block_tile_tmp); - } + using Base = PipelineImplBase; template "A/B Dram block window should have the same data type as appropriate " "([A|B]DataType) defined in Problem definition!"); - static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && - NPerBlock == - BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && - KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}], + static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && + NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}], "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" " or KPerBlock!"); // ------------------------------------------------------------------------------------ // Definitions of all needed tiles - // A tile in LDS - ADataType* p_a_lds = static_cast(p_smem); - constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor(); - auto a_lds_block = make_tensor_view(p_a_lds, a_lds_block_desc); - - // TODO: LDS alignment should come from Policy! - constexpr index_t a_lds_block_space_size_aligned = - integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), - 16) * - 16; - - // B tile in LDS - BDataType* p_b_lds = static_cast( - static_cast(static_cast(p_smem) + a_lds_block_space_size_aligned)); - constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor(); - auto b_lds_block = make_tensor_view(p_b_lds, b_lds_block_desc); + // A/B tiles in LDS + // With c++20 could simplify to below line. + // Currently get error: captured structured bindings are a C++20 extension + // auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem); + auto ab_lds_blocks = Base::GetABLdsTensorViews(p_smem); + auto& a_lds_block = ab_lds_blocks.at(I0{}); + auto& b_lds_block = ab_lds_blocks.at(I1{}); // A DRAM tile window for load - auto a_copy_dram_window = - make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - a_dram_block_window_tmp.get_window_origin(), - Policy::template MakeADramTileDistribution()); - // A LDS tile window for store - auto a_copy_lds_window = - make_tile_window(a_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - a_copy_dram_window.get_tile_distribution()); - // B DRAM tile window for load - auto b_copy_dram_window = - make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - b_dram_block_window_tmp.get_window_origin(), - Policy::template MakeBDramTileDistribution()); + // A LDS tile for block GEMM + auto a_windows = Base::GetAWindows(a_dram_block_window_tmp, a_lds_block); + auto& a_copy_dram_window = a_windows.at(I0{}); + auto& a_copy_lds_window = a_windows.at(I1{}); + auto& a_lds_gemm_window = a_windows.at(I2{}); + // B DRAM tile window for load // B LDS tile window for store - auto b_copy_lds_window = - make_tile_window(b_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - b_copy_dram_window.get_tile_distribution()); - - // A LDS tile for block GEMM - auto a_lds_gemm_window = make_tile_window( - a_lds_block, make_tuple(number{}, number{}), {0, 0}); // B LDS tile for block GEMM - auto b_lds_gemm_window = make_tile_window( - b_lds_block, make_tuple(number{}, number{}), {0, 0}); + auto b_windows = Base::GetBWindows(b_dram_block_window_tmp, b_lds_block); + auto& b_copy_dram_window = b_windows.at(I0{}); + auto& b_copy_lds_window = b_windows.at(I1{}); + auto& b_lds_gemm_window = b_windows.at(I2{}); // Block GEMM auto block_gemm = BlockGemm(); @@ -496,20 +402,20 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem // prefetch // global read 0 - GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); // initialize C tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); // LDS write 0 - LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); - LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); + Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); // Global prefetch [1, PrefetchStages] static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) { - GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); }); // main body @@ -523,19 +429,19 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); // no second block_sync_lds because it's interwave - LocalPrefill( + Base::LocalPrefill( a_copy_lds_window, a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), a_element_func); - LocalPrefill( + Base::LocalPrefill( b_copy_lds_window, b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), b_element_func); - GlobalPrefetch(a_block_tiles.get(number{}), - a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(number{}), - b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(number{}), + a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(number{}), + b_copy_dram_window); }); i += PrefetchStages; @@ -548,12 +454,12 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); // no second block_sync_lds because it's interwave - LocalPrefill(a_copy_lds_window, - a_block_tiles.get(number{}), - a_element_func); - LocalPrefill(b_copy_lds_window, - b_block_tiles.get(number{}), - b_element_func); + Base::LocalPrefill(a_copy_lds_window, + a_block_tiles.get(number{}), + a_element_func); + Base::LocalPrefill(b_copy_lds_window, + b_block_tiles.get(number{}), + b_element_func); }); block_sync_lds(); -- GitLab From aa6e2087f550be335e7b14893ee615303eec3faa Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 28 Nov 2024 10:42:19 -0800 Subject: [PATCH 089/153] Reduce docker size and build time in CI. (#1699) * refactor docker build in CI * add Dockerfile.compiler * add input args to Dockerfile.compiler * rearrange the docker args --- Dockerfile | 4 ---- Dockerfile.compiler | 26 ++++++++++++++++++++++++++ Jenkinsfile | 45 +++++++++++++++++++++++++-------------------- 3 files changed, 51 insertions(+), 24 deletions(-) create mode 100644 Dockerfile.compiler diff --git a/Dockerfile b/Dockerfile index 38a563ce3..f9b7d76e3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,10 +77,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- # Remove unnecessary rocm components that take a lot of space apt-get remove -y rocblas rocfft rocsparse composablekernel-dev -# hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1 -RUN if [ "$ROCMVERSION" = "6.1" ]; then \ - sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \ - fi # Update the cmake to version 3.27.5 RUN pip install --upgrade cmake==3.27.5 && \ #Install latest ccache diff --git a/Dockerfile.compiler b/Dockerfile.compiler new file mode 100644 index 000000000..354b71f69 --- /dev/null +++ b/Dockerfile.compiler @@ -0,0 +1,26 @@ +ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.2" +FROM $BASE_DOCKER +ARG compiler_version="" +ARG compiler_commit="" + +# Add alternative compilers, if necessary +ENV compiler_version=$compiler_version +ENV compiler_commit=$compiler_commit +RUN sh -c "echo compiler version = '$compiler_version'" && \ + sh -c "echo compiler commit = '$compiler_commit'" + +RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \ + git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ + cd llvm-project && mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ + make -j 16 ; \ + else echo "using the release compiler"; \ + fi + +RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \ + git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ + cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ + make -j 16 ; \ + else echo "using the release compiler"; \ + fi diff --git a/Jenkinsfile b/Jenkinsfile index b448a5130..f8493fa2f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -32,41 +32,42 @@ def runShell(String command){ return (output != "") } -def getDockerImageName(){ +def getBaseDockerImageName(){ def img if (params.USE_CUSTOM_DOCKER != ""){ img = "${params.USE_CUSTOM_DOCKER}" } else{ if (params.ROCMVERSION != "6.3"){ - if (params.COMPILER_VERSION == "") { - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" - } - else{ - if (params.COMPILER_COMMIT == ""){ - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" - } - else{ - def commit = "${params.COMPILER_COMMIT}"[0..6] - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" - } - } + img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" + } + else{ + img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" + } + } + return img +} + +def getDockerImageName(){ + def img + def base_name = getBaseDockerImageName() + if (params.USE_CUSTOM_DOCKER != ""){ + img = "${params.USE_CUSTOM_DOCKER}" } else{ if (params.COMPILER_VERSION == "") { - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" + img = "${base_name}" } else{ if (params.COMPILER_COMMIT == ""){ - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" + img = "${base_name}_${params.COMPILER_VERSION}" } else{ def commit = "${params.COMPILER_COMMIT}"[0..6] - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" + img = "${base_name}_${params.COMPILER_VERSION}_${commit}" } } } - } return img } @@ -131,17 +132,21 @@ def buildDocker(install_prefix){ env.DOCKER_BUILDKIT=1 checkout scm def image_name = getDockerImageName() + def base_image_name = getBaseDockerImageName() echo "Building Docker for ${image_name}" - def dockerArgs = "--squash --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' " + def dockerArgs = "--build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ - dockerArgs = dockerArgs + " --no-cache " + dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f Dockerfile.compiler . " + } + else{ + dockerArgs = dockerArgs + " -f Dockerfile . " } echo "Build Args: ${dockerArgs}" try{ if(params.BUILD_DOCKER){ //force building the new docker if that parameter is true echo "Building image: ${image_name}" - retimage = docker.build("${image_name}", dockerArgs + ' .') + retimage = docker.build("${image_name}", dockerArgs) withDockerRegistry([ credentialsId: "docker_test_cred", url: "" ]) { retimage.push() } -- GitLab From bb652696e765fe178404bd38a071d6d6b829bccb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 10:43:36 -0800 Subject: [PATCH 090/153] Bump rocm-docs-core from 1.9.0 to 1.9.1 in /docs/sphinx (#1701) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.9.0 to 1.9.1. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.9.0...v1.9.1) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 5bec504a0..79c74cd7f 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.9.0 +rocm-docs-core==1.9.1 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 8881c0e74..426073037 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.9.0 +rocm-docs-core==1.9.1 # via -r requirements.in six==1.16.0 # via pybtex -- GitLab From 78f0fea08eafa7e3da49cbb3d77c962cecb3ae0b Mon Sep 17 00:00:00 2001 From: aledudek Date: Fri, 29 Nov 2024 11:52:18 +0100 Subject: [PATCH 091/153] Ck tile batched gemm example (#1615) * [CK Tile] Batched GEMM Example * [CK Tile] Batched GEMM Example - minor refactor * [CK Tile] Batched GEMM Example - README update * [CK Tile] Batched Gemm Example - review changes - Added tensor data layours as input parameters - Changed structure of Host and Kernel args - Removed bug with invalid vector read on non-contiguous memory * [CK Tile] Batched Gemm Example - remove comment * [CK Tile] Batched Gemm Example - Add GTests part1 * [CK Tile] Batched Gemm Example - GTests part2 + review changes * [CK TILE] Batched GEMM post merge fixes * [CK Tile] Batched GEMM Example - fix pad views --- .../ck_tile/16_batched_gemm/CMakeLists.txt | 1 + example/ck_tile/16_batched_gemm/README.md | 37 +++ .../ck_tile/16_batched_gemm/batched_gemm.cpp | 103 +++++++ .../ck_tile/16_batched_gemm/batched_gemm.hpp | 63 +++++ .../run_batched_gemm_example.inc | 253 +++++++++++++++++ example/ck_tile/CMakeLists.txt | 2 +- .../ck_tile/host/reference/reference_gemm.hpp | 112 ++++++++ include/ck_tile/ops/gemm.hpp | 1 + .../ops/gemm/kernel/batched_gemm_kernel.hpp | 258 ++++++++++++++++++ .../gemm_pipeline_agmem_bgmem_creg_v1.hpp | 2 +- test/ck_tile/CMakeLists.txt | 1 + test/ck_tile/batched_gemm/CMakeLists.txt | 4 + .../batched_gemm/test_batched_gemm.cpp | 29 ++ .../test_batched_gemm_ut_cases.inc | 9 + .../batched_gemm/test_batched_gemm_util.hpp | 225 +++++++++++++++ 15 files changed, 1098 insertions(+), 2 deletions(-) create mode 100644 example/ck_tile/16_batched_gemm/CMakeLists.txt create mode 100644 example/ck_tile/16_batched_gemm/README.md create mode 100644 example/ck_tile/16_batched_gemm/batched_gemm.cpp create mode 100644 example/ck_tile/16_batched_gemm/batched_gemm.hpp create mode 100644 example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc create mode 100644 include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp create mode 100644 test/ck_tile/batched_gemm/CMakeLists.txt create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm.cpp create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm_util.hpp diff --git a/example/ck_tile/16_batched_gemm/CMakeLists.txt b/example/ck_tile/16_batched_gemm/CMakeLists.txt new file mode 100644 index 000000000..78e78c6b0 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/CMakeLists.txt @@ -0,0 +1 @@ +add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp) diff --git a/example/ck_tile/16_batched_gemm/README.md b/example/ck_tile/16_batched_gemm/README.md new file mode 100644 index 000000000..34b56db52 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/README.md @@ -0,0 +1,37 @@ +# Batched GEMM + +This folder contains example for batched GEMM using ck_tile tile-programming implementation. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ +make tile_example_batched_gemm -j +``` +This will result in an executable `build/bin/tile_example_batched_gemm` + +## example +``` +args: + -m m dimension (default:256) + -n n dimension (default:128) + -k k dimension (default:128) + -a_layout A tensor data layout (default:R) (R for Row, C for Col) + -b_layout B tensor data layout (default:R) (R for Row, C for Col) + -c_layout C tensor data layout (default:R) (R for Row, C for Col) + -stride_a Tensor A stride (default:128) + -stride_b Tensor B stride (default:128) + -stride_c Tensor C stride (default:128) + -batch_stride_a Batch A stride (default:32768) + -batch_stride_b Batch B stride (default:16384) + -batch_stride_c Batch C stride (default:32768) + -batch_count Batch count (default:16) + -v 0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2) + -e Absolute error tolerance (default:1e-5) + -prec data type. fp16/bf16/fp8/bf8 (default:fp16) + -warmup number of iterations before benchmark the kernel (default:10) + -repeat number of iterations to benchmark the kernel (default:100) + -timer gpu:gpu timer, cpu:cpu timer (default:gpu) +``` \ No newline at end of file diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp new file mode 100644 index 000000000..bfdd74126 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/host.hpp" +#include "batched_gemm.hpp" + +template +float batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s) +{ + // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; + constexpr bool kTilePermute = false; + // The rank and permutation will also be generate out by the CodeGen part. + constexpr ck_tile::index_t kOutputRank = 2; + + constexpr int kBlockPerCu = 1; + + // This part comes from the Codegen + constexpr ck_tile::index_t M_Tile = 128; + constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t K_Tile = 32; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + + // Whether doing the CShuffle (transpose before the global memory), depending on the output + // layout. + constexpr bool CShuffleEpilogue = + std::is_same_v; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = ck_tile::GemmTilePartitioner; + + using GemmEpilogue = std::conditional_t< + CShuffleEpilogue, + ck_tile::CShuffleEpilogue>, + ck_tile::Default2DEpilogue< + ck_tile::Default2DEpilogueProblem>>; + + using CodegenGemmTraits = + ck_tile::TileGemmTraits; + + using CodegenPipelineProblem = ck_tile:: + GemmPipelineProblem; + + using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + // ToDo: Will add the codegen part to test different pipeline policies in GEMM. + // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. + using Kernel = ck_tile::BatchedGemmKernel; + + auto kargs = Kernel::MakeKargs(args); + + const dim3 grids = Kernel::GridSize(args); + constexpr dim3 blocks = Kernel::BlockSize(); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; +} + +#include "run_batched_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); } diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp new file mode 100644 index 000000000..e252c0f67 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" + +template +struct BatchedGemmTypeConfig; + +template <> +struct BatchedGemmTypeConfig +{ + using ADataType = ck_tile::half_t; + using BDataType = ck_tile::half_t; + using AccDataType = float; + using CDataType = ck_tile::half_t; +}; + +using Types = BatchedGemmTypeConfig; + +// Specific type aliases for easy access +using ADataType = Types::ADataType; +using BDataType = Types::BDataType; +using AccDataType = Types::AccDataType; +using CDataType = Types::CDataType; + +struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs +{ +}; + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "256", "m dimension") + .insert("n", "128", "n dimension") + .insert("k", "128", "k dimension") + .insert("stride_a", "0", "Tensor A stride") + .insert("stride_b", "0", "Tensor B stride") + .insert("stride_c", "0", "Tensor C stride") + .insert("a_layout", "R", "A tensor data layout - Row by default") + .insert("b_layout", "R", "B tensor data layout - Row by default") + .insert("c_layout", "R", "C tensor data layout - Row by default") + .insert("batch_stride_a", "32768", "Batch A stride") + .insert("batch_stride_b", "16384", "Batch B stride") + .insert("batch_stride_c", "32768", "Batch C stride") + .insert("batch_count", "16", "Batch count") + .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") + .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") + .insert("warmup", "50", "number of iterations before benchmark the kernel") + .insert("repeat", "100", "number of iterations to benchmark the kernel") + .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +// host API +float batched_gemm(batched_gemm_kargs args, const ck_tile::stream_config& s); diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc new file mode 100644 index 000000000..dacca2042 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +template +float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, + ck_tile::DeviceMem& b_k_n_dev_buf, + ck_tile::DeviceMem& c_m_n_dev_buf, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t stride_A, + ck_tile::index_t stride_B, + ck_tile::index_t stride_C, + ck_tile::index_t batch_stride_A, + ck_tile::index_t batch_stride_B, + ck_tile::index_t batch_stride_C, + ck_tile::index_t batch_count, + int n_warmup, + int n_repeat) +{ + batched_gemm_kargs args; + args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); + args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); + args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); + args.M = M; + args.N = N; + args.K = K; + args.stride_A = stride_A; + args.stride_B = stride_B; + args.stride_C = stride_C; + args.batch_stride_A = batch_stride_A; + args.batch_stride_B = batch_stride_B; + args.batch_stride_C = batch_stride_C; + args.batch_count = batch_count; + + float ave_time = batched_gemm( + args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); + + std::string op_name{"Batched Gemm"}; + std::size_t flop = std::size_t(2) * batch_count * M * N * K; + std::size_t num_byte = sizeof(ADataType) * batch_count * M * K + + sizeof(BDataType) * batch_count * N * K + + sizeof(CDataType) * batch_count * M * N; + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K + << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C + << " batch_stride_A =" << batch_stride_A << " batch_stride_B =" << batch_stride_B + << " batch_stride_C =" << batch_stride_C << " batch_count =" << batch_count << " : " + << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << std::endl; + + return ave_time; +} + +template +int run_batched_gemm_example_with_layouts(int argc, + char* argv[], + const ALayout a_layout = ALayout{}, + const BLayout b_layout = BLayout{}, + [[maybe_unused]] const CLayout c_layout = CLayout{}) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + ck_tile::index_t M = arg_parser.get_int("m"); + ck_tile::index_t N = arg_parser.get_int("n"); + ck_tile::index_t K = arg_parser.get_int("k"); + + ck_tile::index_t stride_A = arg_parser.get_int("stride_a"); + ck_tile::index_t stride_B = arg_parser.get_int("stride_b"); + ck_tile::index_t stride_C = arg_parser.get_int("stride_c"); + + ck_tile::index_t batch_stride_A = arg_parser.get_int("batch_stride_a"); + ck_tile::index_t batch_stride_B = arg_parser.get_int("batch_stride_b"); + ck_tile::index_t batch_stride_C = arg_parser.get_int("batch_stride_c"); + ck_tile::index_t batch_count = arg_parser.get_int("batch_count"); + + int n_warmup = arg_parser.get_int("warmup"); + int n_repeat = arg_parser.get_int("repeat"); + + using namespace ck_tile::literals; + + auto f_host_tensor_descriptor = [](std::size_t batch_count_, + std::size_t row, + std::size_t col, + std::size_t stride, + std::size_t batch_stride, + auto layout) { + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, 1_uz, stride}); + } + }; + + auto f_get_default_stride = [](std::size_t row, + std::size_t col, + std::size_t stride, + auto layout) { + if(stride == 0) + { + // give a chance if stride is zero, return a default packed stride + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; + }; + + stride_A = f_get_default_stride(M, K, stride_A, a_layout); + stride_B = f_get_default_stride(K, N, stride_B, b_layout); + stride_C = f_get_default_stride(M, N, stride_C, c_layout); + + ck_tile::HostTensor a_m_k( + f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, a_layout)); + ck_tile::HostTensor b_k_n( + f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, b_layout)); + ck_tile::HostTensor c_m_n_dev_result( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, c_layout)); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + b_k_n_dev_buf.ToDevice(b_k_n.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + invoke_batched_gemm(a_m_k_dev_buf, + b_k_n_dev_buf, + c_m_n_dev_buf, + M, + N, + K, + stride_A, + stride_B, + stride_C, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_count, + n_warmup, + n_repeat); + + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + bool pass = true; + + if(arg_parser.get_int("v") == 1) + { + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{})); + c_m_n_host_ref.SetZero(); + + const auto b_n_k = b_k_n.transpose({0, 2, 1}); + + ck_tile::reference_batched_gemm( + a_m_k, b_n_k, c_m_n_host_ref); + + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref); + + std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl; + } + else if(arg_parser.get_int("v") == 2) + { + ck_tile::HostTensor c_m_n_gpu_ref( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{})); + ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes()); + c_m_n_gpu_ref.SetZero(); + c_m_n_gpu_buf_ref.SetZero(); + + ck_tile::reference_batched_gemm_gpu(a_m_k_dev_buf, + b_k_n_dev_buf, + c_m_n_gpu_buf_ref, + M, + N, + K, + stride_A, + stride_B, + stride_C, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_count); + + c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data()); + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_gpu_ref); + + std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl; + } + + return pass; +} + +int run_batched_gemm_example(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + using Row = ck_tile::tensor_layout::gemm::RowMajor; + using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + + std::string a_layout = arg_parser.get_str("a_layout"); + std::string b_layout = arg_parser.get_str("b_layout"); + + if(a_layout == "R" && b_layout == "R") + { + return run_batched_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{}); + } + else if(a_layout == "R" && b_layout == "C") + { + return run_batched_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{}); + } + // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not + // work else if(a_layout == "C" && b_layout == "C") + // { + // return run_batched_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{}); + // } + // else if(a_layout == "C" && b_layout == "R") + // { + // return run_batched_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{}); + // } + else + { + throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!"); + } +} diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 29305405b..51ebb5bf0 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -15,4 +15,4 @@ add_subdirectory(12_smoothquant) add_subdirectory(13_moe_sorting) add_subdirectory(14_moe_smoothquant) add_subdirectory(15_fused_moe) - +add_subdirectory(16_batched_gemm) diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp index dbdef0e9c..8bd1f5b04 100644 --- a/include/ck_tile/host/reference/reference_gemm.hpp +++ b/include/ck_tile/host/reference/reference_gemm.hpp @@ -183,4 +183,116 @@ void reference_gemm_gpu(DeviceMem& a_device, return; } + +template +void reference_batched_gemm_gpu(DeviceMem& a_device, + DeviceMem& b_device, + DeviceMem& c_device, + index_t M, + index_t N, + index_t K, + index_t stride_a, + index_t stride_b, + index_t stride_c, + index_t batch_stride_A, + index_t batch_stride_B, + index_t batch_stride_C, + index_t batch_count) +{ + + ADataType* d_A; + BDataType* d_B; + CDataType* d_C; + + hipError_t errA = hipMalloc(&d_A, batch_count * M * K * sizeof(ADataType)); + hipError_t errB = hipMalloc(&d_B, batch_count * N * K * sizeof(BDataType)); + hipError_t errC = hipMalloc(&d_C, batch_count * M * N * sizeof(CDataType)); + if(errA != hipSuccess) + { + std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA) + << std::endl; + return; // Early exit on error + } + + if(errB != hipSuccess) + { + std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB) + << std::endl; + return; // Early exit on error + } + + if(errC != hipSuccess) + { + std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC) + << std::endl; + return; // Early exit on error + } + + errA = hipMemcpy(d_A, + a_device.GetDeviceBuffer(), + batch_count * M * K * sizeof(ADataType), + hipMemcpyHostToDevice); + if(errA != hipSuccess) + { + std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl; + } + + errB = hipMemcpy(d_B, + b_device.GetDeviceBuffer(), + batch_count * N * K * sizeof(BDataType), + hipMemcpyHostToDevice); + if(errB != hipSuccess) + { + std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl; + } + + int totalElements = M * N; + int numThreadsPerBlock = 256; // Common choice for threads per block + int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock; + + for(index_t batch_id = 0; batch_id < batch_count; ++batch_id) + { + ADataType* d_ATemp = d_A + batch_id * batch_stride_A; + BDataType* d_BTemp = d_B + batch_id * batch_stride_B; + CDataType* d_CTemp = d_C + batch_id * batch_stride_C; + naive_gemm_kernel + <<>>( + d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c); + } + + errC = hipMemcpy(c_device.GetDeviceBuffer(), + d_C, + batch_count * M * N * sizeof(CDataType), + hipMemcpyDeviceToHost); + if(errC != hipSuccess) + { + std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl; + } + + errA = hipFree(d_A); + if(errA != hipSuccess) + { + std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl; + } + + errB = hipFree(d_B); + if(errB != hipSuccess) + { + std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl; + } + + errC = hipFree(d_C); + if(errC != hipSuccess) + { + std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl; + } + + return; +} } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 1340fb204..b9eb24858 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -25,6 +25,7 @@ #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp" diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp new file mode 100644 index 000000000..07b4af573 --- /dev/null +++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +namespace ck_tile { + +struct BatchedGemmHostArgs +{ + const void* a_ptr; + const void* b_ptr; + void* c_ptr; + index_t M; + index_t N; + index_t K; + index_t stride_A; + index_t stride_B; + index_t stride_C; + index_t batch_stride_A; + index_t batch_stride_B; + index_t batch_stride_C; + index_t batch_count; +}; + +template +struct BatchedGemmKernel +{ + using TilePartitioner = remove_cvref_t; + using GemmPipeline = remove_cvref_t; + using EpiloguePipeline = remove_cvref_t; + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + + struct BatchedGemmKargs + { + const void* a_ptr; + const void* b_ptr; + void* c_ptr; + index_t M; + index_t N; + index_t K; + index_t stride_A; + index_t stride_B; + index_t stride_C; + index_t batch_stride_A; + index_t batch_stride_B; + index_t batch_stride_C; + index_t batch_count; + }; + + using Kargs = BatchedGemmKargs; + using Hargs = BatchedGemmHostArgs; + + __host__ static constexpr auto GridSize(const Hargs& h) + { + return TilePartitioner::GridSize(h.M, h.N, h.batch_count); + } + + __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); } + + CK_TILE_HOST static constexpr BatchedGemmKargs MakeKargs(const Hargs& h) + { + Kargs k; + k.a_ptr = h.a_ptr; + k.b_ptr = h.b_ptr; + k.c_ptr = h.c_ptr; + k.M = h.M; + k.N = h.N; + k.K = h.K; + k.stride_A = h.stride_A; + k.stride_B = h.stride_B; + k.stride_C = h.stride_C; + k.batch_stride_A = h.batch_stride_A; + k.batch_stride_B = h.batch_stride_B; + k.batch_stride_C = h.batch_stride_C; + k.batch_count = h.batch_count; + return k; + } + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); + } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + const auto [i_m, i_n] = TilePartitioner{}(); + const auto i_batch = __builtin_amdgcn_readfirstlane(blockIdx.z); + + // options + const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A); + const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A); + const ADataType* a_start = static_cast(kargs.a_ptr); + + const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B); + const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B); + const BDataType* b_start = static_cast(kargs.b_ptr); + + // Convert pointers to tensor views + auto a_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + a_start + batch_offset_A, + make_tuple(kargs.M, kargs.K), + make_tuple(kargs.stride_A, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + a_start + batch_offset_A, + make_tuple(kargs.M, kargs.K), + make_tuple(1, kargs.stride_A), + number<1>{}, + number<1>{}); + } + }(); + + auto b_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + b_start + batch_offset_B, + make_tuple(kargs.N, kargs.K), + make_tuple(1, kargs.stride_B), + number<1>{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + b_start + batch_offset_B, + make_tuple(kargs.N, kargs.K), + make_tuple(kargs.stride_B, 1), + number{}, + number<1>{}); + } + }(); + + auto a_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + a_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + a_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); + // clang-format on + + auto a_block_window = make_tile_window( + a_pad_view, + make_tuple(number{}, number{}), + {i_m, 0}); + + auto b_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + b_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + b_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); + // clang-format on + + auto b_block_window = make_tile_window( + b_pad_view, + make_tuple(number{}, number{}), + {i_n, 0}); + + // allocate LDS + __shared__ char smem_ptr[GetSmemSize()]; + + const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); + + // Run GEMM cooperatively by whole wokrgroup. + auto c_block_tile = + GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr); + + const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C); + const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C); + CDataType* c_start = static_cast(kargs.c_ptr); + auto c_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + c_start + batch_offset_C, + make_tuple(kargs.M, kargs.N), + make_tuple(kargs.stride_C, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + c_start + batch_offset_C, + make_tuple(kargs.M, kargs.N), + make_tuple(1, kargs.stride_C), + number<1>{}, + number<1>{}); + } + }(); + + auto c_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + c_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + c_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); + auto c_block_window = make_tile_window( + c_pad_view, + make_tuple(number{}, number{}), + {i_m, i_n}); + + EpiloguePipeline{}(c_block_window, c_block_tile); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index c0817e736..822748c69 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -124,7 +124,7 @@ struct GemmPipelineAGmemBGmemCRegV1 b_lds_block, make_tuple(number{}, number{}), {0, 0}); // Block GEMM - constexpr auto block_gemm = Policy::template GetBlockGemm(); + auto block_gemm = Policy::template GetBlockGemm(); // Acc register tile auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){}; diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index ac9c4311d..fd0de0f9c 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(image_to_column) add_subdirectory(gemm) +add_subdirectory(batched_gemm) diff --git a/test/ck_tile/batched_gemm/CMakeLists.txt b/test/ck_tile/batched_gemm/CMakeLists.txt new file mode 100644 index 000000000..532ead112 --- /dev/null +++ b/test/ck_tile/batched_gemm/CMakeLists.txt @@ -0,0 +1,4 @@ +# Currently ck_tile is only built on gfx9 +if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_ck_tile_batched_gemm test_batched_gemm.cpp) +endif() diff --git a/test/ck_tile/batched_gemm/test_batched_gemm.cpp b/test/ck_tile/batched_gemm/test_batched_gemm.cpp new file mode 100644 index 000000000..29bed8d2f --- /dev/null +++ b/test/ck_tile/batched_gemm/test_batched_gemm.cpp @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "gtest/gtest.h" + +#include "ck_tile/host.hpp" +#include "test_batched_gemm_util.hpp" + +using F16 = ck_tile::half_t; +using F32 = float; + +using Row = ck_tile::tensor_layout::gemm::RowMajor; +using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + +// clang-format off +using KernelTypes = ::testing::Types< + // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType + std::tuple< Row, Row, Row, F16, F16, F32, F16>, + //std::tuple< Col, Row, Row, F16, F16, F32, F16>, + std::tuple< Row, Col, Row, F16, F16, F32, F16>//, + //std::tuple< Col, Col, Row, F16, F16, F32, F16> + >; +// clang-format on + +TYPED_TEST_SUITE(TestCkTileBatchedGemm, KernelTypes); + +#include "test_batched_gemm_ut_cases.inc" diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc new file mode 100644 index 000000000..f261164d6 --- /dev/null +++ b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc @@ -0,0 +1,9 @@ +#pragma once + +TYPED_TEST(TestCkTileBatchedGemm, Basic) +{ + constexpr int M = 256; + constexpr int N = 128; + constexpr int K = 128; + this->Run(M, N, K); +} diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp new file mode 100644 index 000000000..88145b987 --- /dev/null +++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" + +template +class TestCkTileBatchedGemm : public ::testing::Test +{ + protected: + using ALayout = std::tuple_element_t<0, Tuple>; + using BLayout = std::tuple_element_t<1, Tuple>; + using CLayout = std::tuple_element_t<2, Tuple>; + using ADataType = std::tuple_element_t<3, Tuple>; + using BDataType = std::tuple_element_t<4, Tuple>; + using AccDataType = std::tuple_element_t<5, Tuple>; + using CDataType = std::tuple_element_t<6, Tuple>; + + struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs + { + }; + + template + void invoke_batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s) + { + // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; + constexpr bool kTilePermute = false; + // The rank and permutation will also be generate out by the CodeGen part. + constexpr ck_tile::index_t kOutputRank = 2; + + constexpr int kBlockPerCu = 1; + + // This part comes from the Codegen + constexpr ck_tile::index_t M_Tile = 128; + constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t K_Tile = 32; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + + // Whether doing the CShuffle (transpose before the global memory), depending on the output + // layout. + constexpr bool CShuffleEpilogue = + std::is_same_v; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = ck_tile::GemmTilePartitioner; + + using GemmEpilogue = std::conditional_t< + CShuffleEpilogue, + ck_tile::CShuffleEpilogue>, + ck_tile::Default2DEpilogue< + ck_tile::Default2DEpilogueProblem>>; + + using CodegenGemmTraits = + ck_tile::TileGemmTraits; + + using CodegenPipelineProblem = ck_tile::GemmPipelineProblem; + + using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + using Kernel = + ck_tile::BatchedGemmKernel; + + auto kargs = Kernel::MakeKargs(args); + + const dim3 grids = Kernel::GridSize(args); + constexpr dim3 blocks = Kernel::BlockSize(); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + } + + public: + void Run(const int M, + const int N, + const int K, + int StrideA = 128, + int StrideB = 128, + int StrideC = 128, + const int BatchStrideA = 32768, + const int BatchStrideB = 16384, + const int BatchStrideC = 32768, + const int BatchCount = 16) + { + using namespace ck_tile::literals; + + auto f_host_tensor_descriptor = [](std::size_t batch_count_, + std::size_t row, + std::size_t col, + std::size_t stride, + std::size_t batch_stride, + auto layout) { + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, 1_uz, stride}); + } + }; + + auto f_get_default_stride = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(stride == 0) + { + // give a chance if stride is zero, return a default packed stride + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; + }; + + StrideA = f_get_default_stride(M, K, StrideA, ALayout{}); + StrideB = f_get_default_stride(K, N, StrideB, BLayout{}); + StrideC = f_get_default_stride(M, N, StrideC, CLayout{}); + + ck_tile::HostTensor a_m_k( + f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{})); + ck_tile::HostTensor b_k_n( + f_host_tensor_descriptor(BatchCount, K, N, StrideB, BatchStrideB, BLayout{})); + ck_tile::HostTensor c_m_n_dev_result( + f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{})); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + b_k_n_dev_buf.ToDevice(b_k_n.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + batched_gemm_kargs kargs{a_m_k_dev_buf.GetDeviceBuffer(), + b_k_n_dev_buf.GetDeviceBuffer(), + c_m_n_dev_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + StrideC, + BatchStrideA, + BatchStrideB, + BatchStrideC, + BatchCount}; + + invoke_batched_gemm(kargs, + ck_tile::stream_config{nullptr, false}); + + std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K + << " StrideA =" << StrideA << " StrideB =" << StrideB << " StrideC =" << StrideC + << " BatchStrideA =" << BatchStrideA << " BatchStrideB =" << BatchStrideB + << " BatchStrideC =" << BatchStrideC << " BatchCount =" << BatchCount + << std::endl; + + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + bool pass = true; + + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{})); + c_m_n_host_ref.SetZero(); + + const auto b_n_k = b_k_n.transpose({0, 2, 1}); + ck_tile::reference_batched_gemm( + a_m_k, b_n_k, c_m_n_host_ref); + + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref); + EXPECT_TRUE(pass); + } +}; -- GitLab From 28e02cf5243107a8b2ea65e0a8ef0e1c4bba3964 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 29 Nov 2024 07:18:43 -0800 Subject: [PATCH 092/153] Bump rocm-docs-core from 1.9.1 to 1.9.2 in /docs/sphinx (#1702) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.9.1 to 1.9.2. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.9.1...v1.9.2) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 79c74cd7f..995dfaf02 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.9.1 +rocm-docs-core==1.9.2 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 426073037..d8f7c3846 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.9.1 +rocm-docs-core==1.9.2 # via -r requirements.in six==1.16.0 # via pybtex -- GitLab From cff7fab798a867c9507fafe7beccd76afd0d16d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Sat, 30 Nov 2024 05:51:09 +0100 Subject: [PATCH 093/153] [CK TILE] Fix universal gemm template keywords (#1704) --- .../ops/gemm/block/block_universal_gemm_as_bs_cr.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp index c9e648f43..0fe0a9f40 100644 --- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp +++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp @@ -623,7 +623,7 @@ struct BlockUniversalGemmAsBsCr CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window, const BSmemBlockWindow& b_block_window) { - block_gemm_impl_.template LocalPrefetch(a_block_window, b_block_window); + block_gemm_impl_.LocalPrefetch(a_block_window, b_block_window); } // C += A * B @@ -632,7 +632,7 @@ struct BlockUniversalGemmAsBsCr const ASmemBlockWindow& a_block_window, const BSmemBlockWindow& b_block_window) { - block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window); + block_gemm_impl_(c_block_tensor, a_block_window, b_block_window); } // C = A * B @@ -641,7 +641,7 @@ struct BlockUniversalGemmAsBsCr const BSmemBlockWindow& b_block_window) { auto c_block_tensor = MakeCBlockTile(); - block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window); + block_gemm_impl_(c_block_tensor, a_block_window, b_block_window); return c_block_tensor; } -- GitLab From 44828b7c0f0d2d4cba5b40c8f2706f542a436aa9 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Sat, 30 Nov 2024 08:11:42 -0800 Subject: [PATCH 094/153] [Python] Add batched gemm instances parsing (#1684) * add op * do not insert ds parameters as they are already parsed * reset ds parameters * apply ruff --- .../batched_universal_gemm/gen_instances.py | 149 ++++++++++++++++++ .../ck4inductor/batched_universal_gemm/op.py | 99 ++++++++++++ .../grouped_conv_fwd/gen_instances.py | 4 +- 3 files changed, 249 insertions(+), 3 deletions(-) create mode 100644 python/ck4inductor/batched_universal_gemm/gen_instances.py create mode 100644 python/ck4inductor/batched_universal_gemm/op.py diff --git a/python/ck4inductor/batched_universal_gemm/gen_instances.py b/python/ck4inductor/batched_universal_gemm/gen_instances.py new file mode 100644 index 000000000..8879fb93d --- /dev/null +++ b/python/ck4inductor/batched_universal_gemm/gen_instances.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +import logging +import os +import subprocess +from dataclasses import replace +from functools import lru_cache +from typing import List + +from ..util import library_path + +from .op import CKBatchedGemmOperation + +log = logging.getLogger(__name__) + + +def _ck_library_dir(): + gemm_instances_path = os.path.join( + library_path(), + "src", + "tensor_operation_instance", + "gpu", + "gemm_universal_batched", + ) + if not os.path.exists(gemm_instances_path): + log.error("CK library path %s does not exist", gemm_instances_path) + return None + return gemm_instances_path + + +def parse_instances(str_instances: List[str]) -> List[CKBatchedGemmOperation]: + """ + Parse the lines containing Universal Gemm template instances into `CKBatchedGemmOperation` instances + """ + + def maybe_int(s): + try: + return int(s) + except ValueError: + return s + + op_instances = [] + for line in str_instances: + s_template_args = line.split("DeviceBatchedGemmMultiD_Xdl_CShuffle_V3")[ + -1 + ].strip("<>, ") + template_args = [] + i_current = 0 + while i_current < len(s_template_args): + if s_template_args[i_current] == " ": + # skip whitespace + i_current += 1 + continue + elif s_template_args[i_current : i_current + 2] == "S<": + # parse template S + i_next = s_template_args.find(">", i_current) + template_args.append( + tuple(map(int, s_template_args[i_current + 2 : i_next].split(","))) + ) + i_current = i_next + 2 + else: + # all string attributes must be either type aliases or global constants in C++ + i_next = s_template_args.find(",", i_current) + template_args.append( + maybe_int( + s_template_args[i_current : i_next if i_next != -1 else None] + ) + ) + if i_next != -1: + i_current = i_next + 1 + if i_next == -1: + break + + # ds layout and dtype are parsed as placeholder; reset value + template_args[2] = tuple() # ds layout + template_args[6] = tuple() # ds dtype + + new_instance = CKBatchedGemmOperation( + *template_args, # type: ignore[arg-type] + ) + + op_instances.append(new_instance) + return op_instances + + +@lru_cache(None) +def gen_ops_library() -> List[CKBatchedGemmOperation]: + """ + Parse the Universal Gemm instances defined in the composable kernel library folder. + """ + ck_library_dir = _ck_library_dir() + if not ck_library_dir: + return [] + + grep_result = subprocess.run( + [ + "grep", + "-inR", + "DeviceBatchedGemmMultiD_Xdl_CShuffle_V3", + _ck_library_dir(), + ], + capture_output=True, + text=True, + ) + + op_instances = parse_instances(grep_result.stdout.strip().split("\n")) + + log.debug("ck instances from library: %d", len(op_instances)) + + schedulers = [ + "BlockGemmPipelineScheduler::Intrawave", + "BlockGemmPipelineScheduler::Interwave", + ] + gemm_specs = [ + "GemmSpecialization::Default", + "GemmSpecialization::MPadding", + "GemmSpecialization::NPadding", + "GemmSpecialization::KPadding", + "GemmSpecialization::MNPadding", + "GemmSpecialization::MKPadding", + "GemmSpecialization::NKPadding", + "GemmSpecialization::MNKPadding", + ] + + # substitute templated args by looping through their domains + substitute_instances = [] + for instance in op_instances: + sub_scheduler = instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched" + sub_spec = instance.gemm_specialization == "GemmSpec" + schedulers_range = ( + schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler] + ) + spec_range = gemm_specs if sub_spec else [instance.gemm_specialization] + for scheduler in schedulers_range: + for spec in spec_range: + substitute_instances.append( + replace( + instance, + block_gemm_pipeline_scheduler=scheduler, + gemm_specialization=spec, + ) + ) + + return substitute_instances + + +if __name__ == "__main__": + print(gen_ops_library()) diff --git a/python/ck4inductor/batched_universal_gemm/op.py b/python/ck4inductor/batched_universal_gemm/op.py new file mode 100644 index 000000000..96978ac8d --- /dev/null +++ b/python/ck4inductor/batched_universal_gemm/op.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +from dataclasses import asdict, dataclass +from typing import Optional, Tuple + + +@dataclass +class CKBatchedGemmOperation: + """ + A python dataclass storing the template parameters of a CK Universal Gemm template instance + """ + + a_layout: str + b_layout: str + ds_layouts: Tuple[str] # addmm specific + c_layout: str + + a_element_dtype: str + b_element_dtype: str + ds_element_dtypes: Tuple[str] # addmm specific + c_element_dtype: str + + acc_dtype: str + c_shuffle_dtype: str + + a_elementwise_op: str + b_elementwise_op: str + c_elementwise_op: str + + gemm_specialization: str + + block_size: int + + m_per_block: int + n_per_block: int + k_per_block: int + + a_k1: int + b_k1: int + + m_per_xdl: int + n_per_xdl: int + + m_xdl_per_wave: int + n_xdl_per_wave: int + + a_block_transfer_thread_cluster_lengths_ak0_m_ak1: Tuple[int, int, int] + a_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int] + a_block_transfer_src_access_order: Tuple[int, int, int] + a_block_transfer_src_vector_dim: int + a_block_transfer_src_scalar_per_vector: int + a_block_transfer_dst_scalar_per_vector_ak1: int + a_block_lds_extra_m: bool + + b_block_transfer_thread_cluster_lengths_bk0_n_bk1: Tuple[int, int, int] + b_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int] + b_block_transfer_src_access_order: Tuple[int, int, int] + + b_block_transfer_src_vector_dim: int + b_block_transfer_src_scalar_per_vector: int + b_block_transfer_dst_scalar_per_vector_bk1: int + b_block_lds_extra_n: bool + + c_shuffle_m_xdl_per_wave_per_shuffle: int + c_shuffle_n_xdl_per_wave_per_shuffle: int + + c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block: ( + Tuple[int, int, int, int] + ) + c_shuffle_block_transfer_scalar_per_vector_n_per_block: Tuple[int] + block_gemm_pipeline_scheduler: str + block_gemm_pipeline_version: str + + a_compute_dtype: Optional[str] = None + b_compute_dtype: Optional[str] = None + + def name(self): + # cpp alias for template instance + return f"ck_device_batched_gemm_multi_d_xdl_c_shuffle_v3_{self.key_name()}" + + def key_name(self): + # TBD; must be unique per instance. Intended to use as dict key + return "_".join( + [ + "K" + + field_name.replace("_", "").lower() + + "V" + + ( + "x".join(map(str, iter(field_value))) + if isinstance(field_value, tuple) + else str(field_value).replace(":", "") + ) + for field_name, field_value in self.dict_items() + ] + ) + + def dict_items(self): + return asdict(self).items() diff --git a/python/ck4inductor/grouped_conv_fwd/gen_instances.py b/python/ck4inductor/grouped_conv_fwd/gen_instances.py index ffbea6bdc..feca20a3b 100644 --- a/python/ck4inductor/grouped_conv_fwd/gen_instances.py +++ b/python/ck4inductor/grouped_conv_fwd/gen_instances.py @@ -130,9 +130,7 @@ def gen_conv_ops_library() -> List[CKGroupedConvFwdOp]: # substitute templated args by looping through their domains substitute_instances = [] for instance in op_instances: - sub_scheduler = ( - instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched" - ) + sub_scheduler = instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched" sub_spec = instance.conv_forward_specialization == "ConvSpec" schedulers_range = ( schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler] -- GitLab From 9488f1c981cda8515b45952a14e539621150c1f6 Mon Sep 17 00:00:00 2001 From: rtmadduri Date: Mon, 2 Dec 2024 00:13:56 -0800 Subject: [PATCH 095/153] LWPCK-2429: Device grouped GEMM uses Async Memcpy (#1695) * LWPCK-2429: Device grouped GEMM uses Async Memcpy Resolving merge conflicts * reverting changes to profile_grouped_gemm * revert date change --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> --- .../impl/device_grouped_gemm_multiple_d_dl.hpp | 12 ++++++------ ...gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp | 10 +++++----- ...rouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp | 8 ++++---- .../gpu/device/impl/device_grouped_gemm_xdl.hpp | 12 ++++++------ .../impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp | 10 +++++----- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp index 060a16d1e..959fc890b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp @@ -1,6 +1,6 @@ #pragma once // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -603,11 +603,11 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm Date: Mon, 2 Dec 2024 07:18:35 -0800 Subject: [PATCH 096/153] Bump rocm-docs-core from 1.9.2 to 1.10.0 in /docs/sphinx (#1706) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.9.2 to 1.10.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.9.2...v1.10.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 995dfaf02..9969824d2 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.9.2 +rocm-docs-core==1.10.0 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index d8f7c3846..bb731db2d 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.9.2 +rocm-docs-core==1.10.0 # via -r requirements.in six==1.16.0 # via pybtex -- GitLab From 08d5c02c37253bf2a6852ad25f2db209f81c0fe7 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 3 Dec 2024 08:42:55 -0800 Subject: [PATCH 097/153] OCP FP8 support for gfx12. (#1710) * (2/5) bilinear gemm pass, perf bug: skip a lds has lower performance than skip b lds * (3/5) batched gemm pass, perf bug: skip a lds has lower performance than skip b lds * (4/5) grouped conv pass * (5/5) attention pass, todo: debug lds perf bug * AIT Attention API refactor (#8) * sanity pass * sanity pass 2 * confirm significant performance regression. * turn on all instances * turn off instance format * Fix bug & tunning & format * DML meta, self_attn+cross_attn * sanity pass * remove useless flag * update tile and problem size used in AIT attention * bug fix in grouped conv supporting check * deprecate inline asm wmma * Bug fix: double lds skip * clang-format * Fix errors in 1. example, fmha 2. gridwise pipeline 3. deviceop, fmha, change some containers from vector to array * part2 of previous commit * clang format * API fix of gridwisegemmpipeline * separate array base and vector base attention tensor transformation * fix gemm * clang format * add gemm fp16 instances * Temp save * fpAintB kernel compile pass * Sanity pass. * Temp save * debug code enabled * Fp16AInt8B_GEMM sanity * MQA implementation * GQA-4 example * tempsave * Compile pass * New implementation of fp16Aint8B Gemm, Acheieve similar math throughput with native fp16 Gemm * Bump rocm-docs-core from 0.24.0 to 0.29.0 in /docs/sphinx Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.24.0 to 0.29.0. - [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases) - [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.24.0...v0.29.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] * initial enablement of gfx950 * fix clang format * disable examples 31 and 41 int8 on gfx950 * initial navi4x enablement * remove extra endif * enabled dl_gemm * update s_barrier and s_waitcnt for gfx12 * fix the gfx12 assembly syntax * fixed block_sync_lds * add support for more dl kernels on navi4 * add wmma * format * Todo: fix gemm_bilinear_wmma instances compilation bug * Solve a bug when K1=16 * remove unnecessary changes * Remove tensor layout limitation to LDS usage in tesnor contraction * fixed block_sync_lds * merge navi3_ref * update self-attention and cross-attention * fix a typo of name * fixed layout * debugging * Add arch limiter for fp8 gemm * fixed wmma * enable fp8 gemm_xdl for all gfx9 targets * temporarily disable gemm_xdl_fp16_fp8 on MI100/200 * fix the cmake logic for gemm_xdl_fp16_fp8 * fixed c_output * re-enable the gemm_xdl_fp16_fp8 on MI100/200 * fixed gfx12 * fixed * fixed * seperate gfx12 blockwise_gemm * fixed * enable fwd conv on navi4x * enable gridwise * enabled gemm * fixed merge * remove empty example fold * fixed conflicts * some small changes * Update cmake-ck-dev.sh * Update cmake-ck-dev.sh * enabled other types * fixed register loads * test fa * enable gfx12 * clean up * enable some instances on gfx12 * add gfx1201 macro in amd_wmma header * fix clang format * enable batched_gemm_softmax_gemm_perm_wmma for gfx12 * disable instances with blocksize=256 in attention examples * debuggging * debug * fixed lds_enabled * debugging * Fix and add limit to skiplds feature * Enable skipLds feature and fix compilation bugs * add ck_tile definitions for gfx12 * fix clang format and test/wmma_op * updage instances cmake for gfx12 * disable the test_wmma_op on gfx12 * fix the builds for gfx950 * add gfx12 and gfx950 to default target list * clean-up cmake file * Initial introduction of OFP8 data types. * Renamed FP8 and BF8 tests into FP8_FNUZ and BF8_FNUZ. * Implementation of ConvertFP32Nearest in test_fp8_ocp. * Remove dependence on possibly undeclared alias. * Implement FP8OCP test for stochastic rounding mode. * Implement FP8OCP tests for half_t type conversions. * enable bf16 atomic add on gfx950 * Implement ConvertFP32Nearest test. * Implement ConvertFP32Stochastic test. * Implement ConvertFP16Nearest and ConvertFP16Stochastic tests. * Refactoring. Move FP8 definitions into a separate header file. * Enable easy switching between architectures. * Fix compilation error for gfx942 architecture. * only builf gfx950 branch for gfx950 target by default * Enable OCP build of example_gemm_xdl_fp8. * Fix formatting. * fix the build logic for gfx950 * Improve GEMM example verbosity. * Add constexpr where applicable. * fix the logic of enabling XDL and WMMA instances * Improve GEMM example verbosity. * Enable build of example_gemm_xdl_fp8_bf8 test. * Fix tests for gfx1101 architecture. * Build DPP examples only on gfx103 and gfx11 architectures. * Optionaly run either CPU or GPU verifications with GEMM examples. * Extend GeneratorTensor_Sequential to produce values of prescribed data types. * Add missing constructor. * Improve infrastructure for OFP8 data type support. * BUGFIX. Should not use FP8 as Compute/Accum data type. * Add custom target for grouped_convnd_bwd_weight tests. * Can build `tests` target on gfx950. * Bugfixes on gfx1101 architecture. * Fix dependencies. * Provide single point of truth for FP8 INF and NAN checks * Prevent instantiation of operators that are not supported by FP8 data types * Add FP8 type selection into client_axample CMakeLists.txt * Prevent sccache server from shutting down during build * Fix test success reporting logic * Change default verification method to CPU. GPU verification takes too much time to complete on the emulator. * Make sure all tests and examples are built for gfx950 * Facilitate testing of FP8 data types on the emulator * Introduce two new tensor generators * Enable instances built for gfx94 to be built on gfx950 * Verify 35_splitk_gemm on floating point numbers. splitk gemm appears to be losing precision VS reference implementation when FP numbers are involved. * Verify 04_gemm_add_add_fastgelu on floating point numbers * Verify 20_grouped_conv_bwd_weight on floating point numbers * Verify 38_grouped_conv_bwd_data_multiple_d on floating point numbers * Verify more tests on floating point data * Fix data types and improve testing verbocity. * Upgrade to NPI 573 build docker. * Skip on gemm_universal tests. The tests take too long to complete on the emulator. Need to see if it is possible to reduce the scope of the testing to just FP8 data types. * Fix gfx1101 build * Document test availability * Re-enable fp8 gemms for gfx94/95 * Cherry-pick GEMM Universal tests for FP8 data types * Cleanup * CK_USE_GFX94 has already been set on this branch * Address formatting issues and leftovers * Make fail/pass logic consistent within 01_gemm folder Removed multiple negations in fail/pass logic to propagate `true` as the success indicator. * Fix GPU verification reporting logic. * Update year in copyright notice. * Cleanup * Use `enum class` instead of `enum` * Remove set_property for FP8 tests * Narrowing the scope of PR to OCP FP8 enablement only * Add tests for OCP FP8 vector_type storage * Enable gemm kernel on all gfx9 architectures (#227) * clean-up * Implement `non_native_vector_base` with `ext_vector_type` array. (#232) * Enable support of 1, 2, 4, and 8-byte custom types in CK. * Fix pool tests for OCP FP8 data type * fix jenkins file * restore cron trigger --------- Signed-off-by: dependabot[bot] Co-authored-by: aska-0096 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jing Zhang Co-authored-by: zjing14 Co-authored-by: Jun Liu Co-authored-by: Andriy Roshchenko Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com> --- CMakeLists.txt | 11 +- client_example/CMakeLists.txt | 8 + example/01_gemm/common.hpp | 2 +- example/01_gemm/run_gemm_example.inc | 4 +- ...rouped_gemm_multiple_d_splitk_xdl_fp16.cpp | 8 +- .../grouped_gemm_multiple_d_xdl_fp16.cpp | 8 +- .../grouped_gemm_xdl_fixed_nk_bias_fp16.cpp | 6 +- .../grouped_gemm_xdl_fixed_nk_fp16.cpp | 4 +- .../grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp | 4 +- .../run_grouped_gemm_example.inc | 7 +- ...xdl_layernorm_naive_single_kernel_fp16.cpp | 6 +- .../run_batched_gemm_gemm_example.inc | 4 +- .../run_batched_gemm_scale_softmax_gemm.inc | 4 +- ...atched_gemm_scale_softmax_gemm_permute.inc | 4 +- ...d_gemm_scale_softmax_gemm_permute_wmma.inc | 4 +- .../run_cross_attention_wmma.inc | 4 +- ...rouped_gemm_scale_softmax_gemm_permute.inc | 4 +- ...n_grouped_query_attention_forward_wmma.inc | 4 +- ...run_multi_query_attention_forward_wmma.inc | 4 +- .../run_self_attention_wmma.inc | 4 +- .../run_splitK_gemm_example.inc | 7 +- ...ed_gemm_add_add_relu_gemm_add_xdl_fp16.cpp | 2 +- .../common.hpp | 4 +- .../gemm_bias_softmax_gemm_permute_xdl.cpp | 4 +- ...mm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp | 8 +- ..._gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp | 6 +- ...emm_multiply_multiply_xdl_fp8_ab_scale.cpp | 3 - example/CMakeLists.txt | 7 + include/ck/library/utility/host_tensor.hpp | 2 +- .../library/utility/host_tensor_generator.hpp | 31 +- ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 3 +- ..._gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp | 3 +- include/ck/utility/amd_buffer_addressing.hpp | 14 +- include/ck/utility/amd_ck_fp8.hpp | 988 ++++++++++++++++++ include/ck/utility/amd_xdlops.hpp | 2 +- include/ck/utility/data_type.hpp | 443 ++++++-- include/ck/utility/math_v2.hpp | 4 +- include/ck/utility/random_gen.hpp | 13 +- include/ck/utility/type_convert.hpp | 204 ++-- .../cpu/reference_gemm.hpp | 10 +- .../gpu/CMakeLists.txt | 4 +- ...evice_max_pool3d_fwd_ndhwc_f8_instance.cpp | 4 +- ...ed_gemm_bias_softmax_gemm_permute_impl.hpp | 4 +- .../profile_batched_gemm_gemm_impl.hpp | 4 +- ...profile_batched_gemm_softmax_gemm_impl.hpp | 4 +- ...batched_gemm_softmax_gemm_permute_impl.hpp | 4 +- .../include/profiler/profile_gemm_impl.hpp | 6 +- test/data_type/CMakeLists.txt | 37 +- .../{test_bf8.cpp => test_bf8_fnuz.cpp} | 135 +-- test/data_type/test_bf8_ocp.cpp | 268 +++++ test/data_type/test_custom_type.cpp | 158 +++ .../{test_fp8.cpp => test_fp8_fnuz.cpp} | 149 +-- test/data_type/test_fp8_ocp.cpp | 250 +++++ test/pool/test_avg_pool2d_fwd.cpp | 2 +- test/pool/test_max_pool2d_fwd.cpp | 2 +- 55 files changed, 2509 insertions(+), 384 deletions(-) create mode 100644 include/ck/utility/amd_ck_fp8.hpp rename test/data_type/{test_bf8.cpp => test_bf8_fnuz.cpp} (52%) create mode 100644 test/data_type/test_bf8_ocp.cpp rename test/data_type/{test_fp8.cpp => test_fp8_fnuz.cpp} (52%) create mode 100644 test/data_type/test_fp8_ocp.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b28a6d912..2c8698756 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,13 +185,22 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx9") add_definitions(-DCK_USE_XDL) endif() if (SUPPORTED_GPU_TARGETS MATCHES "gfx94") - message("Enabling FP8 gemms in ckProfiler") + message("Enabling FP8 gemms on native architectures") add_definitions(-DCK_USE_GFX94) endif() if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") message("Enabling WMMA instances") add_definitions(-DCK_USE_WMMA) endif() +if (SUPPORTED_GPU_TARGETS MATCHES "gfx12") + add_definitions(-DCK_USE_OCP_FP8) + set(CK_USE_OCP_FP8 "ON") +endif() +if (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx94") + add_definitions(-DCK_USE_FNUZ_FP8) + set(CK_USE_FNUZ_FP8 "ON") +endif() + option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF) if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908")) add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH) diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt index c393972b4..ce5834d1e 100644 --- a/client_example/CMakeLists.txt +++ b/client_example/CMakeLists.txt @@ -56,6 +56,14 @@ if (GPU_TARGETS) add_definitions(-DCK_USE_WMMA) set(CK_USE_WMMA "ON") endif() + if (GPU_TARGETS MATCHES "gfx12") + add_definitions(-DCK_USE_OCP_FP8) + set(CK_USE_OCP_FP8 "ON") + endif() + if (GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx94") + add_definitions(-DCK_USE_FNUZ_FP8) + set(CK_USE_FNUZ_FP8 "ON") + endif() else() add_definitions(-DCK_USE_WMMA -DCK_USE_XDL) set(CK_USE_XDL "ON") diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp index 67bf92bbb..a3a62d4cf 100644 --- a/example/01_gemm/common.hpp +++ b/example/01_gemm/common.hpp @@ -76,7 +76,7 @@ struct ProblemSizeSplitK final struct ExecutionConfig final { // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU - int do_verification = 3; + int do_verification = 1; int init_method = 2; bool time_kernel = false; }; diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index bafec3f35..3ee6e2685 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -143,8 +143,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) switch(config.init_method) { case 0: - ck::utils::FillConstant{static_cast(1.f)}(a_m_k); - ck::utils::FillConstant{static_cast(1.f)}(b_k_n); + ck::utils::FillConstant{ck::type_convert(1.f)}(a_m_k); + ck::utils::FillConstant{ck::type_convert(1.f)}(b_k_n); break; case 1: ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp index 8bbf8e629..117a18e3b 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp @@ -186,15 +186,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); for(int j = 0; j < NumDMatrices; ++j) { - d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); } break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); for(int j = 0; j < NumDMatrices; ++j) { - d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); + d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential{}); } } } diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp index e7b2ee417..db162fe44 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp @@ -190,15 +190,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); for(int j = 0; j < NumDs; ++j) { - d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); } break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); for(int j = 0; j < NumDs; ++j) { - d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); + d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential{}); } } } diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp index 3b3ef508c..5bdc99319 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp @@ -167,11 +167,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } - d0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + d0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } using GroupedGemmKernelArgument = ck::tensor_operation::device::GroupedGemmKernelArgument<1>; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp index c1043f419..6806bd188 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp @@ -157,8 +157,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } } diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp index c81874b06..8418c10f5 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp @@ -158,8 +158,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } } diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc index 7cb0588b8..64125cd1d 100644 --- a/example/15_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + #pragma once struct ProblemSize final @@ -124,8 +127,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } } diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp index 90d80f9f0..277fea027 100644 --- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -175,8 +175,8 @@ int main(int argc, char* argv[]) b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_m_k.GenerateTensorValue(GeneratorTensor_Sequential{}); + b_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); } c0_n_bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}); diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc index f32914672..d54550868 100644 --- a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc +++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -150,7 +150,7 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[]) break; default: a_g_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc index 27602e231..1514fc48b 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -157,7 +157,7 @@ int run(int argc, char* argv[]) break; default: a_g_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc index fa76faea8..2b02069e6 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -118,7 +118,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc index 2e77479bc..e0ccb6dad 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -153,7 +153,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc index 9ff4c56e0..0ad031cc7 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -178,7 +178,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc index ea1e2734a..cdfd86dff 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -152,7 +152,7 @@ int run(int argc, char* argv[]) break; default: a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{1}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc index 609d08529..7ac29f33c 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -156,7 +156,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc index b05915c07..fb9b1b0bd 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -156,7 +156,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc index 3fdaaebb0..2cb69380e 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -173,7 +173,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/35_splitK_gemm/run_splitK_gemm_example.inc b/example/35_splitK_gemm/run_splitK_gemm_example.inc index e3690984a..cb1d3410c 100644 --- a/example/35_splitK_gemm/run_splitK_gemm_example.inc +++ b/example/35_splitK_gemm/run_splitK_gemm_example.inc @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + #pragma once struct ProblemSize final @@ -66,8 +69,8 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_m_k.GenerateTensorValue(GeneratorTensor_Sequential{}); + b_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); } DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp index ff1282f3c..f27dc6054 100644 --- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp +++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp @@ -377,7 +377,7 @@ int main(int argc, char* argv[]) break; default: a0_g_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); d00_g_m_n.GenerateTensorValue(GeneratorTensor_1{1}); d01_g_m_n.GenerateTensorValue(GeneratorTensor_1{1}); b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal{}); diff --git a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp index 8a0474156..6af8ac648 100644 --- a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp +++ b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -41,7 +41,7 @@ struct ExecutionConfig final { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; }; #define DefaultConvParams \ diff --git a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp index a90a6340a..392cb155c 100644 --- a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp +++ b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -248,7 +248,7 @@ int main(int argc, char* argv[]) d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1{1}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1{1}); diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp index 742fd5547..055d25304 100644 --- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp +++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -194,9 +194,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b1_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); - b1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } d0_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp index 809c1a956..1ba8133ea 100644 --- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp +++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp @@ -184,9 +184,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - a1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + a1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } d0_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp index 256875464..9b7849a65 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp @@ -205,7 +205,6 @@ int main(int argc, char* argv[]) a1_device_buf.ToDevice(a1_m_k.mData.data()); b0_device_buf.ToDevice(b0_k_n.mData.data()); b1_device_buf.ToDevice(b1_k_n.mData.data()); - e_device_buf.ToDevice(e_m_n_device_result.mData.data()); auto a_element_op = AElementOp{}; auto b_element_op = BElementOp{}; @@ -253,8 +252,6 @@ int main(int argc, char* argv[]) std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl; - e_device_buf.FromDevice(e_m_n_device_result.mData.data()); - if(do_verification) { Tensor c_m_n({M, N}); diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index ea739c707..72759916a 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -54,6 +54,13 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME) list(REMOVE_ITEM FILE_NAME "${source}") endif() endforeach() + #Do not build any DPP examples if DL_KERNELS not set + foreach(source IN LISTS FILE_NAME) + if(NOT DEFINED DL_KERNELS AND source MATCHES "_dpp") + message("removing dpp example ${source} ") + list(REMOVE_ITEM FILE_NAME "${source}") + endif() + endforeach() #Do not build any XDL examples if gfx9 targets are not on the list foreach(source IN LISTS FILE_NAME) if(NOT EX_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl") diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp index a58acaf11..18e1db462 100644 --- a/include/ck/library/utility/host_tensor.hpp +++ b/include/ck/library/utility/host_tensor.hpp @@ -326,7 +326,7 @@ struct Tensor std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); } - void SetZero() { ck::ranges::fill(mData, 0); } + void SetZero() { ck::ranges::fill(mData, T{0}); } template void ForEach_impl(F&& f, std::vector& idx, size_t rank) diff --git a/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp index e87811b76..ab9f01b53 100644 --- a/include/ck/library/utility/host_tensor_generator.hpp +++ b/include/ck/library/utility/host_tensor_generator.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -37,7 +37,7 @@ struct GeneratorTensor_1 float value = 1.0; template - ck::bhalf_t operator()(Is...) + ck::half_t operator()(Is...) { return ck::type_convert(value); } @@ -62,7 +62,7 @@ struct GeneratorTensor_1 float value = 1.0; template - ck::bhalf_t operator()(Is...) + ck::f8_t operator()(Is...) { return ck::type_convert(value); } @@ -256,14 +256,33 @@ struct GeneratorTensor_Checkboard } }; -template +/** + * @brief Is used to generate sequential values based on the specified dimension. + * + * @tparam T The type of the tensor values. + * @tparam Dim The specific dimension used for generation. + * + * GeneratorTensor_Sequential<1>{} will generate the following values for a 3x3 tensor: + * + * 0 1 2 + * 0 1 2 + * 0 1 2 + * + * Essentially, the values generated are logical coordinates of the generated element that + * correspond to dimension Dim. E.g. for 2-dimensional tensor and Dim=1, the values are the column + * indices. + * + */ +template struct GeneratorTensor_Sequential { template - float operator()(Ts... Xs) const + T operator()(Ts... Xs) const { std::array dims = {{static_cast(Xs)...}}; - return dims[Dim]; + + float tmp = dims[Dim]; + return ck::type_convert(tmp); } }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp index c1f58ccda..a7f129b2b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp @@ -111,8 +111,7 @@ __global__ void [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, [[maybe_unused]] const index_t num_k_per_block) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ - defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) // offset base pointer for each work-group const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge); const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp index da6b1b304..813acfa65 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp @@ -38,8 +38,7 @@ __global__ void // __attribute__((amdgpu_waves_per_eu(1, 1))) kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ - defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; GridwiseGemm::template Run( diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index d4ee5c886..5367c3d72 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -549,8 +549,10 @@ __device__ void amd_buffer_store_impl(const typename vector_type::type src (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (is_same::value && + (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), "wrong! not implemented"); @@ -843,8 +845,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave, #else - vector_t tmp = amd_buffer_load_impl( - src_wave_buffer_resource, src_thread_addr_offset, 0); + vector_t tmp{amd_buffer_load_impl( + src_wave_buffer_resource, src_thread_addr_offset, 0)}; return src_thread_element_valid ? tmp : vector_t(0); #endif } @@ -873,8 +875,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave, constexpr index_t vector_size = scalar_type::vector_size; - vector_t tmp = amd_buffer_load_impl( - src_wave_buffer_resource, src_thread_addr_offset, 0); + vector_t tmp{amd_buffer_load_impl( + src_wave_buffer_resource, src_thread_addr_offset, 0)}; return src_thread_element_valid ? tmp : vector_t(customized_value); } diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp new file mode 100644 index 000000000..7b21ad646 --- /dev/null +++ b/include/ck/utility/amd_ck_fp8.hpp @@ -0,0 +1,988 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/random_gen.hpp" +#include "ck/utility/type.hpp" + +#ifdef CK_USE_FNUZ_FP8 +#define CK_USE_FNUZ_FP8 1 +#else +#define CK_USE_FNUZ_FP8 0 +#endif + +#ifdef CK_USE_OCP_FP8 +#define CK_USE_OCP_FP8 1 +#else +#define CK_USE_OCP_FP8 0 +#endif + +namespace ck { + +using f8_fnuz_t = _BitInt(8); +using bf8_fnuz_t = unsigned _BitInt(8); + +#if(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx1200__) || \ + defined(__gfx1201__)) && \ + __HIP_DEVICE_COMPILE__ +#define CK_FP8_CVT_FAST_PATH 1 +#else +#define CK_FP8_CVT_FAST_PATH 0 +#endif + +#if(defined(__gfx1200__) || defined(__gfx1201__)) && __HIP_DEVICE_COMPILE__ +#define CK_OCP_FP8_CVT_FAST_PATH 1 +#else +#define CK_OCP_FP8_CVT_FAST_PATH 0 +#endif + +typedef unsigned char fp8_storage_t; + +/** + * \brief Describes FP8 interpretation + */ +enum class ck_fp8_interpretation_t +{ + CK_E4M3_OCP = 0, // OCP E4M3 + CK_E5M2_OCP = 1, // OCP E5M2 + CK_E4M3_FNUZ = 2, // FP8 + CK_E5M2_FNUZ = 3, // BF8 +}; + +/** + * \brief Describes saturation behavior + */ +enum class ck_saturation_t +{ + CK_NOSAT = 0, // No saturation - replace with NaN or Inf + CK_SATFINITE = 1, // Saturate to finite +}; + +namespace fp8_impl { + +typedef fp8_storage_t fp8x2_storage_t __attribute__((ext_vector_type(2))); +typedef float float2_t __attribute__((ext_vector_type(2))); + +__host__ __device__ static inline constexpr bool fnuz_f8_is_nan(f8_fnuz_t a) +{ + return static_cast(a) == 0x80; +} +__host__ __device__ static inline constexpr bool fnuz_bf8_is_nan(bf8_fnuz_t a) +{ + return static_cast(a) == 0x80; +} + +__host__ __device__ static inline constexpr bool ocp_f8_is_nan(fp8_storage_t a) +{ + return (a & 0x7f) == 0x7f; +} +__host__ __device__ static inline constexpr bool ocp_bf8_is_nan(fp8_storage_t a) +{ + return (a & 0x7f) > 0x7c; +} + +// The conversion function is from rocblas +// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L220 +// This has been modified to handle double types as well +template +__host__ __device__ static inline T cast_from_f8(fp8_storage_t x) +{ + constexpr bool is_half = __hip_internal::is_same::value; + constexpr bool is_float = __hip_internal::is_same::value; + constexpr bool is_double = __hip_internal::is_same::value; + static_assert(is_half || is_float || is_double, "only half, float and double are supported"); + + constexpr int weo = is_half ? 5 : (is_float ? 8 : 11); + constexpr int wmo = is_half ? 10 : (is_float ? 23 : 52); + + T fInf, fNegInf, fNaN, fNeg0, fmax, fmin; + if constexpr(is_half) + { + const unsigned short int ihInf = 0x7C00; + const unsigned short int ihNegInf = 0xFC00; + const unsigned short int ihNaN = 0x7C01; + const unsigned short int ihNeg0 = 0x8000; + /* Max number in e5m2 57344*/ + const unsigned short int ifmax = 0x7B00; + const unsigned short int ifmin = 0xFB00; + + fInf = bit_cast<_Float16>(ihInf); + fNegInf = bit_cast<_Float16>(ihNegInf); + fNaN = bit_cast<_Float16>(ihNaN); + fNeg0 = bit_cast<_Float16>(ihNeg0); + fmax = bit_cast<_Float16>(ifmax); + fmin = bit_cast<_Float16>(ifmin); + } + else if constexpr(is_float) + { + const unsigned int ifInf = 0x7F800000; + const unsigned int ifNegInf = 0xFF800000; + const unsigned int ifNaN = 0x7F800001; + const unsigned int ifNeg0 = 0x80000000; + /* Max number in e5m2 57344*/ + const unsigned int ifmax = 0x47600000; + const unsigned int ifmin = 0xC7600000; + + fInf = bit_cast(ifInf); + fNegInf = bit_cast(ifNegInf); + fNaN = bit_cast(ifNaN); + fNeg0 = bit_cast(ifNeg0); + fmax = bit_cast(ifmax); + fmin = bit_cast(ifmin); + } + else if constexpr(is_double) + { + const unsigned long long ifInf = 0x7FF0000000000000ull; + const unsigned long long ifNegInf = 0xFFF0000000000000ull; + const unsigned long long ifNaN = 0x7FF0000000000001ull; + const unsigned long long ifNeg0 = 0x8000000000000000ull; + /* Max number in e5m2 57344*/ + const unsigned long long ifmax = 0x40EC000000000000ull; + const unsigned long long ifmin = 0xC0EC000000000000ull; + + fInf = bit_cast(ifInf); + fNegInf = bit_cast(ifNegInf); + fNaN = bit_cast(ifNaN); + fNeg0 = bit_cast(ifNeg0); + fmax = bit_cast(ifmax); + fmin = bit_cast(ifmin); + } + + if(x == 0) + { + return 0; + } + + unsigned long long sign = x >> 7; + unsigned long long mantissa = x & ((1 << wm) - 1); + int exponent = (x & 0x7F) >> wm; + if constexpr(is_fnuz) + { + if(x == 0x80) + { + return fNaN; + } + } + else + { + if(x == 0x80) + { + return fNeg0; + } + if constexpr(we == 4) + { // e4m3 + if((x & 0x7F) == 0x7F) + { + return fNaN; + } + } + else if((x & 0x7C) == 0x7C) + { // e5m2 + if((x & 0x3) == 0) + { + if constexpr(clip) + { + return sign ? fmin : fmax; + } + return sign ? fNegInf : fInf; + } + return fNaN; + } + } + + typename __hip_internal::conditional< + sizeof(T) == 2, + unsigned short int, + typename __hip_internal::conditional:: + type>::type retval; + + if constexpr(we == 5 && is_half && !is_fnuz) + { + retval = x << 8; + return bit_cast(retval); + } + + const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (is_fnuz ? 1 : 0); + + // subnormal input + if(exponent == 0) + { +#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ + // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above + int sh = 1 + __clz(mantissa) - (32 - wm); +#else + int sh = 1 + __builtin_clz(mantissa) - (32 - wm); +#endif + mantissa <<= sh; + exponent += 1 - sh; + mantissa &= ((1ull << wm) - 1); + } + exponent += exp_low_cutoff - 1; + mantissa <<= wmo - wm; + + // subnormal output (occurs when T=half, we=5, negative_zero_nan=true) + if(exponent <= 0) + { + mantissa |= 1 << wmo; + mantissa >>= 1 - exponent; + exponent = 0; + } + + if constexpr(sizeof(T) == 2) + retval = (sign << 15) | (exponent << 10) | mantissa; + else if constexpr(sizeof(T) == 4) + retval = (sign << 31) | (exponent << 23) | mantissa; + else + retval = (sign << 63) | (static_cast(exponent) << 52) | mantissa; + + return bit_cast(retval); +} + +#if CK_FP8_CVT_FAST_PATH +template +static __device__ float cast_to_f32_from_f8(fp8_storage_t v) +{ + union + { + unsigned int i32val; + unsigned char i8val[4]; + } val; + val.i8val[0] = v; + + static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ || + interpret == ck_fp8_interpretation_t::CK_E4M3_OCP || + interpret == ck_fp8_interpretation_t::CK_E5M2_FNUZ || + interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, + "Only FNUZ and OCP interpretations are supported"); + + if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) || + (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)) + { + return __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0); + } + else + { + return __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0); + } +} + +template +static __device__ float2_t cast_to_f32x2_from_f8x2(fp8x2_storage_t v) +{ + const auto i16val = bit_cast(v); + + static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ || + interpret == ck_fp8_interpretation_t::CK_E4M3_OCP || + interpret == ck_fp8_interpretation_t::CK_E5M2_FNUZ || + interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, + "Only FNUZ and OCP interpretations are supported"); + + if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) || + (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)) + { + return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, false); + } + else + { + return __builtin_amdgcn_cvt_pk_f32_bf8(i16val, false); + } +} + +#endif + +} // namespace fp8_impl + +struct f8_ocp_t +{ + using data_type = fp8_storage_t; + data_type data; + + static constexpr ck_saturation_t default_saturation = ck_saturation_t::CK_SATFINITE; + static constexpr ck_fp8_interpretation_t default_interpret = + ck_fp8_interpretation_t::CK_E4M3_OCP; + + static constexpr unsigned int we = 4; // exponent width + static constexpr unsigned int wm = 3; // mantissa width + + __host__ __device__ constexpr bool operator==(const f8_ocp_t& other) const + { + return (data == other.data) && (fp8_impl::ocp_f8_is_nan(data) == false); // NaN != NaN + } + +#if CK_USE_OCP_FP8 + __host__ __device__ explicit operator float() const +#else + __host__ explicit operator float() const +#endif + { +#if CK_OCP_FP8_CVT_FAST_PATH + return fp8_impl::cast_to_f32_from_f8(this->data); +#else + return fp8_impl::cast_from_f8( + this->data); // XXX: clip==false must be consistent with operator _Float16 +#endif + } + +#if CK_USE_OCP_FP8 + __host__ __device__ explicit operator _Float16() const +#else + __host__ explicit operator _Float16() const +#endif + { +#if CK_OCP_FP8_CVT_FAST_PATH + return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8(this->data)); +#else + return fp8_impl::cast_from_f8<_Float16, wm, we, false>( + this->data); // XXX: clip==false must be consistent with operator float +#endif + } +}; + +struct bf8_ocp_t +{ + using data_type = fp8_storage_t; + data_type data; + + static constexpr ck_saturation_t default_saturation = ck_saturation_t::CK_SATFINITE; + static constexpr ck_fp8_interpretation_t default_interpret = + ck_fp8_interpretation_t::CK_E5M2_OCP; + + static constexpr unsigned int we = 5; // exponent width + static constexpr unsigned int wm = 2; // mantissa width + + __host__ __device__ constexpr bool operator==(const bf8_ocp_t& other) const + { + return (data == other.data) && (fp8_impl::ocp_bf8_is_nan(data) == false); // NaN != NaN + } + +#if CK_USE_OCP_FP8 + __host__ __device__ explicit operator float() const + +#else + __host__ explicit operator float() const +#endif + { +#if defined(__gfx1200__) || defined(__gfx1201__) + return fp8_impl::cast_to_f32_from_f8(this->data); +#else + return fp8_impl::cast_from_f8( + this->data); // XXX: clip==false must be consistent with operator _Float16 +#endif + } + +#if CK_USE_OCP_FP8 + __host__ __device__ explicit operator _Float16() const +#else + __host__ explicit operator _Float16() const +#endif + { +#if defined(__gfx1200__) || defined(__gfx1201__) + return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8(this->data)); +#else + return fp8_impl::cast_from_f8<_Float16, wm, we, false>( + this->data); // XXX: clip==false must be consistent with operator float +#endif + } +}; + +template +__host__ __device__ static inline constexpr bool fp8_is_nan(T); + +template <> +__host__ __device__ inline constexpr bool fp8_is_nan(f8_ocp_t a) +{ + return fp8_impl::ocp_f8_is_nan(a.data); +} +template <> +__host__ __device__ inline constexpr bool fp8_is_nan(bf8_ocp_t a) +{ + return fp8_impl::ocp_bf8_is_nan(a.data); +} +template <> +__host__ __device__ inline constexpr bool fp8_is_nan(f8_fnuz_t a) +{ + return fp8_impl::fnuz_f8_is_nan(a); +} +template <> +__host__ __device__ inline constexpr bool fp8_is_nan(bf8_fnuz_t a) +{ + return fp8_impl::fnuz_bf8_is_nan(a); +} + +template || std::is_same_v || + std::is_same_v || std::is_same_v, + bool> = true> +__host__ __device__ static inline constexpr bool fp8_is_inf(T) +{ + return false; +} +template <> +__host__ __device__ inline constexpr bool fp8_is_inf(bf8_ocp_t a) +{ + return (a.data & 0x7f) == 0x7c; +} + +namespace fp8_impl { + +// Assertions to check for supported conversion types +#define __assert_ocp_support(interp) \ + { \ + if(interp != ck_fp8_interpretation_t::CK_E4M3_OCP && \ + interp != ck_fp8_interpretation_t::CK_E5M2_OCP) \ + { \ + __hip_assert(false && "type is unsupported by current target device"); \ + } \ + } +#define __assert_fnuz_support(interp) \ + { \ + if(interp != ck_fp8_interpretation_t::CK_E4M3_FNUZ && \ + interp != ck_fp8_interpretation_t::CK_E5M2_FNUZ) \ + { \ + __hip_assert(false && "type is unsupported by current target device"); \ + } \ + } + +__host__ __device__ static inline void +__is_interpret_supported([[maybe_unused]] ck_fp8_interpretation_t interp) +{ +#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ +#if CK_USE_OCP_FP8 + __assert_ocp_support(interp); +#endif +#if CK_USE_FNUZ_FP8 + __assert_fnuz_support(interp); +#endif +#endif +} + +#if CK_FP8_CVT_FAST_PATH +// The conversion function is from rocblas +// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_float8.h#L79 +template +static __device__ fp8_storage_t cast_to_f8_from_f32(float v, unsigned int rng = 0) +{ + fp8_storage_t i8data; + union + { + float fval; + unsigned int i32val; + unsigned char i8val[4]; // NOTE: not endian independent + } val; + + unsigned int ival = 0; + val.fval = v; + + if constexpr(saturate) + { + if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) + { + if((val.i32val & 0x7F800000) != 0x7F800000) + { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0); + } + } + else if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP) + { // OCP type + if((val.i32val & 0x7F800000) != 0x7F800000) + { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 448.0, -448.0); + } + } + else + { + if((val.i32val & 0x7F800000) != 0x7F800000) + { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0); + } + } + } + + if constexpr(stochastic_rounding) + { + ival = (interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) || + (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP) + ? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0) + : __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos + val.i32val = ival; + i8data = val.i8val[0]; // little endian + } + else + { // RNE CVT + ival = (interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) || + (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP) + ? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false) + : __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, + val.fval, + ival, + false); // false -> WORD0 + val.i32val = ival; + i8data = val.i8val[0]; + } + return i8data; +} +#endif // CK_FP8_CVT_FAST_PATH + +// The conversion function is from rocblas +// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L39 +// This has been modified to add double types conversion as well +template +__host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rng = 0) +{ + constexpr bool is_half = __hip_internal::is_same::value; + constexpr bool is_float = __hip_internal::is_same::value; + constexpr bool is_double = __hip_internal::is_same::value; + static_assert(is_half || is_float || is_double, + "Only half, float and double can be cast to f8"); + + constexpr int mfmt = (sizeof(T) == 8) ? 52 : ((sizeof(T) == 4) ? 23 : 10); + + using T_bitwise = typename __hip_internal::conditional< + sizeof(T) == 2, + unsigned short int, + typename __hip_internal::conditional:: + type>::type; + T_bitwise x_bitwise = bit_cast(_x); + + unsigned long long x{x_bitwise}; + + unsigned long long head, mantissa; + int exponent, bias; + unsigned int sign; + unsigned long long fInf, mask; + + if constexpr(sizeof(T) == 8) + { + head = x & 0xFFF0000000000000ull; + mantissa = x & 0xFFFFFFFFFFFFFull; + exponent = (head >> 52) & 0x7FF; + sign = head >> 63; + bias = 1023; + fInf = 0x7FF0000000000000ull; + mask = 0x7FFFFFFFFFFFFFFFull; + } + else if constexpr(sizeof(T) == 4) + { + head = x & 0xFF800000; + mantissa = x & 0x7FFFFF; + exponent = (head >> 23) & 0xFF; + sign = head >> 31; + bias = 127; + fInf = 0x7F800000; + mask = 0x7FFFFFFF; + } + else + { + head = x & 0xFC00; + mantissa = x & 0x3FF; + exponent = (head >> 10) & 0x1F; + sign = head >> 15; + bias = 15; + fInf = 0x7C00; + mask = 0x7FFF; + } + unsigned int signed_inf = 0; + unsigned int nan = 0; + if constexpr(is_fnuz) + { + signed_inf = clip ? ((sign << 7) + 0x7f) : 0x80; + nan = 0x80; + } + else + { + if constexpr(we == 4) + { // e4m3 + signed_inf = (sign << 7) + (clip ? 0x7e : 0x7f); + } + else + { // e5m2 + signed_inf = (sign << 7) + (clip ? 0x7b : 0x7c); + } + nan = (sign << 7) + 0x7f; + } + // Max values + unsigned long long ifmax = 0; + if constexpr(sizeof(T) == 8) + { + if constexpr(we == 5) + { // 57344 + ifmax = 0x40EC000000000000ull; + } + else + { + if constexpr(is_fnuz) + { // 240 + ifmax = 0x406E000000000000ull; + } + else + { // 448 + ifmax = 0x407C000000000000ull; + } + } + } + else if(sizeof(T) == 4) + { + if constexpr(we == 5) + { + ifmax = 0x47600000; + } + else + { + if constexpr(is_fnuz) + { + ifmax = 0x43700000; + } + else + { + ifmax = 0x43E00000; + } + } + } + else + { + if constexpr(we == 5) + { + ifmax = 0x7B00; + } + else + { + if constexpr(is_fnuz) + { + ifmax = 0x5B80; + } + else + { + ifmax = 0x5F00; + } + } + } + // Deal with inf and NaNs + if((x & fInf) == fInf) + { + if constexpr(is_fnuz) + return signed_inf; + + return mantissa != 0 ? nan : signed_inf; + } + + if((x & mask) > ifmax) + { + return signed_inf; + } + + if(x == 0) + { + return 0; + } + + // First need to check if it is normal or denorm as there is a difference of + // implicit 1 Then need to adjust the exponent to align with the F8 exponent, + // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng + // to mantissa and truncate. And for RNE, no need to add rng. Then probably + // need to check whether there is carry and adjust exponent and mantissa again + + // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent + // bits + const int f8_bias = (1 << (we - 1)) - 1 + (is_fnuz ? 1 : 0); + const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal + // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias) + // f8_exponent is the converted f8 exponent with bias encoding + // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent, + // the difference needs to be adjusted and mantissa shifted + int act_exponent, f8_exponent, exponent_diff; + + if(exponent == 0) + { // fp32/fp16 is in denormal. + /* fp32 denormal is below 2^-127 so it is usually not a concern here, we + mostly concern fp16 here. In this case, f8 is usually in denormal. But there + could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has + exponent bias 16. It means that there are some numbers in fp16 denormal but they + are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers + where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 + (NANOO) normal. In this case, the fp16 mantissa should be shift left by 1 */ + act_exponent = exponent - bias + 1; + exponent_diff = f8_denormal_act_exponent - + act_exponent; // actual exponent is exponent-bias+1 as it is denormal + } + else + { // fp32/fp16 is normal with implicit 1 + act_exponent = exponent - bias; + if(act_exponent <= f8_denormal_act_exponent) + { + /* This is the case where fp32/fp16 is normal but it is in f8 denormal + range. For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16 + actual exponent is -7, it is actually larger due to the implicit 1, + Therefore it needs to be adjust to -6 and mantissa shift right by 1. + So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */ + exponent_diff = f8_denormal_act_exponent - act_exponent; + } + else + { // both fp32/fp16 and f8 are in normal range + exponent_diff = 0; // exponent_diff=0 does not mean there is no difference + // for this case, act_exponent could be larger. Just + // that it does not need shift mantissa + } + mantissa += (1ull << mfmt); // Add the implicit 1 into mantissa + } + + bool midpoint = (mantissa & ((1ull << (mfmt - wm + exponent_diff)) - 1)) == + (1ull << (mfmt - wm + exponent_diff - 1)); + /* This part is a bit tricky. The judgment of whether it is a tie needs to be + done before we shift right as shift right could rip off some residual part and + make something not midpoint look like midpoint. For example, the fp16 number + 0x1002 (0 00100 0000000010), it is larger than midpoint, but after shift right + by 4 bits, it would look like midpoint. + */ + + if(exponent_diff > 0) + mantissa >>= exponent_diff; + else if(exponent_diff == -1) + mantissa <<= -exponent_diff; + bool implicit_one = mantissa & (1ull << mfmt); + // if there is no implicit 1, it means the f8 is denormal and need to adjust + // to denorm exponent + f8_exponent = + (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1); + + // Now we have the exponent and mantissa adjusted + unsigned long long drop_mask = (1ull << (mfmt - wm)) - 1; + bool odd = + mantissa & (1ull << (mfmt - wm)); // if the least significant bit that is not truncated is 1 + mantissa += + (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1ull) : mantissa)) & drop_mask; + + // Now we deal with overflow + if(f8_exponent == 0) + { + if((1ull << mfmt) & mantissa) + { + f8_exponent = 1; // denormal overflow to become normal, promote exponent + } + } + else + { + if((1ull << (mfmt + 1)) & mantissa) + { + mantissa >>= 1; + f8_exponent++; + } + } + + mantissa >>= (mfmt - wm); + + // above range: quantize to maximum possible float of the same sign + const int max_exp = (1 << we) - 1; + if(f8_exponent > max_exp) + { + if constexpr(clip) + { + mantissa = (1 << wm) - 1; + f8_exponent = max_exp; + } + else + { + return signed_inf; + } + } + + if(f8_exponent == 0 && mantissa == 0) + return is_fnuz ? 0 : (sign << 7); + mantissa &= (1 << wm) - 1; + return (sign << 7) | (f8_exponent << wm) | mantissa; +} + +/** + * \brief convert float to @p fp8_storage_t + * + * \tparam interp interpretation of fp8 + * \tparam sat saturation of fp8 + * \param f float number + * \return fp8_storage_t + */ +template +#if CK_FP8_CVT_FAST_PATH +__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f) +{ + __is_interpret_supported(interp); + uint32_t rng = 0; + if constexpr(stochastic_rounding) + { + constexpr int seed = 1254739; + rng = prand_generator(reinterpret_cast(&f), f); + } + return cast_to_f8_from_f32( + f, rng); +#else +#if CK_USE_OCP_FP8 +__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f) +{ +#else +__host__ static inline fp8_storage_t cvt_float_to_fp8(const float f) +{ +#endif + uint32_t rng = 0; + if constexpr(stochastic_rounding) + { + constexpr int seed = 1254739; + rng = prand_generator(reinterpret_cast(&f), f); + } + + if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ) + { + return cast_to_f8(f, rng); + } + else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_FNUZ) + { + return cast_to_f8(f, rng); + } + else if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_OCP) + { + return cast_to_f8(f, rng); + } + else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_OCP) + { + return cast_to_f8(f, rng); + } + else + { + __hip_assert(false && "FP8 type is not supported by current target device"); + return 0; + } +#endif // CK_FP8_CVT_FAST_PATH +} + +/** + * \brief convert _Float16 to @p fp8_storage_t + * + * \tparam sat saturation of fp8 + * \tparam interp interpretation of fp8 + * \tparam stochastic_rounding switch between RNE and SR + * \param x _Float16 value + * \return fp8_storage_t + */ +template +#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8 +__host__ __device__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x) +#else +__host__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x) +#endif +{ + return cvt_float_to_fp8(static_cast(x)); +} + +} // namespace fp8_impl + +// Declare a template function for fp8 conversion using RNE +template +__host__ __device__ constexpr Y f8_convert_rne(X x); + +// convert fp32 to fp8 with rounding to nearest even +template <> +inline __host__ __device__ f8_ocp_t f8_convert_rne(float x) +{ + return f8_ocp_t{ + fp8_impl::cvt_float_to_fp8(x)}; +} + +// convert fp32 to bf8 with rounding to nearest even +template <> +inline __host__ __device__ bf8_ocp_t f8_convert_rne(float x) +{ + return bf8_ocp_t{ + fp8_impl::cvt_float_to_fp8(x)}; +} + +// convert _Float16 to fp8 with rounding to nearest even +template <> +inline __host__ __device__ f8_ocp_t f8_convert_rne(_Float16 x) +{ + return f8_ocp_t{ + fp8_impl::cvt_half_t_to_fp8(x)}; +} + +template <> +inline __host__ __device__ bf8_ocp_t f8_convert_rne(_Float16 x) +{ + return bf8_ocp_t{ + fp8_impl::cvt_half_t_to_fp8( + x)}; +} + +// Declare a template function for fp8 conversion using RNE +template +__host__ __device__ constexpr Y f8_convert_sr(X x); + +// convert fp32 to fp8 with stochastic rounding +template <> +inline __host__ __device__ f8_ocp_t f8_convert_sr(float x) +{ + return f8_ocp_t{ + fp8_impl::cvt_float_to_fp8( + x)}; +} + +// convert fp32 to bf8 with stochastic rounding +template <> +inline __host__ __device__ bf8_ocp_t f8_convert_sr(float x) +{ + return bf8_ocp_t{fp8_impl::cvt_float_to_fp8(x)}; +} + +// convert _Float16 to fp8 with stochastic rounding +template <> +inline __host__ __device__ f8_ocp_t f8_convert_sr(_Float16 x) +{ + return f8_ocp_t{fp8_impl::cvt_half_t_to_fp8(x)}; +} + +// convert _Float16 to bf8 with stochastic rounding +template <> +inline __host__ __device__ bf8_ocp_t f8_convert_sr(_Float16 x) +{ + return bf8_ocp_t{fp8_impl::cvt_half_t_to_fp8(x)}; +} + +#if CK_USE_OCP_FP8 +using f8_t = f8_ocp_t; +using bf8_t = bf8_ocp_t; +#define CK_FP8_TYPE_FNUZ 0 +#define CK_FP8_TYPE_OCP 1 +#else +using f8_t = f8_fnuz_t; +using bf8_t = bf8_fnuz_t; +#define CK_FP8_TYPE_FNUZ 1 +#define CK_FP8_TYPE_OCP 0 +#endif + +} // namespace ck diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index a955279bc..5a7030cca 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -4,7 +4,7 @@ #pragma once namespace ck { -// Define the common macro for gfx94x models +// Define the common macro for MI300 models #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) #define __gfx94__ #endif diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 39f532e0e..a7dc071bc 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -3,6 +3,7 @@ #pragma once +#include "ck/utility/amd_ck_fp8.hpp" #include "ck/utility/statically_indexed_array.hpp" namespace ck { @@ -10,8 +11,6 @@ namespace ck { using bhalf_t = ushort; using half_t = _Float16; using int4_t = _BitInt(4); -using f8_t = _BitInt(8); -using bf8_t = unsigned _BitInt(8); inline constexpr auto next_pow2(uint32_t x) { @@ -19,14 +18,15 @@ inline constexpr auto next_pow2(uint32_t x) return x > 1u ? (1u << (32u - __builtin_clz(x - 1u))) : x; } -// native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_t, bf8_t, bool +// native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_fnuz_t, bf8_fnuz_t, +// native types: bool template inline constexpr bool is_native_type() { return is_same::value || is_same::value || is_same::value || is_same::value || is_same::value || is_same::value || - is_same::value || is_same::value || is_same::value || - is_same::value; + is_same::value || is_same::value || + is_same::value || is_same::value; } // vector_type @@ -166,16 +166,30 @@ struct scalar_type #endif template <> -struct scalar_type +struct scalar_type { - using type = f8_t; + using type = f8_fnuz_t; static constexpr index_t vector_size = 1; }; template <> -struct scalar_type +struct scalar_type { - using type = bf8_t; + using type = bf8_fnuz_t; + static constexpr index_t vector_size = 1; +}; + +template <> +struct scalar_type +{ + using type = f8_ocp_t::data_type; + static constexpr index_t vector_size = 1; +}; + +template <> +struct scalar_type +{ + using type = bf8_ocp_t::data_type; static constexpr index_t vector_size = 1; }; @@ -1010,60 +1024,203 @@ struct vector_type()>> } }; +template +struct non_native_vector_base; + +template +struct nnvb_data_t_selector +{ + using type = unsigned _BitInt(8 * sizeof(T)); +}; + +template <> +struct nnvb_data_t_selector +{ + using type = f8_ocp_t::data_type; +}; +template <> +struct nnvb_data_t_selector +{ + using type = bf8_ocp_t::data_type; +}; + +template +struct non_native_vector_base< + T, + N, + std::enable_if_t> +{ + using data_t = typename nnvb_data_t_selector::type; // select data_t based on the size of T + static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch"); + using data_v = data_t __attribute__((ext_vector_type(N))); + using type = non_native_vector_base; + + union alignas(next_pow2(N * sizeof(T))) + { + data_v dN; // storage vector; + StaticallyIndexedArray dxN; + StaticallyIndexedArray dTxN; + StaticallyIndexedArray dNx1; + } data_; + + __host__ __device__ constexpr non_native_vector_base(data_t a) : data_{data_v(a)} {} + __host__ __device__ constexpr non_native_vector_base(T f) + : non_native_vector_base(bit_cast(f)) + { + } + __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){}; + __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {} + + __host__ __device__ constexpr operator data_v() const { return data_.dN; } + __host__ __device__ constexpr operator data_t() const + { + if constexpr(N == 1) + { + return data_.dxN[Number<0>{}]; + } + else + { + return data_.dxN; // XXX this should cause an error + } + } + __host__ __device__ constexpr operator T() const + { + if constexpr(N == 1) + { + return data_.dTxN[Number<0>{}]; + } + else + { + return data_.dTxN; // XXX this should cause an error + } + } + + template + __host__ __device__ constexpr const auto& AsType() const + { + static_assert(is_same_v || is_same_v || is_same_v, + "Something went wrong, please check src and dst types."); + + if constexpr(is_same_v) + { + return data_.dxN; + } + else if constexpr(is_same_v) + { + return data_.dTxN; + } + else if constexpr(is_same_v) + { + return data_.dNx1; + } + else + { + return err; + } + } + + template + __host__ __device__ constexpr auto& AsType() + { + static_assert(is_same_v || is_same_v || is_same_v, + "Something went wrong, please check src and dst types."); + + if constexpr(is_same_v) + { + return data_.dxN; + } + else if constexpr(is_same_v) + { + return data_.dTxN; + } + else if constexpr(is_same_v) + { + return data_.dNx1; + } + else + { + return err; + } + } +}; + template -struct non_native_vector_base +struct scalar_type>; + +template +struct scalar_type> { - using type = non_native_vector_base; + using type = typename non_native_vector_base::data_t; + + static constexpr index_t vector_size = N; +}; - __host__ __device__ non_native_vector_base() = default; - __host__ __device__ non_native_vector_base(const type&) = default; - __host__ __device__ non_native_vector_base(type&&) = default; - __host__ __device__ ~non_native_vector_base() = default; +template +struct scalar_type> +{ + using type = typename non_native_vector_base::data_t; - T d[N]; + static constexpr index_t vector_size = N; }; // non-native vector_type implementation template struct vector_type()>> { - using d1_t = T; - using type = d1_t; + using d1_t = T; + using d1_nnv_t = non_native_vector_base; + using type = d1_nnv_t; union alignas(next_pow2(1 * sizeof(T))) { d1_t d1_; StaticallyIndexedArray d1x1_; + d1_nnv_t d1_nnv_; } data_; - __host__ __device__ constexpr vector_type() : data_{type{}} {} + __host__ __device__ constexpr vector_type() : data_{d1_t{}} {} __host__ __device__ constexpr vector_type(type v) : data_{v} {} template __host__ __device__ constexpr const auto& AsType() const { - static_assert(is_same::value, + static_assert(is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - return data_.d1x1_; + if constexpr(is_same::value || is_same::value) + { + return data_.d1x1_; + } + else + { + return err; + } } template __host__ __device__ constexpr auto& AsType() { - static_assert(is_same::value, + static_assert(is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - return data_.d1x1_; + if constexpr(is_same::value || is_same::value) + { + return data_.d1x1_; + } + else + { + return err; + } } }; template struct vector_type()>> { - using d1_t = T; - using d2_t = non_native_vector_base; + using d1_t = T; + using d1_nnv_t = non_native_vector_base; + using d2_t = non_native_vector_base; using type = d2_t; @@ -1081,10 +1238,11 @@ struct vector_type()>> template __host__ __device__ constexpr const auto& AsType() const { - static_assert(is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x2_; } @@ -1101,10 +1259,11 @@ struct vector_type()>> template __host__ __device__ constexpr auto& AsType() { - static_assert(is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x2_; } @@ -1122,9 +1281,10 @@ struct vector_type()>> template struct vector_type()>> { - using d1_t = T; - using d2_t = non_native_vector_base; - using d4_t = non_native_vector_base; + using d1_t = T; + using d1_nnv_t = non_native_vector_base; + using d2_t = non_native_vector_base; + using d4_t = non_native_vector_base; using type = d4_t; @@ -1143,10 +1303,11 @@ struct vector_type()>> template __host__ __device__ constexpr const auto& AsType() const { - static_assert(is_same::value || is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x4_; } @@ -1167,10 +1328,11 @@ struct vector_type()>> template __host__ __device__ constexpr auto& AsType() { - static_assert(is_same::value || is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x4_; } @@ -1192,10 +1354,11 @@ struct vector_type()>> template struct vector_type()>> { - using d1_t = T; - using d2_t = non_native_vector_base; - using d4_t = non_native_vector_base; - using d8_t = non_native_vector_base; + using d1_t = T; + using d1_nnv_t = non_native_vector_base; + using d2_t = non_native_vector_base; + using d4_t = non_native_vector_base; + using d8_t = non_native_vector_base; using type = d8_t; @@ -1215,11 +1378,12 @@ struct vector_type()>> template __host__ __device__ constexpr const auto& AsType() const { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x8_; } @@ -1244,11 +1408,12 @@ struct vector_type()>> template __host__ __device__ constexpr auto& AsType() { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x8_; } @@ -1274,11 +1439,12 @@ struct vector_type()>> template struct vector_type()>> { - using d1_t = T; - using d2_t = non_native_vector_base; - using d4_t = non_native_vector_base; - using d8_t = non_native_vector_base; - using d16_t = non_native_vector_base; + using d1_t = T; + using d1_nnv_t = non_native_vector_base; + using d2_t = non_native_vector_base; + using d4_t = non_native_vector_base; + using d8_t = non_native_vector_base; + using d16_t = non_native_vector_base; using type = d16_t; @@ -1299,12 +1465,12 @@ struct vector_type()>> template __host__ __device__ constexpr const auto& AsType() const { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x16_; } @@ -1333,12 +1499,12 @@ struct vector_type()>> template __host__ __device__ constexpr auto& AsType() { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x16_; } @@ -1632,20 +1798,70 @@ using int8x32_t = typename vector_type::type; using int8x64_t = typename vector_type::type; // f8 -using f8x2_t = typename vector_type::type; -using f8x4_t = typename vector_type::type; -using f8x8_t = typename vector_type::type; -using f8x16_t = typename vector_type::type; -using f8x32_t = typename vector_type::type; -using f8x64_t = typename vector_type::type; +using f8x2_fnuz_t = typename vector_type::type; +using f8x4_fnuz_t = typename vector_type::type; +using f8x8_fnuz_t = typename vector_type::type; +using f8x16_fnuz_t = typename vector_type::type; +using f8x32_fnuz_t = typename vector_type::type; +using f8x64_fnuz_t = typename vector_type::type; // bf8 -using bf8x2_t = typename vector_type::type; -using bf8x4_t = typename vector_type::type; -using bf8x8_t = typename vector_type::type; -using bf8x16_t = typename vector_type::type; -using bf8x32_t = typename vector_type::type; -using bf8x64_t = typename vector_type::type; +using bf8x2_fnuz_t = typename vector_type::type; +using bf8x4_fnuz_t = typename vector_type::type; +using bf8x8_fnuz_t = typename vector_type::type; +using bf8x16_fnuz_t = typename vector_type::type; +using bf8x32_fnuz_t = typename vector_type::type; +using bf8x64_fnuz_t = typename vector_type::type; + +// f8 +using f8x2_ocp_t = typename vector_type::type; +using f8x4_ocp_t = typename vector_type::type; +using f8x8_ocp_t = typename vector_type::type; +using f8x16_ocp_t = typename vector_type::type; +using f8x32_ocp_t = typename vector_type::type; +using f8x64_ocp_t = typename vector_type::type; + +// bf8 +using bf8x2_ocp_t = typename vector_type::type; +using bf8x4_ocp_t = typename vector_type::type; +using bf8x8_ocp_t = typename vector_type::type; +using bf8x16_ocp_t = typename vector_type::type; +using bf8x32_ocp_t = typename vector_type::type; +using bf8x64_ocp_t = typename vector_type::type; + +#if CK_FP8_TYPE_OCP +// f8 +using f8x2_t = f8x2_ocp_t; +using f8x4_t = f8x4_ocp_t; +using f8x8_t = f8x8_ocp_t; +using f8x16_t = f8x16_ocp_t; +using f8x32_t = f8x32_ocp_t; +using f8x64_t = f8x64_ocp_t; + +// bf8 +using bf8x2_t = bf8x2_ocp_t; +using bf8x4_t = bf8x4_ocp_t; +using bf8x8_t = bf8x8_ocp_t; +using bf8x16_t = bf8x16_ocp_t; +using bf8x32_t = bf8x32_ocp_t; +using bf8x64_t = bf8x64_ocp_t; +#elif CK_FP8_TYPE_FNUZ +// f8 +using f8x2_t = f8x2_fnuz_t; +using f8x4_t = f8x4_fnuz_t; +using f8x8_t = f8x8_fnuz_t; +using f8x16_t = f8x16_fnuz_t; +using f8x32_t = f8x32_fnuz_t; +using f8x64_t = f8x64_fnuz_t; + +// bf8 +using bf8x2_t = bf8x2_fnuz_t; +using bf8x4_t = bf8x4_fnuz_t; +using bf8x8_t = bf8x8_fnuz_t; +using bf8x16_t = bf8x16_fnuz_t; +using bf8x32_t = bf8x32_fnuz_t; +using bf8x64_t = bf8x64_fnuz_t; +#endif // u8 using uint8x2_t = typename vector_type::type; @@ -1702,7 +1918,7 @@ struct NumericLimits #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 template <> -struct NumericLimits +struct NumericLimits { // negative zero nan mode with exp bias = 8 static constexpr uint8_t binary_min = 0x08; // 0b00001000 @@ -1715,17 +1931,17 @@ struct NumericLimits // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111 // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!=0 - __host__ __device__ static constexpr f8_t Min() { return f8_t(binary_min); } + __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); } - __host__ __device__ static constexpr f8_t Max() { return f8_t(binary_max); } + __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); } - __host__ __device__ static constexpr f8_t Lowest() { return f8_t(binary_lowest); } + __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); } - __host__ __device__ static constexpr f8_t QuietNaN() { return f8_t(binary_qnan); } + __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); } }; template <> -struct NumericLimits +struct NumericLimits { // negative zero nan mode with exp bias = 16 static constexpr uint8_t binary_min = 0x04; // 0b00000100 @@ -1738,13 +1954,59 @@ struct NumericLimits // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!= - __host__ __device__ static constexpr bf8_t Min() { return bf8_t(binary_min); } + __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); } - __host__ __device__ static constexpr bf8_t Max() { return bf8_t(binary_max); } + __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); } - __host__ __device__ static constexpr bf8_t Lowest() { return bf8_t(binary_lowest); } + __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); } - __host__ __device__ static constexpr bf8_t QuietNaN() { return bf8_t(binary_qnan); } + __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); } +}; + +template <> +struct NumericLimits +{ + static constexpr uint8_t binary_min = 0x08; // 0b00001000 = 2^-6 + static constexpr uint8_t binary_max = 0x7E; // 0b01111110 = 448 + static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448 + static constexpr uint8_t binary_qnan = 0x7F; // 0b01111111 + + __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr f8_ocp_t Lowest() + { + return bit_cast(binary_lowest); + } + + __host__ __device__ static constexpr f8_ocp_t QuietNaN() + { + return bit_cast(binary_qnan); + } +}; + +template <> +struct NumericLimits +{ + static constexpr uint8_t binary_min = 0x04; // 0b00000100 = 2^-14 + static constexpr uint8_t binary_max = 0x7B; // 0b01111011 = 57344 + static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344 + static constexpr uint8_t binary_qnan = 0x7D; // 0b01111101 + + __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr bf8_ocp_t Lowest() + { + return bit_cast(binary_lowest); + } + + __host__ __device__ static constexpr bf8_ocp_t QuietNaN() + { + return bit_cast(binary_qnan); + } }; template @@ -1787,7 +2049,7 @@ struct NumericUtils }; template <> -struct NumericUtils +struct NumericUtils { static constexpr int exp = 4; static constexpr int mant = 3; @@ -1796,13 +2058,28 @@ struct NumericUtils }; template <> -struct NumericUtils +struct NumericUtils { static constexpr int exp = 5; static constexpr int mant = 2; static constexpr int bias = 16; // negative zero nan mode // static constexpr int bias = 15; // ieee mode }; +template <> +struct NumericUtils +{ + static constexpr int exp = 4; + static constexpr int mant = 3; + static constexpr int bias = 7; +}; + +template <> +struct NumericUtils +{ + static constexpr int exp = 5; + static constexpr int mant = 2; + static constexpr int bias = 15; +}; template <> struct NumericUtils diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp index b374c4ad5..a6c3540d8 100644 --- a/include/ck/utility/math_v2.hpp +++ b/include/ck/utility/math_v2.hpp @@ -80,7 +80,7 @@ static inline __host__ bool isnan(half_t x) return (xx & 0x7FFF) > 0x7C00; }; -static inline __host__ bool isnan(f8_t x) { return (x & 0x80); }; +static inline __host__ bool isnan(f8_t x) { return ck::fp8_is_nan(x); }; #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 static inline __host__ bool isnan(int4_t x) @@ -531,7 +531,7 @@ static inline __device__ bool isnan(half_t x) return (xx & 0x7FFF) > 0x7C00; }; -static inline __device__ bool isnan(f8_t x) { return (x & 0x80); }; +static inline __device__ bool isnan(f8_t x) { return ck::fp8_is_nan(x); }; static inline __device__ half_t sqrt(half_t x) { diff --git a/include/ck/utility/random_gen.hpp b/include/ck/utility/random_gen.hpp index b7edf2650..4ea52f7eb 100644 --- a/include/ck/utility/random_gen.hpp +++ b/include/ck/utility/random_gen.hpp @@ -1,8 +1,10 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#include "ck/ck.hpp" + namespace ck { // Pseudo random number generator @@ -23,7 +25,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = } // version for fp16 -template {}, bool> = false> +template {}, bool> = false> __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t) { uint16_t x = *(reinterpret_cast(&val)); @@ -38,9 +40,10 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = } // return 0 if data is not fp16 or fp32 -template {} || std::is_same{}), bool> = false> +template < + typename T, + uint32_t seed_t, + std::enable_if_t{} || std::is_same<_Float16, T>{}), bool> = false> __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t) { std::ignore = id; diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp index 87fa9aa38..f372756e6 100644 --- a/include/ck/utility/type_convert.hpp +++ b/include/ck/utility/type_convert.hpp @@ -9,7 +9,7 @@ #include "ck/utility/array.hpp" namespace ck { -// Define the common macro for gfx94x models +// Define the common macro for MI300 models #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) #define __gfx94__ #endif @@ -100,6 +100,18 @@ inline __host__ __device__ constexpr bhalf_t type_convert(int8_ return type_convert(x_fp32); } +template <> +inline __host__ __device__ constexpr f8_ocp_t type_convert(int x) +{ + return f8_ocp_t{type_convert(x)}; +} + +template <> +inline __host__ __device__ constexpr bf8_ocp_t type_convert(int x) +{ + return bf8_ocp_t{type_convert(x)}; +} + // Convert X to Y template __host__ __device__ constexpr Y type_convert_sp(X x) @@ -163,7 +175,7 @@ __host__ __device__ constexpr Y f8_convert_sr(X x); // convert fp32 to fp8 with stochastic rounding template <> -inline __host__ __device__ f8_t f8_convert_sr(float x) +inline __host__ __device__ f8_fnuz_t f8_convert_sr(float x) { constexpr int seed = 1254739; uint32_t rng = prand_generator(reinterpret_cast(&x), x); @@ -189,33 +201,35 @@ inline __host__ __device__ f8_t f8_convert_sr(float x) constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic; return utils:: - cast_to_f8(x, - rng); + cast_to_f8( + x, rng); #endif } // convert fp16 to fp8 with stochastic rounding template <> -inline __host__ __device__ f8_t f8_convert_sr(half_t x) +inline __host__ __device__ f8_fnuz_t f8_convert_sr(half_t x) { #if defined(__gfx94__) // convert to float and use native converion - return f8_convert_sr(type_convert(x)); + return f8_convert_sr(type_convert(x)); #else constexpr bool negative_zero_nan = true; constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic; constexpr int seed = 1254739; uint32_t rng = prand_generator(reinterpret_cast(&x), x); - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } // convert fp32 to bf8 with stochastic rounding template <> -inline __host__ __device__ bf8_t f8_convert_sr(float x) +inline __host__ __device__ bf8_fnuz_t f8_convert_sr(float x) { constexpr int seed = 1254739; uint32_t rng = prand_generator(reinterpret_cast(&x), x); @@ -240,28 +254,32 @@ inline __host__ __device__ bf8_t f8_convert_sr(float x) constexpr bool negative_zero_nan = true; constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic; - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } // convert fp16 to bf8 with stochastic rounding template <> -inline __host__ __device__ bf8_t f8_convert_sr(half_t x) +inline __host__ __device__ bf8_fnuz_t f8_convert_sr(half_t x) { #if defined(__gfx94__) // convert to float and use native converion - return f8_convert_sr(type_convert(x)); + return f8_convert_sr(type_convert(x)); #else constexpr bool negative_zero_nan = true; constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic; constexpr int seed = 1254739; uint32_t rng = prand_generator(reinterpret_cast(&x), x); - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } @@ -271,7 +289,7 @@ __host__ __device__ constexpr Y f8_convert_rne(X x); // convert fp32 to fp8 with rounding to nearest even template <> -inline __host__ __device__ f8_t f8_convert_rne(float x) +inline __host__ __device__ f8_fnuz_t f8_convert_rne(float x) { #if defined(__gfx94__) union @@ -296,32 +314,34 @@ inline __host__ __device__ f8_t f8_convert_rne(float x) constexpr f8_rounding_mode rm = f8_rounding_mode::standard; constexpr uint32_t rng = 0; return utils:: - cast_to_f8(x, - rng); + cast_to_f8( + x, rng); #endif } // convert fp16 to fp8 with rounding to nearest even template <> -inline __host__ __device__ f8_t f8_convert_rne(half_t x) +inline __host__ __device__ f8_fnuz_t f8_convert_rne(half_t x) { #if defined(__gfx94__) // convert to float and use native converion - return f8_convert_rne(type_convert(x)); + return f8_convert_rne(type_convert(x)); #else constexpr bool negative_zero_nan = true; constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::standard; constexpr uint32_t rng = 0; - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } // convert fp32 to bf8 with rounding to nearest even template <> -inline __host__ __device__ bf8_t f8_convert_rne(float x) +inline __host__ __device__ bf8_fnuz_t f8_convert_rne(float x) { #if defined(__gfx94__) union @@ -345,44 +365,59 @@ inline __host__ __device__ bf8_t f8_convert_rne(float x) constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::standard; constexpr uint32_t rng = 0; - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } // convert fp16 to bf8 with rounding to nearest even template <> -inline __host__ __device__ bf8_t f8_convert_rne(half_t x) +inline __host__ __device__ bf8_fnuz_t f8_convert_rne(half_t x) { #if defined(__gfx94__) // convert to float and use native converion - return f8_convert_rne(type_convert(x)); + return f8_convert_rne(type_convert(x)); #else constexpr bool negative_zero_nan = true; constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::standard; constexpr uint32_t rng = 0; - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); +#endif +} + +// convert fp32 to fp8 +template <> +inline __host__ __device__ f8_fnuz_t type_convert(float x) +{ +#if CK_USE_SR_F8_CONVERSION + return f8_convert_sr(x); +#else + return f8_convert_rne(x); #endif } // convert fp32 to fp8 template <> -inline __host__ __device__ f8_t type_convert(float x) +inline __host__ __device__ f8_ocp_t type_convert(float x) { #if CK_USE_SR_F8_CONVERSION - return f8_convert_sr(x); + return f8_convert_sr(x); #else - return f8_convert_rne(x); + return f8_convert_rne(x); #endif } // convert fp8 to fp32 template <> -inline __host__ __device__ float type_convert(f8_t x) +inline __host__ __device__ float type_convert(f8_fnuz_t x) { #if defined(__gfx94__) float fval; @@ -392,30 +427,44 @@ inline __host__ __device__ float type_convert(f8_t x) return fval; #else constexpr bool negative_zero_nan = true; - return utils::cast_from_f8(x); + return utils::cast_from_f8(x); #endif } template <> -inline __host__ __device__ float2_t type_convert(f8x2_t x) +inline __host__ __device__ float2_t type_convert(f8x2_fnuz_t x) { #if defined(__gfx94__) const auto i16val = bit_cast(x); return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, 0); #else constexpr bool negative_zero_nan = true; - const auto f8x2_v = vector_type(x); + const auto f8x2_v = vector_type(x); vector_type f32x2_v; f32x2_v.template AsType()(Number<0>{}) = - utils::cast_from_f8( - f8x2_v.template AsType()[Number<0>{}]); + utils::cast_from_f8( + f8x2_v.template AsType()[Number<0>{}]); f32x2_v.template AsType()(Number<1>{}) = - utils::cast_from_f8( - f8x2_v.template AsType()[Number<1>{}]); + utils::cast_from_f8( + f8x2_v.template AsType()[Number<1>{}]); return f32x2_v.template AsType()[Number<0>{}]; #endif } +template <> +inline __host__ __device__ float2_t type_convert(f8x2_ocp_t x) +{ +#if CK_OCP_FP8_CVT_FAST_PATH + return fp8_impl::cast_to_f32x2_from_f8x2( + x.AsType()[Number<0>{}]); +#else + return float2_t{fp8_impl::cast_from_f8( + x.AsType()[Number<0>{}]), + fp8_impl::cast_from_f8( + x.AsType()[Number<1>{}])}; +#endif +} + template <> inline __host__ __device__ half2_t type_convert(float2_t x) { @@ -428,42 +477,64 @@ inline __host__ __device__ half2_t type_convert(float2_t x) // convert fp16 to fp8 template <> -inline __host__ __device__ f8_t type_convert(half_t x) +inline __host__ __device__ f8_fnuz_t type_convert(half_t x) { #if CK_USE_SR_F8_CONVERSION - return f8_convert_sr(x); + return f8_convert_sr(x); #else - return f8_convert_rne(x); + return f8_convert_rne(x); +#endif +} + +// convert fp16 to fp8 +template <> +inline __host__ __device__ f8_ocp_t type_convert(half_t x) +{ +#if CK_USE_SR_F8_CONVERSION + return f8_convert_sr(x); +#else + return f8_convert_rne(x); #endif } // convert fp8 to fp16 template <> -inline __host__ __device__ half_t type_convert(f8_t x) +inline __host__ __device__ half_t type_convert(f8_fnuz_t x) { #if defined(__gfx94__) // use native conversion to float and convert to fp16 return type_convert(type_convert(x)); #else constexpr bool negative_zero_nan = true; - return utils::cast_from_f8(x); + return utils::cast_from_f8(x); +#endif +} + +// convert fp32 to bf8 +template <> +inline __host__ __device__ bf8_fnuz_t type_convert(float x) +{ +#if CK_USE_SR_F8_CONVERSION + return f8_convert_sr(x); +#else + return f8_convert_rne(x); #endif } // convert fp32 to bf8 template <> -inline __host__ __device__ bf8_t type_convert(float x) +inline __host__ __device__ bf8_ocp_t type_convert(float x) { #if CK_USE_SR_F8_CONVERSION - return f8_convert_sr(x); + return f8_convert_sr(x); #else - return f8_convert_rne(x); + return f8_convert_rne(x); #endif } // convert bf8 to fp32 template <> -inline __host__ __device__ float type_convert(bf8_t x) +inline __host__ __device__ float type_convert(bf8_fnuz_t x) { #if defined(__gfx94__) float fval; @@ -473,31 +544,42 @@ inline __host__ __device__ float type_convert(bf8_t x) return fval; #else constexpr bool negative_zero_nan = true; - return utils::cast_from_f8(x); + return utils::cast_from_f8(x); +#endif +} + +// convert fp16 to bf8 +template <> +inline __host__ __device__ bf8_fnuz_t type_convert(half_t x) +{ +#if CK_USE_SR_F8_CONVERSION + return f8_convert_sr(x); +#else + return f8_convert_rne(x); #endif } // convert fp16 to bf8 template <> -inline __host__ __device__ bf8_t type_convert(half_t x) +inline __host__ __device__ bf8_ocp_t type_convert(half_t x) { #if CK_USE_SR_F8_CONVERSION - return f8_convert_sr(x); + return f8_convert_sr(x); #else - return f8_convert_rne(x); + return f8_convert_rne(x); #endif } // convert bf8 to fp16 template <> -inline __host__ __device__ half_t type_convert(bf8_t x) +inline __host__ __device__ half_t type_convert(bf8_fnuz_t x) { #if defined(__gfx94__) // use native conversion to float and convert to fp16 return type_convert(type_convert(x)); #else constexpr bool negative_zero_nan = true; - return utils::cast_from_f8(x); + return utils::cast_from_f8(x); #endif } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index e1edc4fae..1ae11fe9d 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -62,9 +62,9 @@ struct ReferenceGemm : public device::BaseOperator auto f_mk_kn_mn = [&](auto m, auto n) { const int K = arg.a_m_k_.mDesc.GetLengths()[1]; - AccDataType v_acc = 0; - ComputeTypeA v_a = 0; - ComputeTypeB v_b = 0; + AccDataType v_acc{0}; + ComputeTypeA v_a{0}; + ComputeTypeB v_b{0}; for(int k = 0; k < K; ++k) { @@ -93,7 +93,7 @@ struct ReferenceGemm : public device::BaseOperator ck::type_convert(v_a) * ck::type_convert(v_b); } - CDataType v_c = 0; + CDataType v_c{0}; arg.c_element_op_(v_c, v_acc); diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 2c0b6c7b7..dd023e6b5 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -62,7 +62,7 @@ function(add_instance_library INSTANCE_NAME) endforeach() # Do not build mha instances if gfx94 or gfx90a targets are not on the target list foreach(source IN LISTS ARGN) - if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha") + if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha") message("removing mha instance ${source} ") list(REMOVE_ITEM ARGN "${source}") endif() @@ -346,7 +346,7 @@ if(CK_DEVICE_CONV_INSTANCES) endif() if(CK_DEVICE_MHA_INSTANCES) set(gpu_list ${INST_TARGETS}) - if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a") + if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a") add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES}) add_library(composablekernels::device_mha_operations ALIAS device_mha_operations) target_compile_features(device_mha_operations PUBLIC) diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp index af31cf8a8..e31433cc8 100644 --- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp @@ -15,7 +15,7 @@ void add_device_pool3d_fwd_ndhwc_f8_instances( instances) { add_device_operation_instances( - instances, device_pool3d_fwd_ndhwc_instances{}); + instances, device_pool3d_fwd_ndhwc_instances{}); } void add_device_pool3d_fwd_ndhwc_index_f8_instances( @@ -23,7 +23,7 @@ void add_device_pool3d_fwd_ndhwc_index_f8_instances( instances) { add_device_operation_instances( - instances, device_pool3d_fwd_ndhwc_instances{}); + instances, device_pool3d_fwd_ndhwc_instances{}); } } // namespace instance diff --git a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp index 5bee67c1c..be69b67b5 100644 --- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -150,7 +150,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification, break; default: a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{1}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1{1}); } diff --git a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp index f3d2c5561..b585b7d56 100644 --- a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -157,7 +157,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification, break; default: a_g_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp index 15a21206c..700ada73a 100644 --- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -174,7 +174,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, break; default: a_g_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp index f2fcb0b13..e3c462e21 100644 --- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -140,7 +140,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification, break; default: a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{1}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp index 0419ccd8e..1373dbc49 100644 --- a/profiler/include/profiler/profile_gemm_impl.hpp +++ b/profiler/include/profiler/profile_gemm_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -74,8 +74,8 @@ int profile_gemm_impl(int do_verification, switch(init_method) { case 0: - ck::utils::FillConstant{static_cast(1.f)}(a_m_k); - ck::utils::FillConstant{static_cast(1.f)}(b_k_n); + ck::utils::FillConstant{type_convert(1.f)}(a_m_k); + ck::utils::FillConstant{type_convert(1.f)}(b_k_n); break; case 1: ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt index a783be7bb..a9d3dad7f 100644 --- a/test/data_type/CMakeLists.txt +++ b/test/data_type/CMakeLists.txt @@ -9,13 +9,38 @@ if (USE_BITINT_EXTENSION_INT4) endif() endif() -add_gtest_executable(test_fp8 test_fp8.cpp) -if(result EQUAL 0) - target_link_libraries(test_fp8 PRIVATE utility) + + +add_custom_target(test_fp8) + +if (CK_USE_OCP_FP8) + add_gtest_executable(test_fp8_ocp test_fp8_ocp.cpp) + if(result EQUAL 0) + target_link_libraries(test_fp8_ocp PRIVATE utility) + endif() + + add_gtest_executable(test_bf8_ocp test_bf8_ocp.cpp) + if(result EQUAL 0) + target_link_libraries(test_bf8_ocp PRIVATE utility) + endif() + + add_dependencies(test_fp8 test_fp8_ocp) + add_dependencies(test_fp8 test_bf8_ocp) endif() -add_gtest_executable(test_bf8 test_bf8.cpp) -if(result EQUAL 0) - target_link_libraries(test_bf8 PRIVATE utility) + +if (CK_USE_FNUZ_FP8) + add_gtest_executable(test_fp8_fnuz test_fp8_fnuz.cpp) + if(result EQUAL 0) + target_link_libraries(test_fp8_fnuz PRIVATE utility) + endif() + + add_gtest_executable(test_bf8_fnuz test_bf8_fnuz.cpp) + if(result EQUAL 0) + target_link_libraries(test_bf8_fnuz PRIVATE utility) + endif() + + add_dependencies(test_fp8 test_fp8_fnuz) + add_dependencies(test_fp8 test_bf8_fnuz) endif() add_gtest_executable(test_custom_type test_custom_type.cpp) diff --git a/test/data_type/test_bf8.cpp b/test/data_type/test_bf8_fnuz.cpp similarity index 52% rename from test/data_type/test_bf8.cpp rename to test/data_type/test_bf8_fnuz.cpp index 6f50db68c..4ff796a61 100644 --- a/test/data_type/test_bf8.cpp +++ b/test/data_type/test_bf8_fnuz.cpp @@ -5,158 +5,169 @@ #include "ck/utility/data_type.hpp" #include "ck/utility/type_convert.hpp" -using ck::bf8_t; +using ck::bf8_fnuz_t; using ck::f8_convert_rne; using ck::f8_convert_sr; using ck::half_t; using ck::type_convert; -TEST(BF8, NumericLimits) +TEST(BF8FNUZ, NumericLimits) { // constants given for negative zero nan mode - EXPECT_EQ(ck::NumericLimits::Min(), type_convert(0x04)); - EXPECT_EQ(ck::NumericLimits::Max(), type_convert(0x7F)); - EXPECT_EQ(ck::NumericLimits::Lowest(), type_convert(0xFF)); - EXPECT_EQ(ck::NumericLimits::QuietNaN(), type_convert(0x80)); + EXPECT_EQ(ck::NumericLimits::Min(), type_convert(0x04)); + EXPECT_EQ(ck::NumericLimits::Max(), type_convert(0x7F)); + EXPECT_EQ(ck::NumericLimits::Lowest(), type_convert(0xFF)); + EXPECT_EQ(ck::NumericLimits::QuietNaN(), type_convert(0x80)); } -TEST(BF8, ConvertFP32Nearest) +TEST(BF8FNUZ, ConvertFP32Nearest) { // fix the tolerance value float abs_tol = 1e-6; // convert 0 float to bf8 and back, check if holds - ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), abs_tol); + ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), abs_tol); // don't run the next test on gfx11 devices #ifndef CK_SKIP_FLAKY_F8_TEST // convert minimal float to bf8 and back, check if holds ASSERT_NEAR(std::numeric_limits::min(), - type_convert(f8_convert_rne(std::numeric_limits::min())), + type_convert(f8_convert_rne(std::numeric_limits::min())), abs_tol); #endif - // convert maximal bf8_t to float and check if equal to 57344.0 - ASSERT_NEAR(57344.0f, type_convert(f8_convert_rne(57344.0f)), abs_tol); + + const auto max_bf8_t_float = type_convert(ck::NumericLimits::Max()); + // convert maximal bf8_fnuz_t to float and check if equal to 57344.0 + ASSERT_NEAR( + max_bf8_t_float, type_convert(f8_convert_rne(max_bf8_t_float)), abs_tol); // convert maximal float to bf8 and back, check if clipped to 57344.0 - ASSERT_NEAR(57344.0f, - type_convert(f8_convert_rne(std::numeric_limits::max())), + ASSERT_NEAR(max_bf8_t_float, + type_convert(f8_convert_rne(std::numeric_limits::max())), abs_tol); - // convert inf float to bf8_t and check if it is qNan - ASSERT_NEAR(type_convert(0x80), - f8_convert_rne(std::numeric_limits::infinity()), + // convert inf float to bf8_fnuz_t and check if it is qNan + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_rne(std::numeric_limits::infinity()), abs_tol); // positive norm float value to bf8 and back, check if holds float pos_float = 0.0000762939f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); // negative norm float value to bf8 and back, check if holds float neg_float = -0.0000610351f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); // positive subnorm float value to bf8 and back, check if holds pos_float = 0.0000305175f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); // negative subnorm float value to bf8 and back, check if holds neg_float = -0.0000152587f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); } -TEST(BF8, ConvertFP32Stochastic) +TEST(BF8FNUZ, ConvertFP32Stochastic) { // fix the tolerance value float abs_tol = 1e-6; // convert 0 float to bf8 and back, check if holds - ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), abs_tol); + ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), abs_tol); // convert minimal float to bf8 and back, check if holds ASSERT_NEAR(std::numeric_limits::min(), - type_convert(f8_convert_sr(std::numeric_limits::min())), + type_convert(f8_convert_sr(std::numeric_limits::min())), abs_tol); - // convert maximal bf8_t to float and check if equal to 57344.0 - ASSERT_NEAR(57344.0f, type_convert(f8_convert_sr(57344.0f)), abs_tol); + + const auto max_bf8_t_float = type_convert(ck::NumericLimits::Max()); + // convert maximal bf8_fnuz_t to float and check if equal to 57344.0 + ASSERT_NEAR( + max_bf8_t_float, type_convert(f8_convert_sr(max_bf8_t_float)), abs_tol); // convert maximal float to bf8 and back, check if clipped to 57344.0 - ASSERT_NEAR(57344.0f, - type_convert(f8_convert_sr(std::numeric_limits::max())), + ASSERT_NEAR(max_bf8_t_float, + type_convert(f8_convert_sr(std::numeric_limits::max())), abs_tol); - // convert inf float to bf8_t and check if it is qNan - ASSERT_NEAR(type_convert(0x80), - f8_convert_sr(std::numeric_limits::infinity()), + // convert inf float to bf8_fnuz_t and check if it is qNan + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_sr(std::numeric_limits::infinity()), abs_tol); // positive norm float value to bf8 and back, check if holds float pos_float = 0.0000762939f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); // negative norm float value to bf8 and back, check if holds float neg_float = -0.0000610351f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); // positive subnorm float value to bf8 and back, check if holds pos_float = 0.0000305175f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); // negative subnorm float value to bf8 and back, check if holds neg_float = -0.0000152587f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); } -TEST(BF8, ConvertFP16Nearest) +TEST(BF8FNUZ, ConvertFP16Nearest) { // fix the tolerance value float abs_tol = 1e-3; // convert 0 fp16 to bf8 and back, check if holds - ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_rne(half_t{0.0})), abs_tol); + ASSERT_NEAR( + half_t{0.0}, type_convert(f8_convert_rne(half_t{0.0})), abs_tol); // convert minimal fp16 to bf8 and back, check if holds ASSERT_NEAR(ck::NumericLimits::Min(), - type_convert(f8_convert_rne(ck::NumericLimits::Min())), + type_convert(f8_convert_rne(ck::NumericLimits::Min())), abs_tol); - // convert maximal bf8_t to fp16 and check if equal to 57344.0 + + const auto max_bf8_t_half = type_convert(ck::NumericLimits::Max()); + // convert maximal bf8_fnuz_t to fp16 and check if equal to 57344.0 ASSERT_NEAR( - half_t{57344.0}, type_convert(f8_convert_rne(half_t{57344.0})), abs_tol); + max_bf8_t_half, type_convert(f8_convert_rne(max_bf8_t_half)), abs_tol); // convert maximal fp16 to bf8 and back, check if clipped to 57344.0 - ASSERT_NEAR(half_t{57344.0}, - type_convert(f8_convert_rne(ck::NumericLimits::Max())), + ASSERT_NEAR(max_bf8_t_half, + type_convert(f8_convert_rne(ck::NumericLimits::Max())), abs_tol); - // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN - ASSERT_NEAR(type_convert(0x80), - f8_convert_rne(ck::NumericLimits::QuietNaN()), + // convert QuietNaN fp16 to bf8_fnuz_t and check if it is QuietNaN + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_rne(ck::NumericLimits::QuietNaN()), abs_tol); // positive norm fp16 value to bf8 and back, check if holds half_t pos_half = half_t{0.0000762939}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); // negative norm fp16 value to bf8 and back, check if holds half_t neg_half = half_t{-0.0000610351}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); // positive subnorm fp16 value to bf8 and back, check if holds pos_half = half_t{0.0000305175}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); // negative subnorm fp16 value to bf8 and back, check if holds neg_half = half_t{-0.0000152587}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); } -TEST(BF8, ConvertFP16Stochastic) +TEST(BF8FNUZ, ConvertFP16Stochastic) { // fix the tolerance value float abs_tol = 1e-3; // convert 0 fp16 to bf8 and back, check if holds - ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_sr(half_t{0.0})), abs_tol); + ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_sr(half_t{0.0})), abs_tol); // convert minimal fp16 to bf8 and back, check if holds ASSERT_NEAR(ck::NumericLimits::Min(), - type_convert(f8_convert_sr(ck::NumericLimits::Min())), + type_convert(f8_convert_sr(ck::NumericLimits::Min())), abs_tol); - // convert maximal bf8_t to fp16 and check if equal to 57344.0 + + const auto max_bf8_t_half = type_convert(ck::NumericLimits::Max()); + // convert maximal bf8_fnuz_t to fp16 and check if equal to 57344.0 ASSERT_NEAR( - half_t{57344.0}, type_convert(f8_convert_sr(half_t{57344.0})), abs_tol); + max_bf8_t_half, type_convert(f8_convert_sr(max_bf8_t_half)), abs_tol); // convert maximal fp16 to bf8 and back, check if clipped to 57344.0 - ASSERT_NEAR(half_t{57344.0}, - type_convert(f8_convert_sr(ck::NumericLimits::Max())), + ASSERT_NEAR(max_bf8_t_half, + type_convert(f8_convert_sr(ck::NumericLimits::Max())), abs_tol); - // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN - ASSERT_NEAR(type_convert(0x80), - f8_convert_sr(ck::NumericLimits::QuietNaN()), + // convert QuietNaN fp16 to bf8_fnuz_t and check if it is QuietNaN + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_sr(ck::NumericLimits::QuietNaN()), abs_tol); // positive norm fp16 value to bf8 and back, check if holds half_t pos_half = half_t{0.0000762939}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); // negative norm fp16 value to bf8 and back, check if holds half_t neg_half = half_t{-0.0000610351}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); // positive subnorm fp16 value to bf8 and back, check if holds pos_half = half_t{0.0000305175}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); // negative subnorm fp16 value to bf8 and back, check if holds neg_half = half_t{-0.0000152587}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); } diff --git a/test/data_type/test_bf8_ocp.cpp b/test/data_type/test_bf8_ocp.cpp new file mode 100644 index 000000000..9d4ee38b1 --- /dev/null +++ b/test/data_type/test_bf8_ocp.cpp @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "ck/utility/data_type.hpp" +#include "ck/utility/type_convert.hpp" + +using ck::bf8_ocp_t; +using ck::f8_convert_rne; +using ck::f8_convert_sr; +using ck::half_t; +using ck::type_convert; + +TEST(BF8OCP, NumericLimits) +{ // constants given for OCP FP8 + EXPECT_EQ(ck::NumericLimits::Min(), + type_convert(0x04)); // 0b00000100 = 2^-14 + EXPECT_EQ(ck::NumericLimits::Max(), + type_convert(0x7B)); // 0b01111011 = 57344 + EXPECT_EQ(ck::NumericLimits::Lowest(), + type_convert(0xFB)); // 0b11111011 = -57344 + EXPECT_EQ(ck::NumericLimits::QuietNaN().data, + type_convert(0x7D).data); // 0b01111101 + EXPECT_FALSE(ck::NumericLimits::QuietNaN() == + ck::NumericLimits::QuietNaN()); + EXPECT_TRUE(ck::fp8_is_inf(type_convert(0xFC)) && + ck::fp8_is_inf(type_convert(0x7C))); +} + +TEST(BF8OCP, ConvertFP32Nearest) +{ + // fix the tolerance value + float abs_tol = 1e-6; + + // convert 0 float to bfp8 and back, check if holds + ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), 0.0f); + + // convert minimal float to bf8 and back, check if holds + ASSERT_NEAR(std::numeric_limits::min(), + type_convert(f8_convert_rne(std::numeric_limits::min())), + abs_tol); + + const auto max_bf8_t_float = type_convert(ck::NumericLimits::Max()); + + // convert maximal bf8_ocp_t to float and check if equal to bf8 max + ASSERT_NEAR( + max_bf8_t_float, type_convert(f8_convert_rne(max_bf8_t_float)), 0.0f); + + // convert maximal float to bf8 and back, check if clipped to bf8 max (saturation to finite) + ASSERT_NEAR(max_bf8_t_float, + type_convert(f8_convert_rne(std::numeric_limits::max())), + 0.0f); + + // convert float infinity to bf8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ(ck::NumericLimits::Max(), + f8_convert_rne(std::numeric_limits::infinity())); + + // positive normal float value to bf8 and back, check if holds + float pos_float = 0.0000762939f; // 10*2^-17 + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + + // negative smallest normal bf8 value to bf8 and back, check if holds + constexpr auto neg_min_bf8 = -0.00006103515625f; //-2^-14 + ASSERT_NEAR(neg_min_bf8, type_convert(f8_convert_rne(neg_min_bf8)), 0.0f); + + // positive subnorm float value to bf8 and back, check if holds + constexpr auto pos_subnorm_bf8 = 0.000030517578125f; // 2^-15 + ASSERT_NEAR( + pos_subnorm_bf8, type_convert(f8_convert_rne(pos_subnorm_bf8)), 0.0f); + + // min subnorm bf8 value to bf8 and back, check if holds + constexpr auto min_subnorm_bf8 = -0.0000152587890625f; //-2^-16 + ASSERT_NEAR( + min_subnorm_bf8, type_convert(f8_convert_rne(min_subnorm_bf8)), 0.0f); + + // smaller than min subnorm bf8 value to bf8 must be zero + constexpr auto less_than_min_subnorm = 0.00000762939453125f; // 2^-17 + ASSERT_EQ(0.0f, type_convert(f8_convert_rne(less_than_min_subnorm))); + + // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN + const auto bf8_nan = f8_convert_rne(std::numeric_limits::quiet_NaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data)); +} + +TEST(BF8OCP, ConvertFP32Stochastic) +{ + // fix the tolerance value + float abs_tol = 1e-6; + + // convert 0 float to bfp8 and back, check if holds + ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), 0.0f); + + // convert minimal float to bf8 and back, check if holds + ASSERT_NEAR(std::numeric_limits::min(), + type_convert(f8_convert_sr(std::numeric_limits::min())), + abs_tol); + + const auto max_bf8_t_float = type_convert(ck::NumericLimits::Max()); + + // convert maximal bf8_ocp_t to float and check if equal to bf8 max + ASSERT_NEAR( + max_bf8_t_float, type_convert(f8_convert_sr(max_bf8_t_float)), 0.0f); + + // convert maximal float to bf8 and back, check if clipped to bf8 max (saturation to finite) + ASSERT_NEAR(max_bf8_t_float, + type_convert(f8_convert_sr(std::numeric_limits::max())), + 0.0f); + + // convert float infinity to bf8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ(ck::NumericLimits::Max(), + f8_convert_sr(std::numeric_limits::infinity())); + + // positive normal float value to bf8 and back, check if holds + float pos_float = 0.0000762939f; // 10*2^-17 + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + + // negative smallest normal bf8 value to bf8 and back, check if holds + constexpr auto neg_min_bf8 = -0.00006103515625f; //-2^-14 + ASSERT_NEAR(neg_min_bf8, type_convert(f8_convert_sr(neg_min_bf8)), 0.0f); + + // positive subnorm float value to bf8 and back, check if holds + constexpr auto pos_subnorm_bf8 = 0.000030517578125f; // 2^-15 + ASSERT_NEAR( + pos_subnorm_bf8, type_convert(f8_convert_sr(pos_subnorm_bf8)), 0.0f); + + // min subnorm bf8 value to bf8 and back, check if holds + constexpr auto min_subnorm_bf8 = -0.0000152587890625f; //-2^-16 + ASSERT_NEAR( + min_subnorm_bf8, type_convert(f8_convert_sr(min_subnorm_bf8)), 0.0f); + + // smaller than min subnorm bf8 value to bf8 alternates between 0 and 2^-16 + constexpr auto less_than_min_subnorm = 0.00000762939453125f; // 2^-17 + ASSERT_NEAR(0.0f, + type_convert(f8_convert_sr(less_than_min_subnorm)), + 0.0000152587890625f); + + // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN + const auto bf8_nan = f8_convert_sr(std::numeric_limits::quiet_NaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data)); +} + +TEST(BF8OCP, ConvertFP16Nearest) +{ + // fix the tolerance value + constexpr half_t half_t_tol = 1e-3; + constexpr half_t half_t_zero = 0.0; + + // convert 0 half_t to bfp8 and back, check if holds + ASSERT_NEAR( + half_t_zero, type_convert(f8_convert_rne(half_t_zero)), half_t_zero); + + // convert minimal half_t to bf8 and back, check if holds + ASSERT_NEAR(ck::NumericLimits::Min(), + type_convert(f8_convert_rne(ck::NumericLimits::Min())), + half_t_tol); + + const auto max_bf8_t_half_t = type_convert(ck::NumericLimits::Max()); + + // convert maximal bf8_ocp_t to half_t and check if equal to bf8 max + ASSERT_NEAR(max_bf8_t_half_t, + type_convert(f8_convert_rne(max_bf8_t_half_t)), + half_t_zero); + + // convert maximal half_t to bf8 and back, check if clipped to bf8 max (saturation to finite) + ASSERT_NEAR(max_bf8_t_half_t, + type_convert(f8_convert_rne(ck::NumericLimits::Max())), + half_t_zero); + + // convert half_t infinity to bf8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ( + ck::NumericLimits::Max(), + f8_convert_rne(type_convert(std::numeric_limits::infinity()))); + + // positive normal bf8 value to bf8 and back, check if holds + constexpr half_t pos_norm_bf8{0.0000762939f}; // 10*2^-17 + ASSERT_NEAR( + pos_norm_bf8, type_convert(f8_convert_rne(pos_norm_bf8)), half_t_tol); + + // negative smallest normal bf8 value to bf8 and back, check if holds + constexpr half_t neg_min_bf8{-0.00006103515625f}; //-2^-14 + ASSERT_NEAR( + neg_min_bf8, type_convert(f8_convert_rne(neg_min_bf8)), half_t_zero); + + // positive subnorm bf8 value to bf8 and back, check if holds + constexpr half_t pos_subnorm_bf8{0.000030517578125f}; // 2^-15 + ASSERT_NEAR(pos_subnorm_bf8, + type_convert(f8_convert_rne(pos_subnorm_bf8)), + half_t_zero); + + // min subnorm bf8 value to bf8 and back, check if holds + constexpr half_t min_subnorm_bf8{-0.0000152587890625f}; //-2^-16 + ASSERT_NEAR(min_subnorm_bf8, + type_convert(f8_convert_rne(min_subnorm_bf8)), + half_t_zero); + + // smaller than min subnorm bf8 value to bf8 must be zero + constexpr half_t less_than_min_subnorm{0.00000762939453125f}; // 2^-17 + ASSERT_EQ(half_t_zero, type_convert(f8_convert_rne(less_than_min_subnorm))); + + // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN + const auto bf8_nan = f8_convert_rne(ck::NumericLimits::QuietNaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data)); +} + +TEST(BF8OCP, ConvertFP16Stochastic) +{ + // fix the tolerance value + constexpr half_t half_t_tol = 1e-3; + constexpr half_t half_t_zero = 0.0; + constexpr auto min_subnorm_bf8 = 0.0000152587890625f; // 2^-16 + + // convert 0 half_t to bfp8 and back, check if holds + ASSERT_NEAR( + half_t_zero, type_convert(f8_convert_sr(half_t_zero)), half_t_zero); + + // convert minimal half_t (6.103515625e-05) to fp8 and back + ASSERT_NEAR(ck::NumericLimits::Min(), + type_convert(f8_convert_sr(ck::NumericLimits::Min())), + half_t_zero); + + const auto max_bf8_t_half_t = type_convert(ck::NumericLimits::Max()); + + // convert maximal bf8_ocp_t to half_t and check if equal to bf8 max + ASSERT_NEAR(max_bf8_t_half_t, + type_convert(f8_convert_sr(max_bf8_t_half_t)), + half_t_zero); + + // convert maximal half_t to bf8 and back, check if clipped to bf8 max (saturation to finite) + ASSERT_NEAR(max_bf8_t_half_t, + type_convert(f8_convert_sr(ck::NumericLimits::Max())), + half_t_zero); + + // convert half_t infinity to bf8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ( + ck::NumericLimits::Max(), + f8_convert_sr(type_convert(std::numeric_limits::infinity()))); + + // positive normal bf8 value to bf8 and back, check if holds + constexpr half_t pos_norm_bf8{0.0000762939f}; // 10*2^-17 + ASSERT_NEAR( + pos_norm_bf8, type_convert(f8_convert_sr(pos_norm_bf8)), half_t_tol); + + // negative smallest normal bf8 value to bf8 and back, check if holds + constexpr half_t neg_min_bf8{-0.00006103515625f}; //-2^-14 + ASSERT_NEAR( + neg_min_bf8, type_convert(f8_convert_sr(neg_min_bf8)), half_t_zero); + + // positive subnorm bf8 value to bf8 and back, check if holds + constexpr half_t pos_subnorm_bf8{0.000030517578125f}; // 2^-15 + ASSERT_NEAR(pos_subnorm_bf8, + type_convert(f8_convert_sr(pos_subnorm_bf8)), + half_t_zero); + + // min subnorm bf8 value to bf8 and back, check if holds + ASSERT_NEAR(half_t{-min_subnorm_bf8}, + type_convert(f8_convert_sr(half_t{-min_subnorm_bf8})), + half_t_zero); + + // smaller than min subnorm bf8 value to bf8 alternates between 0 and 2^-16 + constexpr half_t less_than_min_subnorm{0.00000762939453125f}; // 2^-17 + ASSERT_NEAR(half_t_zero, + type_convert(f8_convert_sr(less_than_min_subnorm)), + half_t{min_subnorm_bf8}); + + // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN + const auto bf8_nan = f8_convert_sr(ck::NumericLimits::QuietNaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data)); +} diff --git a/test/data_type/test_custom_type.cpp b/test/data_type/test_custom_type.cpp index 101681254..a8fa9ba4a 100644 --- a/test/data_type/test_custom_type.cpp +++ b/test/data_type/test_custom_type.cpp @@ -872,3 +872,161 @@ TEST(Complex_half, TestAsTypeReshape) test_vec.at(num_elem * i + 1)); }); } + +#if CK_USE_OCP_FP8 + +TEST(FP8OCP, TestSize) +{ + static_assert(std::is_same_v, "OCP FP8 is not enabled"); + ASSERT_EQ(sizeof(f8_t), sizeof(ck::fp8_storage_t)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); +} + +TEST(FP8OCP, TestAsType) +{ + static_assert(std::is_same_v, "OCP FP8 is not enabled"); + + // test size + std::array test_vec = {-4, -2, -0.5, -0.25, 1.0 / 8.0, 1, 1.5, 16}; + constexpr int size = test_vec.size(); + + // reference vector + vector_type right_vec; + + // check default CTOR + ck::static_for<0, size, 1>{}( + [&](auto i) { ASSERT_EQ(right_vec.template AsType()(Number{}), f8_t{0}); }); + + // assign test values to the vector + ck::static_for<0, size, 1>{}([&](auto i) { + right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); + }); + + // copy the vector + vector_type left_vec{right_vec}; + + // check if values were copied correctly + ck::static_for<0, size, 1>{}([&](auto i) { + ASSERT_EQ(left_vec.template AsType()(Number{}), + ck::type_convert(test_vec.at(i))); + }); + + ck::non_native_vector_base nnvb_f8x2(ck::type_convert(-10.0f)); + ASSERT_EQ(nnvb_f8x2.template AsType()(Number<0>{}), ck::type_convert(-10.0f)); + ASSERT_EQ(nnvb_f8x2.template AsType()(Number<1>{}), ck::type_convert(-10.0f)); +} + +TEST(FP8OCP, TestAsTypeReshape) +{ + static_assert(std::is_same_v, "OCP FP8 is not enabled"); + + // test size + std::array test_vec = {-8, -0.5, -0.25, 1.0 / 8.0, 1 / 256, 1, 1.5, 16}; + constexpr int size = test_vec.size(); + + // reference vector + vector_type right_vec; + + // check default CTOR + ck::static_for<0, size, 1>{}( + [&](auto i) { ASSERT_EQ(right_vec.template AsType()(Number{}), f8_t{0}); }); + + // assign test values to the vector + ck::static_for<0, size, 1>{}([&](auto i) { + right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); + }); + + // copy the first half of a vector + vector_type left_vec{ + right_vec.template AsType::type>()(Number<0>{})}; + + // check if values were copied correctly + ck::static_for<0, size / 2, 1>{}([&](auto i) { + ASSERT_EQ(left_vec.template AsType()(Number{}), + ck::type_convert(test_vec.at(i))); + }); +} + +TEST(BF8OCP, TestSize) +{ + static_assert(std::is_same_v, "OCP BF8 is not enabled"); + ASSERT_EQ(sizeof(bf8_t), sizeof(ck::fp8_storage_t)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); +} + +TEST(BF8OCP, TestAsType) +{ + static_assert(std::is_same_v, "OCP BF8 is not enabled"); + + // test size + std::array test_vec = {-4, -2, -0.5, -0.25, 1.0 / 8.0, 1, 1.5, 16}; + constexpr int size = test_vec.size(); + + // reference vector + vector_type right_vec; + + // check default CTOR + ck::static_for<0, size, 1>{}( + [&](auto i) { ASSERT_EQ(right_vec.template AsType()(Number{}), bf8_t{0}); }); + + // assign test values to the vector + ck::static_for<0, size, 1>{}([&](auto i) { + right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); + }); + + // copy the vector + vector_type left_vec{right_vec}; + + // check if values were copied correctly + ck::static_for<0, size, 1>{}([&](auto i) { + ASSERT_EQ(left_vec.template AsType()(Number{}), + ck::type_convert(test_vec.at(i))); + }); + + ck::non_native_vector_base nnvb_bf8x2(ck::type_convert(-10.0f)); + ASSERT_EQ(nnvb_bf8x2.template AsType()(Number<0>{}), ck::type_convert(-10.0f)); + ASSERT_EQ(nnvb_bf8x2.template AsType()(Number<1>{}), ck::type_convert(-10.0f)); +} + +TEST(BF8OCP, TestAsTypeReshape) +{ + static_assert(std::is_same_v, "OCP BF8 is not enabled"); + + // test size + std::array test_vec = {-8, -0.5, -0.25, 1.0 / 8.0, 1 / 256, 1, 1.5, 16}; + constexpr int size = test_vec.size(); + + // reference vector + vector_type right_vec; + + // check default CTOR + ck::static_for<0, size, 1>{}( + [&](auto i) { ASSERT_EQ(right_vec.template AsType()(Number{}), bf8_t{0}); }); + + // assign test values to the vector + ck::static_for<0, size, 1>{}([&](auto i) { + right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); + }); + + // copy the first half of a vector + vector_type left_vec{ + right_vec.template AsType::type>()(Number<0>{})}; + + // check if values were copied correctly + ck::static_for<0, size / 2, 1>{}([&](auto i) { + ASSERT_EQ(left_vec.template AsType()(Number{}), + ck::type_convert(test_vec.at(i))); + }); +} + +#endif diff --git a/test/data_type/test_fp8.cpp b/test/data_type/test_fp8_fnuz.cpp similarity index 52% rename from test/data_type/test_fp8.cpp rename to test/data_type/test_fp8_fnuz.cpp index 25d9d9d2f..c2ec6dad9 100644 --- a/test/data_type/test_fp8.cpp +++ b/test/data_type/test_fp8_fnuz.cpp @@ -7,154 +7,171 @@ using ck::f8_convert_rne; using ck::f8_convert_sr; -using ck::f8_t; +using ck::f8_fnuz_t; using ck::half_t; using ck::type_convert; -TEST(FP8, NumericLimits) +TEST(FP8FNUZ, NumericLimits) { // constants given for negative zero nan mode - EXPECT_EQ(ck::NumericLimits::Min(), type_convert(0x08)); - EXPECT_EQ(ck::NumericLimits::Max(), type_convert(0x7F)); - EXPECT_EQ(ck::NumericLimits::Lowest(), type_convert(0xFF)); - EXPECT_EQ(ck::NumericLimits::QuietNaN(), type_convert(0x80)); + EXPECT_EQ(ck::NumericLimits::Min(), type_convert(0x08)); + EXPECT_EQ(ck::NumericLimits::Max(), type_convert(0x7F)); + EXPECT_EQ(ck::NumericLimits::Lowest(), type_convert(0xFF)); + EXPECT_EQ(ck::NumericLimits::QuietNaN(), type_convert(0x80)); } -TEST(FP8, ConvertFP32Nearest) +TEST(FP8FNUZ, ConvertFP32Nearest) { // fix the tolerance value float abs_tol = 1e-6; // convert 0 float to fp8 and back, check if holds - ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), abs_tol); + ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), abs_tol); // don't run the next test on gfx11 devices #ifndef CK_SKIP_FLAKY_F8_TEST // convert minimal float to fp8 and back, check if holds ASSERT_NEAR(std::numeric_limits::min(), - type_convert(f8_convert_rne(std::numeric_limits::min())), + type_convert(f8_convert_rne(std::numeric_limits::min())), abs_tol); #endif - // convert maximal f8_t to float and check if equal to 240.0 - ASSERT_NEAR(240.0f, type_convert(f8_convert_rne(240.0f)), abs_tol); - // convert maximal float to fp8 and back, check if clipped to 240.0 - ASSERT_NEAR(240.0f, - type_convert(f8_convert_rne(std::numeric_limits::max())), + + const auto max_f8_t_float = type_convert(ck::NumericLimits::Max()); + // convert maximal f8_fnuz_t to float and check if equal to fp8 max + ASSERT_NEAR( + max_f8_t_float, type_convert(f8_convert_rne(max_f8_t_float)), abs_tol); + + // XXX: FNUZ f8_convert_rne behavior is inconsistent. + // Clipping large values to fp8 max (saturation to finite) contradicts converting inf float to + // fp8 qNAN (no saturation). + + // convert maximal float to fp8 and back, check if clipped to fp8 max + ASSERT_NEAR(max_f8_t_float, + type_convert(f8_convert_rne(std::numeric_limits::max())), abs_tol); - // convert inf float to f8_t and check if it is qNan - ASSERT_NEAR(type_convert(0x80), - f8_convert_rne(std::numeric_limits::infinity()), + // convert inf float to f8_fnuz_t and check if it is qNan + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_rne(std::numeric_limits::infinity()), abs_tol); // positive norm float value to fp8 and back, check if holds float pos_float = 0.017578125f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); // negative norm float value to fp8 and back, check if holds float neg_float = -0.015625f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); // positive subnorm float value to fp8 and back, check if holds pos_float = 0.00390625f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); // negative subnorm float value to fp8 and back, check if holds neg_float = -0.001953125f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); } -TEST(FP8, ConvertFP32Stochastic) +TEST(FP8FNUZ, ConvertFP32Stochastic) { // fix the tolerance value float abs_tol = 1e-6; // convert 0 float to fp8 and back, check if holds - ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), abs_tol); + ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), abs_tol); // convert minimal float to fp8 and back, check if holds ASSERT_NEAR(std::numeric_limits::min(), - type_convert(f8_convert_sr(std::numeric_limits::min())), + type_convert(f8_convert_sr(std::numeric_limits::min())), abs_tol); - // convert maximal f8_t to float and check if equal to 240.0 - ASSERT_NEAR(240.0f, type_convert(f8_convert_sr(240.0f)), abs_tol); - // convert maximal float to fp8 and back, check if clipped to 240.0 - ASSERT_NEAR(240.0f, - type_convert(f8_convert_sr(std::numeric_limits::max())), + + const auto max_f8_t_float = type_convert(ck::NumericLimits::Max()); + // convert maximal f8_fnuz_t to float and check if equal to fp8 max + ASSERT_NEAR( + max_f8_t_float, type_convert(f8_convert_sr(max_f8_t_float)), abs_tol); + // convert maximal float to fp8 and back, check if clipped to fp8 max + ASSERT_NEAR(max_f8_t_float, + type_convert(f8_convert_sr(std::numeric_limits::max())), abs_tol); - // convert inf float to f8_t and check if it is qNan - ASSERT_NEAR(type_convert(0x80), - f8_convert_sr(std::numeric_limits::infinity()), + // convert inf float to f8_fnuz_t and check if it is qNan + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_sr(std::numeric_limits::infinity()), abs_tol); // positive norm float value to fp8 and back, check if holds float pos_float = 0.017578125f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); // negative norm float value to fp8 and back, check if holds float neg_float = -0.015625f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); // positive subnorm float value to fp8 and back, check if holds pos_float = 0.00390625f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); // negative subnorm float value to fp8 and back, check if holds neg_float = -0.001953125f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); } -TEST(FP8, ConvertFP16Nearest) +TEST(FP8FNUZ, ConvertFP16Nearest) { // fix the tolerance value float abs_tol = 1e-3; // convert 0 fp16 to fp8 and back, check if holds - ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_rne(half_t{0.0})), abs_tol); + ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_rne(half_t{0.0})), abs_tol); // convert minimal fp16 to fp8 and back, check if holds ASSERT_NEAR(ck::NumericLimits::Min(), - type_convert(f8_convert_rne(ck::NumericLimits::Min())), + type_convert(f8_convert_rne(ck::NumericLimits::Min())), abs_tol); - // convert maximal f8_t to fp16 and check if equal to 240.0 - ASSERT_NEAR(half_t{240.0}, type_convert(f8_convert_rne(half_t{240.0})), abs_tol); - // convert maximal fp16 to fp8 and back, check if clipped to 240.0 - ASSERT_NEAR(half_t{240.0}, - type_convert(f8_convert_rne(ck::NumericLimits::Max())), + + const auto max_f8_t_half = type_convert(ck::NumericLimits::Max()); + // convert maximal f8_fnuz_t to fp16 and check if equal to fp8 max + ASSERT_NEAR( + max_f8_t_half, type_convert(f8_convert_rne(max_f8_t_half)), abs_tol); + // convert maximal fp16 to fp8 and back, check if clipped to fp8 max + ASSERT_NEAR(max_f8_t_half, + type_convert(f8_convert_rne(ck::NumericLimits::Max())), abs_tol); - // convert QuietNaN fp16 to f8_t and check if it is QuietNaN - ASSERT_NEAR(type_convert(0x80), - f8_convert_rne(ck::NumericLimits::QuietNaN()), + // convert QuietNaN fp16 to f8_fnuz_t and check if it is QuietNaN + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_rne(ck::NumericLimits::QuietNaN()), abs_tol); // positive norm fp16 value to fp8 and back, check if holds half_t pos_half = half_t{0.017578125}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); // negative norm fp16 value to fp8 and back, check if holds half_t neg_half = half_t{-0.015625}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); // positive subnorm fp16 value to fp8 and back, check if holds pos_half = half_t{0.00390625}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); // negative subnorm fp16 value to fp8 and back, check if holds neg_half = half_t{-0.001953125}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); } -TEST(FP8, ConvertFP16Stochastic) +TEST(FP8FNUZ, ConvertFP16Stochastic) { // fix the tolerance value float abs_tol = 1e-3; // convert 0 fp16 to fp8 and back, check if holds - ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_sr(half_t{0.0})), abs_tol); + ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_sr(half_t{0.0})), abs_tol); // convert minimal fp16 to fp8 and back, check if holds ASSERT_NEAR(ck::NumericLimits::Min(), - type_convert(f8_convert_sr(ck::NumericLimits::Min())), + type_convert(f8_convert_sr(ck::NumericLimits::Min())), abs_tol); - // convert maximal f8_t to fp16 and check if equal to 240.0 - ASSERT_NEAR(half_t{240.0}, type_convert(f8_convert_sr(half_t{240.0})), abs_tol); - // convert maximal fp16 to fp8 and back, check if clipped to 240.0 - ASSERT_NEAR(half_t{240.0}, - type_convert(f8_convert_sr(ck::NumericLimits::Max())), + + const auto max_f8_t_half = type_convert(ck::NumericLimits::Max()); + // convert maximal f8_fnuz_t to fp16 and check if equal to fp8 max + ASSERT_NEAR( + max_f8_t_half, type_convert(f8_convert_sr(max_f8_t_half)), abs_tol); + // convert maximal fp16 to fp8 and back, check if clipped to fp8 max + ASSERT_NEAR(max_f8_t_half, + type_convert(f8_convert_sr(ck::NumericLimits::Max())), abs_tol); - // convert QuietNaN fp16 to f8_t and check if it is QuietNaN - ASSERT_NEAR(type_convert(0x80), - f8_convert_sr(ck::NumericLimits::QuietNaN()), + // convert QuietNaN fp16 to f8_fnuz_t and check if it is QuietNaN + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_sr(ck::NumericLimits::QuietNaN()), abs_tol); // positive norm fp16 value to fp8 and back, check if holds half_t pos_half = half_t{0.017578125}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); // negative norm fp16 value to fp8 and back, check if holds half_t neg_half = half_t{-0.015625}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); // positive subnorm fp16 value to fp8 and back, check if holds pos_half = half_t{0.00390625}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); // negative subnorm fp16 value to fp8 and back, check if holds neg_half = half_t{-0.001953125}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); } diff --git a/test/data_type/test_fp8_ocp.cpp b/test/data_type/test_fp8_ocp.cpp new file mode 100644 index 000000000..a8077f1bd --- /dev/null +++ b/test/data_type/test_fp8_ocp.cpp @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "ck/utility/data_type.hpp" +#include "ck/utility/type_convert.hpp" + +using ck::f8_convert_rne; +using ck::f8_convert_sr; +using ck::f8_ocp_t; +using ck::half_t; +using ck::type_convert; + +TEST(FP8OCP, NumericLimits) +{ + // constants given for OCP FP8 + EXPECT_EQ(ck::NumericLimits::Min(), + type_convert(0x08)); // 0b00001000 = 2^-6 + EXPECT_EQ(ck::NumericLimits::Max(), type_convert(0x7E)); // 0b01111110 = 448 + EXPECT_EQ(ck::NumericLimits::Lowest(), + type_convert(0xFE)); // 0b11111110 = -448 + EXPECT_EQ(ck::NumericLimits::QuietNaN().data, + type_convert(0x7F).data); // 0b01111111 + EXPECT_FALSE(ck::NumericLimits::QuietNaN() == + ck::NumericLimits::QuietNaN()); +} + +TEST(FP8OCP, ConvertFP32Nearest) +{ + // fix the tolerance value + float abs_tol = 1e-6; + // convert 0 float to fp8 and back, check if holds + ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), 0.0f); + + // convert minimal float to fp8 and back, check if holds + ASSERT_NEAR(std::numeric_limits::min(), + type_convert(f8_convert_rne(std::numeric_limits::min())), + abs_tol); + + const auto max_f8_t_float = type_convert(ck::NumericLimits::Max()); + + // convert maximal f8_ocp_t to float and check if equal to fp8 max + ASSERT_NEAR( + max_f8_t_float, type_convert(f8_convert_rne(max_f8_t_float)), 0.0f); + + // convert maximal float to fp8 and back, check if clipped to fp8 max (saturation to finite) + ASSERT_NEAR(max_f8_t_float, + type_convert(f8_convert_rne(std::numeric_limits::max())), + 0.0f); + + // convert float infinity to f8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ(ck::NumericLimits::Max(), + f8_convert_rne(std::numeric_limits::infinity())); + + // positive norm float value to fp8 and back, check if holds + float pos_float = 0.017578125f; + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + + // smallest normal fp8 value to fp8 and back, check if holds + float neg_float = -0.015625f; //-2^-6 + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), 0.0f); + + // positive subnorm float value to fp8 and back, check if holds + pos_float = 0.00390625f; + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + + // min subnorm fp8 value to fp8 and back, check if holds + neg_float = -0.001953125f; //-2^-9 + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), 0.0f); + + // smaller than min subnorm fp8 value to fp8 must be zero + auto less_than_min_subnorm = 0.0009765625f; // 2^-10 + ASSERT_EQ(0.0f, type_convert(f8_convert_rne(less_than_min_subnorm))); + + // convert quiet NaN to f8_ocp_t and check if it is quiet NaN + auto f8_nan = f8_convert_rne(std::numeric_limits::quiet_NaN()); + ASSERT_TRUE((f8_nan.data & 0x7f) == 0x7f); +} + +TEST(FP8OCP, ConvertFP32Stochastic) +{ + // fix the tolerance value + float abs_tol = 1e-6; + // convert 0 float to fp8 and back, check if holds + ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), 0.0f); + + // convert minimal float to fp8 and back, check if holds + ASSERT_NEAR(std::numeric_limits::min(), + type_convert(f8_convert_sr(std::numeric_limits::min())), + abs_tol); + + const auto max_f8_t_float = type_convert(ck::NumericLimits::Max()); + + // convert maximal f8_ocp_t to float and check if equal to fp8 max + ASSERT_NEAR(max_f8_t_float, type_convert(f8_convert_sr(max_f8_t_float)), 0.0f); + + // convert maximal float to fp8 and back, check if clipped to fp8 max (saturation to finite) + ASSERT_NEAR(max_f8_t_float, + type_convert(f8_convert_sr(std::numeric_limits::max())), + 0.0f); + + // convert float infinity to f8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ(ck::NumericLimits::Max(), + f8_convert_sr(std::numeric_limits::infinity())); + + // positive norm float value to fp8 and back, check if holds + float pos_float = 0.017578125f; + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + + // smallest normal fp8 value to fp8 and back, check if holds + float neg_float = -0.015625f; //-2^-6 + ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), 0.0f); + + // positive subnorm float value to fp8 and back, check if holds + pos_float = 0.00390625f; + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + + // min subnorm fp8 value to fp8 and back, check if holds + constexpr auto min_subnorm_fp8 = -0.001953125f; //-2^-9 + ASSERT_NEAR( + min_subnorm_fp8, type_convert(f8_convert_sr(min_subnorm_fp8)), 0.0f); + + // smaller than min subnorm fp8 value to fp8 alternates between 0 and 2^-9 + auto less_than_min_subnorm = 0.0009765625f; // 2^-10 + ASSERT_NEAR( + 0.0f, type_convert(f8_convert_sr(less_than_min_subnorm)), 0.001953125f); + + // convert quiet NaN to f8_ocp_t and check if it is quiet NaN + auto f8_nan = f8_convert_sr(std::numeric_limits::quiet_NaN()); + ASSERT_TRUE((f8_nan.data & 0x7f) == 0x7f); +} + +TEST(FP8OCP, ConvertFP16Nearest) +{ + // fix the tolerance value + constexpr half_t half_t_tol = 1e-3; + constexpr half_t half_t_zero = 0.0; + // convert 0 half_t to fp8 and back, check if holds + ASSERT_NEAR( + half_t_zero, type_convert(f8_convert_rne(half_t_zero)), half_t_zero); + + // convert minimal half_t to fp8 and back, check if holds + ASSERT_NEAR(ck::NumericLimits::Min(), + type_convert(f8_convert_rne(ck::NumericLimits::Min())), + half_t_tol); + const auto max_f8_t_half_t = type_convert(ck::NumericLimits::Max()); + + // convert maximal f8_ocp_t to half_t and check if equal to fp8 max + ASSERT_NEAR(max_f8_t_half_t, + type_convert(f8_convert_rne(max_f8_t_half_t)), + half_t_zero); + + // convert maximal half_t to fp8 and back, check if clipped to fp8 max (saturation to finite) + ASSERT_NEAR(max_f8_t_half_t, + type_convert(f8_convert_rne(ck::NumericLimits::Max())), + half_t_zero); + + // convert half_t infinity to f8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ( + ck::NumericLimits::Max(), + f8_convert_rne(type_convert(std::numeric_limits::infinity()))); + + // positive norm half_t value to fp8 and back, check if holds + half_t pos_half_t{0.017578125f}; + ASSERT_NEAR(pos_half_t, type_convert(f8_convert_rne(pos_half_t)), half_t_tol); + + // smallest normal fp8 value to fp8 and back, check if holds + half_t neg_half_t{-0.015625f}; //-2^-6 + ASSERT_NEAR( + neg_half_t, type_convert(f8_convert_rne(neg_half_t)), half_t_zero); + + // positive subnorm half_t value to fp8 and back, check if holds + pos_half_t = half_t{0.00390625f}; + ASSERT_NEAR(pos_half_t, type_convert(f8_convert_rne(pos_half_t)), half_t_tol); + + // min subnorm fp8 value to fp8 and back, check if holds + neg_half_t = half_t{-0.001953125f}; //-2^-9 + ASSERT_NEAR( + neg_half_t, type_convert(f8_convert_rne(neg_half_t)), half_t_zero); + + // smaller than min subnorm fp8 value to fp8 must be zero + auto less_than_min_subnorm = half_t{0.0009765625f}; // 2^-10 + ASSERT_EQ(half_t_zero, type_convert(f8_convert_rne(less_than_min_subnorm))); + + // convert quiet NaN to f8_ocp_t and check if it is quiet NaN + auto f8_nan = f8_convert_rne(ck::NumericLimits::QuietNaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_f8_is_nan(f8_nan.data)); +} + +TEST(FP8OCP, ConvertFP16Stochastic) +{ + // fix the tolerance value + constexpr half_t half_t_tol = 1e-3; + constexpr half_t half_t_zero = 0.0; + constexpr auto min_subnorm_fp8 = 0.001953125f; // 2^-9 + + // convert 0 half_t to fp8 and back, check if holds + ASSERT_NEAR( + half_t_zero, type_convert(f8_convert_sr(half_t_zero)), half_t_zero); + + // convert minimal half_t (6.103515625e-05) to fp8 and back + // alternates between 0 and 2^-9 (0.001953125) + ASSERT_NEAR(ck::NumericLimits::Min(), + type_convert(f8_convert_sr(ck::NumericLimits::Min())), + type_convert(min_subnorm_fp8)); + + const auto max_f8_t_half_t = type_convert(ck::NumericLimits::Max()); + + // convert maximal f8_ocp_t to half_t and check if equal to fp8 max + ASSERT_NEAR(max_f8_t_half_t, + type_convert(f8_convert_sr(max_f8_t_half_t)), + half_t_zero); + + // convert maximal half_t to fp8 and back, check if clipped to fp8 max (saturation to finite) + ASSERT_NEAR(max_f8_t_half_t, + type_convert(f8_convert_sr(ck::NumericLimits::Max())), + half_t_zero); + + // convert half_t infinity to f8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ( + ck::NumericLimits::Max(), + f8_convert_sr(type_convert(std::numeric_limits::infinity()))); + + // positive norm half_t value to fp8 and back, check if holds + half_t pos_half_t{0.017578125f}; + ASSERT_NEAR(pos_half_t, type_convert(f8_convert_sr(pos_half_t)), half_t_tol); + + // smallest normal fp8 value to fp8 and back, check if holds + half_t neg_half_t{-0.015625f}; //-2^-6 + ASSERT_NEAR(neg_half_t, type_convert(f8_convert_sr(neg_half_t)), half_t_zero); + + // positive subnorm half_t value to fp8 and back, check if holds + pos_half_t = half_t{0.00390625f}; + ASSERT_NEAR(pos_half_t, type_convert(f8_convert_sr(pos_half_t)), half_t_tol); + + // min subnorm fp8 value to fp8 and back, check if holds + neg_half_t = half_t{-min_subnorm_fp8}; //-2^-9 + ASSERT_NEAR(neg_half_t, type_convert(f8_convert_sr(neg_half_t)), half_t_zero); + + // smaller than min subnorm fp8 value to fp8 alternates between 0 and 2^-9 + auto less_than_min_subnorm = half_t{0.0009765625f}; // 2^-10 + ASSERT_NEAR( + type_convert(half_t_zero), + type_convert(type_convert(f8_convert_sr(less_than_min_subnorm))), + min_subnorm_fp8); + + // convert quiet NaN to f8_ocp_t and check if it is quiet NaN + auto f8_nan = f8_convert_sr(ck::NumericLimits::QuietNaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_f8_is_nan(f8_nan.data)); +} diff --git a/test/pool/test_avg_pool2d_fwd.cpp b/test/pool/test_avg_pool2d_fwd.cpp index 8dbb37b84..b5e733419 100644 --- a/test/pool/test_avg_pool2d_fwd.cpp +++ b/test/pool/test_avg_pool2d_fwd.cpp @@ -138,7 +138,7 @@ TYPED_TEST_SUITE(AvgPool2D_BF16, AvgPool2D_BF16_Types); TYPED_TEST_SUITE(AvgPool2D_I8, AvgPool2D_I8_Types); TYPED_TEST_SUITE(AvgPool2D_F8, AvgPool2D_F8_Types); -TYPED_TEST(AvgPool2D_F32, AvgPool2D_I8_Test) { this->Run(); } +TYPED_TEST(AvgPool2D_F32, AvgPool2D_F32_Test) { this->Run(); } TYPED_TEST(AvgPool2D_F16, AvgPool2D_F16_Test) { this->Run(); } TYPED_TEST(AvgPool2D_BF16, AvgPool2D_BF16_Test) { this->Run(); } TYPED_TEST(AvgPool2D_I8, AvgPool2D_I8_Test) { this->Run(); } diff --git a/test/pool/test_max_pool2d_fwd.cpp b/test/pool/test_max_pool2d_fwd.cpp index 80ca47407..217924275 100644 --- a/test/pool/test_max_pool2d_fwd.cpp +++ b/test/pool/test_max_pool2d_fwd.cpp @@ -143,7 +143,7 @@ TYPED_TEST_SUITE(MaxPool2D_BF16, MaxPool2D_BF16_Types); TYPED_TEST_SUITE(MaxPool2D_I8, MaxPool2D_I8_Types); TYPED_TEST_SUITE(MaxPool2D_F8, MaxPool2D_F8_Types); -TYPED_TEST(MaxPool2D_F32, MaxPool2D_I8_Test) { this->Run(); } +TYPED_TEST(MaxPool2D_F32, MaxPool2D_F32_Test) { this->Run(); } TYPED_TEST(MaxPool2D_F16, MaxPool2D_F16_Test) { this->Run(); } TYPED_TEST(MaxPool2D_BF16, MaxPool2D_BF16_Test) { this->Run(); } TYPED_TEST(MaxPool2D_I8, MaxPool2D_I8_Test) { this->Run(); } -- GitLab From 5affda819de5624e83d8d90f883c0a87f80b7ee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Wed, 4 Dec 2024 00:46:47 +0100 Subject: [PATCH 098/153] Add basic documentation structure (#1715) * Add basic documentation structure * Add terminology placeholder * Add codegen placeholder * Create template for each page --- CONTRIBUTORS.md | 1 + README.md | 34 ++++++++++++++--------------- TERMINOLOGY.md | 2 ++ client_example/25_wrapper/README.md | 11 +++------- client_example/README.md | 2 ++ codegen/README.md | 2 ++ example/README.md | 2 ++ include/ck/README.md | 19 ++++++++++++++++ include/ck_tile/README.md | 3 ++- profiler/README.md | 12 ++++++++++ 10 files changed, 62 insertions(+), 26 deletions(-) create mode 100644 TERMINOLOGY.md create mode 100644 codegen/README.md create mode 100644 example/README.md create mode 100644 include/ck/README.md diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index cdce5a463..8ef5c2b72 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -1,3 +1,4 @@ +[Back to the main page](./README.md) # Composable Kernel Developers and Contributors This is the list of developers and contributors to Composable Kernel library diff --git a/README.md b/README.md index d8eb152ee..c0872aa56 100644 --- a/README.md +++ b/README.md @@ -26,23 +26,15 @@ The current CK library is structured into four layers: ## General information -To build our documentation locally, use the following code: - -``` bash -cd docs -pip3 install -r sphinx/requirements.txt -python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html -``` - -You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page. - -```note -If you use CK, cite us as follows: - -* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???): - This paper will be available on arXiv soon. -* [CITATION.cff](/CITATION.cff) -``` +* [CK supported operations](include/ck/README.md) +* [CK Tile supported operations](include/ck_tile/README.md) +* [CK wrapper](client_example/25_wrapper/README.md) +* [CK codegen](codegen/README.md) +* [CK profiler](profiler/README.md) +* [Examples (Custom use of CK supported operations)](example/README.md) +* [Client examples (Use of CK supported operations with instance factory)](client_example/README.md) +* [Terminology](/TERMINOLOGY.md) +* [Contributors](/CONTRIBUTORS.md) CK is released under the **[MIT license](/LICENSE)**. @@ -137,6 +129,14 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa You can find instructions for running ckProfiler in [profiler](/profiler). +* Build our documentation locally: + + ``` bash + cd docs + pip3 install -r sphinx/requirements.txt + python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html + ``` + Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly. However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and crash. On average, you should expect each thread to use ~2Gb of RAM. diff --git a/TERMINOLOGY.md b/TERMINOLOGY.md new file mode 100644 index 000000000..e8833efb8 --- /dev/null +++ b/TERMINOLOGY.md @@ -0,0 +1,2 @@ +[Back to the main page](./README.md) +# Composable Kernel terminology \ No newline at end of file diff --git a/client_example/25_wrapper/README.md b/client_example/25_wrapper/README.md index eba3de017..3db9a9af4 100644 --- a/client_example/25_wrapper/README.md +++ b/client_example/25_wrapper/README.md @@ -1,14 +1,9 @@ +[Back to the main page](../../README.md) # Composable Kernel wrapper GEMM tutorial -This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) -wrapper. We present the base version of GEMM without most of the available optimizations; however, -it's worth noting that CK has kernels with different optimizations. +This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) wrapper. We present the base version of GEMM without most of the available optimizations; however, it's worth noting that CK has kernels with different optimizations. -To implement these optimizations, you can use the CK wrapper or directly use available instances in -CK. You can also refer to the -[optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), -that uses CK wrapper based on the -[`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation. +To implement these optimizations, you can use the CK wrapper or directly use available instances in CK. You can also refer to the [optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), that uses CK wrapper based on the [`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation. The kernel definition should look similar to: diff --git a/client_example/README.md b/client_example/README.md index 64a7130d5..d9f793434 100644 --- a/client_example/README.md +++ b/client_example/README.md @@ -1,3 +1,5 @@ +[Back to the main page](../README.md) +# Composable Kernel client examples ## Client application links to CK library, and therefore CK library needs to be installed before building client applications. diff --git a/codegen/README.md b/codegen/README.md new file mode 100644 index 000000000..deadf3221 --- /dev/null +++ b/codegen/README.md @@ -0,0 +1,2 @@ +[Back to the main page](../README.md) +# Composable Kernel codegen \ No newline at end of file diff --git a/example/README.md b/example/README.md new file mode 100644 index 000000000..43b3419f8 --- /dev/null +++ b/example/README.md @@ -0,0 +1,2 @@ +[Back to the main page](../README.md) +# Composable Kernel examples \ No newline at end of file diff --git a/include/ck/README.md b/include/ck/README.md new file mode 100644 index 000000000..bff689f6b --- /dev/null +++ b/include/ck/README.md @@ -0,0 +1,19 @@ +[Back to the main page](../../README.md) +# Composable Kernel supported operations +## Supported device operations +* [Average pooling]() +* [Batched contraction]() +* [Batched gemm]() +* [Batchnorm]() +* [CGEMM]() +* [Contraction]() +* [Convolution]() +* [Image to Column and Column to Image]() +* [Elementwise]() +* [GEMM]() +* [Max pooling]() +* [Reduce]() +* [Normalization]() +* [Permute]() +* [Put]() +* [Softmax]() diff --git a/include/ck_tile/README.md b/include/ck_tile/README.md index 572e9c7e4..9f88af1ca 100644 --- a/include/ck_tile/README.md +++ b/include/ck_tile/README.md @@ -1,4 +1,5 @@ -# ck_tile +[Back to the main page](../../README.md) +# Composable Kernel Tile ## concept `ck_tile` provides a programming model with templated abstractions to enable users to implement performance-critical kernels for machine learning workloads. introduces following basic concepts to help users building your own operator - tensor coordinate transformation, this is the core concept of layout/index transform abstraction in both compiler time and run time. diff --git a/profiler/README.md b/profiler/README.md index 10febcabd..3f4837aad 100644 --- a/profiler/README.md +++ b/profiler/README.md @@ -1,3 +1,5 @@ +[Back to the main page](../README.md) +# Composable Kernel profiler ## Profile GEMM kernels ```bash #arg1: tensor operation (gemm=GEMM) @@ -180,3 +182,13 @@ Note: Column to image kernel adds to the output memory, this will cause output b ################ op datatype verify init log time dim0 dim1 dim2 in_stride0 in_stride1 in_stride2 out_stride0 out_stride1 out_stride2 ./bin/ckProfiler permute_scale 0 1 1 0 1 64 64 64 4096 64 1 1 64 4096 ``` + +## Convert MIOpen driver command to CKProfiler + +```bash +python3 ../script/convert_miopen_driver_to_profiler.py +/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3 +-p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1 +``` + +Only convolution driver is supported. -- GitLab From 126ce85aa10347007fb5ca2068bcad378cb17d74 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Wed, 4 Dec 2024 15:59:58 +0800 Subject: [PATCH 099/153] [CK_TILE] Use 'false' for highest dimension padding flags (#1716) * Use 'false' for highest dimension padding flags * Update padding flag of bias --- .../ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp | 15 +++++++-------- .../kernel/fmha_fwd_splitkv_combine_kernel.hpp | 2 +- .../ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 15 +++++++-------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 3de433d6a..3a66b78a5 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -998,14 +998,14 @@ struct FmhaFwdKernel return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); const auto k_dram = [&]() { @@ -1019,7 +1019,7 @@ struct FmhaFwdKernel return pad_tensor_view( k_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); const auto v_dram = [&]() { if constexpr(std::is_same_v) @@ -1041,7 +1041,7 @@ struct FmhaFwdKernel return pad_tensor_view( v_dram_transposed, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { @@ -1055,7 +1055,7 @@ struct FmhaFwdKernel return pad_tensor_view( v_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); @@ -1097,9 +1097,8 @@ struct FmhaFwdKernel number{}, number<1>{}); - return pad_tensor_view(bias_dram_naive, - bias_dram_window_lengths, - sequence{}); + return pad_tensor_view( + bias_dram_naive, bias_dram_window_lengths, sequence{}); }(); return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0}); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp index ca9da91a5..0bccabdd2 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp @@ -339,7 +339,7 @@ struct FmhaFwdSplitKVCombineKernel number{}, number<1>{}); - auto o_acc_dram_view = pad_tensor_view( + const auto o_acc_dram_view = pad_tensor_view( o_acc_dram_naive, make_tuple(number<1>{}, number{}, number{}), sequence{}); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index dcb671d81..f37e676da 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -623,14 +623,14 @@ struct FmhaFwdSplitKVKernel return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); @@ -645,7 +645,7 @@ struct FmhaFwdSplitKVKernel return pad_tensor_view( k_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }; const auto k_dram = [&]() { if constexpr(kIsPagedKV) @@ -678,7 +678,7 @@ struct FmhaFwdSplitKVKernel return pad_tensor_view( v_dram_transposed, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { @@ -692,7 +692,7 @@ struct FmhaFwdSplitKVKernel return pad_tensor_view( v_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }; const auto v_dram = [&]() { @@ -804,9 +804,8 @@ struct FmhaFwdSplitKVKernel number{}, number<1>{}); - return pad_tensor_view(bias_dram_naive, - bias_dram_window_lengths, - sequence{}); + return pad_tensor_view( + bias_dram_naive, bias_dram_window_lengths, sequence{}); }(); return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0}); -- GitLab From 4cb3d7d7eac162af2c6e1a1d9c3367cb7633347c Mon Sep 17 00:00:00 2001 From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com> Date: Wed, 4 Dec 2024 21:40:01 +0100 Subject: [PATCH 100/153] Ck tile grouped GEMM example (#1713) * Ck-tile, impl. grouped gemm * Workspace is allocated by user, and is passed to the function * Prepare test to new api design * Unify GemTransKernelArgs, removing N0 param * Add 1 to dim3 in paritioner * Typo: gem - > gemm --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> --- .../ck_tile/17_grouped_gemm/CMakeLists.txt | 2 + example/ck_tile/17_grouped_gemm/README.md | 25 ++ .../ck_tile/17_grouped_gemm/grouped_gemm.cpp | 151 +++++++++ .../ck_tile/17_grouped_gemm/grouped_gemm.hpp | 53 +++ .../run_grouped_gemm_example.inc | 191 +++++++++++ example/ck_tile/17_grouped_gemm/utils.hpp | 38 +++ example/ck_tile/CMakeLists.txt | 1 + .../core/utility/amd_address_space.hpp | 37 +++ include/ck_tile/ops/gemm.hpp | 1 + .../ops/gemm/kernel/gemm_tile_partitioner.hpp | 36 ++ .../ops/gemm/kernel/grouped_gemm_kernel.hpp | 310 ++++++++++++++++++ test/ck_tile/CMakeLists.txt | 1 + test/ck_tile/grouped_gemm/CMakeLists.txt | 4 + .../grouped_gemm/test_grouped_gemm.cpp | 29 ++ .../test_grouped_gemm_ut_cases.inc | 25 ++ .../grouped_gemm/test_grouped_gemm_util.hpp | 282 ++++++++++++++++ 16 files changed, 1186 insertions(+) create mode 100644 example/ck_tile/17_grouped_gemm/CMakeLists.txt create mode 100644 example/ck_tile/17_grouped_gemm/README.md create mode 100644 example/ck_tile/17_grouped_gemm/grouped_gemm.cpp create mode 100644 example/ck_tile/17_grouped_gemm/grouped_gemm.hpp create mode 100644 example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc create mode 100644 example/ck_tile/17_grouped_gemm/utils.hpp create mode 100644 include/ck_tile/core/utility/amd_address_space.hpp create mode 100644 include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp create mode 100644 test/ck_tile/grouped_gemm/CMakeLists.txt create mode 100644 test/ck_tile/grouped_gemm/test_grouped_gemm.cpp create mode 100644 test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc create mode 100644 test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt new file mode 100644 index 000000000..d34013dd6 --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp) + diff --git a/example/ck_tile/17_grouped_gemm/README.md b/example/ck_tile/17_grouped_gemm/README.md new file mode 100644 index 000000000..d1a0458ed --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/README.md @@ -0,0 +1,25 @@ +# Grouped CShuffle GEMM + +This folder contains example for Grouped GEMM using ck_tile tile-programming implementation. Currently, it only supports the basic feature of the CK Tile GEMM, but creates the placeholders for the future support on different GEMM pipeline and different GEMM modules. In the near future, we will gradually migrate all the GEMM features from old CK to CK Tile. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ +# The basic pipeline method on the gemm calculation +make tile_example_grouped_gemm -j +``` +This will result in an executable `build/bin/tile_example_grouped_gemm` + +## example +``` +args: + -a_layout Tensor A layout (default:R) + -b_layout Tensor B layout (default:R) + -c_layout Tensor C layout (default:R) + -v 0. No validation, 1. Validation on CPU + -warmup number of iterations before benchmark the kernel (default:10) + -repeat number of iterations to benchmark the kernel (default:100) +``` diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp new file mode 100644 index 000000000..14f3b4a5b --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/host.hpp" +#include "grouped_gemm.hpp" +#include "utils.hpp" + +namespace { + +struct GroupedGemmKernelParam +{ + static const bool kPadM = false; + static const bool kPadN = false; + static const bool kPadK = false; + static const bool kTilePermute = false; + + static const ck_tile::index_t kOutputRank = 2; + + static const int kBlockPerCu = 1; + static const ck_tile::index_t M_Tile = 128; + static const ck_tile::index_t N_Tile = 128; + static const ck_tile::index_t K_Tile = 32; + + static const ck_tile::index_t M_Warp = 2; + static const ck_tile::index_t N_Warp = 2; + static const ck_tile::index_t K_Warp = 1; + + static const ck_tile::index_t M_Warp_Tile = 32; + static const ck_tile::index_t N_Warp_Tile = 32; + static const ck_tile::index_t K_Warp_Tile = 8; +}; + +using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + +using TilePartitioner = ck_tile::GemmTile1DPartitioner; + +template +using GemmEpilogue = std::conditional_t< + std::is_same_v, + ck_tile::CShuffleEpilogue>, + ck_tile::Default2DEpilogue>>; + +template +using CodegenGemmTraits = ck_tile::TileGemmTraits; + +template +using CodegenPipelineProblem = + ck_tile::GemmPipelineProblem>; + +using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy; + +template +using CodegenGemmPipeline = + ck_tile::GemmPipelineAGmemBGmemCRegV1, + CodegenGemmPolicy>; + +template +using Kernel = ck_tile::GroupedGemmKernel, + GemmEpilogue>; +}; // namespace + +std::size_t GetWorkspaceSize(const std::vector& gemm_descs) +{ + return ::Kernel::GetWorkSpaceSize(gemm_descs); +} + +template +float grouped_gemm(const std::vector& gemm_descs, + const ck_tile::stream_config& s, + void* p_workspace_) +{ + using GroupedGemmKernel = ::Kernel; + + auto arguments = GroupedGemmKernel::MakeKargs(gemm_descs); + + const dim3 grids = GroupedGemmKernel::GridSize(gemm_descs); + constexpr dim3 blocks = GroupedGemmKernel::BlockSize(); + + ck_tile::hip_check_error(hipMemcpyWithStream( + p_workspace_, + arguments.data(), + arguments.size() * sizeof(typename GroupedGemmKernel::GemmTransKernelArg), + hipMemcpyHostToDevice, + s.stream_id_)); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + float ave_time = + ck_tile::launch_kernel(s, + ck_tile::make_kernel( + GroupedGemmKernel{}, + grids, + blocks, + 0, + ck_tile::cast_pointer_to_constant_address_space(p_workspace_), + gemm_descs.size())); + return ave_time; +} + +#include "run_grouped_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); } diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp new file mode 100644 index 000000000..94af4711d --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" + +template +struct GemmBasicTypeConfig; + +template <> +struct GemmBasicTypeConfig +{ + using ADataType = ck_tile::half_t; + using BDataType = ck_tile::half_t; + using CDataType = ck_tile::half_t; + using AccDataType = float; +}; + +using Types = GemmBasicTypeConfig; + +// Specific type aliases for easy access +using ADataType = Types::ADataType; +using BDataType = Types::BDataType; +using AccDataType = Types::AccDataType; +using CDataType = Types::CDataType; + +using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs; + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("a_layout", "R", "A tensor data layout - Row by default") + .insert("b_layout", "R", "B tensor data layout - Row by default") + .insert("c_layout", "R", "C tensor data layout - Row by default") + .insert("validate", "1", "0. No validation, 1. Validation on CPU") + .insert("warmup", "10", "number of iterations before benchmark the kernel") + .insert("repeat", "100", "number of iterations to benchmark the kernel") + .insert("group_count", "16", "group count"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +std::size_t GetWorkspaceSize(const std::vector& gemm_descs); + +float grouped_gemm_calc(const std::vector& gemm_descs, + const ck_tile::stream_config& s, + void* p_workspace_); diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc new file mode 100644 index 000000000..cd5b1c286 --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +template +float invoke_gemm(int n_warmup, + int n_repeat, + int group_count, + const std::vector& args) +{ + + ck_tile::DeviceMem gemm_workspace; + gemm_workspace.Realloc(GetWorkspaceSize(args)); + + float ave_time = grouped_gemm( + args, + ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}, + gemm_workspace.GetDeviceBuffer()); + + std::string op_name{"Grouped Gemm"}; + + std::size_t flop = 0, num_btype = 0; + for(int j = 0; j < group_count; ++j) + { + flop += std::size_t(2) * args[j].M * args[j].N * args[j].K; + + num_btype += sizeof(ADataType) * args[j].M * args[j].K + + sizeof(BDataType) * args[j].K * args[j].N + + sizeof(CDataType) * args[j].M * args[j].N; + } + + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + return ave_time; +} + +template +int run_grouped_gemm_example_with_layouts(int argc, + char* argv[], + const ALayout a_layout = ALayout{}, + const BLayout b_layout = BLayout{}, + [[maybe_unused]] const CLayout c_layout = CLayout{}) +{ + auto [result, arg_parser] = create_args(argc, argv); + + if(!result) + { + return -1; + }; + + const int group_count = arg_parser.get_int("group_count"); + const int repeat = arg_parser.get_int("repeat"); + const int warmup = arg_parser.get_int("warmup"); + + std::vector Ms; + std::vector Ns; + std::vector Ks; + std::vector stride_As; + std::vector stride_Bs; + std::vector stride_Cs; + + for(int i = 0; i < group_count; i++) + { + Ms.push_back(256 + 256 * i); + Ns.push_back(128 + 128 * i); + Ks.push_back(128 + 64 * i); + + stride_As.push_back(Ks[i]); + stride_Bs.push_back(Ks[i]); + stride_Cs.push_back(Ns[i]); + } + + std::vector> a_m_k_tensors; + std::vector> b_k_n_tensors; + std::vector> c_m_n_tensors; + + a_m_k_tensors.reserve(group_count); + b_k_n_tensors.reserve(group_count); + c_m_n_tensors.reserve(group_count); + + std::vector> a_m_k_dev_buf; + std::vector> b_k_n_dev_buf; + std::vector> c_m_n_dev_buf; + + a_m_k_dev_buf.reserve(group_count); + b_k_n_dev_buf.reserve(group_count); + c_m_n_dev_buf.reserve(group_count); + + std::vector gemm_descs; + gemm_descs.reserve(group_count); + + for(int i = 0; i < group_count; ++i) + { + const ck_tile::index_t M = Ms[i]; + const ck_tile::index_t N = Ns[i]; + const ck_tile::index_t K = Ks[i]; + + stride_As[i] = f_get_default_stride(M, N, stride_As[i], a_layout); + stride_Bs[i] = f_get_default_stride(K, N, stride_Bs[i], b_layout); + stride_Cs[i] = f_get_default_stride(M, N, stride_Cs[i], CLayout{}); + + a_m_k_tensors.push_back( + ck_tile::HostTensor(f_host_tensor_descriptor(M, K, stride_As[i], a_layout))); + b_k_n_tensors.push_back( + ck_tile::HostTensor(f_host_tensor_descriptor(K, N, stride_Bs[i], b_layout))); + c_m_n_tensors.push_back(ck_tile::HostTensor( + f_host_tensor_descriptor(M, N, stride_Cs[i], CLayout{}))); + + std::cout << "gemm[" << i << "]" + << " a_m_k: " << a_m_k_tensors[i].mDesc << " b_k_n: " << b_k_n_tensors[i].mDesc + << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl; + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k_tensors[i]); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n_tensors[i]); + + a_m_k_dev_buf.push_back(std::make_unique( + a_m_k_tensors[i].get_element_space_size_in_bytes())); + b_k_n_dev_buf.push_back(std::make_unique( + b_k_n_tensors[i].get_element_space_size_in_bytes())); + c_m_n_dev_buf.push_back(std::make_unique( + c_m_n_tensors[i].get_element_space_size_in_bytes())); + + a_m_k_dev_buf[i]->ToDevice(a_m_k_tensors[i].data()); + b_k_n_dev_buf[i]->ToDevice(b_k_n_tensors[i].data()); + c_m_n_dev_buf[i]->SetZero(); + c_m_n_tensors[i].SetZero(); + + const void* p_a = a_m_k_dev_buf[i]->GetDeviceBuffer(); + const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer(); + void* p_c = c_m_n_dev_buf[i]->GetDeviceBuffer(); + + gemm_descs.push_back({p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]}); + } + + invoke_gemm(warmup, repeat, group_count, gemm_descs); + + for(int i = 0; i < group_count; i++) + { + c_m_n_dev_buf[i]->FromDevice(c_m_n_tensors[i].data()); + } + + bool pass{true}; + if(arg_parser.get_int("validate")) + { + for(int i = 0; i < group_count; ++i) + { + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{})); + c_m_n_host_ref.SetZero(); + ck_tile::reference_gemm( + a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref); + pass &= ck_tile::check_err(c_m_n_tensors[i], c_m_n_host_ref); + } + std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl; + } + + return pass; +} + +int run_grouped_gemm_example(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + { + return -1; + } + + const std::string a_layout = arg_parser.get_str("a_layout"); + const std::string b_layout = arg_parser.get_str("b_layout"); + + using Row = ck_tile::tensor_layout::gemm::RowMajor; + using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + + if(a_layout == "R" && b_layout == "C") + { + return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{}); + } + else if(a_layout == "R" && b_layout == "R") + { + return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{}); + } + else + { + throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!"); + } +} diff --git a/example/ck_tile/17_grouped_gemm/utils.hpp b/example/ck_tile/17_grouped_gemm/utils.hpp new file mode 100644 index 000000000..bb3cdf9fd --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/utils.hpp @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +template +constexpr auto +f_host_tensor_descriptor(std::size_t row, std::size_t col, std::size_t stride, TLayout layout) +{ + using namespace ck_tile::literals; + + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride}); + } +} +template +constexpr auto +f_get_default_stride(std::size_t row, std::size_t col, std::size_t stride, TLayout layout) +{ + if(stride == 0) + { + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; +} diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 51ebb5bf0..296eb1ece 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -16,3 +16,4 @@ add_subdirectory(13_moe_sorting) add_subdirectory(14_moe_smoothquant) add_subdirectory(15_fused_moe) add_subdirectory(16_batched_gemm) +add_subdirectory(17_grouped_gemm) diff --git a/include/ck_tile/core/utility/amd_address_space.hpp b/include/ck_tile/core/utility/amd_address_space.hpp new file mode 100644 index 000000000..cb242bf0d --- /dev/null +++ b/include/ck_tile/core/utility/amd_address_space.hpp @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/config.hpp" + +// Address Space for AMDGCN +// https://llvm.org/docs/AMDGPUUsage.html#address-space + +namespace ck_tile { + +#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4))) + +template +__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p) +{ + // cast a pointer in "Constant" address space (4) to "Generic" address space (0) + // only c-style pointer cast seems be able to be compiled +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" + return (T*)p; // NOLINT(old-style-cast) +#pragma clang diagnostic pop +} + +template +__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p) +{ + // cast a pointer in "Generic" address space (0) to "Constant" address space (4) + // only c-style pointer cast seems be able to be compiled +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" + return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast) +#pragma clang diagnostic pop +} + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index b9eb24858..82d35b9c5 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -25,6 +25,7 @@ #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" +#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp" diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp index 6387233c0..8ffe681f9 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp @@ -35,4 +35,40 @@ struct GemmTilePartitioner return make_tuple(iM, iN); } }; + +template +struct GemmTile1DPartitioner +{ + using BlockGemmShape = remove_cvref_t; + + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N) + { + index_t GridDimX = (M + MPerBlock - 1) / MPerBlock; + index_t GridDimY = (N + NPerBlock - 1) / NPerBlock; + return dim3(GridDimX * GridDimY, 1, 1); + } + + CK_TILE_HOST_DEVICE static constexpr auto GetNBlock(index_t N) + { + return integer_divide_ceil(N, NPerBlock); + } + + CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K) + { + return integer_divide_ceil(K, KPerBlock); + } + + CK_TILE_DEVICE auto operator()(index_t blockOffset, index_t NBlockSize) + { + index_t iM = __builtin_amdgcn_readfirstlane((blockIdx.x - blockOffset) / + GetNBlock(NBlockSize) * MPerBlock); + index_t iN = __builtin_amdgcn_readfirstlane((blockIdx.x - blockOffset) % + GetNBlock(NBlockSize) * NPerBlock); + return make_tuple(iM, iN); + } +}; } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp new file mode 100644 index 000000000..f24fc47af --- /dev/null +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/core/numeric/math.hpp" +#include "ck_tile/core/utility/literals.hpp" +#include "ck_tile/core/utility/amd_address_space.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" +#include "ck_tile/host.hpp" + +namespace ck_tile { + +struct GroupedGemmHostArgs +{ + const void* a_ptr; + const void* b_ptr; + void* c_ptr; + index_t M; + index_t N; + index_t K; + index_t stride_A; + index_t stride_B; + index_t stride_C; +}; + +template +struct GroupedGemmKernel +{ + using TilePartitioner = remove_cvref_t; + using GemmPipeline = remove_cvref_t; + using EpiloguePipeline = remove_cvref_t; + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + + struct GemmTransKernelArg + { + GroupedGemmHostArgs group_karg; + ck_tile::index_t block_start; + ck_tile::index_t block_end; + + GemmTransKernelArg() = default; + GemmTransKernelArg(GroupedGemmHostArgs&& karg, index_t bl_start, index_t bl_end) + : group_karg{karg}, block_start{bl_start}, block_end{bl_end} + { + } + }; + + __host__ static size_t GetWorkSpaceSize(const std::vector& gemm_descs) + { + return gemm_descs.size() * sizeof(GemmTransKernelArg); + } + + __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); } + + using Hargs = GroupedGemmHostArgs; + + __host__ static constexpr auto GridSize(const std::vector& gemm_descs) + { + index_t grid_size = 0; + for(const auto& it_desc : gemm_descs) + { + const auto dim3 = TilePartitioner::GridSize(it_desc.M, it_desc.N); + grid_size += dim3.x * dim3.y * 1; + } + return dim3(grid_size, 1, 1); + } + + CK_TILE_HOST static auto MakeKargs(const std::vector& gemm_descs) + { + std::vector gemm_kernel_args_; + index_t group_count = ck_tile::type_convert(gemm_descs.size()); + index_t grid_size = 0; + gemm_kernel_args_.reserve(group_count); + + for(std::size_t i = 0; i < gemm_descs.size(); ++i) + { + const index_t M = gemm_descs[i].M; + const index_t N = gemm_descs[i].N; + const index_t K = gemm_descs[i].K; + + if(M == 0 || N == 0 || K == 0) + { + continue; + } + + const index_t stride_a = gemm_descs[i].stride_A; + const index_t stride_b = gemm_descs[i].stride_B; + const index_t stride_c = gemm_descs[i].stride_C; + + const auto dim3 = TilePartitioner::GridSize(M, N); + const index_t grid_size_grp = dim3.x * 1 * 1; + + const index_t block_start = grid_size; + const index_t block_end = grid_size + grid_size_grp; + + grid_size += grid_size_grp; + + auto karg = GroupedGemmHostArgs{type_convert(gemm_descs[i].a_ptr), + type_convert(gemm_descs[i].b_ptr), + type_convert(gemm_descs[i].c_ptr), + M, + N, + K, + stride_a, + stride_b, + stride_c}; + + gemm_kernel_args_.emplace_back(std::move(karg), block_start, block_end); + } + + return gemm_kernel_args_; + } + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); + } + + CK_TILE_DEVICE void Run(const Hargs& kargs, const index_t block_start) const + { + const auto [i_m, i_n] = TilePartitioner{}(block_start, kargs.N); + // options + const ADataType* a_start = static_cast(kargs.a_ptr); + const BDataType* b_start = static_cast(kargs.b_ptr); + // Convert pointers to tensor views + auto a_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + a_start, + make_tuple(kargs.M, kargs.K), + make_tuple(kargs.stride_A, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + a_start, + make_tuple(kargs.M, kargs.K), + make_tuple(1, kargs.stride_A), + number<1>{}, + number<1>{}); + } + }(); + + auto b_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + b_start, + make_tuple(kargs.N, kargs.K), + make_tuple(1, kargs.stride_B), + number<1>{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + b_start, + make_tuple(kargs.N, kargs.K), + make_tuple(kargs.stride_B, 1), + number{}, + number<1>{}); + } + }(); + + auto a_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view(a_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + else + { + return pad_tensor_view(a_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + }(); + // clang-format on + + auto a_block_window = make_tile_window( + a_pad_view, + make_tuple(number{}, number{}), + {i_m, 0}); + + auto b_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view(b_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + else + { + return pad_tensor_view(b_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + }(); + + auto b_block_window = make_tile_window( + b_pad_view, + make_tuple(number{}, number{}), + {i_n, 0}); + + // allocate LDS + __shared__ char smem_ptr[GetSmemSize()]; + + const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); + + // Run GEMM cooperatively by whole wokrgroup. + auto c_block_tile = + GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr); + + CDataType* c_start = static_cast(kargs.c_ptr); + auto c_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + c_start, + make_tuple(kargs.M, kargs.N), + make_tuple(kargs.stride_C, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + c_start, + make_tuple(kargs.M, kargs.N), + make_tuple(1, kargs.stride_C), + number<1>{}, + number<1>{}); + } + }(); + + auto c_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view(c_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + else + { + return pad_tensor_view(c_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + }(); + auto CBlockWindow_pad = make_tile_window( + c_pad_view, + make_tuple(number{}, number{}), + {i_m, i_n}); + + EpiloguePipeline{}(CBlockWindow_pad, c_block_tile); + } + + CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const, + int group_count) const + { + const index_t block_id = ck_tile::get_block_1d_id(); + const auto gemm_desc_ptr = reinterpret_cast( + cast_pointer_to_generic_address_space(gemm_descs_const)); + + index_t left = 0; + index_t right = group_count; + index_t group_id = index_t((left + right) / 2); + + while((!(block_id >= gemm_desc_ptr[group_id].block_start && + block_id < gemm_desc_ptr[group_id].block_end)) && + left <= right) + { + if(block_id < gemm_desc_ptr[group_id].block_start) + { + right = group_id; + } + else + { + left = group_id; + } + group_id = index_t((left + right) / 2); + } + + Run(gemm_desc_ptr[group_id].group_karg, gemm_desc_ptr[group_id].block_start); + } +}; + +} // namespace ck_tile diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index fd0de0f9c..77cf35f66 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(image_to_column) add_subdirectory(gemm) add_subdirectory(batched_gemm) +add_subdirectory(grouped_gemm) diff --git a/test/ck_tile/grouped_gemm/CMakeLists.txt b/test/ck_tile/grouped_gemm/CMakeLists.txt new file mode 100644 index 000000000..f4845847f --- /dev/null +++ b/test/ck_tile/grouped_gemm/CMakeLists.txt @@ -0,0 +1,4 @@ +# Currently ck_tile is only built on gfx9 +if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_ck_tile_grouped_gemm test_grouped_gemm.cpp) +endif() diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp b/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp new file mode 100644 index 000000000..1bce0f8aa --- /dev/null +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "gtest/gtest.h" + +#include "ck_tile/host.hpp" +#include "test_grouped_gemm_util.hpp" + +using F16 = ck_tile::half_t; +using F32 = float; + +using Row = ck_tile::tensor_layout::gemm::RowMajor; +using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + +// clang-format off +using KernelTypes = ::testing::Types< + // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType + std::tuple< Row, Row, Row, F16, F16, F32, F16>, + //std::tuple< Col, Row, Row, F16, F16, F32, F16>, + std::tuple< Row, Col, Row, F16, F16, F32, F16>//, + //std::tuple< Col, Col, Row, F16, F16, F32, F16> + >; +// clang-format on + +TYPED_TEST_SUITE(TestCkTileGroupedGemm, KernelTypes); + +#include "test_grouped_gemm_ut_cases.inc" diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc new file mode 100644 index 000000000..68c4693bb --- /dev/null +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc @@ -0,0 +1,25 @@ +#pragma once + +TYPED_TEST(TestCkTileGroupedGemm, Basic) +{ + const int group_count = 16; + std::vector Ms; + std::vector Ns; + std::vector Ks; + std::vector stride_As; + std::vector stride_Bs; + std::vector stride_Cs; + + for(int i = 0; i < group_count; i++) + { + Ms.push_back(256 + 256 * i); + Ns.push_back(128 + 128 * i); + Ks.push_back(128 + 64 * i); + + stride_As.push_back(Ks[i]); + stride_Bs.push_back(Ks[i]); + stride_Cs.push_back(Ns[i]); + } + + this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, group_count); +} diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp new file mode 100644 index 000000000..f532de21d --- /dev/null +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" + +template +class TestCkTileGroupedGemm : public ::testing::Test +{ + protected: + using ALayout = std::tuple_element_t<0, Tuple>; + using BLayout = std::tuple_element_t<1, Tuple>; + using CLayout = std::tuple_element_t<2, Tuple>; + using ADataType = std::tuple_element_t<3, Tuple>; + using BDataType = std::tuple_element_t<4, Tuple>; + using AccDataType = std::tuple_element_t<5, Tuple>; + using CDataType = std::tuple_element_t<6, Tuple>; + + struct GroupedGemKernelParam + { + static const bool kPadM = false; + static const bool kPadN = false; + static const bool kPadK = false; + static const bool kTilePermute = false; + + static const ck_tile::index_t kOutputRank = 2; + + static const int kBlockPerCu = 1; + static const ck_tile::index_t M_Tile = 128; + static const ck_tile::index_t N_Tile = 128; + static const ck_tile::index_t K_Tile = 32; + + static const ck_tile::index_t M_Warp = 2; + static const ck_tile::index_t N_Warp = 2; + static const ck_tile::index_t K_Warp = 1; + + static const ck_tile::index_t M_Warp_Tile = 32; + static const ck_tile::index_t N_Warp_Tile = 32; + static const ck_tile::index_t K_Warp_Tile = 8; + }; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = ck_tile::GemmTile1DPartitioner; + + template + using GemmEpilogue = + std::conditional_t, + ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>, + ck_tile::Default2DEpilogue< + ck_tile::Default2DEpilogueProblem>>; + + template + using CodegenGemmTraits = ck_tile::TileGemmTraits; + + template + using CodegenPipelineProblem = + ck_tile::GemmPipelineProblem>; + + using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy; + + template + using CodegenGemmPipeline = + ck_tile::GemmPipelineAGmemBGmemCRegV1, + CodegenGemmPolicy>; + + template + using Kernel = ck_tile::GroupedGemmKernel, + GemmEpilogue>; + + using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs; + std::size_t GetWorkspaceSize(const std::vector& gemm_descs) + { + return Kernel::GetWorkSpaceSize(gemm_descs); + } + + template + void invoke_grouped_gemm(const std::vector& gemm_descs, + const ck_tile::stream_config& s, + void* p_workspace_) + { + using GroupedGemmKernel = Kernel; + + auto arguments = GroupedGemmKernel::MakeKargs(gemm_descs); + + const dim3 grids = GroupedGemmKernel::GridSize(gemm_descs); + constexpr dim3 blocks = GroupedGemmKernel::BlockSize(); + + ck_tile::hip_check_error(hipMemcpyWithStream( + p_workspace_, + arguments.data(), + arguments.size() * sizeof(typename GroupedGemmKernel::GemmTransKernelArg), + hipMemcpyHostToDevice, + s.stream_id_)); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + ck_tile::launch_kernel(s, + ck_tile::make_kernel( + GroupedGemmKernel{}, + grids, + blocks, + 0, + ck_tile::cast_pointer_to_constant_address_space(p_workspace_), + gemm_descs.size())); + } + + public: + void Run(const std::vector& Ms, + const std::vector& Ns, + const std::vector& Ks, + std::vector& stride_As, + std::vector& stride_Bs, + std::vector& stride_Cs, + const int group_count = 16) + { + using namespace ck_tile::literals; + auto f_host_tensor_descriptor = [](std::size_t row, + std::size_t col, + std::size_t stride, + auto layout) { + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + auto f_get_default_stride = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(stride == 0) + { + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; + }; + + std::vector> a_m_k_tensors; + std::vector> b_k_n_tensors; + std::vector> c_m_n_tensors; + + a_m_k_tensors.reserve(group_count); + b_k_n_tensors.reserve(group_count); + c_m_n_tensors.reserve(group_count); + + std::vector> a_m_k_dev_buf; + std::vector> b_k_n_dev_buf; + std::vector> c_m_n_dev_buf; + + a_m_k_dev_buf.reserve(group_count); + b_k_n_dev_buf.reserve(group_count); + c_m_n_dev_buf.reserve(group_count); + + std::vector gemm_descs; + gemm_descs.reserve(group_count); + + for(int i = 0; i < group_count; ++i) + { + const ck_tile::index_t M = Ms[i]; + const ck_tile::index_t N = Ns[i]; + const ck_tile::index_t K = Ks[i]; + + stride_As[i] = f_get_default_stride(M, N, stride_As[i], ALayout{}); + stride_Bs[i] = f_get_default_stride(K, N, stride_Bs[i], BLayout{}); + stride_Cs[i] = f_get_default_stride(M, N, stride_Cs[i], CLayout{}); + + a_m_k_tensors.push_back(ck_tile::HostTensor( + f_host_tensor_descriptor(M, K, stride_As[i], ALayout{}))); + b_k_n_tensors.push_back(ck_tile::HostTensor( + f_host_tensor_descriptor(K, N, stride_Bs[i], BLayout{}))); + c_m_n_tensors.push_back(ck_tile::HostTensor( + f_host_tensor_descriptor(M, N, stride_Cs[i], CLayout{}))); + + std::cout << "gemm[" << i << "]" + << " a_m_k: " << a_m_k_tensors[i].mDesc + << " b_k_n: " << b_k_n_tensors[i].mDesc + << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl; + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k_tensors[i]); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n_tensors[i]); + + a_m_k_dev_buf.push_back(std::make_unique( + a_m_k_tensors[i].get_element_space_size_in_bytes())); + b_k_n_dev_buf.push_back(std::make_unique( + b_k_n_tensors[i].get_element_space_size_in_bytes())); + c_m_n_dev_buf.push_back(std::make_unique( + c_m_n_tensors[i].get_element_space_size_in_bytes())); + + a_m_k_dev_buf[i]->ToDevice(a_m_k_tensors[i].data()); + b_k_n_dev_buf[i]->ToDevice(b_k_n_tensors[i].data()); + c_m_n_dev_buf[i]->SetZero(); + c_m_n_tensors[i].SetZero(); + + const void* p_a = a_m_k_dev_buf[i]->GetDeviceBuffer(); + const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer(); + void* p_c = c_m_n_dev_buf[i]->GetDeviceBuffer(); + + gemm_descs.push_back( + {p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]}); + } + + ck_tile::DeviceMem gemm_workspace; + gemm_workspace.Realloc(GetWorkspaceSize(gemm_descs)); + + invoke_grouped_gemm( + gemm_descs, ck_tile::stream_config{nullptr, false}, gemm_workspace.GetDeviceBuffer()); + + for(int i = 0; i < group_count; i++) + { + c_m_n_dev_buf[i]->FromDevice(c_m_n_tensors[i].data()); + } + + bool pass{true}; + for(int i = 0; i < group_count; ++i) + { + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{})); + c_m_n_host_ref.SetZero(); + ck_tile::reference_gemm( + a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref); + pass &= ck_tile::check_err(c_m_n_tensors[i], c_m_n_host_ref); + } + EXPECT_TRUE(pass); + } +}; -- GitLab From d2d1d177ffe04f0ff25fed0aedcb3ede0e07c51b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 4 Dec 2024 22:05:47 -0800 Subject: [PATCH 101/153] Bump rocm-docs-core from 1.10.0 to 1.11.0 in /docs/sphinx (#1720) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.10.0 to 1.11.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.10.0...v1.11.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 9969824d2..d1b3465b9 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.10.0 +rocm-docs-core==1.11.0 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index bb731db2d..26d0aa244 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.10.0 +rocm-docs-core==1.11.0 # via -r requirements.in six==1.16.0 # via pybtex -- GitLab From feb9a2bd9b50da9d449e5931e936d527a0db89fe Mon Sep 17 00:00:00 2001 From: jakpiase Date: Thu, 5 Dec 2024 09:02:13 +0100 Subject: [PATCH 102/153] Add IsSupportedArgument() to gemm_kernel (#1698) * add IsSupportedArgument to gemm_kernel * add ut and do some refactoring * switched to ck_tile's integral_constant --- example/ck_tile/03_gemm/gemm_basic.cpp | 5 ++ example/ck_tile/03_gemm/universal_gemm.cpp | 5 ++ .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 73 +++++++++++++++++++ test/ck_tile/gemm/test_gemm_mem_pipeline.cpp | 42 +++++------ .../gemm/test_gemm_mem_pipeline_ut_cases.inc | 59 +++------------ .../gemm/test_gemm_mem_pipeline_util.hpp | 22 ++++-- 6 files changed, 128 insertions(+), 78 deletions(-) diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp index b7d869344..f5260c306 100644 --- a/example/ck_tile/03_gemm/gemm_basic.cpp +++ b/example/ck_tile/03_gemm/gemm_basic.cpp @@ -92,6 +92,11 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); constexpr dim3 blocks = Kernel::BlockSize(); + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } + if(s.log_level_ > 0) { std::cout << "Launching kernel with args:" diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp index eaafc13b9..6c87ca008 100644 --- a/example/ck_tile/03_gemm/universal_gemm.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -119,6 +119,11 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); constexpr dim3 blocks = Kernel::BlockSize(); + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } + if(s.log_level_ > 0) { std::cout << "Launching kernel with args:" diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp index 96af6e826..763d8cad9 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp @@ -66,6 +66,79 @@ struct GemmKernel return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); } + CK_TILE_HOST static bool IsSupportedArgument(const GemmCommonKargs& kargs) + { + if constexpr(std::is_same_v) + { + if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false) + { + return false; + } + if(kargs.K % GemmPipeline::VectorSizeA != 0) + { + return false; + } + } + else + { + if(kargs.M % TilePartitioner::kM != 0 && GemmPipeline::kPadM == false) + { + return false; + } + if(kargs.M % GemmPipeline::VectorSizeA != 0) + { + return false; + } + } + + if constexpr(std::is_same_v) + { + if(kargs.N % TilePartitioner::kN != 0 && GemmPipeline::kPadN == false) + { + return false; + } + if(kargs.N % GemmPipeline::VectorSizeB != 0) + { + return false; + } + } + else + { + if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false) + { + return false; + } + if(kargs.K % GemmPipeline::VectorSizeB != 0) + { + return false; + } + } + + if constexpr(std::is_same_v) + { + if(kargs.N % TilePartitioner::kN != 0 && GemmPipeline::kPadN == false) + { + return false; + } + if(kargs.N % GemmPipeline::VectorSizeC != 0) + { + return false; + } + } + else + { + if(kargs.M % TilePartitioner::kM != 0 && GemmPipeline::kPadM == false) + { + return false; + } + if(kargs.M % GemmPipeline::VectorSizeC != 0) + { + return false; + } + } + return true; + } + CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const { const auto [i_m, i_n] = TilePartitioner{}(); diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp index a1c80fee4..aeb383c87 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp @@ -8,35 +8,29 @@ #include "ck_tile/host.hpp" #include "test_gemm_mem_pipeline_util.hpp" -using F16 = ck_tile::half_t; -using F32 = float; - -using Row = ck_tile::tensor_layout::gemm::RowMajor; -using Col = ck_tile::tensor_layout::gemm::ColumnMajor; -static constexpr auto Intrawave = ck_tile::GemmPipelineScheduler::Intrawave; -static constexpr auto Interwave = ck_tile::GemmPipelineScheduler::Interwave; - -template -class TestCkTileGemmMemPipelineIntrawave : public TestCkTileGemmMemPipeline -{ -}; - -template -class TestCkTileGemmMemPipelineInterwave : public TestCkTileGemmMemPipeline -{ -}; +using F16 = ck_tile::half_t; +using F32 = float; +using Row = ck_tile::tensor_layout::gemm::RowMajor; +using Col = ck_tile::tensor_layout::gemm::ColumnMajor; +using Intrawave = ck_tile::integral_constant; +using Interwave = ck_tile::integral_constant; // clang-format off using KernelTypes = ::testing::Types< - // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType - std::tuple< Row, Col, Row, F16, F16, F32, F16>, - std::tuple< Col, Row, Row, F16, F16, F32, F16>, - std::tuple< Row, Row, Row, F16, F16, F32, F16>, - std::tuple< Col, Col, Row, F16, F16, F32, F16> + // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler + std::tuple< Row, Row, Row, F16, F16, F32, F16, Intrawave>, + std::tuple< Row, Row, Row, F16, F16, F32, F16, Interwave>, + std::tuple< Row, Col, Row, F16, F16, F32, F16, Intrawave>, + std::tuple< Row, Col, Row, F16, F16, F32, F16, Interwave>, + std::tuple< Col, Row, Row, F16, F16, F32, F16, Intrawave>, + std::tuple< Col, Row, Row, F16, F16, F32, F16, Interwave>, + std::tuple< Col, Col, Row, F16, F16, F32, F16, Intrawave>, + std::tuple< Col, Col, Row, F16, F16, F32, F16, Interwave> >; // clang-format on -TYPED_TEST_SUITE(TestCkTileGemmMemPipelineIntrawave, KernelTypes); -TYPED_TEST_SUITE(TestCkTileGemmMemPipelineInterwave, KernelTypes); +TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes); #include "test_gemm_mem_pipeline_ut_cases.inc" diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc index 6b914e797..af94d68f2 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc @@ -3,11 +3,7 @@ #pragma once -//------------------------------------------------------------------------------------------------ -// INTERWAVE SCHEDULER -//------------------------------------------------------------------------------------------------ - -TYPED_TEST(TestCkTileGemmMemPipelineInterwave, SmallM) +TYPED_TEST(TestCkTileGemmMemPipeline, SmallM) { std::vector Ms{1, 2, 3, 4, 5, 6}; constexpr int N = 1024; @@ -17,7 +13,7 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, SmallM) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipelineInterwave, MidLargeM) +TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM) { std::vector Ms{127, 255, 312, 799, 1573}; constexpr int N = 1024; @@ -27,7 +23,7 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, MidLargeM) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipelineInterwave, PaddK) +TYPED_TEST(TestCkTileGemmMemPipeline, PaddK) { std::vector Ms{127}; constexpr int N = 1024; @@ -37,7 +33,7 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, PaddK) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipelineInterwave, Regular) +TYPED_TEST(TestCkTileGemmMemPipeline, Regular) { std::vector Ms{512}; constexpr int N = 1024; @@ -47,46 +43,15 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, Regular) this->Run(M, N, K); } -//------------------------------------------------------------------------------------------------ -// INTRAWAVE SCHEDULER -//------------------------------------------------------------------------------------------------ - -TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, SmallM) +TYPED_TEST(TestCkTileGemmMemPipeline, NotSupportedArgument) { - std::vector Ms{1, 2, 3, 4, 5, 6}; - constexpr int N = 1024; - constexpr int K = 320; - - for(int M : Ms) - this->Run(M, N, K); -} + constexpr int M = 512; + constexpr int N = 1025; + constexpr int K = 513; -TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, MidLargeM) -{ - std::vector Ms{127, 255, 312, 799, 1573}; - constexpr int N = 1024; - constexpr int K = 320; - - for(int M : Ms) - this->Run(M, N, K); -} + constexpr bool PadM = false; + constexpr bool PadN = false; + constexpr bool PadK = false; -TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, PaddK) -{ - std::vector Ms{127}; - constexpr int N = 1024; - constexpr int K = 432; - - for(int M : Ms) - this->Run(M, N, K); -} - -TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, Regular) -{ - std::vector Ms{512}; - constexpr int N = 1024; - constexpr int K = 512; - - for(int M : Ms) - this->Run(M, N, K); + EXPECT_THROW((this->template Run(M, N, K)), std::runtime_error); } diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp index 15f9f516e..6941a7596 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp @@ -11,7 +11,7 @@ #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" -template +template class TestCkTileGemmMemPipeline : public ::testing::Test { protected: @@ -22,7 +22,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test using BDataType = std::tuple_element_t<4, Tuple>; using AccDataType = std::tuple_element_t<5, Tuple>; using CDataType = std::tuple_element_t<6, Tuple>; - static constexpr auto Scheduler = Scheduler_; + static constexpr auto Scheduler = std::tuple_element_t<7, Tuple>::value; // TODO: expose tile size through test t-param ? struct gemm_args @@ -39,6 +39,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test ck_tile::index_t stride_C; }; + template void invoke_gemm(const gemm_args& args, const ck_tile::stream_config& s) { // TODO: This should be parameterized in tests @@ -54,9 +55,9 @@ class TestCkTileGemmMemPipeline : public ::testing::Test constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 8; - constexpr bool kPadM = true; - constexpr bool kPadN = true; - constexpr bool kPadK = true; + constexpr bool kPadM = PadM; + constexpr bool kPadN = PadN; + constexpr bool kPadK = PadK; constexpr int kBlockPerCu = 1; @@ -107,6 +108,11 @@ class TestCkTileGemmMemPipeline : public ::testing::Test const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); constexpr dim3 blocks = Kernel::BlockSize(); + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } + if(s.log_level_ > 0) { std::cout << "Launching kernel with args:" @@ -212,6 +218,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test void SetUp() override { k_batches_ = {1}; } + template void Run(const int M, const int N, const int K, @@ -221,10 +228,11 @@ class TestCkTileGemmMemPipeline : public ::testing::Test { for(auto kb : k_batches_) { - RunSingle(M, N, K, StrideA, StrideB, StrideC, kb); + RunSingle(M, N, K, StrideA, StrideB, StrideC, kb); } } + template void RunSingle(const int M, const int N, const int K, @@ -301,7 +309,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test args.stride_B = stride_B; args.stride_C = stride_C; - invoke_gemm(args, ck_tile::stream_config{nullptr, false}); + invoke_gemm(args, ck_tile::stream_config{nullptr, false}); c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); bool pass = true; -- GitLab From 86990558e39a99d3e2dd909e45f5d38c3b13d956 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:29:12 -0800 Subject: [PATCH 103/153] Upgrade default compiler to ROCm6.3 (#1723) * upgrade to rocm6.3 compiler * Proposed solution to convnd test failures in ROCm 6.3 --------- Co-authored-by: Andriy Roshchenko --- Dockerfile | 13 ++++-------- Dockerfile.compiler | 2 +- Jenkinsfile | 21 ++++++++++--------- .../convscale/convnd_fwd_convscale_common.hpp | 9 ++++---- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/Dockerfile b/Dockerfile index f9b7d76e3..6689ae08f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:20.04 ARG DEBIAN_FRONTEND=noninteractive -ARG ROCMVERSION=6.2 +ARG ROCMVERSION=6.3 ARG compiler_version="" ARG compiler_commit="" ARG CK_SCCACHE="" @@ -13,17 +13,12 @@ RUN set -xe && \ apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \ curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg -RUN if [ "$ROCMVERSION" != "6.3" ]; then \ - sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate" && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \ +RUN if [ "$ROCMVERSION" != "6.4" ]; then \ + sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb --no-check-certificate" && \ + apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \ wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \ - elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \ - sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3-20.04-1_all.deb --no-check-certificate" && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3-20.04-1_all.deb && \ - sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3 rel-20 > /etc/apt/sources.list.d/rocm-build.list' && \ - amdgpu-repo --amdgpu-build=2074281; \ fi RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \ diff --git a/Dockerfile.compiler b/Dockerfile.compiler index 354b71f69..3f3329092 100644 --- a/Dockerfile.compiler +++ b/Dockerfile.compiler @@ -1,4 +1,4 @@ -ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.2" +ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.3" FROM $BASE_DOCKER ARG compiler_version="" ARG compiler_commit="" diff --git a/Jenkinsfile b/Jenkinsfile index f8493fa2f..58cd72c8c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -38,13 +38,14 @@ def getBaseDockerImageName(){ img = "${params.USE_CUSTOM_DOCKER}" } else{ - if (params.ROCMVERSION != "6.3"){ - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" - } - else{ - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" + def ROCM_numeric = "${params.ROCMVERSION}" as float + if ( ROCM_numeric < 6.4 ){ + img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" + } + else{ + img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" + } } - } return img } @@ -739,8 +740,8 @@ def process_results(Map conf=[:]){ } //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true - 0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true + 0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false @@ -765,8 +766,8 @@ pipeline { description: 'If you want to use a custom docker image, please specify it here (default: leave blank).') string( name: 'ROCMVERSION', - defaultValue: '6.2', - description: 'Specify which ROCM version to use: 6.2 (default).') + defaultValue: '6.3', + description: 'Specify which ROCM version to use: 6.3 (default).') string( name: 'COMPILER_VERSION', defaultValue: '', diff --git a/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp b/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp index 978221f8e..bf560f8a4 100644 --- a/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp +++ b/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp @@ -172,12 +172,13 @@ bool run_grouped_conv_fwd(bool do_verification, { case 0: break; case 1: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + // values generated: -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5 + in.GenerateTensorValue(GeneratorTensor_2{-5, 6}); + wei.GenerateTensorValue(GeneratorTensor_3{-1.0, 1.0}); break; default: - in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-1.0, 1.0}); } DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); -- GitLab From 58e7f37fc892c1e7aeca338f96ec694712e6e412 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Fri, 6 Dec 2024 12:59:58 +0800 Subject: [PATCH 104/153] Undo padding-flag changes in fmha_fwd_kernel.hpp (#1725) --- .../ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 3a66b78a5..3de433d6a 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -998,14 +998,14 @@ struct FmhaFwdKernel return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); const auto k_dram = [&]() { @@ -1019,7 +1019,7 @@ struct FmhaFwdKernel return pad_tensor_view( k_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); const auto v_dram = [&]() { if constexpr(std::is_same_v) @@ -1041,7 +1041,7 @@ struct FmhaFwdKernel return pad_tensor_view( v_dram_transposed, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { @@ -1055,7 +1055,7 @@ struct FmhaFwdKernel return pad_tensor_view( v_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); @@ -1097,8 +1097,9 @@ struct FmhaFwdKernel number{}, number<1>{}); - return pad_tensor_view( - bias_dram_naive, bias_dram_window_lengths, sequence{}); + return pad_tensor_view(bias_dram_naive, + bias_dram_window_lengths, + sequence{}); }(); return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0}); -- GitLab From 261f1759de15fd319ba03985ebe7123fae12a722 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Fri, 6 Dec 2024 10:55:23 +0100 Subject: [PATCH 105/153] Support large batch tensors in grouped conv bwd data (#1711) * Support large batch tensors in grouped conv bwd data * Fix multiD * fixes * fixes * fixes --- ...conv_bwd_data_multiple_d_wmma_cshuffle.hpp | 186 +-- ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 294 ++-- .../transform_conv_bwd_data_to_gemm_v1.hpp | 1275 ++++++++++------- test/grouped_convnd_bwd_data/CMakeLists.txt | 8 +- .../test_grouped_convnd_bwd_data_wmma.cpp | 108 ++ ...p => test_grouped_convnd_bwd_data_xdl.cpp} | 39 +- 6 files changed, 1067 insertions(+), 843 deletions(-) create mode 100644 test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp rename test/grouped_convnd_bwd_data/{test_grouped_convnd_bwd_data_xdl_wmma.cpp => test_grouped_convnd_bwd_data_xdl.cpp} (78%) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp index 3fb047f20..359711e5c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -106,89 +106,35 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle static constexpr auto I3 = Number<3>{}; static constexpr index_t KPerBlock = K0PerBlock * K1; - static constexpr auto transform_conv_to_gemm = - TransformConvBwdDataToGemm_v1{}; - - static auto GetDummyABDsEGridDescriptor() - { - const std::array dummy_tensor_lengths = {1}; - const std::array dummy_tensor_strides = {1}; - const std::array dummy_spatial_lengths = {1}; - - const auto a_grid_desc_ak0_m_ak1 = - transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); - - const auto b_grid_desc_bk0_n_bk1 = - transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); - - const auto ds_grid_desc_m_n = generate_tuple( - [&](auto i) { - using DLayout = remove_cvref_t>; - - return transform_conv_to_gemm.template MakeCDescriptor_M_N( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); - }, - Number{}); - - const auto e_grid_desc_m_n = - transform_conv_to_gemm.template MakeCDescriptor_M_N(dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); + using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1; + static auto + GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform) + { + const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1(); + const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1(); + const auto ds_grid_desc_m_n = + generate_tuple([&](auto) { return conv_to_gemm_transform.MakeCDescriptor_M_N(); }, + Number{}); + const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N(); return make_tuple( a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n); } // desc - using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor()); + constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform; + using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform)); using AGridDesc_AK0_M_AK1 = remove_cvref_t>; using BGridDesc_BK0_N_BK1 = remove_cvref_t>; @@ -270,7 +216,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle const std::array& b_g_k_c_xs_lengths, const std::array& b_g_k_c_xs_strides, const std::array, NumDTensor>& - ds_g_n_c_wis_lengths, + /*ds_g_n_c_wis_lengths*/, const std::array, NumDTensor>& ds_g_n_c_wis_strides, const std::array& e_g_n_c_wis_lengths, @@ -291,15 +237,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle b_element_op_{b_element_op}, cde_element_op_{cde_element_op}, a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths}, - a_g_n_k_wos_strides_{a_g_n_k_wos_strides}, b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths}, - b_g_k_c_xs_strides_{b_g_k_c_xs_strides}, - ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths}, - ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides}, - e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths}, - e_g_n_c_wis_strides_{e_g_n_c_wis_strides}, conv_filter_strides_{conv_filter_strides}, - conv_filter_dilations_{conv_filter_dilations}, input_left_pads_{input_left_pads}, input_right_pads_{input_right_pads} { @@ -382,68 +321,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle tildes = {i_ztilde, i_ytilde, i_xtilde}; } + ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths, + a_g_n_k_wos_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + e_g_n_c_wis_lengths, + e_g_n_c_wis_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + tildes}; + const auto a_grid_desc_ak0_m_ak1 = - transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); + conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1(); const auto b_grid_desc_bk0_n_bk1 = - transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); + conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1(); DsGridDesc_M_N ds_grid_desc_m_n; // populate Ds desc static_for<0, NumDTensor, 1>{}([&](auto i) { using DLayout = remove_cvref_t>; - - ds_grid_desc_m_n(i) = - transform_conv_to_gemm.template MakeCDescriptor_M_N( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - ds_g_n_c_wis_lengths[i], - ds_g_n_c_wis_strides[i], - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); - }); - - const auto e_grid_desc_m_n = - transform_conv_to_gemm.template MakeCDescriptor_M_N( + static_assert(is_same_v); + ConvToGemmBwdDataTransform conv_to_gemm_transform_d{ a_g_n_k_wos_lengths, a_g_n_k_wos_strides, b_g_k_c_xs_lengths, b_g_k_c_xs_strides, e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, + ds_g_n_c_wis_strides[i], conv_filter_strides, conv_filter_dilations, input_left_pads, input_right_pads, - tildes); + tildes}; + + ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N(); + }); + + const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N(); // for check validity ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n); @@ -522,17 +440,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle BElementwiseOp b_element_op_; CDEElementwiseOp cde_element_op_; - // for checking IsSupportedArgument() std::array a_g_n_k_wos_lengths_; - std::array a_g_n_k_wos_strides_; std::array b_g_k_c_xs_lengths_; - std::array b_g_k_c_xs_strides_; - std::array, NumDTensor> ds_g_n_c_wis_lengths_; - std::array, NumDTensor> ds_g_n_c_wis_strides_; - std::array e_g_n_c_wis_lengths_; - std::array e_g_n_c_wis_strides_; std::array conv_filter_strides_; - std::array conv_filter_dilations_; std::array input_left_pads_; std::array input_right_pads_; }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp index b544c925e..c8c58d5d8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp @@ -54,15 +54,16 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS @@ -73,10 +74,9 @@ __global__ void const ABDataType* __restrict__ p_b_grid, DsPointer p_ds_grid, EDataType* __restrict__ p_e_grid, - const AElementwiseOperation a_element_op, - const BElementwiseOperation b_element_op, - const CDEElementwiseOperation cde_element_op, - const index_t batch_count, + const AElementwiseOp a_element_op, + const BElementwiseOp b_element_op, + const CDEElementwiseOp cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock @@ -84,24 +84,29 @@ __global__ void const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_, const Block2ETileMap block_2_ctile_map, - const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch) + const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, + const ComputePtrOffsetOfN compute_ptr_offset_of_n) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ defined(__gfx94__)) // offset base pointer for each work-group - const index_t num_blocks_per_batch = - __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); - const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z); + const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y); - const long_index_t a_batch_offset = amd_wave_read_first_lane( - static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); - const long_index_t b_batch_offset = amd_wave_read_first_lane( - static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); - const long_index_t e_batch_offset = amd_wave_read_first_lane( - static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); + const long_index_t e_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); + const long_index_t a_n_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)); + const long_index_t e_n_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx)); + __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; DsPointer p_ds_grid_grp; @@ -112,10 +117,10 @@ __global__ void static_for<0, NumDTensor, 1>{}( [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; }); - GridwiseGemm::template Run(p_a_grid + a_batch_offset, + GridwiseGemm::template Run(p_a_grid + a_batch_offset + a_n_offset, p_b_grid + b_batch_offset, p_ds_grid_grp, - p_e_grid + e_batch_offset, + p_e_grid + e_batch_offset + e_n_offset, p_shared, a_element_op, b_element_op, @@ -130,7 +135,6 @@ __global__ void ignore = p_b_grid; ignore = p_ds_grid; ignore = p_e_grid; - ignore = batch_count; ignore = a_grid_desc_ak0_m_ak1; ignore = b_grid_desc_bk0_n_bk1; ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock; @@ -139,6 +143,7 @@ __global__ void ignore = b_element_op; ignore = cde_element_op; ignore = compute_ptr_offset_of_batch; + ignore = compute_ptr_offset_of_n; ignore = block_2_ctile_map; #endif } @@ -233,82 +238,54 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 static constexpr auto I2 = Number<2>{}; static constexpr auto I3 = Number<3>{}; - static constexpr auto transform_conv_to_gemm = - TransformConvBwdDataToGemm_v1{}; - - static auto GetDummyABDsEGridDescriptor() + using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1; + + static auto + GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform) { - const std::array dummy_tensor_lengths = {1}; - const std::array dummy_tensor_strides = {1}; - const std::array dummy_spatial_lengths = {1}; - - const auto a_grid_desc_ak0_m_ak1 = - transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); - - const auto b_grid_desc_bk0_n_bk1 = - transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); + const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1(); + + const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1(); const auto ds_grid_desc_m_n = generate_tuple( [&](auto i) { - using DLayout = remove_cvref_t>; - - return transform_conv_to_gemm.template MakeCDescriptor_M_N( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); + using DLayout = remove_cvref_t>; + using DDataType = remove_cvref_t>; + using ConvToGemmBwdDataTransformD = + TransformConvBwdDataToGemm_v1; + return ConvToGemmBwdDataTransformD{}.MakeCDescriptor_M_N(); }, Number{}); - const auto e_grid_desc_m_n = - transform_conv_to_gemm.template MakeCDescriptor_M_N(dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); + const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N(); return make_tuple( a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n); @@ -377,7 +354,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 } // desc - using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor()); + constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform; + using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform)); using AGridDesc_AK0_M_AK1 = remove_cvref_t>; using BGridDesc_BK0_N_BK1 = remove_cvref_t>; @@ -431,15 +409,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 b_element_op_{b_element_op}, cde_element_op_{cde_element_op}, a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths}, - a_g_n_k_wos_strides_{a_g_n_k_wos_strides}, b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths}, - b_g_k_c_xs_strides_{b_g_k_c_xs_strides}, - ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths}, - ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides}, - e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths}, - e_g_n_c_wis_strides_{e_g_n_c_wis_strides}, conv_filter_strides_{conv_filter_strides}, - conv_filter_dilations_{conv_filter_dilations}, input_left_pads_{input_left_pads}, input_right_pads_{input_right_pads} { @@ -450,11 +421,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 p_ds_grid_(i) = static_cast(p_ds[i]); }); - // A/B/Ds/E Batch Stride - compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0]; - compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0]; - compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0]; - static_for<0, NumDTensor, 1>{}([&](auto i) { compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_c_wis_strides[i][0]; }); @@ -526,68 +492,65 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 throw std::runtime_error("wrong! only implemented for 2D and 3D now"); } + ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths, + a_g_n_k_wos_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + e_g_n_c_wis_lengths, + e_g_n_c_wis_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + tildes}; + + conv_N_per_block_ = conv_to_gemm_transform_.N_; + const auto a_grid_desc_ak0_m_ak1 = - transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); + conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1(); const auto b_grid_desc_bk0_n_bk1 = - transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); + conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1(); DsGridDesc_M_N ds_grid_desc_m_n; // populate Ds desc static_for<0, NumDTensor, 1>{}([&](auto i) { - using DLayout = remove_cvref_t>; - - ds_grid_desc_m_n(i) = - transform_conv_to_gemm.template MakeCDescriptor_M_N( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - ds_g_n_c_wis_lengths[i], - ds_g_n_c_wis_strides[i], - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); - }); - - const auto e_grid_desc_m_n = - transform_conv_to_gemm.template MakeCDescriptor_M_N( + using DLayout = remove_cvref_t>; + using DDataType = remove_cvref_t>; + using ConvToGemmBwdDataTransformD = + TransformConvBwdDataToGemm_v1; + ConvToGemmBwdDataTransformD conv_to_gemm_transform_d{ a_g_n_k_wos_lengths, a_g_n_k_wos_strides, b_g_k_c_xs_lengths, b_g_k_c_xs_strides, - e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, + ds_g_n_c_wis_lengths[i], + ds_g_n_c_wis_strides[i], conv_filter_strides, conv_filter_dilations, input_left_pads, input_right_pads, - tildes); + tildes}; + + ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N(); + }); + + const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N(); // desc for problem definition const auto a_grid_desc_m_k = @@ -628,6 +591,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 } } } + // A/B/Ds/E Batch Stride + compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0]; + compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0]; + compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0]; + + compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_k_wos_strides[1] * conv_N_per_block_; + compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_c_wis_strides[1] * conv_N_per_block_; } void Print() const @@ -660,6 +630,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 // tensor descriptor for problem definition index_t num_group_; + index_t conv_N_per_block_; std::vector a_grid_desc_m_k_container_; std::vector b_grid_desc_n_k_container_; std::vector ds_grid_desc_m_n_container_; @@ -678,23 +649,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 // for computing batch offset ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_; + ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_n_; // element-wise op AElementwiseOp a_element_op_; BElementwiseOp b_element_op_; CDEElementwiseOp cde_element_op_; - // for checking IsSupportedArgument() std::array a_g_n_k_wos_lengths_; - std::array a_g_n_k_wos_strides_; std::array b_g_k_c_xs_lengths_; - std::array b_g_k_c_xs_strides_; - std::array, NumDTensor> ds_g_n_c_wis_lengths_; - std::array, NumDTensor> ds_g_n_c_wis_strides_; - std::array e_g_n_c_wis_lengths_; - std::array e_g_n_c_wis_strides_; std::array conv_filter_strides_; - std::array conv_filter_dilations_; std::array input_left_pads_; std::array input_right_pads_; }; @@ -711,8 +675,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 arg.Print(); } - float ave_time = 0; + const index_t gdy = arg.num_group_; + const index_t num_workgroups_per_Conv_N = + arg.a_g_n_k_wos_lengths_[I1] / arg.conv_N_per_block_; + const index_t gdz = num_workgroups_per_Conv_N; + float ave_time = 0; for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++) { if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i], @@ -724,9 +692,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 throw std::runtime_error("wrong! device_op has invalid setting"); } - const index_t grid_size = arg.block_2_etile_map_container_[i].CalculateGridSize( - arg.e_grid_desc_m_n_container_[i]) * - arg.num_group_; + const index_t gdx = arg.block_2_etile_map_container_[i].CalculateGridSize( + arg.e_grid_desc_m_n_container_[i]); const auto GemmK = arg.a_grid_desc_m_k_container_[i].GetLength(I1); @@ -747,12 +714,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, Block2ETileMap, ComputePtrOffsetOfStridedBatch, + ComputePtrOffsetOfStridedBatch, has_main_loop>; return launch_and_time_kernel( stream_config, kernel, - dim3(grid_size), + dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg.p_a_grid_, @@ -762,13 +730,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 arg.a_element_op_, arg.b_element_op_, arg.cde_element_op_, - arg.a_g_n_k_wos_lengths_[0], // Group count arg.a_grid_desc_ak0_m_ak1_container_[i], arg.b_grid_desc_bk0_n_bk1_container_[i], arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i], arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i], arg.block_2_etile_map_container_[i], - arg.compute_ptr_offset_of_batch_); + arg.compute_ptr_offset_of_batch_, + arg.compute_ptr_offset_of_n_); }; if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK)) diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp index 2be0b6681..8df0d885b 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -13,245 +13,614 @@ namespace ck { namespace tensor_operation { -namespace { template < index_t NDimSpatial, + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization, + index_t AK1, + index_t BK1, + index_t GemmMPerBlock, + index_t GemmNPerBlock, + index_t GemmKPerBlock, + bool DoPadGemmM, + bool DoPadGemmN, typename ALayout, - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization> -constexpr auto make_out_grid_desc(const index_t N, - const index_t Do, - const index_t Ho, - const index_t Wo, - const index_t K, - const std::array& out_g_n_k_wos_strides) + typename BLayout, + typename CLayout, + bool SplitN = false, + typename ADataType = float, + typename CDataType = float, + index_t NumGroupsToMerge = 1, + typename IndexType = index_t> +struct TransformConvBwdDataToGemm_v1 { - const auto KStride = Number<1>{}; + private: + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; - if constexpr(is_same_v) - { - const index_t NStride = out_g_n_k_wos_strides[1]; - const index_t HiStride = out_g_n_k_wos_strides[3]; - const index_t WiStride = out_g_n_k_wos_strides[4]; - if constexpr(ConvBwdDataSpecialization == - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: - Filter1x1Stride1Pad0) - { + static constexpr auto NonSpatialDimsNum = Number<3>{}; - return make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, K), - make_tuple(WiStride, KStride)); - } - else + static constexpr auto DIdx = NonSpatialDimsNum; + static constexpr auto HIdx = + NDimSpatial == 2 ? NonSpatialDimsNum : Number{}; + static constexpr auto WIdx = + NDimSpatial == 2 ? Number{} : Number{}; + + static constexpr auto ZIdx = NonSpatialDimsNum; + static constexpr auto YIdx = + NDimSpatial == 2 ? NonSpatialDimsNum : Number{}; + static constexpr auto XIdx = + NDimSpatial == 2 ? Number{} : Number{}; + + template + static long_index_t calculate_element_space_size_impl(const ConvDimsType& lengths, + const ConvDimsType& strides, + index_t i) + { + long_index_t acc = 1; + for(; i < (NDimSpatial + 3); i++) { - return make_naive_tensor_descriptor(make_tuple(N, Ho, Wo, K), - make_tuple(NStride, HiStride, WiStride, KStride)); + acc += + static_cast(lengths[i] - I1) * static_cast(strides[i]); } + + return acc; } - else if constexpr(is_same_v) + + template + static IndexType GetSplitedNSize(const ConvDimsType& a_g_n_k_wos_lengths, + const ConvDimsType& a_g_n_k_wos_strides, + const ConvDimsType& c_g_n_c_wis_lengths, + const ConvDimsType& c_g_n_c_wis_strides) { - const index_t NStride = out_g_n_k_wos_strides[1]; - const index_t DoStride = out_g_n_k_wos_strides[3]; - const index_t HoStride = out_g_n_k_wos_strides[4]; - const index_t WoStride = out_g_n_k_wos_strides[5]; - if constexpr(ConvBwdDataSpecialization == - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: - Filter1x1Stride1Pad0) + const long_index_t a_element_space_size = + calculate_element_space_size_impl(a_g_n_k_wos_lengths, a_g_n_k_wos_strides, I1); + const long_index_t c_element_space_size = + calculate_element_space_size_impl(c_g_n_c_wis_lengths, c_g_n_c_wis_strides, I1); + const long_index_t element_space_size = math::max(a_element_space_size * sizeof(ADataType), + c_element_space_size * sizeof(CDataType)); + constexpr long_index_t TwoGB = (long_index_t{1} << 31); + + const IndexType N = a_g_n_k_wos_lengths[I1]; + + if(element_space_size > TwoGB) { + // Minimum divisor of N to not exceed 2GB + const auto divisor = math::integer_divide_ceil(element_space_size, TwoGB); - return make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, K), - make_tuple(WoStride, KStride)); + if(divisor <= static_cast(N)) + { + // Find least divisor of N larger than element_space_size / TwoGB + // Iterate up to sqrt(N). There are no divisors above this value. + for(IndexType least_divisor = divisor; least_divisor * least_divisor <= N; + least_divisor++) + { + if(N % least_divisor == 0) + { + return N / least_divisor; + } + } + // Not found, process one Convolution N per block + return 1; + } + else + { + // Not possible to support even after split N. + // Too large tensor. + return N; + } } else { - return make_naive_tensor_descriptor( - make_tuple(N, Do, Ho, Wo, K), - make_tuple(NStride, DoStride, HoStride, WoStride, KStride)); + // Split N is not needed. + return N; } } - else if constexpr(is_same_v) + + public: + __host__ __device__ constexpr TransformConvBwdDataToGemm_v1() {} + + template + __host__ __device__ TransformConvBwdDataToGemm_v1( + const TransformConvBwdDataToGemm_v1Base& transform_conv_bwd_data_to_gemm_base) + : N_{static_cast(transform_conv_bwd_data_to_gemm_base.N_)}, + Di_{static_cast(transform_conv_bwd_data_to_gemm_base.Di_)}, + Hi_{static_cast(transform_conv_bwd_data_to_gemm_base.Hi_)}, + Wi_{static_cast(transform_conv_bwd_data_to_gemm_base.Wi_)}, + Do_{static_cast(transform_conv_bwd_data_to_gemm_base.Do_)}, + Ho_{static_cast(transform_conv_bwd_data_to_gemm_base.Ho_)}, + Wo_{static_cast(transform_conv_bwd_data_to_gemm_base.Wo_)}, + Z_{static_cast(transform_conv_bwd_data_to_gemm_base.Z_)}, + Y_{static_cast(transform_conv_bwd_data_to_gemm_base.Y_)}, + X_{static_cast(transform_conv_bwd_data_to_gemm_base.X_)}, + K_{static_cast(transform_conv_bwd_data_to_gemm_base.K_)}, + C_{static_cast(transform_conv_bwd_data_to_gemm_base.C_)}, + DiStride_{static_cast(transform_conv_bwd_data_to_gemm_base.DiStride_)}, + HiStride_{static_cast(transform_conv_bwd_data_to_gemm_base.HiStride_)}, + WiStride_{static_cast(transform_conv_bwd_data_to_gemm_base.WiStride_)}, + DoStride_{static_cast(transform_conv_bwd_data_to_gemm_base.DoStride_)}, + HoStride_{static_cast(transform_conv_bwd_data_to_gemm_base.HoStride_)}, + WoStride_{static_cast(transform_conv_bwd_data_to_gemm_base.WoStride_)}, + CStrideTensorB_{ + static_cast(transform_conv_bwd_data_to_gemm_base.CStrideTensorB_)}, + CStrideTensorC_{ + static_cast(transform_conv_bwd_data_to_gemm_base.CStrideTensorC_)}, + KStrideTensorA_{ + static_cast(transform_conv_bwd_data_to_gemm_base.KStrideTensorA_)}, + KStrideTensorB_{ + static_cast(transform_conv_bwd_data_to_gemm_base.KStrideTensorB_)}, + NStrideTensorA_{ + static_cast(transform_conv_bwd_data_to_gemm_base.NStrideTensorA_)}, + NStrideTensorC_{ + static_cast(transform_conv_bwd_data_to_gemm_base.NStrideTensorC_)}, + ConvStrideD_{static_cast(transform_conv_bwd_data_to_gemm_base.ConvStrideD_)}, + ConvStrideH_{static_cast(transform_conv_bwd_data_to_gemm_base.ConvStrideH_)}, + ConvStrideW_{static_cast(transform_conv_bwd_data_to_gemm_base.ConvStrideW_)}, + ConvDilationD_{ + static_cast(transform_conv_bwd_data_to_gemm_base.ConvDilationD_)}, + ConvDilationH_{ + static_cast(transform_conv_bwd_data_to_gemm_base.ConvDilationH_)}, + ConvDilationW_{ + static_cast(transform_conv_bwd_data_to_gemm_base.ConvDilationW_)}, + InLeftPadD_{static_cast(transform_conv_bwd_data_to_gemm_base.InLeftPadD_)}, + InLeftPadH_{static_cast(transform_conv_bwd_data_to_gemm_base.InLeftPadH_)}, + InLeftPadW_{static_cast(transform_conv_bwd_data_to_gemm_base.InLeftPadW_)}, + InRightPadD_{static_cast(transform_conv_bwd_data_to_gemm_base.InRightPadD_)}, + InRightPadH_{static_cast(transform_conv_bwd_data_to_gemm_base.InRightPadH_)}, + InRightPadW_{static_cast(transform_conv_bwd_data_to_gemm_base.InRightPadW_)}, + IdxZTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.IdxZTilde_)}, + IdxYTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.IdxYTilde_)}, + IdxXTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.IdxXTilde_)}, + GcdStrideDilationD_{ + static_cast(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationD_)}, + GcdStrideDilationH_{ + static_cast(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationH_)}, + GcdStrideDilationW_{ + static_cast(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationW_)}, + ZTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.ZTilde_)}, + YTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.YTilde_)}, + XTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.XTilde_)}, + DTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.DTilde_)}, + HTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.HTilde_)}, + WTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.WTilde_)}, + ZDot_{static_cast(transform_conv_bwd_data_to_gemm_base.ZDot_)}, + YDot_{static_cast(transform_conv_bwd_data_to_gemm_base.YDot_)}, + XDot_{static_cast(transform_conv_bwd_data_to_gemm_base.XDot_)} { - // assume packed - if constexpr(ConvBwdDataSpecialization == - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: - Filter1x1Stride1Pad0) + } + + template + __host__ __device__ + TransformConvBwdDataToGemm_v1(const ConvDimsType& a_g_n_k_wos_lengths, + const ConvDimsType& a_g_n_k_wos_strides, + const ConvDimsType& b_g_k_c_xs_lengths, + const ConvDimsType& b_g_k_c_xs_strides, + const ConvDimsType& c_g_n_c_wis_lengths, + const ConvDimsType& c_g_n_c_wis_strides, + const ConvSpatialDimsType& conv_filter_strides, + const ConvSpatialDimsType& conv_filter_dilations, + const ConvSpatialDimsType& input_left_pads, + const ConvSpatialDimsType& input_right_pads, + const ConvSpatialDimsType& tildes) + : Hi_{c_g_n_c_wis_lengths[HIdx]}, + Wi_{c_g_n_c_wis_lengths[WIdx]}, + Ho_{a_g_n_k_wos_lengths[HIdx]}, + Wo_{a_g_n_k_wos_lengths[WIdx]}, + Y_{b_g_k_c_xs_lengths[YIdx]}, + X_{b_g_k_c_xs_lengths[XIdx]}, + K_{a_g_n_k_wos_lengths[I2]}, + C_{b_g_k_c_xs_lengths[I2]}, + HiStride_{c_g_n_c_wis_strides[HIdx]}, + WiStride_{c_g_n_c_wis_strides[WIdx]}, + HoStride_{a_g_n_k_wos_strides[HIdx]}, + WoStride_{a_g_n_k_wos_strides[WIdx]}, + CStrideTensorB_{b_g_k_c_xs_strides[I2]}, + CStrideTensorC_{c_g_n_c_wis_strides[I2]}, + KStrideTensorA_{a_g_n_k_wos_strides[I2]}, + KStrideTensorB_{b_g_k_c_xs_strides[I1]}, + NStrideTensorA_{a_g_n_k_wos_strides[I1]}, + NStrideTensorC_{c_g_n_c_wis_strides[I1]}, + ConvStrideH_{conv_filter_strides[HIdx - NonSpatialDimsNum]}, + ConvStrideW_{conv_filter_strides[WIdx - NonSpatialDimsNum]}, + ConvDilationH_{conv_filter_dilations[HIdx - NonSpatialDimsNum]}, + ConvDilationW_{conv_filter_dilations[WIdx - NonSpatialDimsNum]}, + InLeftPadH_{input_left_pads[HIdx - NonSpatialDimsNum]}, + InLeftPadW_{input_left_pads[WIdx - NonSpatialDimsNum]}, + InRightPadH_{input_right_pads[HIdx - NonSpatialDimsNum]}, + InRightPadW_{input_right_pads[WIdx - NonSpatialDimsNum]}, + IdxYTilde_{tildes[YIdx - NonSpatialDimsNum]}, + IdxXTilde_{tildes[XIdx - NonSpatialDimsNum]} + { + static_assert(is_same_v> || + is_same_v>); + static_assert(is_same_v> || + is_same_v>); + + if constexpr(SplitN) { - return make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)); + N_ = GetSplitedNSize( + a_g_n_k_wos_lengths, a_g_n_k_wos_strides, c_g_n_c_wis_lengths, c_g_n_c_wis_strides); } else { - return make_naive_tensor_descriptor_packed(make_tuple(N, Ho, Wo, K)); + N_ = c_g_n_c_wis_lengths[I1]; } - } - else if constexpr(is_same_v) - { - // assume packed - if constexpr(ConvBwdDataSpecialization == - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: - Filter1x1Stride1Pad0) + if constexpr(NDimSpatial == 3) { - return make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo, K)); + Di_ = c_g_n_c_wis_lengths[DIdx]; + Do_ = a_g_n_k_wos_lengths[DIdx]; + Z_ = b_g_k_c_xs_lengths[ZIdx]; + DiStride_ = c_g_n_c_wis_strides[DIdx]; + DoStride_ = a_g_n_k_wos_strides[DIdx]; + ConvStrideD_ = conv_filter_strides[DIdx - NonSpatialDimsNum]; + ConvDilationD_ = conv_filter_dilations[DIdx - NonSpatialDimsNum]; + InLeftPadD_ = input_left_pads[DIdx - NonSpatialDimsNum]; + InRightPadD_ = input_right_pads[DIdx - NonSpatialDimsNum]; + IdxZTilde_ = tildes[ZIdx - NonSpatialDimsNum]; + GcdStrideDilationD_ = math::gcd(ConvStrideD_, ConvDilationD_); + ZTilde_ = ConvStrideD_ / GcdStrideDilationD_; + DTilde_ = Do_ + math::integer_divide_ceil(ConvDilationD_ * (Z_ - I1), ConvStrideD_); + ZDot_ = math::integer_divide_ceil(Z_, ZTilde_); } else { - return make_naive_tensor_descriptor_packed(make_tuple(N, Do, Ho, Wo, K)); + Di_ = Do_ = Z_ = ZTilde_ = ConvStrideD_ = DTilde_ = ZDot_ = 1; + InLeftPadD_ = InRightPadD_ = DiStride_ = DoStride_ = IdxZTilde_ = 0; } - } - else - { - throw std::runtime_error("wrong! unsupported layout: " + ALayout::name()); - } -} -template -constexpr auto make_wei_grid_desc( - const index_t K, const index_t Z, const index_t Y, const index_t X, const index_t C) -{ + GcdStrideDilationH_ = math::gcd(ConvStrideH_, ConvDilationH_); + GcdStrideDilationW_ = math::gcd(ConvStrideW_, ConvDilationW_); - if constexpr(is_same_v) - { - return make_naive_tensor_descriptor_packed(make_tuple(K, Y, X, C)); - } - else if constexpr(is_same_v) - { - return make_naive_tensor_descriptor_packed(make_tuple(K, Z, Y, X, C)); - } - else - { - throw std::runtime_error("wrong! unsupported layout: " + BLayout::name()); - } -} - -template -constexpr auto make_in_grid_desc(const index_t N, - const index_t Di, - const index_t Hi, - const index_t Wi, - const index_t C, - const std::array& in_g_n_c_wis_strides) -{ + YTilde_ = ConvStrideH_ / GcdStrideDilationH_; + XTilde_ = ConvStrideW_ / GcdStrideDilationW_; - if constexpr(is_same_v || - is_same_v || - is_same_v) - { - return make_naive_tensor_descriptor(make_tuple(N, Hi, Wi, C), - make_tuple(in_g_n_c_wis_strides[1], - in_g_n_c_wis_strides[3], - in_g_n_c_wis_strides[4], - in_g_n_c_wis_strides[2])); + HTilde_ = Ho_ + math::integer_divide_ceil(ConvDilationH_ * (Y_ - I1), ConvStrideH_); + WTilde_ = Wo_ + math::integer_divide_ceil(ConvDilationW_ * (X_ - I1), ConvStrideW_); + + YDot_ = math::integer_divide_ceil(Y_, YTilde_); + XDot_ = math::integer_divide_ceil(X_, XTilde_); } - else if constexpr(is_same_v || - is_same_v) + +#if 0 // At now not supported to split tensor + __host__ bool AreDescriptorsSmallerThan2GB() const { - return make_naive_tensor_descriptor(make_tuple(N, Di, Hi, Wi, C), - make_tuple(in_g_n_c_wis_strides[1], - in_g_n_c_wis_strides[3], - in_g_n_c_wis_strides[4], - in_g_n_c_wis_strides[5], - in_g_n_c_wis_strides[2])); + constexpr long_index_t TwoGB = (long_index_t{1} << 31); + + const long_index_t in_desc_space_size = + I1 + (N_ - I1) * NStrideTensorC_ + (Di_ - I1) * DiStride_ + (Hi_ - I1) * HiStride_ + + (Wi_ - I1) * WiStride_ + (C_ - I1) * CStrideTensorC_; + const long_index_t out_desc_space_size = + I1 + (N_ - I1) * NStrideTensorA_ + (Do_ - I1) * DoStride_ + (Ho_ - I1) * HoStride_ + + (Wo_ - I1) * WoStride_ + (K_ - I1) * KStrideTensorA_; + + bool is_a_descriptor_smaller_than_2GB = (out_desc_space_size * sizeof(ADataType)) <= TwoGB; + bool is_c_descriptor_smaller_than_2GB = (in_desc_space_size * sizeof(CDataType)) <= TwoGB; + + return is_a_descriptor_smaller_than_2GB && is_c_descriptor_smaller_than_2GB; } - else + + __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base, + CDataType* c_grid_ptr_base) const { - throw std::runtime_error("wrong! unsupported layout: " + CLayout::name()); - } -} + // Create copies + auto conv_to_gemm_transformer_left = *this; + auto conv_to_gemm_transformer_right = *this; + IndexType a_right_offset = 0; + IndexType c_right_offset = 0; + // Calculate real filter size + const IndexType z_eff = (Z_ - 1) * ConvDilationD_ + 1; + const IndexType y_eff = (Y_ - 1) * ConvDilationH_ + 1; + const IndexType x_eff = (X_ - 1) * ConvDilationW_ + 1; + // Calculate start position in input for right tensor + const IndexType di_right_transformer_start_idx = (Do_ / 2) * ConvStrideD_; + const IndexType hi_right_transformer_start_idx = (Ho_ / 2) * ConvStrideH_; + const IndexType wi_right_transformer_start_idx = (Wo_ / 2) * ConvStrideW_; + // Calculate last position in input for left tensor + const IndexType di_left_transformer_end_idx = (Do_ / 2 - 1) * ConvStrideD_ + z_eff; + const IndexType hi_left_transformer_end_idx = (Ho_ / 2 - 1) * ConvStrideH_ + y_eff; + const IndexType wi_left_transformer_end_idx = (Wo_ / 2 - 1) * ConvStrideW_ + x_eff; + // Allow to split if whole left padding will be in left tensor and right padding in right + // tensor + const bool is_possible_to_split_d = Do_ != 1 && + di_right_transformer_start_idx > InLeftPadD_ && + di_left_transformer_end_idx <= (InLeftPadD_ + Di_); + const bool is_possible_to_split_h = Ho_ != 1 && + hi_right_transformer_start_idx > InLeftPadH_ && + hi_left_transformer_end_idx <= (InLeftPadH_ + Hi_); + const bool is_possible_to_split_w = Wo_ != 1 && + wi_right_transformer_start_idx > InLeftPadW_ && + wi_left_transformer_end_idx <= (InLeftPadW_ + Wi_); + + if(is_possible_to_split_d) + { + // Apply new sizes + // Split output on half + conv_to_gemm_transformer_left.Do_ = Do_ / 2; + conv_to_gemm_transformer_right.Do_ = Do_ - Do_ / 2; + // Assign left padding to left convolution + conv_to_gemm_transformer_left.InLeftPadD_ = InLeftPadD_; + conv_to_gemm_transformer_right.InLeftPadD_ = 0; + // Assign right padding to right convolution + conv_to_gemm_transformer_left.InRightPadD_ = 0; + conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_; + // Calculate new input size + conv_to_gemm_transformer_left.Di_ = di_left_transformer_end_idx - InLeftPadD_; + conv_to_gemm_transformer_right.Di_ = + math::min(Di_ - (di_right_transformer_start_idx - InLeftPadD_), + (conv_to_gemm_transformer_right.Do_ - 1) * ConvStrideD_ + z_eff); + ; + // Calcualte offsets + a_right_offset = (Do_ / 2) * DoStride_; + c_right_offset = ((Do_ / 2) * ConvStrideD_ - InLeftPadD_) * DiStride_; + } + else if(is_possible_to_split_h) + { + conv_to_gemm_transformer_left.Ho_ = Ho_ / 2; + conv_to_gemm_transformer_right.Ho_ = Ho_ - Ho_ / 2; -} // namespace + conv_to_gemm_transformer_left.InLeftPadH_ = InLeftPadH_; + conv_to_gemm_transformer_right.InLeftPadH_ = 0; -template < - index_t NDimSpatial, - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization, - index_t AK1, - index_t BK1, - index_t GemmMPerBlock, - index_t GemmNPerBlock, - index_t GemmKPerBlock, - bool DoPadGemmM, - bool DoPadGemmN> -struct TransformConvBwdDataToGemm_v1 -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; + conv_to_gemm_transformer_left.InRightPadH_ = 0; + conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_; - static constexpr auto NonSpatialDimsNum = Number<3>{}; + conv_to_gemm_transformer_left.Hi_ = hi_left_transformer_end_idx - InLeftPadH_; + conv_to_gemm_transformer_right.Hi_ = + math::min(Hi_ - (hi_right_transformer_start_idx - InLeftPadH_), + (conv_to_gemm_transformer_right.Ho_ - 1) * ConvStrideH_ + y_eff); + a_right_offset = (Ho_ / 2) * HoStride_; + c_right_offset = ((Ho_ / 2) * ConvStrideH_ - InLeftPadH_) * HiStride_; + } + else if(is_possible_to_split_w) + { + conv_to_gemm_transformer_left.Wo_ = Wo_ / 2; + conv_to_gemm_transformer_right.Wo_ = Wo_ - Wo_ / 2; - static constexpr auto DIdx = Number{}; - static constexpr auto HIdx = - NDimSpatial == 2 ? Number{} : Number{}; - static constexpr auto WIdx = - NDimSpatial == 2 ? Number{} : Number{}; + conv_to_gemm_transformer_left.InLeftPadW_ = InLeftPadW_; + conv_to_gemm_transformer_right.InLeftPadW_ = 0; - static constexpr auto ZIdx = Number{}; - static constexpr auto YIdx = - NDimSpatial == 2 ? Number{} : Number{}; - static constexpr auto XIdx = - NDimSpatial == 2 ? Number{} : Number{}; + conv_to_gemm_transformer_left.InRightPadW_ = 0; + conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_; - template || - is_same_v || - is_same_v || - is_same_v), - bool>::type = false> - static auto MakeADescriptor_AK0_M_AK1( - const std::array& out_g_n_k_wos_lengths, - const std::array& out_g_n_k_wos_strides, - const std::array& wei_g_k_c_xs_lengths, - const std::array& /* wei_g_k_c_xs_strides */, - const std::array& in_g_n_c_wis_lengths, - const std::array& /* in_g_n_c_wis_strides */, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& input_left_pads, - const std::array& /* input_right_pads */, - const std::array& tildes) + conv_to_gemm_transformer_left.Wi_ = wi_left_transformer_end_idx - InLeftPadW_; + conv_to_gemm_transformer_right.Wi_ = + math::min(Wi_ - (wi_right_transformer_start_idx - InLeftPadW_), + (conv_to_gemm_transformer_right.Wo_ - 1) * ConvStrideW_ + x_eff); + + a_right_offset = (Wo_ / 2) * WoStride_; + c_right_offset = ((Wo_ / 2) * ConvStrideW_ - InLeftPadW_) * WiStride_; + } + // Return left transform, right transformer, right offset to Input and right offset to + // Output + return ck::make_tuple(conv_to_gemm_transformer_left, + conv_to_gemm_transformer_right, + a_grid_ptr_base + a_right_offset, + c_grid_ptr_base + c_right_offset); + } + + __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base, + CDataType* c_grid_ptr_base) const { - index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum]; - index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum]; - index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum]; + // Create copies + auto conv_to_gemm_transformer_left = *this; + auto conv_to_gemm_transformer_right = *this; + IndexType a_right_offset = 0; + IndexType c_right_offset = 0; + + // Calculate start position in input for right tensor + const IndexType do_right_transformer_start_idx = math::integer_divide_ceil((Di_ / 2) + InLeftPadD_ - ((Z_ - 1) * ConvDilationD_), ConvStrideD_); + const IndexType ho_right_transformer_start_idx = math::integer_divide_ceil((Hi_ / 2) + InLeftPadH_ - ((Y_ - 1) * ConvDilationH_), ConvStrideH_); + const IndexType wo_right_transformer_start_idx = math::integer_divide_ceil((Wi_ / 2) + InLeftPadW_ - ((X_ - 1) * ConvDilationW_), ConvStrideW_); + // Calculate last position in input for left tensor + const IndexType do_left_transformer_end_idx = math::integer_divide_ceil((Di_ / 2 - 1) + InLeftPadD_, ConvStrideD_); + const IndexType ho_left_transformer_end_idx = math::integer_divide_ceil((Hi_ / 2 - 1) + InLeftPadH_, ConvStrideH_); + const IndexType wo_left_transformer_end_idx = math::integer_divide_ceil((Wi_ / 2 - 1) + InLeftPadW_, ConvStrideW_); + + + if(Di_!=1) + { + // Apply new sizes + // Split output on half + conv_to_gemm_transformer_left.Di_ = Di_ / 2; + conv_to_gemm_transformer_right.Di_ = Di_ - Di_ / 2; + // Assign left padding to left convolution + conv_to_gemm_transformer_left.InLeftPadD_ = InLeftPadD_; + conv_to_gemm_transformer_right.InLeftPadD_ = 0; + // // Assign right padding to right convolution + conv_to_gemm_transformer_left.InRightPadD_ = 0; + conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_; + // Calculate new input size + conv_to_gemm_transformer_left.Do_ = do_left_transformer_end_idx; + conv_to_gemm_transformer_right.Do_ = Do_ - do_right_transformer_start_idx; + ; + // Calcualte offsets + a_right_offset = do_right_transformer_start_idx * DoStride_; + c_right_offset = (Di_ / 2) * DiStride_; + } + else if(Hi_!=1) + { + // Apply new sizes + // Split output on half + conv_to_gemm_transformer_left.Hi_ = Hi_ / 2; + conv_to_gemm_transformer_right.Hi_ = Hi_ - Hi_ / 2; + // Assign left padding to left convolution + conv_to_gemm_transformer_left.InLeftPadH_ = InLeftPadH_; + conv_to_gemm_transformer_right.InLeftPadH_ = 0; + // // Assign right padding to right convolution + conv_to_gemm_transformer_left.InRightPadH_ = 0; + conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_; + // Calculate new input size + conv_to_gemm_transformer_left.Ho_ = ho_left_transformer_end_idx ; + conv_to_gemm_transformer_right.Ho_ = Ho_ - ho_right_transformer_start_idx ; + ; + // Calcualte offsets + a_right_offset = ho_right_transformer_start_idx * HoStride_; + c_right_offset = (Hi_ / 2) * HiStride_; + } + else if(Wi_!=1) + { + // Apply new sizes + // Split output on half + conv_to_gemm_transformer_left.Wi_ = Wi_ / 2; + conv_to_gemm_transformer_right.Wi_ = Wi_ - Wi_ / 2; + // Assign left padding to left convolution + conv_to_gemm_transformer_left.InLeftPadW_ = InLeftPadW_; + conv_to_gemm_transformer_right.InLeftPadW_ = 0; + // Assign right padding to right convolution + conv_to_gemm_transformer_left.InRightPadW_ = 0; + conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_; + // Calculate new input size + conv_to_gemm_transformer_left.Wo_ = wo_left_transformer_end_idx; + conv_to_gemm_transformer_right.Wo_ = Wo_ - wo_right_transformer_start_idx; + ; + // Calcualte offsets + a_right_offset = wo_right_transformer_start_idx * WoStride_; + c_right_offset = (Wi_ / 2) * WiStride_; + } + // Return left transform, right transformer, right offset to Input and right offset to + // Output + return ck::make_tuple(conv_to_gemm_transformer_left, + conv_to_gemm_transformer_right, + a_grid_ptr_base + a_right_offset, + c_grid_ptr_base + c_right_offset); + } +#endif - const index_t N = in_g_n_c_wis_lengths[1]; - const index_t K = wei_g_k_c_xs_lengths[1]; + __host__ __device__ auto MakeOutGridDesc() const + { + if constexpr(is_same_v) + { + if constexpr(ConvBwdDataSpecialization == + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: + Filter1x1Stride1Pad0) + { - const index_t Di = NDimSpatial == 3 ? in_g_n_c_wis_lengths[DIdx] : 1; - const index_t Hi = in_g_n_c_wis_lengths[HIdx]; - const index_t Wi = in_g_n_c_wis_lengths[WIdx]; + return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_), + make_tuple(WoStride_, KStrideTensorA_)); + } + else + { + return make_naive_tensor_descriptor( + make_tuple(N_, Ho_, Wo_, K_), + make_tuple(NStrideTensorA_, HoStride_, WoStride_, KStrideTensorA_)); + } + } + else if constexpr(is_same_v) + { + if constexpr(ConvBwdDataSpecialization == + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: + Filter1x1Stride1Pad0) + { - const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1; - const index_t Ho = out_g_n_k_wos_lengths[HIdx]; - const index_t Wo = out_g_n_k_wos_lengths[WIdx]; + return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_), + make_tuple(WoStride_, KStrideTensorA_)); + } + else + { + return make_naive_tensor_descriptor( + make_tuple(N_, Do_, Ho_, Wo_, K_), + make_tuple(NStrideTensorA_, DoStride_, HoStride_, WoStride_, KStrideTensorA_)); + } + } + else if constexpr(is_same_v) + { + // assume packed + if constexpr(ConvBwdDataSpecialization == + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: + Filter1x1Stride1Pad0) + { + return make_naive_tensor_descriptor_packed(make_tuple(N_ * Ho_ * Wo_, K_)); + } + else + { + return make_naive_tensor_descriptor_packed(make_tuple(N_, Ho_, Wo_, K_)); + } + } + else if constexpr(is_same_v) + { + // assume packed + if constexpr(ConvBwdDataSpecialization == + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: + Filter1x1Stride1Pad0) + { + return make_naive_tensor_descriptor_packed(make_tuple(N_ * Do_ * Ho_ * Wo_, K_)); + } + else + { + return make_naive_tensor_descriptor_packed(make_tuple(N_, Do_, Ho_, Wo_, K_)); + } + } + else + { + throw std::runtime_error("wrong! unsupported layout: " + ALayout::name()); + } + } - const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1; - const index_t Y = wei_g_k_c_xs_lengths[YIdx]; - const index_t X = wei_g_k_c_xs_lengths[XIdx]; + __host__ __device__ auto MakeWeiGridDesc() const + { - const index_t InLeftPadD = input_left_pads[DIdx - NonSpatialDimsNum]; - const index_t InLeftPadH = input_left_pads[HIdx - NonSpatialDimsNum]; - const index_t InLeftPadW = input_left_pads[WIdx - NonSpatialDimsNum]; + if constexpr(is_same_v) + { + return make_naive_tensor_descriptor_packed(make_tuple(K_, Y_, X_, C_)); + } + else if constexpr(is_same_v) + { + return make_naive_tensor_descriptor_packed(make_tuple(K_, Z_, Y_, X_, C_)); + } + else + { + throw std::runtime_error("wrong! unsupported layout: " + BLayout::name()); + } + } - const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum]; - const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum]; - const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum]; + __host__ __device__ auto MakeInGridDesc() const + { - const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum]; - const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum]; - const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum]; + if constexpr(is_same_v || + is_same_v || + is_same_v) + { + return make_naive_tensor_descriptor( + make_tuple(N_, Hi_, Wi_, C_), + make_tuple(NStrideTensorC_, HiStride_, WiStride_, CStrideTensorC_)); + } + else if constexpr(is_same_v || + is_same_v) + { + return make_naive_tensor_descriptor( + make_tuple(N_, Di_, Hi_, Wi_, C_), + make_tuple(NStrideTensorC_, DiStride_, HiStride_, WiStride_, CStrideTensorC_)); + } + else + { + throw std::runtime_error("wrong! unsupported layout: " + CLayout::name()); + } + } + template < + typename ALayout_ = ALayout, + typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) && + (is_same_v || + is_same_v || + is_same_v || + is_same_v), + bool>::type = false> + __host__ __device__ auto MakeADescriptor_AK0_M_AK1() const + { // n_do_ho_wo_k for 3d or n_ho_wo_k for 2d - const auto out_grid_desc = - make_out_grid_desc( - N, Do, Ho, Wo, K, out_g_n_k_wos_strides); + const auto out_grid_desc = MakeOutGridDesc(); if constexpr(ConvBwdDataSpecialization == ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: Filter1x1Stride1Pad0) { - const index_t AK0 = math::integer_divide_ceil(K, AK1); + const index_t AK0 = math::integer_divide_ceil(K_, AK1); // A: output tensor const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor( out_grid_desc, - make_tuple(make_pass_through_transform(N * Do * Ho * Wo), + make_tuple(make_pass_through_transform(N_ * Do_ * Ho_ * Wo_), make_unmerge_transform(make_tuple(AK0, AK1))), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0, 2>{})); @@ -266,82 +635,63 @@ struct TransformConvBwdDataToGemm_v1 } else { - const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD); - const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); - const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); - - const auto ZTilde = ConvStrideD / GcdStrideDilationD; - const auto YTilde = ConvStrideH / GcdStrideDilationH; - const auto XTilde = ConvStrideW / GcdStrideDilationW; - - const auto ZDot = math::integer_divide_ceil(Z, ZTilde); - const auto YDot = math::integer_divide_ceil(Y, YTilde); - const auto XDot = math::integer_divide_ceil(X, XTilde); - - const auto DTilde = - Do + math::integer_divide_ceil(ConvDilationD * (Z - I1), ConvStrideD); - const auto HTilde = - Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH); - const auto WTilde = - Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW); - // only work on HTilde and WTilde that contribute to non-padding area of input tensor const auto IDTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadD - ConvDilationD * (ZTilde - I1)), ConvStrideD); + math::max(I0, InLeftPadD_ - ConvDilationD_ * (ZTilde_ - I1)), ConvStrideD_); const auto IHTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH); + math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_); const auto IWTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW); + math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_); const auto IDTildeSliceEnd = math::min( - DTilde, math::integer_divide_ceil(InLeftPadD + Di - I1, ConvStrideD) + I1); + DTilde_, math::integer_divide_ceil(InLeftPadD_ + Di_ - I1, ConvStrideD_) + I1); const auto IHTildeSliceEnd = math::min( - HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1); + HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1); const auto IWTildeSliceEnd = math::min( - WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1); + WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1); const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin; const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin; const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin; // GemmK is different for each GEMM - const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde); - const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde); - const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde); + const auto ZDotSlice = math::integer_divide_ceil(Z_ - IdxZTilde_, ZTilde_); + const auto YDotSlice = math::integer_divide_ceil(Y_ - IdxYTilde_, YTilde_); + const auto XDotSlice = math::integer_divide_ceil(X_ - IdxXTilde_, XTilde_); if constexpr(NDimSpatial == 2) { // A: output tensor const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor( out_grid_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Ho, I0, I0), - make_pad_transform(Wo, I0, I0), - make_pass_through_transform(K)), + make_tuple(make_pass_through_transform(N_), + make_pad_transform(Ho_, I0, I0), + make_pad_transform(Wo_, I0, I0), + make_pass_through_transform(K_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor( out_n_hop_wop_k_grid_desc, make_tuple( - make_pass_through_transform(N), - make_embed_transform(make_tuple(YDot, HTilde), - make_tuple(-ConvDilationH / GcdStrideDilationH, I1)), - make_embed_transform(make_tuple(XDot, WTilde), - make_tuple(-ConvDilationW / GcdStrideDilationW, I1)), - make_pass_through_transform(K)), + make_pass_through_transform(N_), + make_embed_transform(make_tuple(YDot_, HTilde_), + make_tuple(-ConvDilationH_ / GcdStrideDilationH_, I1)), + make_embed_transform(make_tuple(XDot_, WTilde_), + make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)), + make_pass_through_transform(K_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc = transform_tensor_descriptor( out_n_ydot_htilde_xdot_wtilde_k_grid_desc, - make_tuple(make_pass_through_transform(N), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice), - make_pass_through_transform(K)), + make_tuple(make_pass_through_transform(N_), + make_slice_transform(YDot_, I0, YDotSlice), + make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice), + make_slice_transform(XDot_, I0, XDotSlice), + make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice), + make_pass_through_transform(K_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -357,8 +707,8 @@ struct TransformConvBwdDataToGemm_v1 const auto out_gemmk_gemmmraw_grid_desc = transform_tensor_descriptor( out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc, - make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K)), - make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice))), + make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K_)), + make_merge_transform(make_tuple(N_, HTildeSlice, WTildeSlice))), make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -385,11 +735,11 @@ struct TransformConvBwdDataToGemm_v1 // A: output tensor const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor( out_grid_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Do, I0, I0), - make_pad_transform(Ho, I0, I0), - make_pad_transform(Wo, I0, I0), - make_pass_through_transform(K)), + make_tuple(make_pass_through_transform(N_), + make_pad_transform(Do_, I0, I0), + make_pad_transform(Ho_, I0, I0), + make_pad_transform(Wo_, I0, I0), + make_pass_through_transform(K_)), make_tuple( Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), make_tuple( @@ -398,17 +748,17 @@ struct TransformConvBwdDataToGemm_v1 const auto out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor( out_n_hop_wop_k_grid_desc, - make_tuple(make_pass_through_transform(N), + make_tuple(make_pass_through_transform(N_), make_embed_transform( - make_tuple(ZDot, DTilde), - make_tuple(-ConvDilationD / GcdStrideDilationD, I1)), + make_tuple(ZDot_, DTilde_), + make_tuple(-ConvDilationD_ / GcdStrideDilationD_, I1)), make_embed_transform( - make_tuple(YDot, HTilde), - make_tuple(-ConvDilationH / GcdStrideDilationH, I1)), + make_tuple(YDot_, HTilde_), + make_tuple(-ConvDilationH_ / GcdStrideDilationH_, I1)), make_embed_transform( - make_tuple(XDot, WTilde), - make_tuple(-ConvDilationW / GcdStrideDilationW, I1)), - make_pass_through_transform(K)), + make_tuple(XDot_, WTilde_), + make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)), + make_pass_through_transform(K_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -424,14 +774,15 @@ struct TransformConvBwdDataToGemm_v1 out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc = transform_tensor_descriptor( out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc, - make_tuple(make_pass_through_transform(N), - make_slice_transform(ZDot, I0, ZDotSlice), - make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice), - make_pass_through_transform(K)), + make_tuple( + make_pass_through_transform(N_), + make_slice_transform(ZDot_, I0, ZDotSlice), + make_slice_transform(DTilde_, IDTildeSliceBegin, DTildeSlice), + make_slice_transform(YDot_, I0, YDotSlice), + make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice), + make_slice_transform(XDot_, I0, XDotSlice), + make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice), + make_pass_through_transform(K_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -452,8 +803,9 @@ struct TransformConvBwdDataToGemm_v1 const auto out_gemmk_gemmmraw_grid_desc = transform_tensor_descriptor( out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc, make_tuple( - make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K)), - make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice))), + make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K_)), + make_merge_transform( + make_tuple(N_, DTildeSlice, HTildeSlice, WTildeSlice))), make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -482,66 +834,31 @@ struct TransformConvBwdDataToGemm_v1 } } - template || - is_same_v), + (is_same_v || + is_same_v), bool>::type = false> - static auto MakeBDescriptor_BK0_N_BK1( - const std::array& out_g_n_k_wos_lengths, - const std::array& /* out_g_n_k_wos_strides */, - const std::array& wei_g_k_c_xs_lengths, - const std::array& /* wei_g_k_c_xs_strides */, - const std::array& in_g_n_c_wis_lengths, - const std::array& /* in_g_n_c_wis_strides */, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& /* input_left_pads */, - const std::array& /* input_right_pads */, - const std::array& tildes) + __host__ __device__ auto MakeBDescriptor_BK0_N_BK1() const { - index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum]; - index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum]; - index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum]; - - const index_t N = in_g_n_c_wis_lengths[1]; - const index_t K = wei_g_k_c_xs_lengths[1]; - const index_t C = wei_g_k_c_xs_lengths[2]; - - const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1; - const index_t Ho = out_g_n_k_wos_lengths[HIdx]; - const index_t Wo = out_g_n_k_wos_lengths[WIdx]; - - const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1; - const index_t Y = wei_g_k_c_xs_lengths[YIdx]; - const index_t X = wei_g_k_c_xs_lengths[XIdx]; - - const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum]; - const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum]; - const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum]; - - const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum]; - const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum]; - const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum]; - // assume packed // k_y_x_c for 2d or k_z_y_x_c for 3d - const auto wei_grid_desc = make_wei_grid_desc(K, Z, Y, X, C); + const auto wei_grid_desc = MakeWeiGridDesc(); if constexpr(ConvBwdDataSpecialization == ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: Filter1x1Stride1Pad0) { - const index_t BK0 = math::integer_divide_ceil(K, BK1); + const index_t BK0 = math::integer_divide_ceil(K_, BK1); // B: weight tensor const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc = - transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)), + transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K_, C_)), make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)), - make_pass_through_transform(C)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, C), make_tuple(I0, I1)); + make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, C_), make_tuple(I0, I1)); const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc = ck::tensor_operation::device::PadTensorDescriptor( @@ -553,22 +870,10 @@ struct TransformConvBwdDataToGemm_v1 } else { - const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD); - const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); - const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); - - const auto ZTilde = ConvStrideD / GcdStrideDilationD; - const auto YTilde = ConvStrideH / GcdStrideDilationH; - const auto XTilde = ConvStrideW / GcdStrideDilationW; - - const auto ZDot = math::integer_divide_ceil(Z, ZTilde); - const auto YDot = math::integer_divide_ceil(Y, YTilde); - const auto XDot = math::integer_divide_ceil(X, XTilde); - // GemmK is different for each GEMM - const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde); - const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde); - const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde); + const auto ZDotSlice = math::integer_divide_ceil(Z_ - IdxZTilde_, ZTilde_); + const auto YDotSlice = math::integer_divide_ceil(Y_ - IdxYTilde_, YTilde_); + const auto XDotSlice = math::integer_divide_ceil(X_ - IdxXTilde_, XTilde_); // B weight tensor if constexpr(NDimSpatial == 2) @@ -576,23 +881,23 @@ struct TransformConvBwdDataToGemm_v1 const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor( wei_grid_desc, make_tuple( - make_pass_through_transform(K), - make_embed_transform(make_tuple(YDot, YTilde), - make_tuple(ConvStrideH / GcdStrideDilationH, I1)), - make_embed_transform(make_tuple(XDot, XTilde), - make_tuple(ConvStrideW / GcdStrideDilationW, I1)), - make_pass_through_transform(C)), + make_pass_through_transform(K_), + make_embed_transform(make_tuple(YDot_, YTilde_), + make_tuple(ConvStrideH_ / GcdStrideDilationH_, I1)), + make_embed_transform(make_tuple(XDot_, XTilde_), + make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto wei_k_ydotslice_xdotslice_c_grid_desc = transform_tensor_descriptor( wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc, - make_tuple(make_pass_through_transform(K), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_freeze_transform(i_ytilde), - make_freeze_transform(i_xtilde), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(K_), + make_slice_transform(YDot_, I0, YDotSlice), + make_slice_transform(XDot_, I0, XDotSlice), + make_freeze_transform(IdxYTilde_), + make_freeze_transform(IdxXTilde_), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, @@ -608,8 +913,8 @@ struct TransformConvBwdDataToGemm_v1 const auto wei_gemmk_gemmnraw_grid_desc = transform_tensor_descriptor( wei_k_ydotslice_xdotslice_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K)), - make_pass_through_transform(C)), + make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K_)), + make_pass_through_transform(C_)), make_tuple(Sequence<1, 2, 0>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -636,15 +941,17 @@ struct TransformConvBwdDataToGemm_v1 const auto wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor( wei_grid_desc, - make_tuple( - make_pass_through_transform(K), - make_embed_transform(make_tuple(ZDot, ZTilde), - make_tuple(ConvStrideD / GcdStrideDilationD, I1)), - make_embed_transform(make_tuple(YDot, YTilde), - make_tuple(ConvStrideH / GcdStrideDilationH, I1)), - make_embed_transform(make_tuple(XDot, XTilde), - make_tuple(ConvStrideW / GcdStrideDilationW, I1)), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(K_), + make_embed_transform( + make_tuple(ZDot_, ZTilde_), + make_tuple(ConvStrideD_ / GcdStrideDilationD_, I1)), + make_embed_transform( + make_tuple(YDot_, YTilde_), + make_tuple(ConvStrideH_ / GcdStrideDilationH_, I1)), + make_embed_transform( + make_tuple(XDot_, XTilde_), + make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -659,14 +966,14 @@ struct TransformConvBwdDataToGemm_v1 const auto wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc = transform_tensor_descriptor( wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc, - make_tuple(make_pass_through_transform(K), - make_slice_transform(ZDot, I0, ZDotSlice), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_freeze_transform(i_ztilde), - make_freeze_transform(i_ytilde), - make_freeze_transform(i_xtilde), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(K_), + make_slice_transform(ZDot_, I0, ZDotSlice), + make_slice_transform(YDot_, I0, YDotSlice), + make_slice_transform(XDot_, I0, XDotSlice), + make_freeze_transform(IdxZTilde_), + make_freeze_transform(IdxYTilde_), + make_freeze_transform(IdxXTilde_), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, @@ -686,8 +993,9 @@ struct TransformConvBwdDataToGemm_v1 const auto wei_gemmk_gemmnraw_grid_desc = transform_tensor_descriptor( wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K)), - make_pass_through_transform(C)), + make_tuple( + make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K_)), + make_pass_through_transform(C_)), make_tuple(Sequence<1, 2, 3, 0>{}, Sequence<4>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -716,66 +1024,20 @@ struct TransformConvBwdDataToGemm_v1 } } - template || - is_same_v || - is_same_v || - is_same_v || - is_same_v), - bool>::type = false> - static auto - MakeCDescriptor_M_N(const std::array& out_g_n_k_wos_lengths, - const std::array& /* out_g_n_k_wos_strides */, - const std::array& wei_g_k_c_xs_lengths, - const std::array& /* wei_g_k_c_xs_strides */, - const std::array& in_g_n_c_wis_lengths, - const std::array& in_g_n_c_wis_strides, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& input_left_pads, - const std::array& input_right_pads, - const std::array& tildes) + template < + typename CLayout_ = CLayout, + typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) && + (is_same_v || + is_same_v || + is_same_v || + is_same_v || + is_same_v), + bool>::type = false> + __host__ __device__ auto MakeCDescriptor_M_N() const { - index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum]; - index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum]; - index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum]; - - const index_t N = in_g_n_c_wis_lengths[1]; - const index_t C = wei_g_k_c_xs_lengths[2]; - - const index_t Di = NDimSpatial == 3 ? in_g_n_c_wis_lengths[DIdx] : 1; - const index_t Hi = in_g_n_c_wis_lengths[HIdx]; - const index_t Wi = in_g_n_c_wis_lengths[WIdx]; - - const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1; - const index_t Ho = out_g_n_k_wos_lengths[HIdx]; - const index_t Wo = out_g_n_k_wos_lengths[WIdx]; - - const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1; - const index_t Y = wei_g_k_c_xs_lengths[YIdx]; - const index_t X = wei_g_k_c_xs_lengths[XIdx]; - - const index_t InLeftPadD = input_left_pads[DIdx - NonSpatialDimsNum]; - const index_t InLeftPadH = input_left_pads[HIdx - NonSpatialDimsNum]; - const index_t InLeftPadW = input_left_pads[WIdx - NonSpatialDimsNum]; - - const index_t InRightPadD = input_right_pads[DIdx - NonSpatialDimsNum]; - const index_t InRightPadH = input_right_pads[HIdx - NonSpatialDimsNum]; - const index_t InRightPadW = input_right_pads[WIdx - NonSpatialDimsNum]; - - const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum]; - const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum]; - const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum]; - - const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum]; - const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum]; - const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum]; - // assume strided // n_hi_wi_c for 2d n_di_hi_wi_c for 3d - const auto in_grid_desc = - make_in_grid_desc(N, Di, Hi, Wi, C, in_g_n_c_wis_strides); + const auto in_grid_desc = MakeInGridDesc(); if constexpr(ConvBwdDataSpecialization == ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: @@ -787,10 +1049,10 @@ struct TransformConvBwdDataToGemm_v1 const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor( in_grid_desc, make_tuple( - make_pass_through_transform(N), - make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)), - make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)), - make_pass_through_transform(C)), + make_pass_through_transform(N_), + make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)), + make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); @@ -798,8 +1060,8 @@ struct TransformConvBwdDataToGemm_v1 in_n_y_ho_x_wo_c_grid_desc, make_tuple(make_freeze_transform(I0), make_freeze_transform(I0), - make_merge_transform(make_tuple(N, Ho, Wo)), - make_pass_through_transform(C)), + make_merge_transform(make_tuple(N_, Ho_, Wo_)), + make_pass_through_transform(C_)), make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}), make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{})); @@ -818,11 +1080,11 @@ struct TransformConvBwdDataToGemm_v1 const auto in_n_x_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor( in_grid_desc, make_tuple( - make_pass_through_transform(N), - make_embed_transform(make_tuple(I1, Do), make_tuple(I1, ConvStrideD)), - make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)), - make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)), - make_pass_through_transform(C)), + make_pass_through_transform(N_), + make_embed_transform(make_tuple(I1, Do_), make_tuple(I1, ConvStrideD_)), + make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)), + make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)), + make_pass_through_transform(C_)), make_tuple( Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), make_tuple(Sequence<0>{}, @@ -836,8 +1098,8 @@ struct TransformConvBwdDataToGemm_v1 make_tuple(make_freeze_transform(I0), make_freeze_transform(I0), make_freeze_transform(I0), - make_merge_transform(make_tuple(N, Do, Ho, Wo)), - make_pass_through_transform(C)), + make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_)), + make_pass_through_transform(C_)), make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, @@ -861,36 +1123,21 @@ struct TransformConvBwdDataToGemm_v1 } else { - const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD); - const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); - const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); - - const auto ZTilde = ConvStrideD / GcdStrideDilationD; - const auto YTilde = ConvStrideH / GcdStrideDilationH; - const auto XTilde = ConvStrideW / GcdStrideDilationW; - - const auto DTilde = - Do + math::integer_divide_ceil(ConvDilationD * (Z - I1), ConvStrideD); - const auto HTilde = - Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH); - const auto WTilde = - Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW); - // only work on DTilde, HTilde and WTilde that contribute to // non-padding area of input tensor const auto IDTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadD - ConvDilationD * (ZTilde - I1)), ConvStrideD); + math::max(I0, InLeftPadD_ - ConvDilationD_ * (ZTilde_ - I1)), ConvStrideD_); const auto IHTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH); + math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_); const auto IWTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW); + math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_); const auto IDTildeSliceEnd = math::min( - DTilde, math::integer_divide_ceil(InLeftPadD + Di - I1, ConvStrideD) + I1); + DTilde_, math::integer_divide_ceil(InLeftPadD_ + Di_ - I1, ConvStrideD_) + I1); const auto IHTildeSliceEnd = math::min( - HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1); + HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1); const auto IWTildeSliceEnd = math::min( - WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1); + WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1); const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin; const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin; @@ -901,34 +1148,34 @@ struct TransformConvBwdDataToGemm_v1 { const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor( in_grid_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Hi, InLeftPadH, InRightPadH), - make_pad_transform(Wi, InLeftPadW, InRightPadW), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_pad_transform(Hi_, InLeftPadH_, InRightPadH_), + make_pad_transform(Wi_, InLeftPadW_, InRightPadW_), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor( in_n_hip_wip_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_embed_transform(make_tuple(YTilde, HTilde), - make_tuple(ConvDilationH, ConvStrideH)), - make_embed_transform(make_tuple(XTilde, WTilde), - make_tuple(ConvDilationW, ConvStrideW)), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_embed_transform(make_tuple(YTilde_, HTilde_), + make_tuple(ConvDilationH_, ConvStrideH_)), + make_embed_transform(make_tuple(XTilde_, WTilde_), + make_tuple(ConvDilationW_, ConvStrideW_)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple( Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor( in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_freeze_transform(i_ytilde), - make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice), - make_freeze_transform(i_xtilde), - make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_freeze_transform(IdxYTilde_), + make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice), + make_freeze_transform(IdxXTilde_), + make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -944,8 +1191,8 @@ struct TransformConvBwdDataToGemm_v1 const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor( in_n_htildeslice_wtildeslice_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)), - make_pass_through_transform(C)), + make_tuple(make_merge_transform(make_tuple(N_, HTildeSlice, WTildeSlice)), + make_pass_through_transform(C_)), make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -961,11 +1208,11 @@ struct TransformConvBwdDataToGemm_v1 { const auto in_n_dip_hip_wip_c_grid_desc = transform_tensor_descriptor( in_grid_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Di, InLeftPadD, InRightPadD), - make_pad_transform(Hi, InLeftPadH, InRightPadH), - make_pad_transform(Wi, InLeftPadW, InRightPadW), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_pad_transform(Di_, InLeftPadD_, InRightPadD_), + make_pad_transform(Hi_, InLeftPadH_, InRightPadH_), + make_pad_transform(Wi_, InLeftPadW_, InRightPadW_), + make_pass_through_transform(C_)), make_tuple( Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), make_tuple( @@ -974,14 +1221,14 @@ struct TransformConvBwdDataToGemm_v1 const auto in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor( in_n_dip_hip_wip_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_embed_transform(make_tuple(ZTilde, DTilde), - make_tuple(ConvDilationD, ConvStrideD)), - make_embed_transform(make_tuple(YTilde, HTilde), - make_tuple(ConvDilationH, ConvStrideH)), - make_embed_transform(make_tuple(XTilde, WTilde), - make_tuple(ConvDilationW, ConvStrideW)), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_embed_transform(make_tuple(ZTilde_, DTilde_), + make_tuple(ConvDilationD_, ConvStrideD_)), + make_embed_transform(make_tuple(YTilde_, HTilde_), + make_tuple(ConvDilationH_, ConvStrideH_)), + make_embed_transform(make_tuple(XTilde_, WTilde_), + make_tuple(ConvDilationW_, ConvStrideW_)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -996,14 +1243,14 @@ struct TransformConvBwdDataToGemm_v1 const auto in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor( in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_freeze_transform(i_ztilde), - make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice), - make_freeze_transform(i_ytilde), - make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice), - make_freeze_transform(i_xtilde), - make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_freeze_transform(IdxZTilde_), + make_slice_transform(DTilde_, IDTildeSliceBegin, DTildeSlice), + make_freeze_transform(IdxYTilde_), + make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice), + make_freeze_transform(IdxXTilde_), + make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -1024,8 +1271,8 @@ struct TransformConvBwdDataToGemm_v1 const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor( in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc, make_tuple( - make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice)), - make_pass_through_transform(C)), + make_merge_transform(make_tuple(N_, DTildeSlice, HTildeSlice, WTildeSlice)), + make_pass_through_transform(C_)), make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -1044,84 +1291,41 @@ struct TransformConvBwdDataToGemm_v1 } // for input bias - template || - is_same_v), + (is_same_v || + is_same_v), bool>::type = false> - static auto - MakeCDescriptor_M_N(const std::array& out_g_n_k_wos_lengths, - const std::array& /* out_g_n_k_wos_strides */, - const std::array& wei_g_k_c_xs_lengths, - const std::array& /* wei_g_k_c_xs_strides */, - const std::array& in_g_n_c_wis_lengths, - const std::array& /* in_g_n_c_wis_strides */, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& input_left_pads, - const std::array& /* input_right_pads */, - const std::array& /* tildes */) + __host__ __device__ auto MakeCDescriptor_M_N() const { - const index_t N = in_g_n_c_wis_lengths[1]; - const index_t C = wei_g_k_c_xs_lengths[2]; - - const index_t Hi = in_g_n_c_wis_lengths[3]; - const index_t Wi = in_g_n_c_wis_lengths[4]; - - const index_t Ho = out_g_n_k_wos_lengths[3]; - const index_t Wo = out_g_n_k_wos_lengths[4]; - - const index_t Y = wei_g_k_c_xs_lengths[3]; - const index_t X = wei_g_k_c_xs_lengths[4]; - - const index_t InLeftPadH = input_left_pads[0]; - const index_t InLeftPadW = input_left_pads[1]; - - const index_t ConvStrideH = conv_filter_strides[0]; - const index_t ConvStrideW = conv_filter_strides[1]; - - const index_t ConvDilationH = conv_filter_dilations[0]; - const index_t ConvDilationW = conv_filter_dilations[1]; - if constexpr(ConvBwdDataSpecialization == ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: Filter1x1Stride1Pad0) { const auto in_gemmm_gemmn_grid_desc = - make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, C), make_tuple(I0, I1)); + make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, C_), make_tuple(I0, I1)); return in_gemmm_gemmn_grid_desc; } else { - const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); - const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); - - const auto YTilde = ConvStrideH / GcdStrideDilationH; - const auto XTilde = ConvStrideW / GcdStrideDilationW; - - const auto HTilde = - Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH); - const auto WTilde = - Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW); - // only work on HTilde and WTilde that contribute to non-padding area of input tensor const auto IHTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH); + math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_); const auto IWTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW); + math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_); const auto IHTildeSliceEnd = math::min( - HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1); + HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1); const auto IWTildeSliceEnd = math::min( - WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1); + WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1); const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin; const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin; // bias tensor const auto in_gemmmraw_gemmnraw_grid_desc = make_naive_tensor_descriptor( - make_tuple(N * HTildeSlice * WTildeSlice, C), make_tuple(I0, I1)); + make_tuple(N_ * HTildeSlice * WTildeSlice, C_), make_tuple(I0, I1)); const auto in_gemmm_gemmn_grid_desc = ck::tensor_operation::device::PadTensorDescriptor( in_gemmmraw_gemmnraw_grid_desc, @@ -1131,6 +1335,25 @@ struct TransformConvBwdDataToGemm_v1 return in_gemmm_gemmn_grid_desc; } } + + IndexType N_; + IndexType Di_, Hi_, Wi_; + IndexType Do_, Ho_, Wo_; + IndexType Z_, Y_, X_; + IndexType K_, C_; + IndexType DiStride_, HiStride_, WiStride_; + IndexType DoStride_, HoStride_, WoStride_; + IndexType CStrideTensorB_, CStrideTensorC_, KStrideTensorA_, KStrideTensorB_; + IndexType NStrideTensorA_, NStrideTensorC_; + IndexType ConvStrideD_, ConvStrideH_, ConvStrideW_; + IndexType ConvDilationD_, ConvDilationH_, ConvDilationW_; + IndexType InLeftPadD_, InLeftPadH_, InLeftPadW_; + IndexType InRightPadD_, InRightPadH_, InRightPadW_; + IndexType IdxZTilde_, IdxYTilde_, IdxXTilde_; + IndexType GcdStrideDilationD_, GcdStrideDilationH_, GcdStrideDilationW_; + IndexType ZTilde_, YTilde_, XTilde_; + IndexType DTilde_, HTilde_, WTilde_; + IndexType ZDot_, YDot_, XDot_; }; } // namespace tensor_operation diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/test/grouped_convnd_bwd_data/CMakeLists.txt index 8edb71520..6d78da8db 100644 --- a/test/grouped_convnd_bwd_data/CMakeLists.txt +++ b/test/grouped_convnd_bwd_data/CMakeLists.txt @@ -1,6 +1,10 @@ -add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data_xdl_wmma.cpp) +add_gtest_executable(test_grouped_convnd_bwd_data_xdl test_grouped_convnd_bwd_data_xdl.cpp) if(result EQUAL 0) - target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance) + target_link_libraries(test_grouped_convnd_bwd_data_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance) +endif() +add_gtest_executable(test_grouped_convnd_bwd_data_wmma test_grouped_convnd_bwd_data_wmma.cpp) +if(result EQUAL 0) + target_link_libraries(test_grouped_convnd_bwd_data_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance) endif() add_gtest_executable(test_grouped_convnd_bwd_data_interface_xdl test_grouped_convnd_bwd_data_interface_xdl.cpp) if(result EQUAL 0) diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp new file mode 100644 index 000000000..7ad7b78d6 --- /dev/null +++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include + +#include "profiler/profile_grouped_conv_bwd_data_impl.hpp" + +template +class TestGroupedConvndBwdDataWmma : public ::testing::Test +{ + protected: + using DataType = std::tuple_element_t<0, Tuple>; + using OutLayout = std::tuple_element_t<1, Tuple>; + using WeiLayout = std::tuple_element_t<2, Tuple>; + using InLayout = std::tuple_element_t<3, Tuple>; + + std::vector conv_params; + + template + void Run() + { + EXPECT_FALSE(conv_params.empty()); + bool pass = true; + for(auto& param : conv_params) + { + pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl( + true, // do_verification + 1, // init_method: integer value + false, // do_log + false, // time_kernel + param); + } + EXPECT_TRUE(pass); + } +}; + +using namespace ck::tensor_layout::convolution; + +using KernelTypes2d = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; + +using KernelTypes3d = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; + +template +class TestGroupedConvndBwdDataWmma2d : public TestGroupedConvndBwdDataWmma +{ +}; + +template +class TestGroupedConvndBwdDataWmma3d : public TestGroupedConvndBwdDataWmma +{ +}; + +TYPED_TEST_SUITE(TestGroupedConvndBwdDataWmma2d, KernelTypes2d); +TYPED_TEST_SUITE(TestGroupedConvndBwdDataWmma3d, KernelTypes3d); + +TYPED_TEST(TestGroupedConvndBwdDataWmma2d, Test2D) +{ + this->conv_params.clear(); + + this->conv_params.push_back( + {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back( + {2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back( + {2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back( + {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->template Run<2>(); +} + +TYPED_TEST(TestGroupedConvndBwdDataWmma3d, Test3D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {3, 2, 16, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); + this->conv_params.push_back( + {3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->conv_params.push_back( + {3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); + this->conv_params.push_back( + {3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->conv_params.push_back( + {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->conv_params.push_back( + {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->template Run<3>(); +} diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_wmma.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp similarity index 78% rename from test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_wmma.cpp rename to test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp index 96506b876..fdc8fb64e 100644 --- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_wmma.cpp +++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -12,7 +12,7 @@ #include "profiler/profile_grouped_conv_bwd_data_impl.hpp" template -class TestGroupedConvndBwdData : public ::testing::Test +class TestGroupedConvndBwdDataXdl : public ::testing::Test { protected: using DataType = std::tuple_element_t<0, Tuple>; @@ -51,35 +51,31 @@ using namespace ck::tensor_layout::convolution; using KernelTypes2d = ::testing::Types, std::tuple, std::tuple, - std::tuple, std::tuple, std::tuple, - std::tuple, - std::tuple>; + std::tuple>; using KernelTypes3d = ::testing::Types, std::tuple, std::tuple, - std::tuple, std::tuple, std::tuple, - std::tuple, - std::tuple>; + std::tuple>; template -class TestGroupedConvndBwdData2d : public TestGroupedConvndBwdData +class TestGroupedConvndBwdDataXdl2d : public TestGroupedConvndBwdDataXdl { }; template -class TestGroupedConvndBwdData3d : public TestGroupedConvndBwdData +class TestGroupedConvndBwdDataXdl3d : public TestGroupedConvndBwdDataXdl { }; -TYPED_TEST_SUITE(TestGroupedConvndBwdData2d, KernelTypes2d); -TYPED_TEST_SUITE(TestGroupedConvndBwdData3d, KernelTypes3d); +TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl2d, KernelTypes2d); +TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl3d, KernelTypes3d); -TYPED_TEST(TestGroupedConvndBwdData2d, Test2D) +TYPED_TEST(TestGroupedConvndBwdDataXdl2d, Test2D) { this->conv_params.clear(); @@ -94,10 +90,13 @@ TYPED_TEST(TestGroupedConvndBwdData2d, Test2D) this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + // SplitN case + this->conv_params.push_back( + {2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}}); this->template Run<2>(); } -TYPED_TEST(TestGroupedConvndBwdData3d, Test3D) +TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D) { this->conv_params.clear(); this->conv_params.push_back( @@ -112,5 +111,17 @@ TYPED_TEST(TestGroupedConvndBwdData3d, Test3D) {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); this->conv_params.push_back( {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + // SplitN case + this->conv_params.push_back({3, + 1, + 128, + 4, + 192, + {2, 2, 2}, + {2, 224, 224}, + {1, 224, 224}, + {1, 1, 1}, + {0, 0, 0}, + {0, 0, 0}}); this->template Run<3>(); } -- GitLab From 5e6bd75a725e2c77459bb045b814b7eaded948f9 Mon Sep 17 00:00:00 2001 From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Date: Fri, 6 Dec 2024 09:56:27 -0600 Subject: [PATCH 106/153] Add copy assignment op test (#1718) * Add copy assignment op test * Add a deep copy testing --- test/data_type/test_custom_type.cpp | 82 +++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/test/data_type/test_custom_type.cpp b/test/data_type/test_custom_type.cpp index a8fa9ba4a..b8c0d402a 100644 --- a/test/data_type/test_custom_type.cpp +++ b/test/data_type/test_custom_type.cpp @@ -51,8 +51,11 @@ TEST(Custom_bool, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_bool_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -129,8 +132,11 @@ TEST(Custom_int8, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_int8_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -207,8 +213,11 @@ TEST(Custom_uint8, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_uint8_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -287,8 +296,11 @@ TEST(Custom_f8, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_f8_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -369,8 +381,11 @@ TEST(Custom_bf8, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_bf8_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -450,8 +465,11 @@ TEST(Custom_half, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_half_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -533,8 +551,11 @@ TEST(Custom_bhalf, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_bhalf_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -615,8 +636,11 @@ TEST(Custom_float, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_float_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -693,8 +717,11 @@ TEST(Custom_double, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_double_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -813,8 +840,11 @@ TEST(Complex_half, TestAsType) right_vec.template AsType()(Number{}) = complex_half_t{test_vec.at(num_elem * i), test_vec.at(num_elem * i + 1)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).real, @@ -907,8 +937,11 @@ TEST(FP8OCP, TestAsType) right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { @@ -984,8 +1017,11 @@ TEST(BF8OCP, TestAsType) right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); }); - // copy the vector vector_type left_vec{right_vec}; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { -- GitLab From 355893cdd85418f3174a023aeb1ddba008951660 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 6 Dec 2024 13:04:25 -0800 Subject: [PATCH 107/153] Refactor CI performance tests. (#1726) * merge the build and performance tests CI stages together * add gemm performance test on gfx11/gfx12 * add suffices to distinguish gemm performance logs from different archs * use smaller gemm set in CI for gfx10/gfx11/gfx12 * disable performance tests on gfx1030 * fix the shashing logic * fix finding python3 for mha instances --- Jenkinsfile | 286 ++++++------------ .../gpu/mha/CMakeLists.txt | 6 +- script/process_perf_data.py | 4 +- script/process_perf_data.sh | 13 + script/process_qa_data.sh | 12 + script/run_full_performance_tests.sh | 2 +- script/run_gemm_performance_tests.sh | 41 +++ script/run_performance_tests.sh | 21 +- 8 files changed, 176 insertions(+), 209 deletions(-) create mode 100755 script/run_gemm_performance_tests.sh diff --git a/Jenkinsfile b/Jenkinsfile index 58cd72c8c..0a98cc5c6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -330,10 +330,8 @@ def cmake_build(Map conf=[:]){ try{ archiveArtifacts "perf_fmha_fwd_*.log" archiveArtifacts "perf_fmha_bwd_*.log" - stash name: "perf_fmha_fwd_gfx942.log" - stash name: "perf_fmha_bwd_gfx942.log" - stash name: "perf_fmha_fwd_gfx90a.log" - stash name: "perf_fmha_bwd_gfx90a.log" + stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942" + stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a" } catch(Exception err){ echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." @@ -408,128 +406,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){ } } -def runCKProfiler(Map conf=[:]){ - show_node_info() - - env.HSA_ENABLE_SDMA=0 - checkout scm - - def image = getDockerImageName() - def prefixpath = conf.get("prefixpath", "/opt/rocm") - - // Jenkins is complaining about the render group - def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" - if (conf.get("enforce_xnack_on", false)) { - dockerOpts = dockerOpts + " --env HSA_XNACK=1 " - } - def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') - def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3') - dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} " - echo "Docker flags: ${dockerOpts}" - - def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - - def variant = env.STAGE_NAME - def retimage - - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { - try { - (retimage, image) = getDockerImage(conf) - withDockerContainer(image: image, args: dockerOpts) { - timeout(time: 5, unit: 'MINUTES'){ - sh 'rocminfo | tee rocminfo.log' - if ( !runShell('grep -n "gfx" rocminfo.log') ){ - throw new Exception ("GPU not found") - } - else{ - echo "GPU is OK" - } - } - } - } - catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ - echo "The job was cancelled or aborted" - throw e - } - - withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 24, unit: 'HOURS') - { - sh """ - rm -rf build - mkdir build - """ - dir("build"){ - unstash 'ckProfiler.tar.gz' - sh 'tar -xvf ckProfiler.tar.gz' - } - - dir("script"){ - if (params.RUN_FULL_QA){ - sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" - archiveArtifacts "perf_gemm.log" - archiveArtifacts "perf_resnet50_N256.log" - archiveArtifacts "perf_resnet50_N4.log" - archiveArtifacts "perf_batched_gemm.log" - archiveArtifacts "perf_grouped_gemm.log" - archiveArtifacts "perf_grouped_conv_fwd.log" - archiveArtifacts "perf_grouped_conv_bwd_data.log" - archiveArtifacts "perf_grouped_conv_bwd_weight.log" - archiveArtifacts "perf_gemm_bilinear.log" - archiveArtifacts "perf_reduction.log" - archiveArtifacts "perf_splitK_gemm.log" - archiveArtifacts "perf_onnx_gemm.log" - archiveArtifacts "perf_mixed_gemm.log" - // stash perf files to master - stash name: "perf_gemm.log" - stash name: "perf_resnet50_N256.log" - stash name: "perf_resnet50_N4.log" - stash name: "perf_batched_gemm.log" - stash name: "perf_grouped_gemm.log" - stash name: "perf_grouped_conv_fwd.log" - stash name: "perf_grouped_conv_bwd_data.log" - stash name: "perf_grouped_conv_bwd_weight.log" - stash name: "perf_gemm_bilinear.log" - stash name: "perf_reduction.log" - stash name: "perf_splitK_gemm.log" - stash name: "perf_onnx_gemm.log" - stash name: "perf_mixed_gemm.log" - //we will process results on the master node - } - else{ - sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" - archiveArtifacts "perf_gemm.log" - archiveArtifacts "perf_resnet50_N256.log" - archiveArtifacts "perf_resnet50_N4.log" - // stash perf files to master - stash name: "perf_gemm.log" - stash name: "perf_resnet50_N256.log" - stash name: "perf_resnet50_N4.log" - //we will process the results on the master node - } - } - } - } - } - return retimage -} - -def runPerfTest(Map conf=[:]){ - try{ - runCKProfiler(conf) - } - catch(e){ - echo "throwing error exception in performance tests" - echo 'Exception occurred: ' + e.toString() - throw e - } - finally{ - if (!conf.get("no_reboot", false)) { - reboot() - } - } -} - def Build_CK(Map conf=[:]){ show_node_info() @@ -589,36 +465,95 @@ def Build_CK(Map conf=[:]){ throw e } withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 24, unit: 'HOURS') + timeout(time: 12, unit: 'HOURS') { //check whether to run performance tests on this node - def do_perf_tests = 0 + def arch_type = 0 sh 'rocminfo | tee rocminfo.log' - if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx1201" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){ - do_perf_tests = 1 - echo "Stash profiler and run performance tests" + if ( runShell('grep -n "gfx90a" rocminfo.log') ){ + arch_type = 1 + } + else if ( runShell('grep -n "gfx942" rocminfo.log') ) { + arch_type = 2 + } + else if ( runShell('grep -n "gfx1030" rocminfo.log') ) { + arch_type = 3 + } + else if ( runShell('grep -n "gfx1101" rocminfo.log') ) { + arch_type = 4 + } + else if ( runShell('grep -n "gfx1201" rocminfo.log') ) { + arch_type = 5 } cmake_build(conf) dir("build"){ - //run tests and examples - //sh 'make -j check' - if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){ - //we only need the ckProfiler to run the performance tests, so we pack and stash it - //do not stash profiler on nodes where we don't need to run performance tests - sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler' - stash name: "ckProfiler.tar.gz" - } - if (params.RUN_FULL_QA && do_perf_tests == 0 ){ - // build deb packages for all gfx9 targets and prepare to export + if (params.RUN_FULL_QA && arch_type == 1 ){ + // build deb packages for all gfx9 targets on gfx90a system and prepare to export + echo "Build ckProfiler package" sh 'make -j package' archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb' - archiveArtifacts artifacts: 'composablekernel-tests_*.deb' sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb' - stash name: "ckprofiler_0.2.0_amd64.deb" + stash includes: "ckprofiler_0.2.0_amd64.deb", name: "ckprofiler_0.2.0_amd64.deb" + } + } + // run performance tests, stash the logs, results will be processed on the master node + dir("script"){ + if (params.RUN_PERFORMANCE_TESTS){ + if (params.RUN_FULL_QA && arch_type == 1){ + // run full tests on gfx90a + echo "Run full performance tests" + sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" + archiveArtifacts "perf_gemm.log" + archiveArtifacts "perf_resnet50_N256.log" + archiveArtifacts "perf_resnet50_N4.log" + archiveArtifacts "perf_batched_gemm.log" + archiveArtifacts "perf_grouped_gemm.log" + archiveArtifacts "perf_grouped_conv_fwd.log" + archiveArtifacts "perf_grouped_conv_bwd_data.log" + archiveArtifacts "perf_grouped_conv_bwd_weight.log" + archiveArtifacts "perf_gemm_bilinear.log" + archiveArtifacts "perf_reduction.log" + archiveArtifacts "perf_splitK_gemm.log" + archiveArtifacts "perf_onnx_gemm.log" + archiveArtifacts "perf_mixed_gemm.log" + stash includes: "perf_**.log", name: "perf_log" + } + else if ( arch_type == 1 ){ + // run standard tests on gfx90a + echo "Run performance tests" + sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" + archiveArtifacts "perf_gemm.log" + archiveArtifacts "perf_onnx_gemm.log" + archiveArtifacts "perf_resnet50_N256.log" + archiveArtifacts "perf_resnet50_N4.log" + stash includes: "perf_**.log", name: "perf_log" + } + // disable performance tests on gfx1030 for now. + //else if ( arch_type == 3){ + // run basic tests on gfx1030 + // echo "Run gemm performance tests" + // sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10" + // archiveArtifacts "perf_onnx_gemm_gfx10.log" + // stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10" + //} + else if ( arch_type == 4){ + // run basic tests on gfx11 + echo "Run gemm performance tests" + sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11" + archiveArtifacts "perf_onnx_gemm_gfx11.log" + stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11" + } + else if ( arch_type == 5 ){ + // run basic tests on gfx12 + echo "Run gemm performance tests" + sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12" + archiveArtifacts "perf_onnx_gemm_gfx12.log" + stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12" + } } } - if (params.hipTensor_test && do_perf_tests == 0 ){ - //build and test hipTensor + if (params.hipTensor_test && arch_type == 1 ){ + // build and test hipTensor on gfx90a node sh """#!/bin/bash rm -rf "${params.hipTensor_branch}".zip rm -rf hipTensor-"${params.hipTensor_branch}" @@ -690,10 +625,8 @@ def process_results(Map conf=[:]){ dir("script"){ if (params.RUN_CK_TILE_FMHA_TESTS){ try{ - unstash "perf_fmha_fwd_gfx942.log" - unstash "perf_fmha_bwd_gfx942.log" - unstash "perf_fmha_fwd_gfx90a.log" - unstash "perf_fmha_bwd_gfx90a.log" + unstash "perf_fmha_log_gfx942" + unstash "perf_fmha_log_gfx90a" } catch(Exception err){ echo "could not locate the FMHA performance logs: ${err.getMessage()}." @@ -703,26 +636,26 @@ def process_results(Map conf=[:]){ // unstash perf files to master unstash "ckprofiler_0.2.0_amd64.deb" sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/" - unstash "perf_gemm.log" - unstash "perf_resnet50_N256.log" - unstash "perf_resnet50_N4.log" - unstash "perf_batched_gemm.log" - unstash "perf_grouped_gemm.log" - unstash "perf_grouped_conv_fwd.log" - unstash "perf_grouped_conv_bwd_data.log" - unstash "perf_grouped_conv_bwd_weight.log" - unstash "perf_gemm_bilinear.log" - unstash "perf_reduction.log" - unstash "perf_splitK_gemm.log" - unstash "perf_onnx_gemm.log" - unstash "perf_mixed_gemm.log" + unstash "perf_log" + try{ + unstash "perf_log_gfx11" + unstash "perf_log_gfx12" + } + catch(Exception err){ + echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}." + } sh "./process_qa_data.sh" } else{ // unstash perf files to master - unstash "perf_gemm.log" - unstash "perf_resnet50_N256.log" - unstash "perf_resnet50_N4.log" + unstash "perf_log" + try{ + unstash "perf_log_gfx11" + unstash "perf_log_gfx12" + } + catch(Exception err){ + echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}." + } sh "./process_perf_data.sh" } } @@ -1241,29 +1174,6 @@ pipeline { } } } - - stage("Performance Tests") - { - parallel - { - stage("Run ckProfiler: gfx90a") - { - when { - beforeAgent true - expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() } - } - options { retry(1) } - agent{ label rocmnode("gfx90a")} - environment{ - setup_args = "NO_CK_BUILD" - } - steps{ - runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') - cleanWs() - } - } - } - } stage("Process Performance Test Results") { parallel diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt index a53fde166..0457588ea 100644 --- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt @@ -6,7 +6,7 @@ set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/) # CK Codegen requires dataclass which is added in Python 3.7 # Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04 if(NOT CK_USE_ALTERNATIVE_PYTHON) - find_package(PythonInterp 3 REQUIRED) + find_package(Python3 COMPONENTS Interpreter Development) else() message("Using alternative python version") set(EXTRA_PYTHON_PATH) @@ -33,7 +33,7 @@ set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd") # Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time. # With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad. execute_process( - COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py + COMMAND ${Python3_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt --api ${FMHA_KNOWN_APIS} --receipt 3 @@ -50,7 +50,7 @@ endif() # With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad. add_custom_command( OUTPUT ${FMHA_GEN_BLOBS} - COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py + COMMAND ${Python3_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py --output_dir ${FMHA_CPP_FOLDER} --api ${FMHA_KNOWN_APIS} --receipt 3 diff --git a/script/process_perf_data.py b/script/process_perf_data.py index 3892206e4..fbfec94ee 100644 --- a/script/process_perf_data.py +++ b/script/process_perf_data.py @@ -82,7 +82,7 @@ def parse_logfile(logfile): StrideA=[] StrideB=[] StrideC=[] - if 'perf_gemm.log' in logfile: + if 'perf_gemm' in logfile and 'gemm_bilinear' not in logfile: for line in open(logfile): if 'Best Perf' in line: lst=line.split() @@ -260,7 +260,7 @@ def main(): conn = sqlEngine.connect() #save gemm performance tests: - if 'perf_gemm.log' in filename: + if 'perf_gemm' in filename and 'gemm_bilinear' not in filename: #write the ck_gemm_test_params table only needed once the test set changes #post_test_params(test_list,conn) for i in range(1,len(results)+1): diff --git a/script/process_perf_data.sh b/script/process_perf_data.sh index af1e7e7a0..ae9346320 100755 --- a/script/process_perf_data.sh +++ b/script/process_perf_data.sh @@ -11,9 +11,22 @@ #process results python3 process_perf_data.py perf_gemm.log +python3 process_perf_data.py perf_onnx_gemm.log python3 process_perf_data.py perf_resnet50_N256.log python3 process_perf_data.py perf_resnet50_N4.log +file=./perf_onnx_gemm_gfx10.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx10.log +fi +file=./perf_onnx_gemm_gfx11.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx11.log +fi +file=./perf_onnx_gemm_gfx12.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx12.log +fi file=./perf_fmha_fwd_gfx942.log if [ -e "$file" ]; then python3 process_perf_data.py perf_fmha_fwd_gfx942.log diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh index c9a1645f6..fb8fe01c6 100755 --- a/script/process_qa_data.sh +++ b/script/process_qa_data.sh @@ -24,6 +24,18 @@ python3 process_perf_data.py perf_splitK_gemm.log python3 process_perf_data.py perf_onnx_gemm.log python3 process_perf_data.py perf_mixed_gemm.log +file=./perf_onnx_gemm_gfx10.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx10.log +fi +file=./perf_onnx_gemm_gfx11.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx11.log +fi +file=./perf_onnx_gemm_gfx12.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx12.log +fi file=./perf_fmha_fwd_gfx942.log if [ -e "$file" ]; then python3 process_perf_data.py perf_fmha_fwd_gfx942.log diff --git a/script/run_full_performance_tests.sh b/script/run_full_performance_tests.sh index e167ce012..ddc5c270b 100755 --- a/script/run_full_performance_tests.sh +++ b/script/run_full_performance_tests.sh @@ -5,7 +5,7 @@ # post your new test results to the database and compare them to the baseline # please contact Illia.Silin@amd.com for more details # -# run the script as "./run_full_performance_tests.sh < node name> +# run the script as "./run_full_performance_tests.sh # input arguments: # verification = 0 : do not verify result correctness on CPU # = 1 : verifuy correctness on CPU (may take a long time) diff --git a/script/run_gemm_performance_tests.sh b/script/run_gemm_performance_tests.sh new file mode 100755 index 000000000..12adad30f --- /dev/null +++ b/script/run_gemm_performance_tests.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# +# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ +# run the script as "./run_gemm_performance_tests.sh +# input arguments: +# verification = 0 : do not verify result correctness on CPU +# = 1 : verify correctness on CPU (may take a long time) +# environment tag : a string describing the specifics of your test environment +# branch name : name of the branch in git repo (git status | grep -e 'On branch') +# node name : $hostname +# arch : GPU architecture, e.g. "gfx9" or "gfx1100" + +#get the command line arguments: +export verify=$1 +echo 'Verification: ' $verify +export env_type=$2 +echo 'Environment type: ' $env_type +export branch=$3 +echo 'Branch name: ' $branch +export host_name=$4 +echo 'Host name: ' $host_name +export arch=$5 +echo 'GPU architecture: ' $arch + +function print_log_header(){ + rm -f $1; + echo 'On branch ' $3 &> $1; + echo 'Node name: ' $4 >> $1; + #get GPU_arch and number of compute units from rocminfo + echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; + rocminfo | grep "Compute Unit:" >> $1; + hipcc --version | grep -e 'HIP version' >> $1; + echo 'Environment type: ' $2 >> $1; + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; +} + +#run ONNX gemm tests +export onnx_log="perf_onnx_gemm_$arch.log" +print_log_header $onnx_log $env_type $branch $host_name +./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log +./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log diff --git a/script/run_performance_tests.sh b/script/run_performance_tests.sh index 317d27098..c8a281dc0 100755 --- a/script/run_performance_tests.sh +++ b/script/run_performance_tests.sh @@ -1,7 +1,7 @@ #!/bin/bash # # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ -# run the script as "./run_performance_tests.sh < node name> +# run the script as "./run_performance_tests.sh # input arguments: # verification = 0 : do not verify result correctness on CPU # = 1 : verify correctness on CPU (may take a long time) @@ -51,20 +51,11 @@ print_log_header $gemm_log $env_type $branch $host_name ./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log ./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log -#run grouped_fwd fp16 tests -export grouped_conv_fwd_log="perf_grouped_conv_fwd_fp16.log" -print_log_header $conv_fwd_log $env_type $branch $host_name -./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log - -#run grouped_bwd_data fp16 tests -export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data_fp16.log" -print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name -./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log - -#run grouped_bwd_weight fp16 tests -export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight_fp16.log" -print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name -./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 1 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log +#run ONNX gemm tests +export onnx_log="perf_onnx_gemm.log" +print_log_header $onnx_log $env_type $branch $host_name +./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log +./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log #run resnet50 tests export resnet256_log="perf_resnet50_N256.log" -- GitLab From c773cc25a235dbc3c044b9cf7fb32910bc8fcae0 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 9 Dec 2024 08:50:36 -0800 Subject: [PATCH 108/153] remove unnecessary file (#1732) --- modified_files.txt | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100755 modified_files.txt diff --git a/modified_files.txt b/modified_files.txt deleted file mode 100755 index 34a42e3f3..000000000 --- a/modified_files.txt +++ /dev/null @@ -1,10 +0,0 @@ -example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp -example/01_gemm/run_gemm_example_streamk_v2.inc -include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp -include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp -library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp -library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp -library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp -library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp -profiler/src/profile_gemm_universal_streamk.cpp -modified_files.txt -- GitLab From 2f088b870764d406ec453987198deb298f3e9e3a Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 9 Dec 2024 09:32:14 -0800 Subject: [PATCH 109/153] update CI timeout limits (#1733) --- Jenkinsfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0a98cc5c6..cb344e8a5 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -377,7 +377,7 @@ def buildHipClangJob(Map conf=[:]){ gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 48, unit: 'HOURS') + timeout(time: 20, unit: 'HOURS') { cmake_build(conf) } @@ -449,7 +449,7 @@ def Build_CK(Map conf=[:]){ try { (retimage, image) = getDockerImage(conf) withDockerContainer(image: image, args: dockerOpts) { - timeout(time: 5, unit: 'MINUTES'){ + timeout(time: 2, unit: 'MINUTES'){ sh 'rocminfo | tee rocminfo.log' if ( !runShell('grep -n "gfx" rocminfo.log') ){ throw new Exception ("GPU not found") @@ -465,7 +465,7 @@ def Build_CK(Map conf=[:]){ throw e } withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 12, unit: 'HOURS') + timeout(time: 20, unit: 'HOURS') { //check whether to run performance tests on this node def arch_type = 0 @@ -620,7 +620,7 @@ def process_results(Map conf=[:]){ } withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 1, unit: 'HOURS'){ + timeout(time: 15, unit: 'MINUTES'){ try{ dir("script"){ if (params.RUN_CK_TILE_FMHA_TESTS){ -- GitLab From 23cf2026b496140e73a2990199f79e6257b228c7 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 9 Dec 2024 14:11:20 -0800 Subject: [PATCH 110/153] build CI for gfx12 by default (#1734) --- Jenkinsfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index cb344e8a5..f118d4e45 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -675,8 +675,8 @@ def process_results(Map conf=[:]){ //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true 0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true - 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true - 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true + 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true + 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false 0 13 * * * % BUILD_LEGACY_OS=true''' : "" @@ -763,8 +763,8 @@ pipeline { description: "Test building instances for various architectures simultaneously (default: OFF)") booleanParam( name: "BUILD_GFX12", - defaultValue: false, - description: "Build CK and run tests on gfx12 (default: OFF)") + defaultValue: true, + description: "Build CK and run tests on gfx12 (default: ON)") booleanParam( name: "NINJA_BUILD_TRACE", defaultValue: false, -- GitLab From 94ae7113bd05e3c39364193dba1b391a4c54a2f4 Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 10 Dec 2024 11:36:18 +0800 Subject: [PATCH 111/153] [CK TILE] Use config name instead of data type in FmhaFwdTypeConfig (#1731) * Add data type config, Prepare to add mix precision in the future * Fix compile error --- .../ck_tile/01_fmha/codegen/cpp_symbol_map.py | 15 ++- .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py | 14 +-- .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 11 ++- .../01_fmha/codegen/ops/fmha_fwd_appendkv.py | 9 +- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 27 ++--- example/ck_tile/01_fmha/fmha_bwd.cpp | 14 +-- example/ck_tile/01_fmha/fmha_bwd.hpp | 12 ++- example/ck_tile/01_fmha/fmha_fwd.cpp | 99 ++++++++++--------- example/ck_tile/01_fmha/fmha_fwd.hpp | 32 +++++- 9 files changed, 142 insertions(+), 91 deletions(-) diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py index 66691356a..f6df44a31 100644 --- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py +++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py @@ -2,10 +2,17 @@ # Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. # generate kernel instances to speed up compilation -DTYPE_MAP = { - "fp16": "ck_tile::fp16_t", - "bf16": "ck_tile::bf16_t", - "fp8" : "ck_tile::fp8_t" +FWD_DTYPE_MAP = { + "fp16" : "FmhaFwdFp16", + "bf16" : "FmhaFwdBf16", + "fp8" : "FmhaFwdFp8", + "fp8fp16": "FmhaFwdFp8Fp16", + "fp8bf16": "FmhaFwdFp8Bf16" +} + +BWD_DTYPE_MAP = { + "fp16": "FmhaBwdFp16", + "bf16": "FmhaBwdBf16" } MASK_IMPL = { diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py index 096394c0c..83a1e82d6 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py @@ -283,7 +283,7 @@ class FmhaBwdApiPool: inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline], F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout], - F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype], + F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype], F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], F_deterministic=BOOL_MAP[trait.deterministic]) @@ -360,7 +360,7 @@ class FmhaBwdDQDKDVKernel: FMHA_BWD_DQ_DK_DV_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = BWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn0 = self.F_tile.F_bn0, F_bk0 = self.F_tile.F_bk0, @@ -469,7 +469,7 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> gen = list() api_pool = FmhaBwdApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in BWD_DTYPE_MAP.keys(): d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d == None: continue @@ -585,7 +585,7 @@ class FmhaBwdOGradDotOKernel: FMHA_BWD_DOT_DO_O_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = BWD_DTYPE_MAP[self.F_dtype], F_spad = BOOL_MAP[self.F_spad], F_dvpad = BOOL_MAP[self.F_dvpad], F_mode = MODE_MAP[self.F_mode], @@ -616,7 +616,7 @@ def get_bwd_dot_do_o_blobs() -> List[FmhaBwdOGradDotOKernel]: gen = list() - for dtype in DTYPE_MAP.keys(): + for dtype in BWD_DTYPE_MAP.keys(): d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d == None: continue @@ -716,7 +716,7 @@ class FmhaBwdConvertQGradKernel: FMHA_BWD_CONVERT_DQ_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = BWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_bm0, F_bn0 = self.F_bn0, F_spad = BOOL_MAP[self.F_spad], @@ -751,7 +751,7 @@ def get_bwd_convert_dq_blobs() -> List[FmhaBwdConvertQGradKernel]: gen = list() - for dtype in DTYPE_MAP.keys(): + for dtype in BWD_DTYPE_MAP.keys(): d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index e5ee1d22e..eca638784 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -282,7 +282,7 @@ class FmhaFwdApiPool: F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max, - F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) + F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) if_i = 'if' if i == 0 else 'else if' @@ -301,7 +301,7 @@ class FmhaFwdTileSize: F_bk1 : int # tile size along kv gemm unroll F_bk0max : int # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile) F_rm0 : int # number of warps for gemm0 along q seqlen - F_rn0 : int # number of warps for gemm0 along k seqlen + F_rn0 : int # number of warps for gemm0 along k seqlen F_rk0 : int # number of warps for gemm0 along head dim q (not used) F_rm1 : int # number of warps for gemm1 along q seqlen F_rn1 : int # number of warps for gemm1 along head dim v @@ -339,7 +339,7 @@ class FmhaFwdKernel: FMHA_FWD_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn0 = self.F_tile.F_bn0, F_bk0 = self.F_tile.F_bk0, @@ -462,6 +462,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm # no need lse/dropout kernels for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', 'f', squant, mask)) + elif dtype in ['fp8fp16', 'fp8bf16']: + # TODO + None else: assert False return pipelines @@ -469,7 +472,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm gen = list() api_pool = FmhaFwdApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_tile_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py index cfd1d01c9..fb998a33d 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py @@ -181,7 +181,7 @@ class FmhaFwdAppendKVApiPool: inners = inners + FMHA_FWD_APPENDKV_API_INNER_DISPATCH.format(F_if=if_k, F_vlayout=LAYOUT_MAP[trait.vlayout], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_rope_check=ROPE_CHECK_MAP[trait.rope], F_pagedkv=BOOL_MAP[trait.pagedkv], F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], - F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) + F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) if_i = 'if' if i == 0 else 'else if' @@ -216,7 +216,7 @@ class FmhaFwdAppendKVKernel: FMHA_FWD_APPENDKV_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bs = self.F_tile.F_bs, F_bsk = self.F_tile.F_bsk, F_bd = self.F_tile.F_bd, @@ -301,6 +301,9 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> elif dtype in ['fp8', 'bf8']: # rope/paged-kv is not supported pipelines.append(FmhaFwdAppendKVPipeline('col', 't', 't', 't', 't', 'no', 'f')) + elif dtype in ['fp8fp16', 'fp8bf16']: + # TODO + None else: assert False return pipelines @@ -308,7 +311,7 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> gen = list() api_pool = FmhaFwdAppendKVApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_appendkv_tile_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index 1c40cf6f3..e448902cf 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -112,7 +112,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) }} using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, - {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, + {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; #include @@ -161,7 +161,7 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaSplitKVCombinePipelineProblem< typename FmhaFwdTypeConfig::OaccDataType, typename FmhaFwdTypeConfig::ODataType, {F_hdim}, - {F_bm0}, + {F_bm0}, {F_bn1}, {F_mode}, fmha_trait>; @@ -231,11 +231,11 @@ float fmha_fwd_splitkv_(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a if(s.log_level_ > 0) std::cout << ", " << fmha_fwd_splitkv_get_name_() - << ", " << fmha_fwd_splitkv_combine_get_name_() + << ", " << fmha_fwd_splitkv_combine_get_name_() << std::flush; return ck_tile::launch_kernel(s, - [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_oneshot_(s_, a); }}, + [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_oneshot_(s_, a); }}, [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_combine_oneshot_(s_, a); }} ); }} @@ -431,11 +431,11 @@ class FmhaFwdSplitKVApiPool: inners = inners + FMHA_FWD_SPLITKV_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout], F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias], - F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv], + F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max, - F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) + F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) if_i = 'if' if i == 0 else 'else if' @@ -472,7 +472,7 @@ class FmhaFwdSplitKVKernel: FMHA_FWD_SPLITKV_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn0 = self.F_tile.F_bn0, F_bk0 = self.F_tile.F_bk0, @@ -492,7 +492,7 @@ class FmhaFwdSplitKVKernel: F_spad = BOOL_MAP[self.F_pipeline.F_spad], F_skpad = BOOL_MAP[self.F_pipeline.F_skpad], F_dpad = BOOL_MAP[self.F_pipeline.F_dpad], - F_dvpad = BOOL_MAP[self.F_pipeline.F_dvpad], + F_dvpad = BOOL_MAP[self.F_pipeline.F_dvpad], F_bias = BIAS_MAP[self.F_pipeline.F_bias], F_lse = BOOL_MAP[self.F_pipeline.F_lse], F_squant = BOOL_MAP[self.F_pipeline.F_squant], @@ -552,7 +552,7 @@ class FmhaFwdSplitKVCombineKernel: FMHA_FWD_SPLITKV_COMBINE_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn1 = self.F_tile.F_bn1, F_spad = BOOL_MAP[self.F_pipeline.F_spad], @@ -625,7 +625,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> pipelines = [] if dtype in ['fp16', 'bf16']: for mask, bias, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]): - # TODO: use async pipeline when compiler is more stable + # TODO: use async pipeline when compiler is more stable if hdim == 256 or hdim in [32, 64, 128]: ### [32, 64, 96, 128]: # if True: pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask)) @@ -644,6 +644,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> elif dtype in ['fp8', 'bf8']: for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask)) + elif dtype in ['fp8fp16', 'fp8bf16']: + # TODO + None else: assert False return pipelines @@ -651,7 +654,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> gen = list() api_pool = FmhaFwdSplitKVApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_tile_dict_from_dtype(dtype) if d == None: continue @@ -711,7 +714,7 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis gen = list() - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/fmha_bwd.cpp b/example/ck_tile/01_fmha/fmha_bwd.cpp index 2d76627a7..eaf99529f 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.cpp +++ b/example/ck_tile/01_fmha/fmha_bwd.cpp @@ -101,7 +101,7 @@ auto create_args(int argc, char* argv[]) } // different threshold for different dtype -template +template auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/) { double rtol = 1e-2; @@ -110,7 +110,7 @@ auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/) } template <> -auto get_elimit(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v) +auto get_elimit(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v) { double rtol = 1e-2; double atol = 1e-2; @@ -122,7 +122,7 @@ auto get_elimit(ck_tile::index_t hdim_q, ck_tile::index_t hdim_ return ck_tile::make_tuple(rtol, atol); } -template +template bool run(const ck_tile::ArgParser& arg_parser) { std::string data_type = arg_parser.get_str("prec"); @@ -209,7 +209,7 @@ bool run(const ck_tile::ArgParser& arg_parser) const auto seqstart_q_host = generate_seqstarts(mode, batch, seqlen_q); const auto seqstart_k_host = generate_seqstarts(mode, batch, seqlen_k); - using TypeConfig = FmhaBwdTypeConfig; + using TypeConfig = FmhaBwdTypeConfig; using QDataType = typename TypeConfig::QDataType; using KDataType = typename TypeConfig::KDataType; @@ -933,7 +933,7 @@ bool run(const ck_tile::ArgParser& arg_parser) } // clang-format on - auto [rtol, atol] = get_elimit(hdim_q, hdim_v); + auto [rtol, atol] = get_elimit(hdim_q, hdim_v); bool dq_cur_pass = ck_tile::check_err(dq_host_result, dq_host_ref, std::string("Error: QGrad Incorrect results!"), @@ -986,11 +986,11 @@ int main(int argc, char* argv[]) const std::string data_type = arg_parser.get_str("prec"); if(data_type == "fp16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } else if(data_type == "bf16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } return -3; diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp index 722ef15a2..6204cbcfa 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.hpp +++ b/example/ck_tile/01_fmha/fmha_bwd.hpp @@ -14,11 +14,19 @@ #include #include +struct FmhaBwdFp16 +{ +}; + +struct FmhaBwdBf16 +{ +}; + template struct FmhaBwdTypeConfig; template <> -struct FmhaBwdTypeConfig +struct FmhaBwdTypeConfig { using QDataType = ck_tile::half_t; using KDataType = ck_tile::half_t; @@ -38,7 +46,7 @@ struct FmhaBwdTypeConfig }; template <> -struct FmhaBwdTypeConfig +struct FmhaBwdTypeConfig { using QDataType = ck_tile::bf16_t; using KDataType = ck_tile::bf16_t; diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index 1f0d73d95..ebf2c93a3 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -142,7 +142,7 @@ auto create_args(int argc, char* argv[]) } // different threshold for different dtype -template +template auto get_elimit(std::string /*init_method*/) { double rtol = 1e-3; @@ -151,7 +151,7 @@ auto get_elimit(std::string /*init_method*/) } template <> -auto get_elimit(std::string /*init_method*/) +auto get_elimit(std::string /*init_method*/) { double rtol = 1e-2; double atol = 1e-2; @@ -159,7 +159,7 @@ auto get_elimit(std::string /*init_method*/) } template <> -auto get_elimit(std::string init_method) +auto get_elimit(std::string init_method) { if(init_method == "ui" || init_method == "ni") { @@ -261,7 +261,7 @@ int override_num_splits_if_necessary( return num_splits; } -template +template bool run(const ck_tile::ArgParser& arg_parser) { std::string data_type = arg_parser.get_str("prec"); @@ -305,8 +305,8 @@ bool run(const ck_tile::ArgParser& arg_parser) } ck_tile::index_t rotary_dim = arg_parser.get_int("rotary_dim"); - if constexpr(!(std::is_same_v || - std::is_same_v)) + if constexpr(!(std::is_same_v || + std::is_same_v)) { if(0 < rotary_dim) { @@ -428,25 +428,6 @@ bool run(const ck_tile::ArgParser& arg_parser) return atoi(squant_str.c_str()) != 0 ? true : false; }(); - float range_q = arg_parser.get_float("range_q"); - float range_k = arg_parser.get_float("range_k"); - float range_v = arg_parser.get_float("range_v"); - float range_p = arg_parser.get_float("range_p"); - float range_o = arg_parser.get_float("range_o"); - - float dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); - - float scale_p = 1.f; - float scale_o = 1.f; - - if(squant) - { - scale_s = scale_s * (range_q / dtype_max) * (range_k / dtype_max); - scale_p = dtype_max / range_p; - // scale_p = [max(fp8_t)/range_o] * [range_p/max(fp8_t)] * [range_v/max(fp8_t)] - scale_o = range_p * range_v / range_o / dtype_max; - } - std::string vlayout = arg_parser.get_str("vlayout"); bool lse = arg_parser.get_bool("lse"); @@ -499,7 +480,7 @@ bool run(const ck_tile::ArgParser& arg_parser) const auto seqstart_k_host = to_seqstarts(seqlen_ks); const auto seqstart_k_with_padding_host = to_seqstarts(seqlen_kpads); - using TypeConfig = FmhaFwdTypeConfig; + using TypeConfig = FmhaFwdTypeConfig; using QDataType = typename TypeConfig::QDataType; using KDataType = typename TypeConfig::KDataType; @@ -513,6 +494,28 @@ bool run(const ck_tile::ArgParser& arg_parser) using OaccDataType = typename TypeConfig::OaccDataType; using ODataType = typename TypeConfig::ODataType; + float range_q = arg_parser.get_float("range_q"); + float range_k = arg_parser.get_float("range_k"); + float range_v = arg_parser.get_float("range_v"); + float range_p = arg_parser.get_float("range_p"); + float range_o = arg_parser.get_float("range_o"); + + float q_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + float k_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + float v_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + float p_dtype_max = v_dtype_max; // assume p and v is the same type + float o_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + + float scale_p = 1.f; + float scale_o = 1.f; + + if(squant) + { + scale_s = scale_s * (range_q / q_dtype_max) * (range_k / k_dtype_max); + scale_p = p_dtype_max / range_p; + scale_o = (o_dtype_max / range_o) * (range_p / p_dtype_max) * (range_v / v_dtype_max); + } + // accumulation numbers for performance evaluation std::size_t flop = 0, num_byte = 0; auto max_seqlen_q = @@ -709,14 +712,14 @@ bool run(const ck_tile::ArgParser& arg_parser) else if(init_method == "ufq" || init_method == "uf:q" || init_method == "3") // suitable for fp8 quantization { - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(q_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(k_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(knew_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(v_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(vnew_host); + ck_tile::FillUniformDistribution{-q_dtype_max, q_dtype_max, seed}(q_host); + ck_tile::FillUniformDistribution{-k_dtype_max, k_dtype_max, seed}(k_host); + ck_tile::FillUniformDistribution{-k_dtype_max, k_dtype_max, seed}(knew_host); + ck_tile::FillUniformDistribution{-v_dtype_max, v_dtype_max, seed}(v_host); + ck_tile::FillUniformDistribution{-v_dtype_max, v_dtype_max, seed}(vnew_host); // bias_fp8 = qscale_bias * bias_fp32 - float qscale_bias = (dtype_max / range_q) * (dtype_max / range_k); + float qscale_bias = (q_dtype_max / range_q) * (k_dtype_max / range_k); // Assume bias is in [-1.f, 1.f] in original fp32 ck_tile::FillUniformDistribution{-qscale_bias, qscale_bias, seed}(bias_host); } @@ -1129,14 +1132,14 @@ bool run(const ck_tile::ArgParser& arg_parser) randval_buf.FromDevice(randval_host.data()); auto p_compute_element_func = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) return ck_tile::scales{scale_p}; else return ck_tile::identity{}; }(); auto oacc_element_func = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) return ck_tile::composes(ck_tile::saturates{}, ck_tile::scales{scale_o}); else @@ -1186,7 +1189,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { decltype(q_host_ref) q_host_ref_ro(q_host_ref.get_lengths()); - auto [rotary_cos_slice, rotary_sin_slice] = + auto [rotary_cos_slice, rotary_sin_slice] = slice_rotary_cos_sin(rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], real_seqlen_q); ck_tile::reference_batched_rotary_position_embedding( @@ -1202,13 +1205,13 @@ bool run(const ck_tile::ArgParser& arg_parser) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[0] / nr, i[1] % page_block_size, i[2]); }); - } else { + } else { k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[1] % page_block_size, i[0] / nr, i[2]); }); } } else -#endif +#endif { if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[0] / nr, i[1] + key_offset, i[2]); }); else k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[1] + key_offset, i[0] / nr, i[2]); }); @@ -1229,7 +1232,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { knew_host_ref_ro.emplace(knew_host_ref.get_lengths()); - auto [rotary_cos_slice, rotary_sin_slice] = + auto [rotary_cos_slice, rotary_sin_slice] = slice_rotary_cos_sin(rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], seqlen_knew); ck_tile::reference_batched_rotary_position_embedding( @@ -1251,19 +1254,19 @@ bool run(const ck_tile::ArgParser& arg_parser) if(0 < page_block_size) { if(is_v_rowmajor) { if(i_perm) { - v_host_ref.ForEach([&](auto& self, auto i) { - self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[2] % page_block_size, i[1]); + v_host_ref.ForEach([&](auto& self, auto i) { + self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[2] % page_block_size, i[1]); }); } else { - v_host_ref.ForEach([&](auto& self, auto i) { + v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[2] % page_block_size, i[0] / nr, i[1]); }); } } - else + else { - if(i_perm) { - v_host_ref.ForEach([&](auto& self, auto i) { + if(i_perm) { + v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[1], i[2] % page_block_size); }); } else { @@ -1458,7 +1461,7 @@ bool run(const ck_tile::ArgParser& arg_parser) else o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b_idx, idx[1] + query_offset, idx[0], idx[2]); }); // clang-format on - auto [rtol, atol] = get_elimit(init_method); + auto [rtol, atol] = get_elimit(init_method); bool cur_pass = ck_tile::check_err( o_host_result, o_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol); pass &= cur_pass; @@ -1515,15 +1518,15 @@ int main(int argc, char* argv[]) const std::string data_type = arg_parser.get_str("prec"); if(data_type == "fp16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } else if(data_type == "bf16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } else if(data_type == "fp8") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } return -3; diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 8a821b917..aee54b475 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -16,11 +16,35 @@ #include #include +struct FmhaFwdFp16 +{ +}; + +struct FmhaFwdBf16 +{ +}; + +struct FmhaFwdFp8 +{ +}; + +struct FmhaFwdBf8 +{ +}; + +struct FmhaFwdFp8Fp16 +{ +}; + +struct FmhaFwdFp8Bf16 +{ +}; + template struct FmhaFwdTypeConfig; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::half_t; using KDataType = ck_tile::half_t; @@ -36,7 +60,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::bf16_t; using KDataType = ck_tile::bf16_t; @@ -52,7 +76,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::fp8_t; using KDataType = ck_tile::fp8_t; @@ -68,7 +92,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::bf8_t; using KDataType = ck_tile::bf8_t; -- GitLab From 67497a044d450fbc0bcb099cfb0aa270cfb0aa6b Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:47:36 +0000 Subject: [PATCH 112/153] Make sure we call __hneg with half to remove ambigios error (#1736) --- include/ck/utility/math_v2.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp index a6c3540d8..eaa1c6813 100644 --- a/include/ck/utility/math_v2.hpp +++ b/include/ck/utility/math_v2.hpp @@ -611,7 +611,7 @@ inline __device__ int8_t neg(int8_t x) template <> inline __device__ half_t neg(half_t x) { - return __hneg(x); + return __hneg(static_cast<__half>(x)); }; template -- GitLab From 90d8410d562220ba65e7e75f10e7b3996409200f Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 10 Dec 2024 08:48:51 -0800 Subject: [PATCH 113/153] Upgrade to Ubuntu22.04 as default OS. (#1738) * upgrade to ubuntu 22.04 * try adding -u roof docker options for ubuntu 22 --- Dockerfile | 5 +++-- Dockerfile.compiler | 2 +- Jenkinsfile | 8 ++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6689ae08f..8ce158a20 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:20.04 +FROM ubuntu:22.04 ARG DEBIAN_FRONTEND=noninteractive ARG ROCMVERSION=6.3 ARG compiler_version="" @@ -48,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- libnuma-dev \ libpthread-stubs0-dev \ llvm-amdgpu \ + mpich \ net-tools \ pkg-config \ python \ @@ -70,7 +71,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- rm -rf /var/lib/apt/lists/* && \ rm -rf amdgpu-install* && \ # Remove unnecessary rocm components that take a lot of space - apt-get remove -y rocblas rocfft rocsparse composablekernel-dev + apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt # Update the cmake to version 3.27.5 RUN pip install --upgrade cmake==3.27.5 && \ diff --git a/Dockerfile.compiler b/Dockerfile.compiler index 3f3329092..a22103b96 100644 --- a/Dockerfile.compiler +++ b/Dockerfile.compiler @@ -1,4 +1,4 @@ -ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.3" +ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.3" FROM $BASE_DOCKER ARG compiler_version="" ARG compiler_commit="" diff --git a/Jenkinsfile b/Jenkinsfile index f118d4e45..f82c34afa 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -40,10 +40,10 @@ def getBaseDockerImageName(){ else{ def ROCM_numeric = "${params.ROCMVERSION}" as float if ( ROCM_numeric < 6.4 ){ - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" + img = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm${params.ROCMVERSION}" } else{ - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" + img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm${params.ROCMVERSION}" } } return img @@ -357,7 +357,7 @@ def buildHipClangJob(Map conf=[:]){ def prefixpath = conf.get("prefixpath", "/opt/rocm") // Jenkins is complaining about the render group - def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" if (conf.get("enforce_xnack_on", false)) { dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } @@ -426,7 +426,7 @@ def Build_CK(Map conf=[:]){ def prefixpath = conf.get("prefixpath", "/opt/rocm") // Jenkins is complaining about the render group - def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" if (conf.get("enforce_xnack_on", false)) { dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } -- GitLab From 357a0b1c57d2f6b4eb9607d26047ba2e0b679f72 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:16:03 -0800 Subject: [PATCH 114/153] add missing stdexcept header (#1740) --- codegen/test/rtc/include/rtc/hip.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/codegen/test/rtc/include/rtc/hip.hpp b/codegen/test/rtc/include/rtc/hip.hpp index 6b523382d..e962d4cd3 100644 --- a/codegen/test/rtc/include/rtc/hip.hpp +++ b/codegen/test/rtc/include/rtc/hip.hpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace rtc { -- GitLab From 77a38e0211f587775c233fc0afd4de819d51500c Mon Sep 17 00:00:00 2001 From: carlushuang Date: Thu, 12 Dec 2024 03:54:03 +0000 Subject: [PATCH 115/153] [CK_TILE] naive attn (#1708) * add reference attention fwd * refactor addresser * update * paged, and i8 reflect-quant * lets call it forward-quant * fix error in decode variation * update naive-attn * fix page table * fix build err --- example/ck_tile/01_fmha/fmha_fwd.cpp | 57 +- include/ck_tile/README.md | 3 + include/ck_tile/core.hpp | 1 + include/ck_tile/ops/gemm.hpp | 2 +- include/ck_tile/ref/README.md | 5 + include/ck_tile/ref/naive_attention.hpp | 666 ++++++++++++++++++++++++ include/ck_tile/remod.py | 4 + 7 files changed, 734 insertions(+), 4 deletions(-) create mode 100644 include/ck_tile/ref/README.md create mode 100644 include/ck_tile/ref/naive_attention.hpp diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index ebf2c93a3..08d263da9 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -3,6 +3,7 @@ #include "fmha_fwd.hpp" #include "ck_tile/host.hpp" +#include "ck_tile/ref/naive_attention.hpp" #include "mask.hpp" #include "rotary.hpp" #include "utils.hpp" @@ -41,7 +42,7 @@ std::ostream& operator<<(std::ostream& os, const std::vector& v) auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; - arg_parser.insert("v", "1", "weather do CPU validation or not") + arg_parser.insert("v", "1", "0:no validation, 2:cpu validation, 2:gpu validation(experimental)") .insert("mode", "0", "kernel mode. 0:batch, 1:group") .insert("b", "2", "batch size") .insert("h", "8", "num of head, for q") @@ -447,7 +448,7 @@ bool run(const ck_tile::ArgParser& arg_parser) } bool s_randval = false; - if(p_drop > 0.0f && do_validation) + if(p_drop > 0.0f && do_validation != 0) { s_randval = true; } @@ -1121,11 +1122,61 @@ bool run(const ck_tile::ArgParser& arg_parser) << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec << " GB/s" << std::flush; - if(!do_validation) + if(do_validation == 0) { std::cout << std::flush << std::endl; return true; } + if(do_validation == 2) + { + // NOTE: use gpu to do validation + ck_tile::naive_attention_fwd_traits naive_t; + naive_t.q_type = data_type; + naive_t.k_type = data_type; + naive_t.v_type = data_type; + naive_t.o_type = data_type; + naive_t.q_layout = i_perm == 1 ? "bhsd" : "bshd"; + naive_t.k_layout = i_perm == 1 ? "bhsd" : "bshd"; + naive_t.v_layout = i_perm == 1 ? "bhsd" : "bshd"; + naive_t.o_layout = o_perm == 1 ? "bhsd" : "bshd"; + naive_t.variation = 0; // TODO? + + ck_tile::DeviceMem o_naive_buf(o_host.get_element_space_size_in_bytes()); + + ck_tile::naive_attention_fwd_args naive_a; + naive_a.q_ptr = q_buf.GetDeviceBuffer(); + naive_a.k_ptr = k_buf.GetDeviceBuffer(); + naive_a.v_ptr = v_buf.GetDeviceBuffer(); + naive_a.o_ptr = o_naive_buf.GetDeviceBuffer(); + naive_a.scale_s = scale_s; + naive_a.context_len_ptr = nullptr; // used when seqlen kv come from a pointer + naive_a.page_table_ptr = + nullptr; // [batch, num_blocks] seqlen_kv is in different block(paged attn) + naive_a.hdim = hdim_q; + naive_a.hdim_v = hdim_v; // could be cross-attn, where V and Q/K hdim are different + naive_a.batch_q = batch; + naive_a.batch_kv = batch; + naive_a.batch_ratio_kv = 1; // batch_q / batch_kv + naive_a.seqlen_q = seqlen_qs[0]; + naive_a.seqlen_kv = seqlen_ks[0]; // if context_len_ptr is not nullptr, ignore this field + naive_a.nhead_q = nhead; + naive_a.nhead_kv = nhead_k; + naive_a.nhead_ratio_kv = naive_a.nhead_q / naive_a.nhead_kv; // nhead_q / nhead_kv + naive_a.page_size = 0; // if paged, the seqlen-kv for each block + + ck_tile::stream_config naive_s{}; + + naive_attention_fwd(naive_t, naive_a, naive_s); + + auto o_naive_ref = o_naive_buf.ToHost(); + o_buf.FromDevice(o_host.data()); // TODO: ugly + + auto [rtol_, atol_] = get_elimit(init_method); + bool pass_ = ck_tile::check_err( + o_host, o_naive_ref, std::string("OUT Error: Incorrect results!"), rtol_, atol_); + std::cout << ", valid:" << (pass_ ? "y" : "n") << std::flush << std::endl; + return pass_; + } o_buf.FromDevice(o_host.data()); lse_buf.FromDevice(lse_host.data()); diff --git a/include/ck_tile/README.md b/include/ck_tile/README.md index 9f88af1ca..9d5e92391 100644 --- a/include/ck_tile/README.md +++ b/include/ck_tile/README.md @@ -45,5 +45,8 @@ our implementation of different device operators. **[ops/epilogue]** epilogue part of our kernel. We may extend this epilogue part to let users to build their own cutomized epilogues. +**[ref]** +reference implementation of cpu or gpu. This folder is supposed to include a specific header on demand. + ## examples currently we put all ck_tile related example under [/example/ck_tile](/example/ck_tile/) folder. Please check each example's subfolder. diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 3cf0c2595..41f3383c7 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -54,6 +54,7 @@ #include "ck_tile/core/tensor/tile_window_linear.hpp" #include "ck_tile/core/tensor/tile_window_utils.hpp" #include "ck_tile/core/tensor/update_tile.hpp" +#include "ck_tile/core/utility/amd_address_space.hpp" #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" #include "ck_tile/core/utility/functional_with_tuple.hpp" diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 82d35b9c5..2d38ef592 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -23,10 +23,10 @@ #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp" #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp" #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" -#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp" diff --git a/include/ck_tile/ref/README.md b/include/ck_tile/ref/README.md new file mode 100644 index 000000000..6efee782f --- /dev/null +++ b/include/ck_tile/ref/README.md @@ -0,0 +1,5 @@ +# reference + +this folder contains reference implementation of a specific op. Note by including a specific header, you are including the implementation(expecially the gpu implementation) into your source code, and compile that kernel into the fatbin, hence may increase your kernel obj code length. Usually the header starts with `reference_` is a cpu reference implementation. The header starts with `naive_` contains a gpu implementation with a small launcher. + +TODO: move `host/reference` under this folder diff --git a/include/ck_tile/ref/naive_attention.hpp b/include/ck_tile/ref/naive_attention.hpp new file mode 100644 index 000000000..09ded761e --- /dev/null +++ b/include/ck_tile/ref/naive_attention.hpp @@ -0,0 +1,666 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include +#include + +namespace ck_tile { + +enum class naive_attention_layout_enum +{ + BSHD, // [batch, seqlen, nhead, hdim] + BHSD, // [batch, nhead, seqlen, hdim] + BS3HD, // [batch, nhead, 3, seqlen, hdim], used when qkv are packed + PHSD, // [pages, nhead, page_size, hdim] + // PHSDX, // [pages, nhead, page_size/x, hdim, x], where <# used pages>*page_size = seqlen + PHDSX, // [pages, nhead, hdim/x, page_size, x], where <# used pages>*page_size = seqlen + PHDS, // [pages, nhead, hdim, page_size], where <# used pages>*page_size = seqlen +}; + +// will used to specialize kernel variation +enum class naive_attention_variation_enum +{ + FLASH_BATCHED = 0, // standard flash attention, or xformer/sdpa, used for training + FLASH_GROUPED, + DECODE_PAGED, // decode attn, where kv token from another buffer called kvcache +}; + +// TODO: for simplicity, this will be used as host/device arg +struct naive_attention_fwd_args +{ + void* q_ptr; + void* k_ptr; + void* v_ptr; + void* o_ptr; + void* context_len_ptr; // [batch] used when seqlen kv come from a pointer(each element is a + // number, not cumsum) + void* page_table_ptr; // [batch, max_pages_per_seq] seqlen_kv is in different block(paged attn) + void* kvscale_ptr; // [nhead, 2(kv), hdim] used for kvcache dequant + float scale_s; + int hdim; + int hdim_v; // could be cross-attn, where V and Q/K hdim are different + int batch_q; + int batch_kv; + int batch_ratio_kv; // batch_q / batch_kv + int seqlen_q; // in decode case, this should be 1 + int seqlen_kv; // if context_len_ptr is not nullptr, ignore this field + int nhead_q; + int nhead_kv; + int nhead_ratio_kv; // nhead_q / nhead_kv + int page_size; // if paged, the seqlen-kv per each block + int max_pages_per_seq; +}; + +// this is trait for host API +struct naive_attention_fwd_traits +{ + std::string q_type; + std::string k_type; + std::string v_type; + std::string o_type; + std::string q_layout; + std::string k_layout; + std::string v_layout; + std::string o_layout; + int variation; // sync with naive_attention_variation_enum +}; + +// this is trait for kernel template +template +struct naive_attention_fwd_kernel_traits +{ + static constexpr naive_attention_variation_enum variation = variation_; +}; + +// for simplicity, please do not use const-reference type for the template type +template +struct naive_attention_fwd_kernel +{ + static constexpr bool is_kvcache_i8 = + std::is_same_v && std::is_same_v && sizeof(QType) != 1; + + // kvcache-i8 will have per head scale, we apply this scale to Q/P matrix instead of original + // K/V matrix. This can speed up conversion since Q/P usually is fp16/bf16/fp32 + static constexpr bool is_kvcache_i8_forward_quant = is_kvcache_i8; + + // TODO: hardcode + using KVScaleType = float; + using SoftmaxType = float; + using PType = VType; // src A of gemm2, same type as V + + using p_vec_type = ext_vector_t; + static constexpr int p_vec_elem = vector_traits::vector_size; + + __host__ __device__ naive_attention_fwd_kernel() {} + + template + struct addresser + { + int b, s, h, d; // batch, seqlen, nhead, hdim + T* base_ptr; + __device__ addresser(int b_, int s_, int h_, int d_, void* base_ptr_) + : b(b_), s(s_), h(h_), d(d_), base_ptr(reinterpret_cast(base_ptr_)) + { + } + + // TODO: all the batch/nhead offset will accumulate to the base pointer + __device__ T* get_base(int i_b, int i_h) + { + if constexpr(Layout == naive_attention_layout_enum::BSHD) + return base_ptr + i_b * s * h * d + i_h * d; + else if constexpr(Layout == naive_attention_layout_enum::BHSD) + return base_ptr + i_b * s * h * d + i_h * s * d; + } + + __device__ int get_offset(int i_s, int i_d) + { + if constexpr(Layout == naive_attention_layout_enum::BSHD) + return i_s * h * d + i_d; + else if constexpr(Layout == naive_attention_layout_enum::BHSD) + return i_s * d + i_d; + } + + // below set of API will directly use pointer inside this struct + __device__ void init(int i_b, int i_h) { base_ptr = get_base(i_b, i_h); } + __device__ T load(int i_s, int i_d) { return base_ptr[get_offset(i_s, i_d)]; } + __device__ void store(T value, int i_s, int i_d) { base_ptr[get_offset(i_s, i_d)] = value; } + }; + + template + struct page_addresser + { + int s, h, d; // page_size, nhead, hdim + static constexpr int x = 16 / sizeof(T); // pack 4 dword + T* base_ptr; + int* page_table_ptr; // TODO: page table always int + int i_h; // store current head + + __device__ page_addresser(int s_, int h_, int d_, void* base_ptr_, void* pptr_) + : s(s_), + h(h_), + d(d_), + base_ptr(reinterpret_cast(base_ptr_)), + page_table_ptr(reinterpret_cast(pptr_)) + { + } + + __device__ int64_t get_phy_page_idx(int i_s) + { + // dynamic compute page idx is simple but slow + int page_idx = i_s / s; + int phy = page_table_ptr[page_idx]; + return static_cast(phy); + } + + __device__ int get_phy_page_offset(int i_s) + { + // dynamic compute page idx is simple but slow + return i_s % s; + } + + __device__ int64_t get_offset(int i_s, int i_d) + { + int page_offset = get_phy_page_offset(i_s); + int64_t page_idx = get_phy_page_idx(i_s); + int64_t base_ = page_idx * h * s * d; + if constexpr(Layout == naive_attention_layout_enum::PHSD) + return static_cast(i_h * s * d + page_offset * d + i_d) + base_; + else if constexpr(Layout == naive_attention_layout_enum::PHDSX) + { + int d_r = i_d / x; + int d_x = i_d % x; + return static_cast(i_h * d * s + d_r * s * x + page_offset * x + d_x) + + base_; + } + else if constexpr(Layout == naive_attention_layout_enum::PHDS) + { + return static_cast(i_h * d * s + i_d * s + page_offset) + base_; + } + } + + // below set of API will directly use pointer inside this struct + __device__ void init(int /*i_b*/, int i_h_) { i_h = i_h_; } + __device__ T load(int i_s, int i_d) { return base_ptr[get_offset(i_s, i_d)]; } + __device__ void store(T /*value*/, int /*i_s*/, int /*i_d*/) {} + }; + + template + struct kvscale_addresser + { + int h, d; // nhead, hdim + T* base_ptr; + __device__ kvscale_addresser(int h_, int d_, void* p_) + : h(h_), d(d_), base_ptr(reinterpret_cast(p_)) + { + } + __device__ int get_offset(int i_h, int i_d, int i_kv /*0 or 1*/) + { + // [h, 2, d] + return i_h * 2 * d + i_kv * d + i_d; + } + __device__ T load(int i_h, int i_d, int i_kv) + { + return base_ptr[get_offset(i_h, i_d, i_kv)]; + } + }; + + __device__ __host__ static constexpr int get_block_size() { return 256; } + + // for simpliciy, 1 WG always compute 1 token along q, compute all token along kv + // compute all hdim from q, compute WG_SIZE hdim from v + // 1) in prefill case, seqlen_q >= 1, seqlen_kv >= 1, batch_q=batch_kv + // 2) in decode case, seqlen_q = 1, batch_q is input num-tokens, batch_kv is 1 + // 3) in paged-attn case, we still use 1 WG compute all the seqlen-kv for simplicity + // TODO: could support split-kv to validate intermediate logsum + __host__ static dim3 get_grid_size(naive_attention_fwd_args args) + { + constexpr int wg_size = get_block_size(); + auto g = + dim3((args.hdim_v + wg_size - 1) / wg_size, args.seqlen_q, args.batch_q * args.nhead_q); + return g; + } + + // reduce single pixel within a wave + template + __device__ constexpr T wave_reduce(T local, F reduce_f) + { + // constexpr int wave_size = 64; + constexpr int reduce_stage = 6; // 1<<6=64 + T v_local = local; +#pragma unroll + for(int i_stage = 0; i_stage < reduce_stage; i_stage++) + { + int src_lane = __lane_id() ^ (1 << i_stage); + int32_t v_remote_tmp = + __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast(v_local)); + T v_remote = bit_cast(v_remote_tmp); + v_local = reduce_f(v_local, v_remote); + } + return v_local; + } + + // Note: this function must be called after wave_reduce + // Note: better not use this under if...else... with thread divergence (syncthreads) + template + __device__ constexpr T cross_wave_reduce(T local, F reduce_f, T* smem) + { + constexpr int waves = 4; + constexpr int wave_size = 64; + int lane_id = threadIdx.x % wave_size; + + __syncthreads(); + smem[threadIdx.x] = local; + __syncthreads(); + + // the data within single wave is the same + // but for simplicity, we still use data from each lane. + T v_local = smem[lane_id]; +#pragma unroll + for(int i_stage = 1; i_stage < waves; i_stage++) + { + T v_remote = smem[i_stage * wave_size + lane_id]; + v_local = reduce_f(v_local, v_remote); + } + return v_local; + } + + // kernel entry point + __device__ void operator()(naive_attention_fwd_args args) + { + constexpr int wg_size = get_block_size(); + __shared__ char smem[wg_size * 4 * sizeof(float)]; // should enough + int i_dv = blockIdx.x * wg_size + threadIdx.x; // index of hdim_v + int i_sq = blockIdx.y; // index of seqlen_q + int i_batch = blockIdx.z; // index of batch_q * nhead_q + int i_bq = i_batch / args.nhead_q; // index of batch_q + int i_hq = i_batch % args.nhead_q; // index of nhead_q + + int i_bk = i_bq / args.batch_ratio_kv; + int i_hk = i_hq / args.nhead_ratio_kv; + + void* page_table_ptr = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return reinterpret_cast(args.page_table_ptr) + i_bq * args.max_pages_per_seq; + } + else + { + return nullptr; + } + }(); + + auto q_addr = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED) + { + return addresser{ + args.batch_q, args.seqlen_q, args.nhead_q, args.hdim, args.q_ptr}; + } + else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return addresser{ + args.batch_q, args.seqlen_q, args.nhead_q, args.hdim, args.q_ptr}; + } + }(); + auto k_addr = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED) + { + return addresser{ + args.batch_kv, args.seqlen_kv, args.nhead_kv, args.hdim, args.k_ptr}; + } + else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return page_addresser{ + args.page_size, args.nhead_kv, args.hdim, args.k_ptr, page_table_ptr}; + } + }(); + auto v_addr = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED) + { + return addresser{ + args.batch_kv, args.seqlen_kv, args.nhead_kv, args.hdim_v, args.v_ptr}; + } + else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return page_addresser{ + args.page_size, args.nhead_kv, args.hdim_v, args.v_ptr, page_table_ptr}; + } + }(); + auto o_addr = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED) + { + return addresser{ + args.batch_q, args.seqlen_q, args.nhead_q, args.hdim_v, args.o_ptr}; + } + else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return addresser{ + args.batch_q, args.seqlen_q, args.nhead_q, args.hdim_v, args.o_ptr}; + } + }(); + + q_addr.init(i_bq, i_hq); + k_addr.init(i_bk, i_hk); + v_addr.init(i_bk, i_hk); + o_addr.init(i_bq, i_hq); + + auto f_max = [](auto x_, auto y_) { return max(x_, y_); }; + auto f_sum = [](auto x_, auto y_) { return x_ + y_; }; + auto f_absmax_f32 = [](float v_0_, float v_1_) { + float rtn; + asm volatile("v_max_f32 %0, abs(%1), abs(%2)" : "=v"(rtn) : "v"(v_0_), "v"(v_1_)); + return rtn; + }; + + int seqlen_kv = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED) + { + return args.seqlen_kv; + } + else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return reinterpret_cast(args.context_len_ptr)[i_bq]; + } + }(); + + SoftmaxType row_max = -numeric::infinity(); + SoftmaxType l{0}; + AccType o_acc = {0}; + + int sk_loops = (seqlen_kv + wg_size - 1) / wg_size; + float qf_scale = .0f; + kvscale_addresser kvscale_addr{args.nhead_kv, args.hdim, args.kvscale_ptr}; + + if constexpr(is_kvcache_i8_forward_quant) + { + // AccType is i32 now, seqlen_q = 1, hdim up to 256 + float q = 0; + float k_s = 0; + if(static_cast(threadIdx.x) < args.hdim) + { + q = type_convert(q_addr.load(0, threadIdx.x)); + k_s = type_convert(kvscale_addr.load(i_hk, threadIdx.x, 0)); + } + // 1) we apply the k scale to q + float q_forwarded = q * k_s; + + // 2) apply smooth-quant + // find absmax + float qf_max = wave_reduce(q_forwarded, f_absmax_f32); + qf_max = cross_wave_reduce(qf_max, f_absmax_f32, reinterpret_cast(smem)); + + // per-token scale + qf_scale = qf_max / 127.0; + + // devide by scale + q = q / qf_scale; + + // fp32->i8 + int8_t quantized_q = static_cast(q); + __syncthreads(); + reinterpret_cast(smem)[threadIdx.x] = quantized_q; + __syncthreads(); + + // after above process, we have 2 data + // 1) int8 q data stored in smem(no need to reload) + // 2) per-token scale qf_scale, to be mul after 1st gemm + } + + for(int i_loop1 = 0; i_loop1 < sk_loops; i_loop1++) + { + int i_sk = i_loop1 * wg_size + threadIdx.x; + // gemm-1 + SoftmaxType s_softmax = -numeric::infinity(); + if(i_sk < seqlen_kv) + { + AccType s_acc{0}; // clear for every loop + for(auto i_dq = 0; i_dq < args.hdim; i_dq++) + { + if constexpr(is_kvcache_i8_forward_quant) + { + int8_t q = reinterpret_cast(smem)[i_dq]; + auto k = k_addr.load(i_sk, i_dq); + + s_acc += type_convert(q) * type_convert(k); + } + else + { + auto q = q_addr.load(i_sq, i_dq); // q will have duplicate load + auto k = k_addr.load(i_sk, i_dq); + + s_acc += type_convert(q) * type_convert(k); + } + } + // scale + s_softmax = type_convert(s_acc); + s_softmax *= + type_convert(args.scale_s * ck_tile::log2e_v); + if constexpr(is_kvcache_i8_forward_quant) + { + s_softmax *= qf_scale; // post scale the per-token factor + } + } + + // s->p + float pf_scale = 0.; // used for i8 quant + { + // softmax, find max + SoftmaxType old_max = row_max; + SoftmaxType cur_max = wave_reduce(s_softmax, f_max); + + cur_max = cross_wave_reduce(cur_max, f_max, reinterpret_cast(smem)); + row_max = max(old_max, cur_max); // update row_max + // softmax, exp(i_elem - max) + SoftmaxType p_compute = __builtin_amdgcn_exp2f(s_softmax - row_max); + + // compute exp_sum + SoftmaxType row_sum = wave_reduce(p_compute, f_sum); + row_sum = cross_wave_reduce(row_sum, f_sum, reinterpret_cast(smem)); + + // l, pre-scall o_acc + SoftmaxType tmp = __builtin_amdgcn_exp2f(old_max - row_max); + l = tmp * l + row_sum; + o_acc = type_convert(type_convert(o_acc) * tmp); + + // prepare the p_compute into smem, to let every thread read same p_compute and do + // 2nd gemm + if constexpr(is_kvcache_i8_forward_quant) + { + float v_s = 0; + if(static_cast(threadIdx.x) < args.hdim_v) + { + v_s = type_convert(kvscale_addr.load(i_hk, threadIdx.x, 1)); + } + + // 1) we apply the v scale to p + float p_forwarded = p_compute * v_s; + + // 2) apply smooth-quant + // find absmax + float pf_max = wave_reduce(p_forwarded, f_absmax_f32); + pf_max = + cross_wave_reduce(pf_max, f_absmax_f32, reinterpret_cast(smem)); + + // per-token scale + pf_scale = pf_max / 127.0; + + // devide by scale + p_compute = p_compute / pf_scale; + + // fp32->i8 + int8_t quantized_p = static_cast(p_compute); + __syncthreads(); + reinterpret_cast(smem)[threadIdx.x] = quantized_p; + __syncthreads(); + // after above process, we have 2 data + // 1) int8 p data stored in smem(no need to reload) + // 2) per-token scale pf_scale, to be mul after 2nd gemm + } + else + { + __syncthreads(); + reinterpret_cast(smem)[threadIdx.x] = type_convert(p_compute); + __syncthreads(); + } + } + + // gemm-2, simple loop over vector by vector + constexpr int gemm_2_loop = wg_size / p_vec_elem; + { + AccType o_acc_local = {0}; + int sk_start = i_loop1 * wg_size; // we start from the first seqlen_kv element + for(int i_loop2 = 0; i_loop2 < gemm_2_loop; i_loop2++) + { + p_vec_type p_vec = reinterpret_cast(smem)[i_loop2]; +#pragma unroll + for(int i_j = 0; i_j < p_vec_elem; i_j++) + { + int sv_offset = i_loop2 * p_vec_elem + i_j; + int i_sv = sk_start + sv_offset; + + VType v = 0.f; + if(i_dv < args.hdim_v && i_sv < seqlen_kv) + { + v = v_addr.load(i_sv, i_dv); + } + + o_acc_local += type_convert(p_vec[i_j]) * type_convert(v); + } + } + if constexpr(is_kvcache_i8_forward_quant) + { + // apply pr scale to local acc + o_acc_local = + type_convert(type_convert(o_acc_local) * pf_scale); + } + o_acc += o_acc_local; + } + } + + // post scale o_acc + { + SoftmaxType tmp = l == 0.f ? 0.f : 1.f / l; // in case masking + o_acc = type_convert(type_convert(o_acc) * tmp); + } + + // store O + if(i_dv < args.hdim_v) + o_addr.store(type_convert(o_acc), i_sq, i_dv); + } +}; + +#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_() \ + { \ + using ktraits_ = \ + naive_attention_fwd_kernel_traits( \ + variation_)>; \ + using k_ = naive_attention_fwd_kernel; \ + dim3 grids = k_::get_grid_size(a); \ + r = ck_tile::launch_kernel(s, \ + ck_tile::make_kernel(k_{}, grids, k_::get_block_size(), 0, a)); \ + } + +#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_() \ + if(t.variation == 0 && t.q_layout == "bshd" && t.k_layout == "bshd" && t.v_layout == "bshd" && \ + t.o_layout == "bshd") \ + { \ + constexpr auto q_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr auto k_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr auto v_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr auto o_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr int variation_ = 0; \ + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \ + } \ + else if(t.variation == 0 && t.q_layout == "bhsd" && t.k_layout == "bhsd" && \ + t.v_layout == "bhsd" && t.o_layout == "bhsd") \ + { \ + constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto k_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto v_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr int variation_ = 0; \ + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \ + } \ + else if(t.variation == 2 && t.q_layout == "bhsd" && t.k_layout == "phdsx" && \ + t.v_layout == "phds" && t.o_layout == "bhsd") \ + { \ + constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto k_layout_ = naive_attention_layout_enum::PHDSX; \ + constexpr auto v_layout_ = naive_attention_layout_enum::PHDS; \ + constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr int variation_ = 2; \ + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \ + } + +// +CK_TILE_HOST float naive_attention_fwd(naive_attention_fwd_traits t, + naive_attention_fwd_args a, + ck_tile::stream_config s) +{ + float r = -1; + // TODO: do not explicitly create too much instance! + if(t.q_type == "fp16" && t.k_type == "fp16" && t.v_type == "fp16" && t.o_type == "fp16") + { + using q_type_ = fp16_t; + using k_type_ = fp16_t; + using v_type_ = fp16_t; + using o_type_ = fp16_t; + using acc_type_ = float; + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); + } + else if(t.q_type == "bf16" && t.k_type == "bf16" && t.v_type == "bf16" && t.o_type == "bf16") + { + using q_type_ = bf16_t; + using k_type_ = bf16_t; + using v_type_ = bf16_t; + using o_type_ = bf16_t; + using acc_type_ = float; + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); + } + else if(t.q_type == "bf16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "bf16") + { + using q_type_ = bf16_t; + using k_type_ = int8_t; + using v_type_ = int8_t; + using o_type_ = bf16_t; + using acc_type_ = int32_t; // NOTE! + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); + } + else if(t.q_type == "fp16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "fp16") + { + using q_type_ = fp16_t; + using k_type_ = int8_t; + using v_type_ = int8_t; + using o_type_ = fp16_t; + using acc_type_ = int32_t; // NOTE! + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); + } + return r; +} + +#undef CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_ +#undef CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_ + +} // namespace ck_tile diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py index b0d2c36ef..9f2ef3389 100644 --- a/include/ck_tile/remod.py +++ b/include/ck_tile/remod.py @@ -7,6 +7,7 @@ import copy NS = 'ck_tile' OPS = 'ops' +REF = 'ref' OPS_COMMON = 'common' # common header will be duplicated into ops/* other module HEADER_COMMON = f"""// SPDX-License-Identifier: MIT @@ -29,6 +30,9 @@ class submodule_t: def push(self, f): if len(f.parents) != 1: # ignore ./xxx.hpp mod = get_module(f) + # ref is supposed to include one header on demand + if mod == REF: + return if mod == OPS: if mod not in self.m.keys(): self.m[mod] = dict() -- GitLab From 4e73177684817d425fc583b8827dd09d0c609e94 Mon Sep 17 00:00:00 2001 From: chenjun <46212055+junhaha666@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:53:52 +0800 Subject: [PATCH 116/153] Ck tile/smoothquant out stride (#1742) * add ck_tile/smoothquant out stride parameter * Remove the default stride value --------- Co-authored-by: so --- .../12_smoothquant/example_smoothquant.cpp | 44 +++++++++++-------- .../ck_tile/12_smoothquant/smoothquant.cpp | 44 +++++++++++-------- .../smoothquant/kernel/smoothquant_kernel.hpp | 20 ++++++--- 3 files changed, 66 insertions(+), 42 deletions(-) diff --git a/example/ck_tile/12_smoothquant/example_smoothquant.cpp b/example/ck_tile/12_smoothquant/example_smoothquant.cpp index 3a26eb6a7..aa1d1adfd 100644 --- a/example/ck_tile/12_smoothquant/example_smoothquant.cpp +++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp @@ -35,7 +35,8 @@ auto create_args(int argc, char* argv[]) ck_tile::ArgParser arg_parser; arg_parser.insert("m", "3328", "m dimension") .insert("n", "4096", "n dimension") - .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("x_stride", "-1", "input stride per row, if -1 then equal to n") + .insert("y_stride", "-1", "output stride per row, if -1 then equal to n") .insert("e", "1e-5", "epsilon") .insert("v", "1", "cpu validation or not") .insert("prec", "fp16", "precision") @@ -49,11 +50,14 @@ auto create_args(int argc, char* argv[]) template bool run(const ck_tile::ArgParser& arg_parser) { - ck_tile::index_t m = arg_parser.get_int("m"); - ck_tile::index_t n = arg_parser.get_int("n"); - ck_tile::index_t stride = arg_parser.get_int("stride"); - if(stride < 0) - stride = n; + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t x_stride = arg_parser.get_int("x_stride"); + if(x_stride < 0) + x_stride = n; + ck_tile::index_t y_stride = arg_parser.get_int("y_stride"); + if(y_stride < 0) + y_stride = n; std::string data_type = arg_parser.get_str("prec"); int do_validation = arg_parser.get_int("v"); int warmup = arg_parser.get_int("warmup"); @@ -68,14 +72,14 @@ bool run(const ck_tile::ArgParser& arg_parser) using ComputeDataType = float; // host verify - ck_tile::HostTensor x_host({m, n}, {stride, 1}); + ck_tile::HostTensor x_host({m, n}, {x_stride, 1}); ck_tile::HostTensor xscale_host({n}); ck_tile::HostTensor yscale_host_ref({m}, {1}); ck_tile::HostTensor yscale_host_dev({m}, {1}); - ck_tile::HostTensor qy_host_ref({m, n}, {stride, 1}); - ck_tile::HostTensor qy_host_dev({m, n}, {stride, 1}); + ck_tile::HostTensor qy_host_ref({m, n}, {y_stride, 1}); + ck_tile::HostTensor qy_host_dev({m, n}, {y_stride, 1}); ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); ck_tile::FillUniformDistribution{1e-3, .5f}(xscale_host); @@ -116,7 +120,8 @@ bool run(const ck_tile::ArgParser& arg_parser) qy_buf.GetDeviceBuffer(), m, n, - stride}; + x_stride, + y_stride}; auto kargs = Kernel::MakeKargs(args); @@ -133,7 +138,7 @@ bool run(const ck_tile::ArgParser& arg_parser) if(do_validation) { using YDataType = ComputeDataType; - ck_tile::HostTensor y_host({m, n}, {stride, 1}); + ck_tile::HostTensor y_host({m, n}, {y_stride, 1}); // smooth outlier { auto f = [&](auto n_) { @@ -183,7 +188,7 @@ bool run(const ck_tile::ArgParser& arg_parser) qy_buf.FromDevice(qy_host_dev.data()); auto [rtol, atol] = get_elimit(); - if(stride == n) + if(y_stride == n) { pass = ck_tile::check_err(qy_host_dev, qy_host_ref, @@ -195,10 +200,12 @@ bool run(const ck_tile::ArgParser& arg_parser) { for(int i_r = 0; i_r < m; i_r++) { - std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, - qy_host_dev.begin() + i_r * stride + n); - std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, - qy_host_ref.begin() + i_r * stride + n); + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride, + qy_host_dev.begin() + i_r * y_stride + + n); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride, + qy_host_ref.begin() + i_r * y_stride + + n); pass &= ck_tile::check_err(qy_host_dev_row, qy_host_ref_row, std::string("qy[") + std::to_string(i_r) + @@ -210,8 +217,9 @@ bool run(const ck_tile::ArgParser& arg_parser) } std::cout << "[" << data_type << "]" - << " m:" << m << ", n:" << n << ", stride:" << stride - << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + << " m:" << m << ", n:" << n << ", x_stride:" << x_stride + << ", y_stride:" << y_stride << ", valid:" << (pass ? "y" : "n") << std::flush + << std::endl; } return pass; diff --git a/example/ck_tile/12_smoothquant/smoothquant.cpp b/example/ck_tile/12_smoothquant/smoothquant.cpp index ed01d654f..fd1c4ec7b 100644 --- a/example/ck_tile/12_smoothquant/smoothquant.cpp +++ b/example/ck_tile/12_smoothquant/smoothquant.cpp @@ -33,7 +33,8 @@ auto create_args(int argc, char* argv[]) ck_tile::ArgParser arg_parser; arg_parser.insert("m", "3328", "m dimension") .insert("n", "4096", "n dimension") - .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("x_stride", "-1", "input stride per row, if -1 then equal to n") + .insert("y_stride", "-1", "output stride per row, if -1 then equal to n") .insert("v", "1", "cpu validation or not") .insert("kname", "1", "print kernel name or not") .insert("prec", "fp16", "precision") @@ -47,18 +48,21 @@ auto create_args(int argc, char* argv[]) template bool run(const ck_tile::ArgParser& arg_parser) { - ck_tile::index_t m = arg_parser.get_int("m"); - ck_tile::index_t n = arg_parser.get_int("n"); - ck_tile::index_t stride = arg_parser.get_int("stride"); - if(stride < 0) - stride = n; + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t x_stride = arg_parser.get_int("x_stride"); + if(x_stride < 0) + x_stride = n; + ck_tile::index_t y_stride = arg_parser.get_int("y_stride"); + if(y_stride < 0) + y_stride = n; std::string data_type = arg_parser.get_str("prec"); int kname = arg_parser.get_int("kname"); int do_validation = arg_parser.get_int("v"); int warmup = arg_parser.get_int("warmup"); int repeat = arg_parser.get_int("repeat"); - assert(stride >= n); + assert(x_stride >= n); using TypeConfig = SmoothquantTypeConfig; @@ -69,14 +73,14 @@ bool run(const ck_tile::ArgParser& arg_parser) using ComputeDataType = typename TypeConfig::ComputeDataType; // host verify - ck_tile::HostTensor x_host({m, n}, {stride, 1}); + ck_tile::HostTensor x_host({m, n}, {x_stride, 1}); ck_tile::HostTensor xscale_host({n}); ck_tile::HostTensor yscale_host_ref({m}, {1}); ck_tile::HostTensor yscale_host_dev({m}, {1}); - ck_tile::HostTensor qy_host_ref({m, n}, {stride, 1}); - ck_tile::HostTensor qy_host_dev({m, n}, {stride, 1}); + ck_tile::HostTensor qy_host_ref({m, n}, {y_stride, 1}); + ck_tile::HostTensor qy_host_dev({m, n}, {y_stride, 1}); ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); ck_tile::FillUniformDistribution{1e-3, .5f}(xscale_host); @@ -90,7 +94,8 @@ bool run(const ck_tile::ArgParser& arg_parser) xscale_buf.ToDevice(xscale_host.data()); std::cout << "[" << data_type << "]" - << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush; + << " m:" << m << ", n:" << n << ", x_stride:" << x_stride << ", y_stride:" << y_stride + << std::flush; smoothquant_traits traits{data_type}; @@ -100,7 +105,8 @@ bool run(const ck_tile::ArgParser& arg_parser) qy_buf.GetDeviceBuffer(), m, n, - stride}; + x_stride, + y_stride}; float ave_time = smoothquant( traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); @@ -116,7 +122,7 @@ bool run(const ck_tile::ArgParser& arg_parser) if(do_validation) { using YDataType = ComputeDataType; - ck_tile::HostTensor y_host({m, n}, {stride, 1}); + ck_tile::HostTensor y_host({m, n}, {y_stride, 1}); // smooth outlier { auto f = [&](auto n_) { @@ -166,7 +172,7 @@ bool run(const ck_tile::ArgParser& arg_parser) qy_buf.FromDevice(qy_host_dev.data()); auto [rtol, atol] = get_elimit(); - if(stride == n) + if(y_stride == n) { pass = ck_tile::check_err(qy_host_dev, qy_host_ref, @@ -178,10 +184,12 @@ bool run(const ck_tile::ArgParser& arg_parser) { for(int i_r = 0; i_r < m; i_r++) { - std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, - qy_host_dev.begin() + i_r * stride + n); - std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, - qy_host_ref.begin() + i_r * stride + n); + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride, + qy_host_dev.begin() + i_r * y_stride + + n); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride, + qy_host_ref.begin() + i_r * y_stride + + n); pass &= ck_tile::check_err(qy_host_dev_row, qy_host_ref_row, std::string("qy[") + std::to_string(i_r) + diff --git a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp index 6ec333516..0b3d9d6ca 100644 --- a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp +++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp @@ -19,7 +19,8 @@ struct SmoothquantHostArgs index_t m; index_t n; - index_t stride; // row_stride + index_t x_stride; // input row_stride + index_t y_stride; // output row_stride }; // TODO: Extract some type to wrapper class @@ -58,14 +59,21 @@ struct Smoothquant index_t m; index_t n; - index_t stride; // row_stride + index_t x_stride; // input row_stride + index_t y_stride; // out row_stride }; using Hargs = SmoothquantHostArgs; CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) { - return Kargs{ - hargs.p_x, hargs.p_xscale, hargs.p_yscale, hargs.p_qy, hargs.m, hargs.n, hargs.stride}; + return Kargs{hargs.p_x, + hargs.p_xscale, + hargs.p_yscale, + hargs.p_qy, + hargs.m, + hargs.n, + hargs.x_stride, + hargs.y_stride}; } CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) @@ -116,7 +124,7 @@ struct Smoothquant const auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_x), make_tuple(kargs.m, kargs.n), - make_tuple(kargs.stride, 1), + make_tuple(kargs.x_stride, 1), number{}, number<1>{}); @@ -157,7 +165,7 @@ struct Smoothquant auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_qy), make_tuple(kargs.m, kargs.n), - make_tuple(kargs.stride, 1), + make_tuple(kargs.y_stride, 1), number{}, number<1>{}); -- GitLab From 4d8fce33dddfc003432ae06848f6416a9d5d5e2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Fri, 13 Dec 2024 21:08:35 +0100 Subject: [PATCH 117/153] Add SplitK support into Batched GEMM V3 (#1729) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add bmm api * add bf16 multi_d * add ckProfiler for bf16 * add ckProfiler files * add more instance; fixed 64bit index issue * fixed naming * enabled batched Ds * use long_index for ds offsets * clean * add bmm fp8 ckProfiler * Update example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp Co-authored-by: Bartłomiej Kocot * Update example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp Co-authored-by: Bartłomiej Kocot * Update example/24_batched_gemm/run_batched_gemm_example_rowwise.inc Co-authored-by: Bartłomiej Kocot * Update library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp Co-authored-by: Bartłomiej Kocot * Update library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp Co-authored-by: Bartłomiej Kocot * Update library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp Co-authored-by: Bartłomiej Kocot * Update profiler/src/profile_gemm_universal_batched.cpp Co-authored-by: Bartłomiej Kocot * Update profiler/include/profiler/profile_gemm_universal_batched_impl.hpp Co-authored-by: Bartłomiej Kocot * clean * Update include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp * Update include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp * Update library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp * Update include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp * Update include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp * Update include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp * refactor batch offset func * add splitk suppport into bmm_v3 * clean * clean * format * fixed * fix --------- Co-authored-by: Jing Zhang Co-authored-by: zjing14 --- .../batched_gemm_xdl_bf16_v3.cpp | 4 +- .../device/device_batched_gemm_multi_d.hpp | 3 +- ...atched_gemm_multiple_d_xdl_cshuffle_v3.hpp | 45 ++++-- .../gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp | 16 +- ..._xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp | 3 + ...gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp | 2 + .../profile_gemm_universal_batched_impl.hpp | 148 ++++++++++-------- .../src/profile_gemm_universal_batched.cpp | 20 +-- 8 files changed, 137 insertions(+), 104 deletions(-) diff --git a/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp index fa8b75218..548500518 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp @@ -78,14 +78,14 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD 2, // ABlockTransferSrcVectorDim 8, // ABlockTransferSrcScalarPerVector 8, // ABlockTransferDstScalarPerVector_AK1 - 1, // ABlockLdsExtraM + 0, // ABlockLdsExtraM S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // BBlockTransferSrcAccessOrder 2, // BBlockTransferSrcVectorDim 8, // BBlockTransferSrcScalarPerVector 8, // BBlockTransferDstScalarPerVector_BK1 - 1, // BBlockLdsExtraN + 0, // BBlockLdsExtraN 1, // CShuffleMXdlPerWavePerShuffle 1, // CShuffleNXdlPerWavePerShuffle S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp index 58c0288e8..8fb4a71f5 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp @@ -89,7 +89,8 @@ struct DeviceBatchedGemmV2MultiD : public BaseOperator index_t BatchStrideE, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CDEElementwiseOperation cde_element_op) = 0; + CDEElementwiseOperation cde_element_op, + index_t KBatch) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp index 314ecdf76..5f5bea4f8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp @@ -41,12 +41,15 @@ __global__ void __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; const index_t g_idx = blockIdx.z % karg.Batch; + const index_t k_idx = blockIdx.z / karg.Batch; const auto a_batch_offset = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); const auto b_batch_offset = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); const auto c_batch_offset = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx); + auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx); + // populate pointer, desc for Ds static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) { // D pointer @@ -54,8 +57,8 @@ __global__ void }); GridwiseGemm::template Run( - karg.p_a_grid + a_batch_offset, - karg.p_b_grid + b_batch_offset, + karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset, + karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset, karg.p_ds_grid, karg.p_c_grid + c_batch_offset, p_shared, @@ -87,12 +90,15 @@ __global__ void __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()]; const index_t g_idx = blockIdx.z % karg.Batch; + const index_t k_idx = blockIdx.z / karg.Batch; const auto a_batch_offset = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); const auto b_batch_offset = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); const auto c_batch_offset = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx); + auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx); + // populate pointer, desc for Ds static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) { // D pointer @@ -100,8 +106,8 @@ __global__ void }); GridwiseGemm::template Run_2Lds( - karg.p_a_grid + a_batch_offset, - karg.p_b_grid + b_batch_offset, + karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset, + karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset, karg.p_ds_grid, karg.p_c_grid + c_batch_offset, p_shared_0, @@ -303,7 +309,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 index_t Batch_, AElementwiseOperation a_element_op_, BElementwiseOperation b_element_op_, - CElementwiseOperation c_element_op_) + CElementwiseOperation c_element_op_, + index_t KBatch_) : GridwiseGemm::Argument{p_a_grid_, p_b_grid_, p_ds_grid_, @@ -315,7 +322,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 StrideB_, StrideDs_, StrideE_, - 1, + KBatch_, a_element_op_, b_element_op_, c_element_op_}, @@ -336,13 +343,14 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 arg.Print(); } - if(!GridwiseGemm::CheckValidity(arg) || arg.KBatch > 1) + if(!GridwiseGemm::CheckValidity(arg)) { throw std::runtime_error("wrong! GridwiseGemm has invalid setting"); } index_t gdx, gdy, gdz; - std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch); + std::tie(gdx, gdy, gdz) = + GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch * arg.KBatch); float ave_time = 0; @@ -387,10 +395,11 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 rotating_mem.Next(); // clear c mem if(arg_.KBatch > 1) - hipGetErrorString(hipMemsetAsync(arg_.p_c_grid, - 0, - arg_.M * arg_.N * sizeof(CDataType), - stream_config.stream_id_)); + hipGetErrorString( + hipMemsetAsync(arg_.p_c_grid, + 0, + arg.Batch * arg_.M * arg_.N * sizeof(CDataType), + stream_config.stream_id_)); }; ave_time = ck::utility::launch_and_time_kernel_with_preprocess( @@ -889,7 +898,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 index_t BatchStrideE, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op) + CElementwiseOperation c_element_op, + index_t KBatch = 1) { return Argument{static_cast(p_a), static_cast(p_b), @@ -909,7 +919,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 Batch, a_element_op, b_element_op, - c_element_op}; + c_element_op, + KBatch}; } static auto MakeInvoker() { return Invoker{}; } @@ -934,7 +945,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 index_t BatchStrideE, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op) override + CElementwiseOperation c_element_op, + index_t KBatch = 1) override { return std::make_unique(static_cast(p_a), static_cast(p_b), @@ -954,7 +966,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 Batch, a_element_op, b_element_op, - c_element_op); + c_element_op, + KBatch); } // polymorphic diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp index c7038ed4f..e5a31f8d1 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp @@ -41,7 +41,7 @@ __global__ void #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; - auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg); + auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z); GridwiseGemm::template Run( karg.p_a_grid + splitk_batch_offset.a_k_split_offset, @@ -76,7 +76,7 @@ __global__ void __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()]; __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()]; - auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg); + auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z); GridwiseGemm::template Run_2Lds( karg.p_a_grid + splitk_batch_offset.a_k_split_offset, @@ -639,27 +639,27 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3 struct SplitKBatchOffset { - __device__ SplitKBatchOffset(Argument& karg) + __device__ SplitKBatchOffset(Argument& karg, index_t k_id) { if constexpr(is_same_v) { - a_k_split_offset = blockIdx.z * karg.KRead; + a_k_split_offset = k_id * karg.KRead; } else if constexpr(is_same_v) { - a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA; + a_k_split_offset = k_id * karg.KRead * karg.StrideA; } if constexpr(is_same_v) { - b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB; + b_k_split_offset = k_id * karg.KRead * karg.StrideB; } else if constexpr(is_same_v) { - b_k_split_offset = blockIdx.z * karg.KRead; + b_k_split_offset = k_id * karg.KRead; } - if(blockIdx.z < static_cast(karg.KBatch - 1)) + if(k_id < karg.KBatch - 1) { karg.K = karg.KRead; } diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp index 5db041de0..21cef335c 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp @@ -52,6 +52,9 @@ using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 64, 8, 8, 16, 16, 8, 7, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 32, 1, 8>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 160, 64, 8, 8, 16, 16, 8, 5, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 32, 1, 8>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 160, 64, 8, 8, 32, 32, 1, 5, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 64, 1, 4>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 160, 128, 64, 8, 8, 32, 32, 5, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, BF16, BF16, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, S<4>, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp index 355dc3212..552ac3cd0 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp @@ -42,6 +42,7 @@ using device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std //##################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //##################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //##################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + #ifdef __gfx94__ // Compute friendly DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 64, 16, 16, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, @@ -72,6 +73,7 @@ using device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std: //##################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //##################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //##################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< Row, Col, DsLayout, Row, F8, F8, DsDataType, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, diff --git a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp index 53f81162a..f4300af8d 100644 --- a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp @@ -48,6 +48,7 @@ bool profile_gemm_universal_batched_impl(int do_verification, int StrideB, int StrideC, int BatchCount, + int KBatch, int n_warmup, int n_iter, uint64_t rotating = 0) @@ -147,89 +148,100 @@ bool profile_gemm_universal_batched_impl(int do_verification, float best_ave_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; + float best_kbatch = 0; // profile device op instances for(auto& op_ptr : op_ptrs) { - std::unique_ptr argument_ptr; - // false branch for multi d dl kernel - - argument_ptr = - op_ptr->MakeArgumentPointer(static_cast(a_device_buf.GetDeviceBuffer()), - static_cast(b_device_buf.GetDeviceBuffer()), - {}, - static_cast(c_device_buf.GetDeviceBuffer()), - M, - N, - K, - BatchCount, - StrideA, - StrideB, - {}, - StrideC, - BatchStrideA, - BatchStrideB, - {}, - BatchStrideC, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}); - - auto invoker_ptr = op_ptr->MakeInvokerPointer(); - - if(op_ptr->IsSupportedArgument(argument_ptr.get())) - { - // re-init C to zero before profiling next kernel - c_device_buf.SetZero(); - - std::string op_name = op_ptr->GetTypeString(); + std::vector kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38}; - float ave_time = invoker_ptr->Run( - argument_ptr.get(), - StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter, true, rotating_count}); + if(KBatch > 0) + { + kbatch_list = {KBatch}; + } - std::size_t flop = std::size_t(2) * BatchCount * M * N * K; + for(std::size_t i = 0; i < kbatch_list.size(); i++) + { + auto kbatch_curr = kbatch_list[i]; + + auto argument_ptr = + op_ptr->MakeArgumentPointer(static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b_device_buf.GetDeviceBuffer()), + {}, + static_cast(c_device_buf.GetDeviceBuffer()), + M, + N, + K, + BatchCount, + StrideA, + StrideB, + {}, + StrideC, + BatchStrideA, + BatchStrideB, + {}, + BatchStrideC, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}, + kbatch_curr); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + std::string op_name = op_ptr->GetTypeString(); - std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + - sizeof(CDataType) * M * N) * - BatchCount; + float ave_time = invoker_ptr->Run( + argument_ptr.get(), + StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter, true, rotating_count}); - float tflops = static_cast(flop) / 1.E9 / ave_time; + std::size_t flop = std::size_t(2) * BatchCount * M * N * K; - float gb_per_sec = num_btype / 1.E6 / ave_time; + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + + sizeof(CDataType) * M * N) * + BatchCount; - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec - << " GB/s, " << op_name << std::endl; + float tflops = static_cast(flop) / 1.E9 / ave_time; - if(tflops > best_tflops) - { - best_op_name = op_name; - best_tflops = tflops; - best_ave_time = ave_time; - best_gb_per_sec = gb_per_sec; - } + float gb_per_sec = num_btype / 1.E6 / ave_time; - if(do_verification) - { - c_device_buf.FromDevice(c_g_m_n_device_result.mData.data()); + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << op_name << ", KBatch " << kbatch_curr << std::endl; - pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result); + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + best_kbatch = kbatch_curr; + } - if(do_log) + if(do_verification) { - LogRangeAsType(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl; - LogRangeAsType(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl; - LogRangeAsType(std::cout << "c_host: ", c_g_m_n_host_result.mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "c_device: ", c_g_m_n_device_result.mData, ",") - << std::endl; + c_device_buf.FromDevice(c_g_m_n_device_result.mData.data()); + + pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result); + + if(do_log) + { + LogRangeAsType(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl; + LogRangeAsType(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl; + LogRangeAsType( + std::cout << "c_host: ", c_g_m_n_host_result.mData, ",") + << std::endl; + LogRangeAsType( + std::cout << "c_device: ", c_g_m_n_device_result.mData, ",") + << std::endl; + } } } - } - else - { - std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; + else + { + std::cout << op_ptr->GetTypeString() << " does not support this problem" + << std::endl; + } } } @@ -270,8 +282,8 @@ bool profile_gemm_universal_batched_impl(int do_verification, std::cout << " B = " << BatchCount << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA << " StrideB = " << StrideB << " StrideC = " << StrideC - << ": " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec - << " GB/s, " << best_op_name << std::endl; + << " KBatch = " << best_kbatch << ": " << best_ave_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; return pass; } diff --git a/profiler/src/profile_gemm_universal_batched.cpp b/profiler/src/profile_gemm_universal_batched.cpp index 4afef8e55..d57511fbf 100644 --- a/profiler/src/profile_gemm_universal_batched.cpp +++ b/profiler/src/profile_gemm_universal_batched.cpp @@ -31,7 +31,7 @@ enum struct GemmDataType int profile_batched_gemm_universal(int argc, char* argv[]) { - if(argc != 18 && argc != 21) + if(argc != 19 && argc != 22) { // clang-format off printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); @@ -44,11 +44,11 @@ int profile_batched_gemm_universal(int argc, char* argv[]) printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg6: print tensor value (0: no; 1: yes)\n"); printf("arg7: time kernel (0=n0, 1=yes)\n"); - printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n"); + printf("arg8 to 18: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount, KBatch\n"); printf("optional:\n"); - printf("arg18: number of warm-up cycles (default 1)\n"); - printf("arg19: number of iterations (default 10)\n"); - printf("arg20: memory for rotating buffer (default 0, size in MB)\n"); + printf("arg19: number of warm-up cycles (default 1)\n"); + printf("arg20: number of iterations (default 10)\n"); + printf("arg21: memory for rotating buffer (default 0, size in MB)\n"); // clang-format on exit(1); } @@ -56,11 +56,11 @@ int profile_batched_gemm_universal(int argc, char* argv[]) int n_warmup = 1; int n_iter = 10; uint64_t rotating = 0; - if(argc == 21) + if(argc == 22) { - n_warmup = std::stoi(argv[18]); - n_iter = std::stoi(argv[19]); - rotating = std::stoull(argv[20]) * 1024 * 1024; + n_warmup = std::stoi(argv[19]); + n_iter = std::stoi(argv[20]); + rotating = std::stoull(argv[21]) * 1024 * 1024; } const auto data_type = static_cast(std::stoi(argv[2])); @@ -83,6 +83,7 @@ int profile_batched_gemm_universal(int argc, char* argv[]) const int BatchStrideC = std::stoi(argv[16]); const int BatchCount = std::stoi(argv[17]); + const int KBatch = std::stoi(argv[18]); #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) using F8 = ck::f8_t; @@ -159,6 +160,7 @@ int profile_batched_gemm_universal(int argc, char* argv[]) StrideB_, StrideC_, BatchCount, + KBatch, n_warmup, n_iter, rotating); -- GitLab From 41ebf117a5927654a504803c19d18749babdeddd Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:30:22 -0800 Subject: [PATCH 118/153] Add zstd lib for building hipTensor. (#1745) * add zstd library to CI docker * fix the libzstd name --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 8ce158a20..4329c54c1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -64,6 +64,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- nano \ zlib1g-dev \ zip \ + libzstd-dev \ openssh-server \ clang-format-12 \ kmod && \ -- GitLab From d68974a5c68bd25bb8433302886213d7f5ff0d88 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:30:39 -0800 Subject: [PATCH 119/153] upgrade pandas package (#1746) --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 4329c54c1..83edbfb8e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -94,7 +94,7 @@ RUN pip install --upgrade cmake==3.27.5 && \ dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \ # Install packages for processing the performance results pip3 install --upgrade pip && \ - pip3 install sqlalchemy==1.4.46 pymysql pandas==2.0.3 setuptools-rust sshtunnel==0.4.0 && \ + pip3 install sqlalchemy==1.4.46 pymysql pandas==2.2.3 setuptools-rust sshtunnel==0.4.0 && \ # Add render group groupadd -f render && \ # Install the new rocm-cmake version -- GitLab From f57d720c67123b43cb6f18f4b8b5aa0c7c9f51ba Mon Sep 17 00:00:00 2001 From: "Xu, Shengnan" <117875955+shengnxu@users.noreply.github.com> Date: Sun, 15 Dec 2024 20:13:10 +0800 Subject: [PATCH 120/153] added moe interleaving pipeline (#1712) * added moe interleaving pipeline * remove redundant code * formater --------- Co-authored-by: root --- include/ck_tile/ops/flatmm.hpp | 1 + ...latmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp | 510 +++++++++++++ ..._uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc | 708 ++++++++++++++++++ .../fused_moegemm_pipeline_flatmm_policy.hpp | 29 +- .../pipeline/fused_moegemm_traits.hpp | 4 +- 5 files changed, 1249 insertions(+), 3 deletions(-) create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp create mode 100644 include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc diff --git a/include/ck_tile/ops/flatmm.hpp b/include/ck_tile/ops/flatmm.hpp index eee80cda4..ba76e3070 100644 --- a/include/ck_tile/ops/flatmm.hpp +++ b/include/ck_tile/ops/flatmm.hpp @@ -5,6 +5,7 @@ #include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp" #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp" #include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp new file mode 100644 index 000000000..681a69603 --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp" + +namespace ck_tile { + +// "S"tream update output along "N" +// A in smem, B load from global +// require 4 wave, occupancy=1c + +struct FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base +{ + using BDataType = bf16_t; + using ODataType = bf16_t; + + // TODO: need paired with tile_window_linear! + // TODO: need call init_raw() before call this function! + // template + template + CK_TILE_DEVICE auto + operator()(const BRes& res_b, + const BCoords& cached_coords_b, + const ORes& res_o, + const OCoords& cached_coords_o, + const OFlags& o_flags, // this should be in sgpr + CK_TILE_LDS_ADDR void* smem, + index_t n, // loop along n dim + const ScaleTensor& scale_, + index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust + index_t tile_offset_o) + { + static_assert(BCoords::size() == 8); // 8 + static_assert(OCoords::size() == 8); + + const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType); + const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType); + + static_assert(ScaleTensor::size() == 2); + float s0 = scale_[number<0>{}]; + float s1 = scale_[number<1>{}]; + + // index_t loop_cnt = n / Block_N; + + register float v_c0 asm("v64"); + register float v_c1 asm("v65"); + register float v_c2 asm("v66"); + register float v_c3 asm("v67"); + register float v_c4 asm("v68"); + register float v_c5 asm("v69"); + register float v_c6 asm("v70"); + register float v_c7 asm("v71"); + register float v_c8 asm("v72"); + register float v_c9 asm("v73"); + register float v_c10 asm("v74"); + register float v_c11 asm("v75"); + register float v_c12 asm("v76"); + register float v_c13 asm("v77"); + register float v_c14 asm("v78"); + register float v_c15 asm("v79"); + register float v_c16 asm("v80"); + register float v_c17 asm("v81"); + register float v_c18 asm("v82"); + register float v_c19 asm("v83"); + register float v_c20 asm("v84"); + register float v_c21 asm("v85"); + register float v_c22 asm("v86"); + register float v_c23 asm("v87"); + register float v_c24 asm("v88"); + register float v_c25 asm("v89"); + register float v_c26 asm("v90"); + register float v_c27 asm("v91"); + register float v_c28 asm("v92"); + register float v_c29 asm("v93"); + register float v_c30 asm("v94"); + register float v_c31 asm("v95"); + int32_t nan_hi = 0x7fff0000; + int32_t nan_lo = 0x00007fff; + + // in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4) + // every threads need 8xK in contiguous register + // ... and every wave need the same data + int lane_id = threadIdx.x % 64; + int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128; + sld_y_os *= 2; + + // y y p p p y + // reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4) + // but order is N0*M0*Nv + // in LDS we need store as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4) + // y y wave-id lid/16 lid%16 v + // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4 + int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4); + sfl_sst *= 2; + + // from LDS we need load as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4) + // ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2) + // sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4 + int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4; + sfl_sld *= 2; + + // B nr->kr + // clang-format off +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winline-asm" + asm volatile( +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 +#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc" +#undef CK_TILE_FLATMM_UK_MFMA + :[smem_]"+r"(smem), + // [s_loop_cnt]"+s"(loop_cnt), + [s_loop_cnt]"+s"(n), + [c0]"+v" (v_c0), + [c1]"+v" (v_c1), + [c2]"+v" (v_c2), + [c3]"+v" (v_c3), + [c4]"+v" (v_c4), + [c5]"+v" (v_c5), + [c6]"+v" (v_c6), + [c7]"+v" (v_c7), + [c8]"+v" (v_c8), + [c9]"+v" (v_c9), + [c10]"+v"(v_c10), + [c11]"+v"(v_c11), + [c12]"+v"(v_c12), + [c13]"+v"(v_c13), + [c14]"+v"(v_c14), + [c15]"+v"(v_c15), + [c16]"+v"(v_c16), + [c17]"+v"(v_c17), + [c18]"+v"(v_c18), + [c19]"+v"(v_c19), + [c20]"+v"(v_c20), + [c21]"+v"(v_c21), + [c22]"+v"(v_c22), + [c23]"+v"(v_c23), + [c24]"+v"(v_c24), + [c25]"+v"(v_c25), + [c26]"+v"(v_c26), + [c27]"+v"(v_c27), + [c28]"+v"(v_c28), + [c29]"+v"(v_c29), + [c30]"+v"(v_c30), + [c31]"+v"(v_c31) + : + [sld_a_base]"n"(0), + [shfl_base]"n"(0), + [v_sld_y_os]"v"(sld_y_os), + [v_sfl_sld]"v"(sfl_sld), + [v_sfl_sst]"v"(sfl_sst), + [s_res_o0]"s"(res_o[0]), + [s_res_o1]"s"(res_o[1]), + //[s_res_o2]"s"(res_o[2]), + //[s_res_o3]"s"(res_o[3]), + [s_res_b0]"s"(res_b[0]), + [s_res_b1]"s"(res_b[1]), + [s_res_b2]"s"(res_b[2]), + [s_res_b3]"s"(res_b[3]), + [v_os_o0]"v"(static_cast(cached_coords_o[number<0>{}] * sizeof(ODataType))), + [v_os_o1]"v"(static_cast(cached_coords_o[number<1>{}] * sizeof(ODataType))), + [v_os_o2]"v"(static_cast(cached_coords_o[number<2>{}] * sizeof(ODataType))), + [v_os_o3]"v"(static_cast(cached_coords_o[number<3>{}] * sizeof(ODataType))), + [v_os_o4]"v"(static_cast(cached_coords_o[number<4>{}] * sizeof(ODataType))), + [v_os_o5]"v"(static_cast(cached_coords_o[number<5>{}] * sizeof(ODataType))), + [v_os_o6]"v"(static_cast(cached_coords_o[number<6>{}] * sizeof(ODataType))), + [v_os_o7]"v"(static_cast(cached_coords_o[number<7>{}] * sizeof(ODataType))), + [v_os_b0]"v"(static_cast(cached_coords_b[number<0>{}] * sizeof(BDataType))), + [v_os_b1]"v"(static_cast(cached_coords_b[number<1>{}] * sizeof(BDataType))), + [v_os_b2]"v"(static_cast(cached_coords_b[number<2>{}] * sizeof(BDataType))), + [v_os_b3]"v"(static_cast(cached_coords_b[number<3>{}] * sizeof(BDataType))), + [v_os_b4]"v"(static_cast(cached_coords_b[number<4>{}] * sizeof(BDataType))), + [v_os_b5]"v"(static_cast(cached_coords_b[number<5>{}] * sizeof(BDataType))), + [v_os_b6]"v"(static_cast(cached_coords_b[number<6>{}] * sizeof(BDataType))), + [v_os_b7]"v"(static_cast(cached_coords_b[number<7>{}] * sizeof(BDataType))), + + [s_tile_os_o]"s"(tile_stride_o_bytes), + [s_tile_os_b]"s"(tile_stride_b_bytes), + [scale_0]"v"(s0), + [scale_1]"v"(s1), + [v_nan_lo]"v"(nan_lo), + [v_nan_hi]"v"(nan_hi), + [s_execflag_0]"s"(o_flags[number<0>{}]), + [s_execflag_1]"s"(o_flags[number<1>{}]), + [s_execflag_2]"s"(o_flags[number<2>{}]), + [s_execflag_3]"s"(o_flags[number<3>{}]), + [s_execflag_4]"s"(o_flags[number<4>{}]), + [s_execflag_5]"s"(o_flags[number<5>{}]), + [s_execflag_6]"s"(o_flags[number<6>{}]), + [s_execflag_7]"s"(o_flags[number<7>{}]) + : + "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", + "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", + "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", + "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", + "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", + "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", + "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", + "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", + "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", + "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", + "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", + "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", + "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", + "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", + "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", + "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", + "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", + "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", + "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", + "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", + "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", + "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", + "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", + "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", + "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", + "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255", + "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86", + "s36", "s37","s59","s80", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v50", "v54", "v55", + "v64","v65","v66","v67","v68","v69","v70","v71", + "v72","v73","v74","v75","v76","v77","v78","v79", + "v80","v81","v82","v83","v84","v85","v86","v87", + "v88","v89","v90","v91","v92","v93","v94","v95", + "v128", "v129", "v130", "v131", + "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139", + "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147", + "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155", + "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163", + "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171", + "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", + "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", + "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195", + "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203", + "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211", + "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219", + "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227", + "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235", + "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", + "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", + "v252", "v253", "v254", "v255" + ); +#pragma clang diagnostic pop + // clang-format on + } +}; + +struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base +{ + using BDataType = bf16_t; + using ODataType = bf16_t; + + // TODO: need paired with tile_window_linear! + // TODO: need call init_raw() before call this function! + // template + template + CK_TILE_DEVICE auto + operator()(const BRes& res_b, + const BCoords& cached_coords_b, + const ORes& res_o, + const OCoords& cached_coords_o, + const OFlags& o_flags, // this should be in sgpr + CK_TILE_LDS_ADDR void* smem, + index_t n, // loop along n dim + const ScaleTensor& scale_, + index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust + index_t tile_offset_o) + { + static_assert(BCoords::size() == 8); // 8 + static_assert(OCoords::size() == 8); + + const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType); + const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType); + + static_assert(ScaleTensor::size() == 2); + float s0 = scale_[number<0>{}]; + float s1 = scale_[number<1>{}]; + + // index_t loop_cnt = n / Block_N; + + register float v_c0 asm("v64"); + register float v_c1 asm("v65"); + register float v_c2 asm("v66"); + register float v_c3 asm("v67"); + register float v_c4 asm("v68"); + register float v_c5 asm("v69"); + register float v_c6 asm("v70"); + register float v_c7 asm("v71"); + register float v_c8 asm("v72"); + register float v_c9 asm("v73"); + register float v_c10 asm("v74"); + register float v_c11 asm("v75"); + register float v_c12 asm("v76"); + register float v_c13 asm("v77"); + register float v_c14 asm("v78"); + register float v_c15 asm("v79"); + register float v_c16 asm("v80"); + register float v_c17 asm("v81"); + register float v_c18 asm("v82"); + register float v_c19 asm("v83"); + register float v_c20 asm("v84"); + register float v_c21 asm("v85"); + register float v_c22 asm("v86"); + register float v_c23 asm("v87"); + register float v_c24 asm("v88"); + register float v_c25 asm("v89"); + register float v_c26 asm("v90"); + register float v_c27 asm("v91"); + register float v_c28 asm("v92"); + register float v_c29 asm("v93"); + register float v_c30 asm("v94"); + register float v_c31 asm("v95"); + int32_t nan_hi = 0x7fff0000; + int32_t nan_lo = 0x00007fff; + + // in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4) + // every threads need 8xK in contiguous register + // ... and every wave need the same data + int lane_id = threadIdx.x % 64; + int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128; + sld_y_os *= 2; + + // y y p p p y + // reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4) + // but order is N0*M0*Nv + // in LDS we need store as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4) + // y y wave-id lid/16 lid%16 v + // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4 + int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4); + sfl_sst *= 2; + + // from LDS we need load as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4) + // ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2) + // sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4 + int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4; + sfl_sld *= 2; + + // B nr->kr + // clang-format off +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winline-asm" + asm volatile( +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16 +#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc" +#undef CK_TILE_FLATMM_UK_MFMA + :[smem_]"+r"(smem), + [s_loop_cnt]"+s"(n), + [c0]"+v" (v_c0), + [c1]"+v" (v_c1), + [c2]"+v" (v_c2), + [c3]"+v" (v_c3), + [c4]"+v" (v_c4), + [c5]"+v" (v_c5), + [c6]"+v" (v_c6), + [c7]"+v" (v_c7), + [c8]"+v" (v_c8), + [c9]"+v" (v_c9), + [c10]"+v"(v_c10), + [c11]"+v"(v_c11), + [c12]"+v"(v_c12), + [c13]"+v"(v_c13), + [c14]"+v"(v_c14), + [c15]"+v"(v_c15), + [c16]"+v"(v_c16), + [c17]"+v"(v_c17), + [c18]"+v"(v_c18), + [c19]"+v"(v_c19), + [c20]"+v"(v_c20), + [c21]"+v"(v_c21), + [c22]"+v"(v_c22), + [c23]"+v"(v_c23), + [c24]"+v"(v_c24), + [c25]"+v"(v_c25), + [c26]"+v"(v_c26), + [c27]"+v"(v_c27), + [c28]"+v"(v_c28), + [c29]"+v"(v_c29), + [c30]"+v"(v_c30), + [c31]"+v"(v_c31) + : + [sld_a_base]"n"(0), + [shfl_base]"n"(0), + [v_sld_y_os]"v"(sld_y_os), + [v_sfl_sld]"v"(sfl_sld), + [v_sfl_sst]"v"(sfl_sst), + [s_res_o0]"s"(res_o[0]), + [s_res_o1]"s"(res_o[1]), + //[s_res_o2]"s"(res_o[2]), + //[s_res_o3]"s"(res_o[3]), + [s_res_b0]"s"(res_b[0]), + [s_res_b1]"s"(res_b[1]), + [s_res_b2]"s"(res_b[2]), + [s_res_b3]"s"(res_b[3]), + [v_os_o0]"v"(static_cast(cached_coords_o[number<0>{}] * sizeof(ODataType))), + [v_os_o1]"v"(static_cast(cached_coords_o[number<1>{}] * sizeof(ODataType))), + [v_os_o2]"v"(static_cast(cached_coords_o[number<2>{}] * sizeof(ODataType))), + [v_os_o3]"v"(static_cast(cached_coords_o[number<3>{}] * sizeof(ODataType))), + [v_os_o4]"v"(static_cast(cached_coords_o[number<4>{}] * sizeof(ODataType))), + [v_os_o5]"v"(static_cast(cached_coords_o[number<5>{}] * sizeof(ODataType))), + [v_os_o6]"v"(static_cast(cached_coords_o[number<6>{}] * sizeof(ODataType))), + [v_os_o7]"v"(static_cast(cached_coords_o[number<7>{}] * sizeof(ODataType))), + [v_os_b0]"v"(static_cast(cached_coords_b[number<0>{}] * sizeof(BDataType))), + [v_os_b1]"v"(static_cast(cached_coords_b[number<1>{}] * sizeof(BDataType))), + [v_os_b2]"v"(static_cast(cached_coords_b[number<2>{}] * sizeof(BDataType))), + [v_os_b3]"v"(static_cast(cached_coords_b[number<3>{}] * sizeof(BDataType))), + [v_os_b4]"v"(static_cast(cached_coords_b[number<4>{}] * sizeof(BDataType))), + [v_os_b5]"v"(static_cast(cached_coords_b[number<5>{}] * sizeof(BDataType))), + [v_os_b6]"v"(static_cast(cached_coords_b[number<6>{}] * sizeof(BDataType))), + [v_os_b7]"v"(static_cast(cached_coords_b[number<7>{}] * sizeof(BDataType))), + + [s_tile_os_o]"s"(tile_stride_o_bytes), + [s_tile_os_b]"s"(tile_stride_b_bytes), + [scale_0]"v"(s0), + [scale_1]"v"(s1), + [v_nan_lo]"v"(nan_lo), + [v_nan_hi]"v"(nan_hi), + [s_execflag_0]"s"(o_flags[number<0>{}]), + [s_execflag_1]"s"(o_flags[number<1>{}]), + [s_execflag_2]"s"(o_flags[number<2>{}]), + [s_execflag_3]"s"(o_flags[number<3>{}]), + [s_execflag_4]"s"(o_flags[number<4>{}]), + [s_execflag_5]"s"(o_flags[number<5>{}]), + [s_execflag_6]"s"(o_flags[number<6>{}]), + [s_execflag_7]"s"(o_flags[number<7>{}]) + : + "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", + "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", + "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", + "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", + "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", + "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", + "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", + "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", + "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", + "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", + "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", + "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", + "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", + "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", + "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", + "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", + "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", + "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", + "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", + "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", + "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", + "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", + "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", + "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", + "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", + "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255", + "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86", + "s36", "s37","s59","s80", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v50", "v54", "v55", + "v64","v65","v66","v67","v68","v69","v70","v71", + "v72","v73","v74","v75","v76","v77","v78","v79", + "v80","v81","v82","v83","v84","v85","v86","v87", + "v88","v89","v90","v91","v92","v93","v94","v95", + "v128", "v129", "v130", "v131", + "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139", + "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147", + "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155", + "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163", + "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171", + "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", + "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", + "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195", + "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203", + "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211", + "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219", + "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227", + "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235", + "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", + "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", + "v252", "v253", "v254", "v255" + ); +#pragma clang diagnostic pop + // clang-format on + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc new file mode 100644 index 000000000..b8c6d2002 --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc @@ -0,0 +1,708 @@ +#ifndef CK_TILE_FLATMM_UK_MFMA +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 +#endif + +#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16 +# define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16" + +# define _UK_PK_CVT_(x0_, x1_, y_) \ + " v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \ + " v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \ + " v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \ + " v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \ + " v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \ + " v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \ + " v_perm_b32 " y_ ", v55, v54, s52 \n" + +# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16" + +#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16 +#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16" + +# define _UK_PK_CVT_(x0_, x1_, y_) \ + " v_cvt_f16_f32 v54, " x0_ " \n" \ + " v_cvt_f16_f32 v55, " x1_ " \n" \ + " v_pack_b32_f16 " y_ ", v54, v55 \n" + +# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16" + +#endif + + +";-------------------------------------------------------------\n" +" s_mov_b32 s52, 0x07060302 ; v_perm\n" +" s_mov_b64 s[38:39], exec ; save current exec\n" +" s_mov_b32 s8, %[s_res_o0] \n" +" s_mov_b32 s9, %[s_res_o1] \n" +" s_mov_b32 s12, %[s_res_b0] \n" +" s_mov_b32 s13, %[s_res_b1] \n" +" s_mov_b32 s14, %[s_res_b2] \n" +" s_mov_b32 s15, %[s_res_b3] \n" +" s_mov_b32 s59, 0 \n" +" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n" +" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n" +" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n" +" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n" +" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n" +" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n" +" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n" +" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n" +" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n" +" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n" +" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n" +" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n" +" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n" +" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n" +" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n" +" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n" +" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n" +" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n" +" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n" +" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n" +" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n" +" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n" +" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n" +" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n" +" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n" +" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n" +" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n" +" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n" +" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n" +" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n" +" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n" +" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n" +" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n" +" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n" +" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n" +" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n" +" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n" +" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n" +" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n" +" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n" +" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n" +" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n" +" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n" +" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n" +" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n" +" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n" +" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n" +" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n" +" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n" +" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n" +" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n" +" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n" +" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n" +" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n" +" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n" +" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n" +" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n" +" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n" +" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n" +" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n" +" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n" +" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n" +" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n" +" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n" +" s_waitcnt 0 \n" +" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" +" s_add_u32 s12, %[s_tile_os_b], s12 \n" +" s_addc_u32 s13, 0, s13 \n" +" v_mov_b32 v64, 0 \n" +" v_mov_b32 v80, 0 \n" +" v_mov_b32 v65, 0 \n" +" v_mov_b32 v81, 0 \n" +" v_mov_b32 v66, 0 \n" +" v_mov_b32 v82, 0 \n" +" v_mov_b32 v67, 0 \n" +" v_mov_b32 v83, 0 \n" +" v_mov_b32 v68, 0 \n" +" v_mov_b32 v84, 0 \n" +" v_mov_b32 v69, 0 \n" +" v_mov_b32 v85, 0 \n" +" v_mov_b32 v70, 0 \n" +" v_mov_b32 v86, 0 \n" +" v_mov_b32 v71, 0 \n" +" v_mov_b32 v87, 0 \n" +" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640 \n" +" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992 \n" +" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816 \n" +" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168 \n" +" s_mov_b32 s80, 0 \n" +" s_waitcnt vmcnt(24) \n" +"label_0AA6: \n" +" s_waitcnt vmcnt(30) & lgkmcnt(0) \n" +" s_barrier \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n" +" ds_read_b32 v10, %[v_sfl_sld] offset:16640 \n" +" ds_read_b32 v11, %[v_sfl_sld] offset:16672 \n" +" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344 \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67] \n" + " buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen \n" +" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696 \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67] \n" +" ds_read_b32 v12, %[v_sfl_sld] offset:16704 \n" +" ds_read_b32 v13, %[v_sfl_sld] offset:16736 \n" +" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520 \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67] \n" +" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872 \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67] \n" +" ds_read_b32 v14, %[v_sfl_sld] offset:20992 \n" +" ds_read_b32 v15, %[v_sfl_sld] offset:21024 \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67] \n" + " buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67] \n" +" ds_read_b32 v16, %[v_sfl_sld] offset:21056 \n" +" ds_read_b32 v17, %[v_sfl_sld] offset:21088 \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71] \n" + " buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71] \n" + " buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71] \n" + " s_waitcnt lgkmcnt(0) \n" + " s_mov_b64 exec, %[s_execflag_0] \n" +_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75] \n" + " buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75] \n" + " buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79] \n" + " buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79] \n" + " buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79] \n" + " s_mov_b64 exec, %[s_execflag_1] \n" +_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_waitcnt vmcnt(30) \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67] \n" + " buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67] \n" + " buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71] \n" + " buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71] \n" + " buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71] \n" + " s_mov_b64 exec, %[s_execflag_2] \n" +_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75] \n" + " buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75] \n" + " buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79] \n" + " buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79] \n" + " buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79] \n" + " s_mov_b64 exec, %[s_execflag_3] \n" +_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_waitcnt vmcnt(30) \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67] \n" + " buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67] \n" + " buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71] \n" + " buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71] \n" + " buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71] \n" + " s_mov_b64 exec, %[s_execflag_4] \n" +_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75] \n" + " buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75] \n" + " buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79] \n" + " buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79] \n" + " buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79] \n" + " s_mov_b64 exec, %[s_execflag_5] \n" +_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_waitcnt vmcnt(30) \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67] \n" + " buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67] \n" + " buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67] \n" + _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71] \n" + " buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71] \n" + " buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71] \n" + _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71] \n" + " s_mov_b64 exec, %[s_execflag_6] \n" +_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75] \n" + " buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75] \n" + " buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75] \n" + _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79] \n" + " buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79] \n" + " buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79] \n" + _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79] \n" + " s_mov_b64 exec, %[s_execflag_7] \n" +_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_add_u32 s60, 0x00000100, s80 \n" +" s_cmp_lt_u32 s60, %[s_loop_cnt] \n" +" s_cselect_b32 s56, %[s_tile_os_b], 0 \n" +" s_add_u32 s12, s56, s12 \n" +" s_addc_u32 s13, 0, s13 \n" +" s_cmp_ge_u32 s80, 0x00000100 \n" +" s_cselect_b32 s59, %[s_tile_os_o], s59 \n" +" s_add_u32 s8, s59, s8 \n" +" s_addc_u32 s9, 0, s9 \n" +" v_mul_f32 %[c0], %[scale_0], %[c0] \n" +" v_mul_f32 %[c1], %[scale_0], %[c1] \n" +" v_mul_f32 %[c2], %[scale_0], %[c2] \n" +" v_mul_f32 %[c3], %[scale_0], %[c3] \n" +" v_mul_f32 %[c4], %[scale_1], %[c4] \n" +" v_mul_f32 %[c5], %[scale_1], %[c5] \n" +" v_mul_f32 %[c6], %[scale_1], %[c6] \n" +" v_mul_f32 %[c7], %[scale_1], %[c7] \n" +" v_mul_f32 %[c8], %[scale_0], %[c8] \n" +" v_mul_f32 %[c9], %[scale_0], %[c9] \n" +" v_mul_f32 %[c10], %[scale_0], %[c10] \n" +" v_mul_f32 %[c11], %[scale_0], %[c11] \n" +" v_mul_f32 %[c12], %[scale_1], %[c12] \n" +" v_mul_f32 %[c13], %[scale_1], %[c13] \n" +" v_mul_f32 %[c14], %[scale_1], %[c14] \n" +" v_mul_f32 %[c15], %[scale_1], %[c15] \n" +_UK_PK_CVT_("%[c0]", "%[c1]", "%[c0]") +_UK_PK_CVT_("%[c2]", "%[c3]", "%[c1]") +_UK_PK_CVT_("%[c4]", "%[c5]", "%[c2]") +_UK_PK_CVT_("%[c6]", "%[c7]", "%[c3]") +_UK_PK_CVT_("%[c8]", "%[c9]", "%[c4]") +_UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]") +_UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]") +_UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]") +" s_addk_i32 s80, 0x0080 \n" +" s_cmp_lt_i32 s80, %[s_loop_cnt] \n" +" s_cbranch_scc0 label_0EC1 \n" +" s_waitcnt vmcnt(30) & lgkmcnt(0) \n" +" s_barrier \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0 \n" +" ds_read_b32 v10, %[v_sfl_sld] offset:25344 \n" +" ds_read_b32 v11, %[v_sfl_sld] offset:25376 \n" +" ds_write_b64 v3, v[64:65] offset:16640 \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83] \n" + " buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n" +" ds_write_b64 v3, v[66:67] offset:20992 \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83] \n" +" ds_read_b32 v12, %[v_sfl_sld] offset:25408 \n" +" ds_read_b32 v13, %[v_sfl_sld] offset:25440 \n" +" ds_write_b64 v3, v[68:69] offset:18816 \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83] \n" +" ds_write_b64 v3, v[70:71] offset:23168 \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83] \n" +" ds_read_b32 v14, %[v_sfl_sld] offset:29696 \n" +" ds_read_b32 v15, %[v_sfl_sld] offset:29728 \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83] \n" + " buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83] \n" +" ds_read_b32 v16, %[v_sfl_sld] offset:29760 \n" +" ds_read_b32 v17, %[v_sfl_sld] offset:29792 \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0 \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87] \n" + " buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87] \n" + " buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87] \n" + " s_waitcnt lgkmcnt(0) \n" + " s_mov_b64 exec, %[s_execflag_0] \n" +_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0 \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91] \n" + " buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91] \n" + " buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0 \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95] \n" + " buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95] \n" + " buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95] \n" + " s_mov_b64 exec, %[s_execflag_1] \n" +_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_waitcnt vmcnt(30) \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83] \n" + " buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83] \n" + " buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87] \n" + " buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87] \n" + " buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87] \n" + " s_mov_b64 exec, %[s_execflag_2] \n" +_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91] \n" + " buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91] \n" + " buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95] \n" + " buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95] \n" + " buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95] \n" + " s_mov_b64 exec, %[s_execflag_3] \n" +_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_waitcnt vmcnt(30) \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83] \n" + " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83] \n" + " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87] \n" + " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87] \n" + " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87] \n" + " s_mov_b64 exec, %[s_execflag_4] \n" +_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91] \n" + " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91] \n" + " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95] \n" + " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95] \n" + " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95] \n" + " s_mov_b64 exec, %[s_execflag_5] \n" +_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_waitcnt vmcnt(30) \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83] \n" + " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83] \n" + " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[236:237], v[188:189], v[80:83] \n" + _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87] \n" + " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[228:229], v[244:245], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87] \n" + " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[236:237], v[252:253], v[84:87] \n" + _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87] \n" + " s_mov_b64 exec, %[s_execflag_6] \n" +_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91] \n" + " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91] \n" + " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[252:253], v[188:189], v[88:91] \n" + _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95] \n" + " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[244:245], v[244:245], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95] \n" + " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[252:253], v[252:253], v[92:95] \n" + _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95] \n" + " s_mov_b64 exec, %[s_execflag_7] \n" +_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_add_u32 s60, 0x00000100, s80 \n" +" s_cmp_lt_u32 s60, %[s_loop_cnt] \n" +" s_cselect_b32 s56, s56, 0 \n" +" s_add_u32 s12, s56, s12 \n" +" s_addc_u32 s13, 0, s13 \n" +" s_cmp_ge_u32 s80, 0x00000100 \n" +" s_cselect_b32 s59, 0x00000100, s59 \n" +" s_add_u32 s8, s59, s8 \n" +" s_addc_u32 s9, 0, s9 \n" +" v_mul_f32 %[c16], %[scale_0], %[c16] \n" +" v_mul_f32 %[c17], %[scale_0], %[c17] \n" +" v_mul_f32 %[c18], %[scale_0], %[c18] \n" +" v_mul_f32 %[c19], %[scale_0], %[c19] \n" +" v_mul_f32 %[c20], %[scale_1], %[c20] \n" +" v_mul_f32 %[c21], %[scale_1], %[c21] \n" +" v_mul_f32 %[c22], %[scale_1], %[c22] \n" +" v_mul_f32 %[c23], %[scale_1], %[c23] \n" +" v_mul_f32 %[c24], %[scale_0], %[c24] \n" +" v_mul_f32 %[c25], %[scale_0], %[c25] \n" +" v_mul_f32 %[c26], %[scale_0], %[c26] \n" +" v_mul_f32 %[c27], %[scale_0], %[c27] \n" +" v_mul_f32 %[c28], %[scale_1], %[c28] \n" +" v_mul_f32 %[c29], %[scale_1], %[c29] \n" +" v_mul_f32 %[c30], %[scale_1], %[c30] \n" +" v_mul_f32 %[c31], %[scale_1], %[c31] \n" +_UK_PK_CVT_("%[c16]", "%[c17]", "%[c16]") +_UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]") +_UK_PK_CVT_("%[c20]", "%[c21]", "%[c18]") +_UK_PK_CVT_("%[c22]", "%[c23]", "%[c19]") +_UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]") +_UK_PK_CVT_("%[c26]", "%[c27]", "%[c21]") +_UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]") +_UK_PK_CVT_("%[c30]", "%[c31]", "%[c23]") +" s_addk_i32 s80, 0x0080 \n" +" s_cmp_lt_i32 s80, %[s_loop_cnt] \n" +" s_cbranch_scc0 label_0EC1 \n" +" s_branch label_0AA6 \n" +" label_0EC1: \n" +" s_waitcnt lgkmcnt(0) \n" +" s_barrier \n" +" ds_read_b32 v10, %[v_sfl_sld] offset:16640 \n" +" ds_read_b32 v11, %[v_sfl_sld] offset:16672 \n" +" ds_read_b32 v12, %[v_sfl_sld] offset:16704 \n" +" ds_read_b32 v13, %[v_sfl_sld] offset:16736 \n" +" ds_read_b32 v14, %[v_sfl_sld] offset:20992 \n" +" ds_read_b32 v15, %[v_sfl_sld] offset:21024 \n" +" ds_read_b32 v16, %[v_sfl_sld] offset:21056 \n" +" ds_read_b32 v17, %[v_sfl_sld] offset:21088 \n" +" s_waitcnt lgkmcnt(0) \n" + " s_mov_b64 exec, %[s_execflag_0] \n" +_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_1] \n" +_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_2] \n" +_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_3] \n" +_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_4] \n" +_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_5] \n" +_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_6] \n" +_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_7] \n" +_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_add_u32 s8, s59, s8 \n" +" s_addc_u32 s9, 0, s9 \n" +" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344 \n" +" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696 \n" +" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520 \n" +" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872 \n" +" s_waitcnt lgkmcnt(0) \n" +" s_barrier \n" +" ds_read_b32 v10, %[v_sfl_sld] offset:25344 \n" +" ds_read_b32 v11, %[v_sfl_sld] offset:25376 \n" +" ds_read_b32 v12, %[v_sfl_sld] offset:25408 \n" +" ds_read_b32 v13, %[v_sfl_sld] offset:25440 \n" +" ds_read_b32 v14, %[v_sfl_sld] offset:29696 \n" +" ds_read_b32 v15, %[v_sfl_sld] offset:29728 \n" +" ds_read_b32 v16, %[v_sfl_sld] offset:29760 \n" +" ds_read_b32 v17, %[v_sfl_sld] offset:29792 \n" +" s_waitcnt lgkmcnt(0) \n" +" s_mov_b64 exec, %[s_execflag_0] \n" +_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_1] \n" +_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_2] \n" +_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_3] \n" +_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_4] \n" +_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_5] \n" +_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_6] \n" +_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" + " s_mov_b64 exec, %[s_execflag_7] \n" +_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" + +#undef _UK_MFMA_ +#undef _UK_PK_CVT_ +#undef _UK_ATOMIC_ADD_ + diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp index fea30f029..629f0ee8f 100644 --- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp @@ -810,21 +810,46 @@ struct FusedMoeGemmPipelineFlatmmPolicy CK_TILE_HOST_DEVICE static constexpr auto GetUK_1() { using S_ = typename Problem::BlockShape; + using T_ = typename Problem::Traits; if constexpr(std::is_same_v && std::is_same_v && std::is_same_v && S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 && - S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32) + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 && + T_::PipeInterleave == false) { return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16{}; + // return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl{}; } else if constexpr(std::is_same_v && std::is_same_v && std::is_same_v && S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 && - S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32) + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 && + T_::PipeInterleave == false) { return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{}; + // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl{}; + } + else if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 && + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 && + T_::PipeInterleave == true) + { + // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{}; + return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl{}; + } + else if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 && + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 && + T_::PipeInterleave == true) + { + // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{}; + return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl{}; } } }; diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp index d7127b098..3fb82bc09 100644 --- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp @@ -22,7 +22,8 @@ template + bool PadIntermediateSize_ = false, + bool PipeInterleave_ = true> struct FusedMoeGemmTraits { // Gate+Up or Gate only @@ -32,6 +33,7 @@ struct FusedMoeGemmTraits static constexpr FusedMoeGemmWeightPermuteEnum PermuteEnum = PermuteEnum_; static constexpr bool PadHiddenSize = PadHiddenSize_; static constexpr bool PadIntermediateSize = PadIntermediateSize_; + static constexpr bool PipeInterleave = PipeInterleave_; }; // Note: this need to be a bit mask -- GitLab From fdfe2102304f62ec62194706a5f67766ae824dc6 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Sun, 15 Dec 2024 16:25:21 -0800 Subject: [PATCH 121/153] upgrade sqlalchemy version (#1748) * upgrade sqlalchemy version * replace the connection with engine in to_sql call * change the hipTes=nsor ctest syntax --- Dockerfile | 2 +- Jenkinsfile | 4 +--- script/process_perf_data.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 83edbfb8e..a3bf3866b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -94,7 +94,7 @@ RUN pip install --upgrade cmake==3.27.5 && \ dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \ # Install packages for processing the performance results pip3 install --upgrade pip && \ - pip3 install sqlalchemy==1.4.46 pymysql pandas==2.2.3 setuptools-rust sshtunnel==0.4.0 && \ + pip3 install sqlalchemy==2.0.36 pymysql pandas==2.2.3 setuptools-rust sshtunnel==0.4.0 && \ # Add render group groupadd -f render && \ # Install the new rocm-cmake version diff --git a/Jenkinsfile b/Jenkinsfile index f82c34afa..87c9457fc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -566,11 +566,9 @@ def Build_CK(Map conf=[:]){ ls -ltr CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install" cmake --build build -- -j + ctest --test-dir build """ } - dir("hipTensor-${params.hipTensor_branch}/build"){ - sh 'ctest' - } } } } diff --git a/script/process_perf_data.py b/script/process_perf_data.py index fbfec94ee..32e2e15d7 100644 --- a/script/process_perf_data.py +++ b/script/process_perf_data.py @@ -332,7 +332,7 @@ def main(): table_name="ck_fmha_bwd_tflops" tflops_base = get_baseline(table_name,conn) - store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn) + store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, sqlEngine) conn.close() #compare the results to the baseline if baseline exists -- GitLab From a8ad7fcce912c8e462ca69d5ca680d99b2ef56dd Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:14:52 +0000 Subject: [PATCH 122/153] add template placeholders --- .github/CONTRIBUTING.md | 0 .github/ISSUE_TEMPLATE.md | 14 ++++++++++++++ .github/PULL_REQUEST_TEMPLATE.md | 0 3 files changed, 14 insertions(+) create mode 100644 .github/CONTRIBUTING.md create mode 100644 .github/ISSUE_TEMPLATE.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 000000000..e69de29bb diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 000000000..263cc3480 --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,14 @@ +When creating an issue, please check if a similar issue already exists. + +### When reporting a bug, please include: +- [ ] A descriptive title +- [ ] An isolated way to reproduce the behavior (preferably a docker container with a repro) +- [ ] ROCm version, clang version, Composable Kernel commit pin +- [ ] Environment variables +- [ ] The behavior you expect to see, and the behavior you actually see + +### When requesting a feature, please include: +- [ ] A descriptive title +- [ ] A detailed description of the problem you are trying to solve +- [ ] An overview of the suggested solution +- [ ] Explanation why the solution is an improvement \ No newline at end of file diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..e69de29bb -- GitLab From 30a37cac0e76298ef184597b1f7d3ef0d3f4bb60 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:50:27 +0000 Subject: [PATCH 123/153] add pull request template placeholder --- .github/PULL_REQUEST_TEMPLATE.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e69de29bb..c5161f7f8 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,19 @@ +## Proposed changes + +Please describe the motivation behind the pull request, whether it enables a new feature or fixes a bug. If there are associated pull requests or issues, please link them to the pull request. + +## Checklist + +Please put an `x` into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask. + +- [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally +- [ ] I have added inline documentation which enables the maintainers with understanding the motivation +- [ ] I have removed the stale documentation which is no longer relevant after this pull request +- [ ] I have added release notes which provide the end users with a brief summary of the improvement from this pull request +- [ ] I have run `clang-format` on all changed files +- [ ] Any dependent changes have been merged + +## Discussion + +If this is a relatively large or complex change, feel free to start a discussion by explaining why you chose the solution you did and what alternatives you considered + -- GitLab From 1b75c77da41afdfa8cff30a40bbe0fc4bd1d643f Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 10 Dec 2024 19:14:37 +0000 Subject: [PATCH 124/153] add contributing placeholder --- .github/CONTRIBUTING.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index e69de29bb..56f2acee7 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,10 @@ +We'd love for you to contribute to our source code! + +Some helpful links: + +- [Code of Conduct guidelines](https://www.contributor-covenant.org/version/2/1/code_of_conduct/code_of_conduct.txt) +- [New issue guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/ISSUE_TEMPLATE.md) +- [Submitting a pull request guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/PULL_REQUEST_TEMPLATE.md) +- [Maintainers](https://github.com/rocm/composable_kernel/blob/develop/CONTRIBUTORS.md) +- [General information](https://github.com/rocm/composable_kernel/blob/develop/README.md) +- [ROCm documentation](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.html) \ No newline at end of file -- GitLab From 0fd6978d2a3c5973d9c0486616b2a71ea7aa5f86 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 10 Dec 2024 20:29:49 +0000 Subject: [PATCH 125/153] clarify release notes bullet point --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c5161f7f8..b3fcabec3 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -9,7 +9,7 @@ Please put an `x` into the boxes that apply. You can also fill these out after c - [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally - [ ] I have added inline documentation which enables the maintainers with understanding the motivation - [ ] I have removed the stale documentation which is no longer relevant after this pull request -- [ ] I have added release notes which provide the end users with a brief summary of the improvement from this pull request +- [ ] (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request - [ ] I have run `clang-format` on all changed files - [ ] Any dependent changes have been merged -- GitLab From d46196f291a33539a089d7d09bcbc4d2270733c2 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Tue, 17 Dec 2024 09:19:44 +0100 Subject: [PATCH 126/153] Enhance printing functionality (#1751) * Added object print with all template parameters * fix clang format --------- Co-authored-by: ravil-mobile Co-authored-by: illsilin --- .../gpu/device/device_base.hpp | 34 + .../impl/device_gemm_xdl_cshuffle_v3.hpp | 1 + ...m_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc | 1383 +++++++++------- ..._uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc | 1439 +++++++++-------- ...atmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc | 1007 ++++++------ .../profiler/profile_gemm_universal_impl.hpp | 18 +- 6 files changed, 2095 insertions(+), 1787 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp index 908ada016..736e241fd 100644 --- a/include/ck/tensor_operation/gpu/device/device_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_base.hpp @@ -5,6 +5,8 @@ #include #include +#include +#include #include "ck/stream_config.hpp" @@ -12,6 +14,34 @@ namespace ck { namespace tensor_operation { namespace device { +#define GET_OBJECT_NAME_IMLP \ + std::optional GetObjectName() const override \ + { \ + std::string str = __PRETTY_FUNCTION__; \ + static std::regex obj_name_expr{" (.*)::GetObjectName"}; \ + std::smatch match; \ + if(!std::regex_search(str, match, obj_name_expr)) \ + { \ + return str; \ + } \ + return std::string(match[1]) + ';'; \ + } + +#define GET_TEMPLATE_INFO_IMPL \ + std::optional GetTemplateInfo() const override \ + { \ + std::string str = __PRETTY_FUNCTION__; \ + static std::regex template_expr{"\\[(.*)\\]"}; \ + std::smatch match; \ + if(!std::regex_search(str, match, template_expr)) \ + { \ + return std::nullopt; \ + } \ + return std::string(match[1]); \ + } + +#define REGISTER_EXTRA_PRINTING_METHODS GET_OBJECT_NAME_IMLP GET_TEMPLATE_INFO_IMPL + struct BaseArgument { BaseArgument() = default; @@ -48,6 +78,10 @@ struct BaseOperator virtual std::string GetTypeIdName() const { return typeid(*this).name(); } + virtual std::optional GetObjectName() const { return std::nullopt; } + + virtual std::optional GetTemplateInfo() const { return std::nullopt; } + virtual std::string GetTypeIdHashCode() const { std::ostringstream oss; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp index 4489b2e5c..ad6aa1e7c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp @@ -729,6 +729,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2 best_op_object_name; float best_ave_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; @@ -225,7 +226,8 @@ bool profile_gemm_universal_impl(int do_verification, } } - std::string op_name = op_ptr->GetTypeString(); + std::string op_name = op_ptr->GetTypeString(); + std::optional op_obj_name = op_ptr->GetObjectName(); float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, @@ -251,11 +253,12 @@ bool profile_gemm_universal_impl(int do_verification, if(tflops > best_tflops && ave_time > 1e-10) { - best_op_name = op_name; - best_tflops = tflops; - best_ave_time = ave_time; - best_gb_per_sec = gb_per_sec; - best_kbatch = kbatch_curr; + best_op_name = op_name; + best_op_object_name = op_obj_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + best_kbatch = kbatch_curr; } } else @@ -306,6 +309,9 @@ bool profile_gemm_universal_impl(int do_verification, << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + if(best_op_object_name) + std::cout << best_op_object_name.value() << std::endl; + return pass; } -- GitLab From 627a27bda3f38b3d904f844ec0b4d988e50cc262 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Tue, 17 Dec 2024 14:25:22 +0100 Subject: [PATCH 127/153] Added unit tests for CK Tile compute bound gemm pipeline (#1728) --- test/ck_tile/gemm/CMakeLists.txt | 2 +- test/ck_tile/gemm/test_gemm_mem_pipeline.cpp | 36 ----------- test/ck_tile/gemm/test_gemm_pipeline.cpp | 42 +++++++++++++ ...es.inc => test_gemm_pipeline_ut_cases.inc} | 10 +-- ...e_util.hpp => test_gemm_pipeline_util.hpp} | 62 +++++++++++++------ 5 files changed, 90 insertions(+), 62 deletions(-) delete mode 100644 test/ck_tile/gemm/test_gemm_mem_pipeline.cpp create mode 100644 test/ck_tile/gemm/test_gemm_pipeline.cpp rename test/ck_tile/gemm/{test_gemm_mem_pipeline_ut_cases.inc => test_gemm_pipeline_ut_cases.inc} (79%) rename test/ck_tile/gemm/{test_gemm_mem_pipeline_util.hpp => test_gemm_pipeline_util.hpp} (80%) diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt index f96ad9c6e..ecfbd4e55 100644 --- a/test/ck_tile/gemm/CMakeLists.txt +++ b/test/ck_tile/gemm/CMakeLists.txt @@ -1,4 +1,4 @@ # Currently ck_tile is only built on gfx9 if(GPU_TARGETS MATCHES "gfx9") - add_gtest_executable(test_ck_tile_gemm_mem_pipeline test_gemm_mem_pipeline.cpp) + add_gtest_executable(test_ck_tile_gemm_pipeline test_gemm_pipeline.cpp) endif() diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp deleted file mode 100644 index aeb383c87..000000000 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "gtest/gtest.h" - -#include "ck_tile/host.hpp" -#include "test_gemm_mem_pipeline_util.hpp" - -using F16 = ck_tile::half_t; -using F32 = float; -using Row = ck_tile::tensor_layout::gemm::RowMajor; -using Col = ck_tile::tensor_layout::gemm::ColumnMajor; -using Intrawave = ck_tile::integral_constant; -using Interwave = ck_tile::integral_constant; - -// clang-format off -using KernelTypes = ::testing::Types< - // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler - std::tuple< Row, Row, Row, F16, F16, F32, F16, Intrawave>, - std::tuple< Row, Row, Row, F16, F16, F32, F16, Interwave>, - std::tuple< Row, Col, Row, F16, F16, F32, F16, Intrawave>, - std::tuple< Row, Col, Row, F16, F16, F32, F16, Interwave>, - std::tuple< Col, Row, Row, F16, F16, F32, F16, Intrawave>, - std::tuple< Col, Row, Row, F16, F16, F32, F16, Interwave>, - std::tuple< Col, Col, Row, F16, F16, F32, F16, Intrawave>, - std::tuple< Col, Col, Row, F16, F16, F32, F16, Interwave> - >; -// clang-format on - -TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes); - -#include "test_gemm_mem_pipeline_ut_cases.inc" diff --git a/test/ck_tile/gemm/test_gemm_pipeline.cpp b/test/ck_tile/gemm/test_gemm_pipeline.cpp new file mode 100644 index 000000000..48a2b86a6 --- /dev/null +++ b/test/ck_tile/gemm/test_gemm_pipeline.cpp @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "gtest/gtest.h" + +#include "ck_tile/host.hpp" +#include "test_gemm_pipeline_util.hpp" + +using F16 = ck_tile::half_t; +using F32 = float; +using Row = ck_tile::tensor_layout::gemm::RowMajor; +using Col = ck_tile::tensor_layout::gemm::ColumnMajor; +using Intrawave = ck_tile::integral_constant; +using Interwave = ck_tile::integral_constant; +using Mem = ck_tile::integral_constant; +using Comp = ck_tile::integral_constant; + +// clang-format off +using KernelTypes = ::testing::Types< + // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler, PipelineType + std::tuple< Row, Row, Row, F16, F16, F32, F16, Intrawave, Mem>, + std::tuple< Row, Row, Row, F16, F16, F32, F16, Intrawave, Comp>, + std::tuple< Row, Row, Row, F16, F16, F32, F16, Interwave, Mem>, + std::tuple< Row, Col, Row, F16, F16, F32, F16, Intrawave, Mem>, + std::tuple< Row, Col, Row, F16, F16, F32, F16, Intrawave, Comp>, + std::tuple< Row, Col, Row, F16, F16, F32, F16, Interwave, Mem>, + std::tuple< Col, Row, Row, F16, F16, F32, F16, Intrawave, Mem>, + std::tuple< Col, Row, Row, F16, F16, F32, F16, Intrawave, Comp>, + std::tuple< Col, Row, Row, F16, F16, F32, F16, Interwave, Mem>, + std::tuple< Col, Col, Row, F16, F16, F32, F16, Intrawave, Mem>, + std::tuple< Col, Col, Row, F16, F16, F32, F16, Intrawave, Comp>, + std::tuple< Col, Col, Row, F16, F16, F32, F16, Interwave, Mem> + >; +// clang-format on + +TYPED_TEST_SUITE(TestCkTileGemmPipeline, KernelTypes); + +#include "test_gemm_pipeline_ut_cases.inc" diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc similarity index 79% rename from test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc rename to test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc index af94d68f2..c78d69601 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc +++ b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc @@ -3,7 +3,7 @@ #pragma once -TYPED_TEST(TestCkTileGemmMemPipeline, SmallM) +TYPED_TEST(TestCkTileGemmPipeline, SmallM) { std::vector Ms{1, 2, 3, 4, 5, 6}; constexpr int N = 1024; @@ -13,7 +13,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, SmallM) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM) +TYPED_TEST(TestCkTileGemmPipeline, MidLargeM) { std::vector Ms{127, 255, 312, 799, 1573}; constexpr int N = 1024; @@ -23,7 +23,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipeline, PaddK) +TYPED_TEST(TestCkTileGemmPipeline, PaddK) { std::vector Ms{127}; constexpr int N = 1024; @@ -33,7 +33,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, PaddK) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipeline, Regular) +TYPED_TEST(TestCkTileGemmPipeline, Regular) { std::vector Ms{512}; constexpr int N = 1024; @@ -43,7 +43,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, Regular) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipeline, NotSupportedArgument) +TYPED_TEST(TestCkTileGemmPipeline, NotSupportedArgument) { constexpr int M = 512; constexpr int N = 1025; diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp similarity index 80% rename from test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp rename to test/ck_tile/gemm/test_gemm_pipeline_util.hpp index 6941a7596..a51498602 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp @@ -11,18 +11,24 @@ #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" +enum struct GemmPipelineType +{ + Mem, + Comp +}; template -class TestCkTileGemmMemPipeline : public ::testing::Test +class TestCkTileGemmPipeline : public ::testing::Test { protected: - using ALayout = std::tuple_element_t<0, Tuple>; - using BLayout = std::tuple_element_t<1, Tuple>; - using CLayout = std::tuple_element_t<2, Tuple>; - using ADataType = std::tuple_element_t<3, Tuple>; - using BDataType = std::tuple_element_t<4, Tuple>; - using AccDataType = std::tuple_element_t<5, Tuple>; - using CDataType = std::tuple_element_t<6, Tuple>; - static constexpr auto Scheduler = std::tuple_element_t<7, Tuple>::value; + using ALayout = std::tuple_element_t<0, Tuple>; + using BLayout = std::tuple_element_t<1, Tuple>; + using CLayout = std::tuple_element_t<2, Tuple>; + using ADataType = std::tuple_element_t<3, Tuple>; + using BDataType = std::tuple_element_t<4, Tuple>; + using AccDataType = std::tuple_element_t<5, Tuple>; + using CDataType = std::tuple_element_t<6, Tuple>; + static constexpr auto Scheduler = std::tuple_element_t<7, Tuple>::value; + static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value; // TODO: expose tile size through test t-param ? struct gemm_args @@ -74,8 +80,13 @@ class TestCkTileGemmMemPipeline : public ::testing::Test using Traits = ck_tile::TileGemmTraits; - using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem< - ck_tile::GemmPipelineProblem>; + using BaseGemmPipeline = std::conditional_t< + PipelineType == GemmPipelineType::Mem, + ck_tile::BaseGemmPipelineAgBgCrMem< + ck_tile::GemmPipelineProblem>, + ck_tile::BaseGemmPipelineAgBgCrCompV3< + ck_tile:: + GemmPipelineProblem>>; const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(args.K); const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); @@ -85,15 +96,26 @@ class TestCkTileGemmMemPipeline : public ::testing::Test constexpr bool has_hot_loop_v = has_hot_loop_.value; constexpr auto tail_number_v = tail_number_.value; - using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem< - ck_tile::UniversalGemmPipelineProblem>; + using GemmPipeline = + std::conditional_t>, + ck_tile::GemmPipelineAgBgCrCompV3< + ck_tile::UniversalGemmPipelineProblem>>; using Kernel = ck_tile::GemmKernel; auto kargs = Kernel::MakeKargs(args.p_a, args.p_b, -- GitLab From 0e54d7ae5a638c9c1cbdc478dd12159354cd7e97 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Dec 2024 06:57:55 -0800 Subject: [PATCH 128/153] Bump rocm-docs-core from 1.11.0 to 1.12.0 in /docs/sphinx (#1753) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.11.0 to 1.12.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.11.0...v1.12.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index d1b3465b9..46a61a87f 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.11.0 +rocm-docs-core==1.12.0 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 26d0aa244..c2e74baae 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.11.0 +rocm-docs-core==1.12.0 # via -r requirements.in six==1.16.0 # via pybtex -- GitLab From 6ef8d3c295686b872d7e7a86621b68f765d98572 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Thu, 12 Dec 2024 19:47:57 +0000 Subject: [PATCH 129/153] refactor conditional usage; fix build on rocm6.1 where the reference didn't exist --- include/ck/utility/amd_ck_fp8.hpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp index 7b21ad646..1bdb1d078 100644 --- a/include/ck/utility/amd_ck_fp8.hpp +++ b/include/ck/utility/amd_ck_fp8.hpp @@ -18,6 +18,12 @@ #define CK_USE_OCP_FP8 0 #endif +namespace { +// https://en.cppreference.com/w/cpp/types/conditional +template struct conditional { using type = T; }; +template struct conditional { using type = F; }; +} + namespace ck { using f8_fnuz_t = _BitInt(8); @@ -191,10 +197,10 @@ __host__ __device__ static inline T cast_from_f8(fp8_storage_t x) } } - typename __hip_internal::conditional< + typename conditional< sizeof(T) == 2, unsigned short int, - typename __hip_internal::conditional:: + typename conditional:: type>::type retval; if constexpr(we == 5 && is_half && !is_fnuz) @@ -538,10 +544,10 @@ __host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rn constexpr int mfmt = (sizeof(T) == 8) ? 52 : ((sizeof(T) == 4) ? 23 : 10); - using T_bitwise = typename __hip_internal::conditional< + using T_bitwise = typename conditional< sizeof(T) == 2, unsigned short int, - typename __hip_internal::conditional:: + typename conditional:: type>::type; T_bitwise x_bitwise = bit_cast(_x); -- GitLab From 689a5ae45be802f51fc947a9f92208dcfb143f77 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 17 Dec 2024 10:17:29 -0800 Subject: [PATCH 130/153] Pass build flags to config.h (#1760) * pass the build flags to config.h * fix clang format --- CMakeLists.txt | 4 ++++ include/ck/config.h.in | 16 ++++++++++++++++ include/ck/utility/amd_ck_fp8.hpp | 20 +++++++++++++------- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c8698756..be4efd3df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -183,14 +183,17 @@ message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}") if (SUPPORTED_GPU_TARGETS MATCHES "gfx9") message("Enabling XDL instances") add_definitions(-DCK_USE_XDL) + set(CK_USE_XDL "ON") endif() if (SUPPORTED_GPU_TARGETS MATCHES "gfx94") message("Enabling FP8 gemms on native architectures") add_definitions(-DCK_USE_GFX94) + set(CK_USE_GFX94 "ON") endif() if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") message("Enabling WMMA instances") add_definitions(-DCK_USE_WMMA) + set(CK_USE_WMMA "ON") endif() if (SUPPORTED_GPU_TARGETS MATCHES "gfx12") add_definitions(-DCK_USE_OCP_FP8) @@ -204,6 +207,7 @@ endif() option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF) if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908")) add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH) + set(CK_USE_FP8_ON_UNSUPPORTED_ARCH "ON") endif() # CK config file to record supported datatypes, etc. diff --git a/include/ck/config.h.in b/include/ck/config.h.in index 0f0b7bd60..55a498073 100644 --- a/include/ck/config.h.in +++ b/include/ck/config.h.in @@ -111,6 +111,22 @@ #cmakedefine CK_USE_WMMA @CK_USE_WMMA@ #endif +#ifndef CK_USE_GFX94 +#cmakedefine CK_USE_GFX94 @CK_USE_GFX94@ +#endif + +#ifndef DCK_USE_OCP_FP8 +#cmakedefine DCK_USE_OCP_FP8 @DCK_USE_OCP_FP8@ +#endif + +#ifndef CK_USE_FNUZ_FP8 +#cmakedefine CK_USE_FNUZ_FP8 @CK_USE_FNUZ_FP8@ +#endif + +#ifndef CK_USE_FP8_ON_UNSUPPORTED_ARCH +#cmakedefine CK_USE_FP8_ON_UNSUPPORTED_ARCH @CK_USE_FP8_ON_UNSUPPORTED_ARCH@ +#endif + // clang-format on #endif // CK_CONFIG_H_IN diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp index 1bdb1d078..e9174904c 100644 --- a/include/ck/utility/amd_ck_fp8.hpp +++ b/include/ck/utility/amd_ck_fp8.hpp @@ -20,9 +20,17 @@ namespace { // https://en.cppreference.com/w/cpp/types/conditional -template struct conditional { using type = T; }; -template struct conditional { using type = F; }; -} +template +struct conditional +{ + using type = T; +}; +template +struct conditional +{ + using type = F; +}; +} // namespace namespace ck { @@ -200,8 +208,7 @@ __host__ __device__ static inline T cast_from_f8(fp8_storage_t x) typename conditional< sizeof(T) == 2, unsigned short int, - typename conditional:: - type>::type retval; + typename conditional::type>::type retval; if constexpr(we == 5 && is_half && !is_fnuz) { @@ -547,8 +554,7 @@ __host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rn using T_bitwise = typename conditional< sizeof(T) == 2, unsigned short int, - typename conditional:: - type>::type; + typename conditional::type>::type; T_bitwise x_bitwise = bit_cast(_x); unsigned long long x{x_bitwise}; -- GitLab From d9e37c6874402023f5fe033f6821bde6869c5da5 Mon Sep 17 00:00:00 2001 From: Harisankar Sadasivan <135730918+hsadasiv@users.noreply.github.com> Date: Tue, 17 Dec 2024 10:31:21 -0800 Subject: [PATCH 131/153] updated fp16 instances to be on parity with universal gemm instances (#1754) * updated fp16 instances to be on parity with universal gemm instances * corrected instance name to streamk instance --- ...universal_streamk_f16_f16_f16_mk_kn_mn.hpp | 18 ++++++++++-- ...universal_streamk_f16_f16_f16_mk_nk_mn.hpp | 29 +++++++++++++++---- 2 files changed, 39 insertions(+), 8 deletions(-) mode change 100644 => 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp mode change 100644 => 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp old mode 100644 new mode 100755 index 6e8d5c798..5460f7f85 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp @@ -41,6 +41,8 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = st //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, @@ -49,7 +51,9 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = st DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 256, 32, 8, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 128, 32, 8, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 2, 2, 32, 32, 2, 2, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> // clang-format on >; @@ -61,14 +65,21 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - // Latency friendly + // Latency friendly DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 4, 4, 16, 16, 1, 1, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 2, 2, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, // Memory friendly DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 8, 2, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 2, 2, 32, 32, 2, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 2, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 2, 2, 16, 16, 4, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 8, 4, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 4, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 64, 8, 4, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, @@ -82,6 +93,7 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 64, 8, 4, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 4, 4, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 4, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp old mode 100644 new mode 100755 index e00c1733e..e716b3e85 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp @@ -42,14 +42,21 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = st // Compute friendly DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 32, 4, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - // AGPR Spill - // DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - // AGPR Spill when use permuted lds layout. so, use padding for these two. + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 16, 16, 8, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 16, 16, 8, 8, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 64, 8, 8, 16, 16, 8, 7, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 64, 1, 4>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, @@ -68,15 +75,23 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - // Latency friendly + // Latency friendly DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 8, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, // Memory friendly DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 8, 8, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 4, 4, 32, 32, 2, 1, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 2, 2, 32, 32, 2, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 8, 8, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 4, 4, 32, 32, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 2, 2, 32, 32, 2, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 64, 8, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 8, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, @@ -84,12 +99,16 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 8, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 8, 8, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 64, 8, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 64, 8, 8, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 8, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 8, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 4, 4, 32, 32, 1, 2, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 2, 2, 32, 32, 1, 2, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> // clang-format on >; } // namespace instance -- GitLab From f6c4d614e35b7424774160a23d8e8bef3b15faad Mon Sep 17 00:00:00 2001 From: aledudek Date: Wed, 18 Dec 2024 09:45:58 +0100 Subject: [PATCH 132/153] [CK_TILE] Move hipmalloc/memcpy calls out of gpu reference gemm (#1743) * [CK_TILE] Move hipmalloc/memcpy calls out of gpu reference gemm * [CK_TILE] Move hipmalloc/memcpy calls out of gpu reference gemm - review changes * [CK_TILE] Move hipmalloc/memcpy calls out of gpu reference gemm - review fix --- example/ck_tile/03_gemm/run_gemm_example.inc | 29 +++- .../run_batched_gemm_example.inc | 33 +++- .../ck_tile/host/reference/reference_gemm.hpp | 162 ++---------------- 3 files changed, 68 insertions(+), 156 deletions(-) diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index a1fc15577..2b7a967ba 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -161,14 +161,39 @@ int run_gemm_example_with_layouts(int argc, c_m_n_gpu_ref.SetZero(); c_m_n_gpu_buf_ref.SetZero(); + ADataType* d_A; + BDataType* d_B; + CDataType* d_C; + + ck_tile::hip_check_error(hipMalloc(&d_A, M * K * sizeof(ADataType))); + ck_tile::hip_check_error(hipMalloc(&d_B, N * K * sizeof(BDataType))); + ck_tile::hip_check_error(hipMalloc(&d_C, M * N * sizeof(CDataType))); + + ck_tile::hip_check_error(hipMemcpy(d_A, + a_m_k_dev_buf.GetDeviceBuffer(), + M * K * sizeof(ADataType), + hipMemcpyHostToDevice)); + ck_tile::hip_check_error(hipMemcpy(d_B, + b_k_n_dev_buf.GetDeviceBuffer(), + N * K * sizeof(BDataType), + hipMemcpyHostToDevice)); + ck_tile::reference_gemm_gpu( - a_m_k_dev_buf, b_k_n_dev_buf, c_m_n_gpu_buf_ref, M, N, K, stride_A, stride_B, stride_C); + CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C); + + ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(), + d_C, + M * N * sizeof(CDataType), + hipMemcpyDeviceToHost)); + + ck_tile::hip_check_error(hipFree(d_A)); + ck_tile::hip_check_error(hipFree(d_B)); + ck_tile::hip_check_error(hipFree(d_C)); c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data()); pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_gpu_ref); diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc index dacca2042..8345eef95 100644 --- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc +++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc @@ -188,15 +188,33 @@ int run_batched_gemm_example_with_layouts(int argc, c_m_n_gpu_ref.SetZero(); c_m_n_gpu_buf_ref.SetZero(); + ADataType* d_A; + BDataType* d_B; + CDataType* d_C; + + ck_tile::hip_check_error(hipMalloc(&d_A, batch_count * M * K * sizeof(ADataType))); + ck_tile::hip_check_error(hipMalloc(&d_B, batch_count * N * K * sizeof(BDataType))); + ck_tile::hip_check_error(hipMalloc(&d_C, batch_count * M * N * sizeof(CDataType))); + + ck_tile::hip_check_error(hipMemcpy(d_A, + a_m_k_dev_buf.GetDeviceBuffer(), + batch_count * M * K * sizeof(ADataType), + hipMemcpyHostToDevice)); + + ck_tile::hip_check_error(hipMemcpy(d_B, + b_k_n_dev_buf.GetDeviceBuffer(), + batch_count * N * K * sizeof(BDataType), + hipMemcpyHostToDevice)); + ck_tile::reference_batched_gemm_gpu(a_m_k_dev_buf, - b_k_n_dev_buf, - c_m_n_gpu_buf_ref, + CLayout>(d_A, + d_B, + d_C, M, N, K, @@ -208,6 +226,15 @@ int run_batched_gemm_example_with_layouts(int argc, batch_stride_C, batch_count); + ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(), + d_C, + batch_count * M * N * sizeof(CDataType), + hipMemcpyDeviceToHost)); + + ck_tile::hip_check_error(hipFree(d_A)); + ck_tile::hip_check_error(hipFree(d_B)); + ck_tile::hip_check_error(hipFree(d_C)); + c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data()); pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_gpu_ref); diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp index 8bd1f5b04..fc412e883 100644 --- a/include/ck_tile/host/reference/reference_gemm.hpp +++ b/include/ck_tile/host/reference/reference_gemm.hpp @@ -97,9 +97,9 @@ template -void reference_gemm_gpu(DeviceMem& a_device, - DeviceMem& b_device, - DeviceMem& c_device, +void reference_gemm_gpu(ADataType* a_ptr, + BDataType* b_ptr, + CDataType* c_ptr, index_t M, index_t N, index_t K, @@ -107,79 +107,13 @@ void reference_gemm_gpu(DeviceMem& a_device, index_t stride_b, index_t stride_c) { - - ADataType* d_A; - BDataType* d_B; - CDataType* d_C; - - hipError_t errA = hipMalloc(&d_A, M * K * sizeof(ADataType)); - hipError_t errB = hipMalloc(&d_B, N * K * sizeof(BDataType)); - hipError_t errC = hipMalloc(&d_C, M * N * sizeof(CDataType)); - if(errA != hipSuccess) - { - std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA) - << std::endl; - return; // Early exit on error - } - - if(errB != hipSuccess) - { - std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB) - << std::endl; - return; // Early exit on error - } - - if(errC != hipSuccess) - { - std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC) - << std::endl; - return; // Early exit on error - } - - errA = hipMemcpy( - d_A, a_device.GetDeviceBuffer(), M * K * sizeof(ADataType), hipMemcpyHostToDevice); - if(errA != hipSuccess) - { - std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl; - } - - errB = hipMemcpy( - d_B, b_device.GetDeviceBuffer(), N * K * sizeof(BDataType), hipMemcpyHostToDevice); - if(errB != hipSuccess) - { - std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl; - } - int totalElements = M * N; int numThreadsPerBlock = 256; // Common choice for threads per block int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock; naive_gemm_kernel - <<>>(d_A, d_B, d_C, M, N, K, stride_a, stride_b, stride_c); - errC = hipMemcpy( - c_device.GetDeviceBuffer(), d_C, M * N * sizeof(CDataType), hipMemcpyDeviceToHost); - if(errC != hipSuccess) - { - std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl; - } - - errA = hipFree(d_A); - if(errA != hipSuccess) - { - std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl; - } - - errB = hipFree(d_B); - if(errB != hipSuccess) - { - std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl; - } - - errC = hipFree(d_C); - if(errC != hipSuccess) - { - std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl; - } + <<>>( + a_ptr, b_ptr, c_ptr, M, N, K, stride_a, stride_b, stride_c); return; } @@ -191,9 +125,9 @@ template -void reference_batched_gemm_gpu(DeviceMem& a_device, - DeviceMem& b_device, - DeviceMem& c_device, +void reference_batched_gemm_gpu(ADataType* a_ptr, + BDataType* b_ptr, + CDataType* c_ptr, index_t M, index_t N, index_t K, @@ -205,94 +139,20 @@ void reference_batched_gemm_gpu(DeviceMem& a_device, index_t batch_stride_C, index_t batch_count) { - - ADataType* d_A; - BDataType* d_B; - CDataType* d_C; - - hipError_t errA = hipMalloc(&d_A, batch_count * M * K * sizeof(ADataType)); - hipError_t errB = hipMalloc(&d_B, batch_count * N * K * sizeof(BDataType)); - hipError_t errC = hipMalloc(&d_C, batch_count * M * N * sizeof(CDataType)); - if(errA != hipSuccess) - { - std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA) - << std::endl; - return; // Early exit on error - } - - if(errB != hipSuccess) - { - std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB) - << std::endl; - return; // Early exit on error - } - - if(errC != hipSuccess) - { - std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC) - << std::endl; - return; // Early exit on error - } - - errA = hipMemcpy(d_A, - a_device.GetDeviceBuffer(), - batch_count * M * K * sizeof(ADataType), - hipMemcpyHostToDevice); - if(errA != hipSuccess) - { - std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl; - } - - errB = hipMemcpy(d_B, - b_device.GetDeviceBuffer(), - batch_count * N * K * sizeof(BDataType), - hipMemcpyHostToDevice); - if(errB != hipSuccess) - { - std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl; - } - int totalElements = M * N; int numThreadsPerBlock = 256; // Common choice for threads per block int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock; for(index_t batch_id = 0; batch_id < batch_count; ++batch_id) { - ADataType* d_ATemp = d_A + batch_id * batch_stride_A; - BDataType* d_BTemp = d_B + batch_id * batch_stride_B; - CDataType* d_CTemp = d_C + batch_id * batch_stride_C; + ADataType* d_ATemp = a_ptr + batch_id * batch_stride_A; + BDataType* d_BTemp = b_ptr + batch_id * batch_stride_B; + CDataType* d_CTemp = c_ptr + batch_id * batch_stride_C; naive_gemm_kernel <<>>( d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c); } - errC = hipMemcpy(c_device.GetDeviceBuffer(), - d_C, - batch_count * M * N * sizeof(CDataType), - hipMemcpyDeviceToHost); - if(errC != hipSuccess) - { - std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl; - } - - errA = hipFree(d_A); - if(errA != hipSuccess) - { - std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl; - } - - errB = hipFree(d_B); - if(errB != hipSuccess) - { - std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl; - } - - errC = hipFree(d_C); - if(errC != hipSuccess) - { - std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl; - } - return; } } // namespace ck_tile -- GitLab From 1c1b336371e2367fece6b33644b36ab30d92b2d3 Mon Sep 17 00:00:00 2001 From: Xiaodong Wang Date: Wed, 18 Dec 2024 02:32:38 -0800 Subject: [PATCH 133/153] Disambiguate bit_cast (#1749) Adding namespace to disambiguate with std::bit_cast Co-authored-by: Po Yen Chen --- include/ck_tile/core/container/meta_data_buffer.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/ck_tile/core/container/meta_data_buffer.hpp b/include/ck_tile/core/container/meta_data_buffer.hpp index 7493b93d8..eba60fac7 100644 --- a/include/ck_tile/core/container/meta_data_buffer.hpp +++ b/include/ck_tile/core/container/meta_data_buffer.hpp @@ -30,7 +30,7 @@ struct meta_data_buffer { constexpr index_t size = sizeof(T); - auto tmp = bit_cast>(data); + auto tmp = ck_tile::bit_cast>(data); for(int i = 0; i < size; i++) { @@ -66,7 +66,7 @@ struct meta_data_buffer pos++; } - data = bit_cast(tmp); + data = ck_tile::bit_cast(tmp); } return data; @@ -86,7 +86,7 @@ struct meta_data_buffer pos++; } - auto data = bit_cast(tmp); + auto data = ck_tile::bit_cast(tmp); return data; } -- GitLab From 453ca373479e1c3510bff66c03a773a29f1caada Mon Sep 17 00:00:00 2001 From: aledudek Date: Wed, 18 Dec 2024 17:52:46 +0100 Subject: [PATCH 134/153] [CK TILE] Refactor GemmKernel to be reused by other GEMM related operators (#1730) * Gemm Kernel Refactor part1 * Gemm Kernel Refactor common gemm pipeline part2 * [CK TILE] Refactor batched gemm to reuse GemmKernel * [CK TILE] Refactor GemmKernel - review changes part1 * [CK TILE] Refactor GemmKernel - references fix * [CK TILE] Refactor GemmKernel - naming changes, add problem * [CK_TILE] Refactor GemmKernel - update tests * [CK_TILE] Refactor GemmKernel - review changes * [CK_TILE] Refactor GemmKernel - update test * [CK_TILE] Refactor GemmKernel - constness fixes * [CK_TILE] Refactor GemmKernel - update tests --- example/ck_tile/03_gemm/gemm_basic.cpp | 16 +- example/ck_tile/03_gemm/gemm_basic.hpp | 16 +- example/ck_tile/03_gemm/run_gemm_example.inc | 10 +- .../ck_tile/16_batched_gemm/batched_gemm.cpp | 6 +- .../ck_tile/16_batched_gemm/batched_gemm.hpp | 6 +- .../run_batched_gemm_example.inc | 2 +- .../ops/gemm/kernel/batched_gemm_kernel.hpp | 274 +++++------------- .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 259 ++++++++++++----- .../batched_gemm/test_batched_gemm_util.hpp | 42 ++- test/ck_tile/gemm/test_gemm_pipeline_util.hpp | 40 +-- 10 files changed, 300 insertions(+), 371 deletions(-) diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp index f5260c306..4c630375f 100644 --- a/example/ck_tile/03_gemm/gemm_basic.cpp +++ b/example/ck_tile/03_gemm/gemm_basic.cpp @@ -15,7 +15,7 @@ #include "gemm_basic.hpp" template -float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) +float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) { // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. constexpr bool kPadM = false; @@ -79,17 +79,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. using Kernel = ck_tile::GemmKernel; - auto kargs = Kernel::MakeKargs(args.p_a, - args.p_b, - args.p_c, - args.M, - args.N, - args.K, - args.stride_A, - args.stride_B, - args.stride_C); - - const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); + auto kargs = Kernel::MakeKernelArgs(args); + + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); constexpr dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) diff --git a/example/ck_tile/03_gemm/gemm_basic.hpp b/example/ck_tile/03_gemm/gemm_basic.hpp index 23e99bc2a..58cdaea7d 100644 --- a/example/ck_tile/03_gemm/gemm_basic.hpp +++ b/example/ck_tile/03_gemm/gemm_basic.hpp @@ -51,20 +51,6 @@ using BDataType = Types::BDataType; using AccDataType = Types::AccDataType; using CDataType = Types::CDataType; -struct gemm_basic_args -{ - const void* p_a; - const void* p_b; - void* p_c; - ck_tile::index_t kbatch; - ck_tile::index_t M; - ck_tile::index_t N; - ck_tile::index_t K; - ck_tile::index_t stride_A; - ck_tile::index_t stride_B; - ck_tile::index_t stride_C; -}; - auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; @@ -89,4 +75,4 @@ auto create_args(int argc, char* argv[]) } // host API -float gemm_calc(gemm_basic_args args, const ck_tile::stream_config& s); +float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s); diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index 2b7a967ba..68df389bf 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -16,11 +16,11 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, int n_warmup, int n_repeat) { - gemm_basic_args args; - args.p_a = a_m_k_dev_buf.GetDeviceBuffer(); - args.p_b = b_k_n_dev_buf.GetDeviceBuffer(); - args.p_c = c_m_n_dev_buf.GetDeviceBuffer(); - args.kbatch = kbatch; + ck_tile::GemmHostArgs args; + args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); + args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); + args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); + args.k_batch = kbatch; args.M = M; args.N = N; args.K = K; diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp index bfdd74126..9b4ed9a9e 100644 --- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp +++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp @@ -16,7 +16,7 @@ #include "batched_gemm.hpp" template -float batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s) +float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stream_config& s) { // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. constexpr bool kPadM = false; @@ -79,9 +79,9 @@ float batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. using Kernel = ck_tile::BatchedGemmKernel; - auto kargs = Kernel::MakeKargs(args); + auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.batch_count); constexpr dim3 blocks = Kernel::BlockSize(); if(s.log_level_ > 0) diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp index e252c0f67..f0c0c9efb 100644 --- a/example/ck_tile/16_batched_gemm/batched_gemm.hpp +++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp @@ -29,10 +29,6 @@ using BDataType = Types::BDataType; using AccDataType = Types::AccDataType; using CDataType = Types::CDataType; -struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs -{ -}; - auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; @@ -60,4 +56,4 @@ auto create_args(int argc, char* argv[]) } // host API -float batched_gemm(batched_gemm_kargs args, const ck_tile::stream_config& s); +float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stream_config& s); diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc index 8345eef95..4e7218b5b 100644 --- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc +++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc @@ -20,7 +20,7 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, int n_warmup, int n_repeat) { - batched_gemm_kargs args; + ck_tile::BatchedGemmHostArgs args; args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp index 07b4af573..07a4cf8fb 100644 --- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp @@ -3,90 +3,93 @@ #pragma once -#include -#include - -#include "ck_tile/core.hpp" -#include "ck_tile/ops/common.hpp" +#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" namespace ck_tile { -struct BatchedGemmHostArgs +struct BatchedGemmHostArgs : public ck_tile::GemmHostArgs { - const void* a_ptr; - const void* b_ptr; - void* c_ptr; - index_t M; - index_t N; - index_t K; - index_t stride_A; - index_t stride_B; - index_t stride_C; - index_t batch_stride_A; - index_t batch_stride_B; - index_t batch_stride_C; - index_t batch_count; + CK_TILE_HOST BatchedGemmHostArgs() = default; + CK_TILE_HOST BatchedGemmHostArgs(const void* a_ptr_, + const void* b_ptr_, + void* c_ptr_, + ck_tile::index_t k_batch_, + ck_tile::index_t M_, + ck_tile::index_t N_, + ck_tile::index_t K_, + ck_tile::index_t stride_A_, + ck_tile::index_t stride_B_, + ck_tile::index_t stride_C_, + ck_tile::index_t batch_stride_A_, + ck_tile::index_t batch_stride_B_, + ck_tile::index_t batch_stride_C_, + ck_tile::index_t batch_count_) + : GemmHostArgs( + a_ptr_, b_ptr_, c_ptr_, k_batch_, M_, N_, K_, stride_A_, stride_B_, stride_C_), + batch_stride_A(batch_stride_A_), + batch_stride_B(batch_stride_B_), + batch_stride_C(batch_stride_C_), + batch_count(batch_count_) + { + } + + ck_tile::index_t batch_stride_A; + ck_tile::index_t batch_stride_B; + ck_tile::index_t batch_stride_C; + ck_tile::index_t batch_count; }; template -struct BatchedGemmKernel +struct BatchedGemmKernel : public GemmKernel { - using TilePartitioner = remove_cvref_t; - using GemmPipeline = remove_cvref_t; - using EpiloguePipeline = remove_cvref_t; - using ALayout = remove_cvref_t; - using BLayout = remove_cvref_t; - using CLayout = remove_cvref_t; - static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize; + using Base = GemmKernel; - using ADataType = remove_cvref_t; - using BDataType = remove_cvref_t; - using CDataType = remove_cvref_t; + using GemmKernelArgs = typename Base::GemmKernelArgs; - struct BatchedGemmKargs + using ADataType = typename Base::ADataType; + using BDataType = typename Base::BDataType; + using CDataType = typename Base::CDataType; + + using TilePartitioner = typename Base::TilePartitioner; + using GemmPipeline = typename Base::GemmPipeline; + using EpiloguePipeline = typename Base::EpiloguePipeline; + using ALayout = typename Base::ALayout; + using BLayout = typename Base::BLayout; + using CLayout = typename Base::CLayout; + + struct BatchedGemmKernelArgs : GemmKernelArgs { - const void* a_ptr; - const void* b_ptr; - void* c_ptr; - index_t M; - index_t N; - index_t K; - index_t stride_A; - index_t stride_B; - index_t stride_C; index_t batch_stride_A; index_t batch_stride_B; index_t batch_stride_C; index_t batch_count; }; - using Kargs = BatchedGemmKargs; - using Hargs = BatchedGemmHostArgs; + using KernelArgs = BatchedGemmKernelArgs; - __host__ static constexpr auto GridSize(const Hargs& h) + __host__ static constexpr auto GridSize(index_t M, index_t N, index_t batch_count) { - return TilePartitioner::GridSize(h.M, h.N, h.batch_count); + return TilePartitioner::GridSize(M, N, batch_count); } - __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); } + __host__ static constexpr auto BlockSize() { return dim3(Base::KernelBlockSize); } - CK_TILE_HOST static constexpr BatchedGemmKargs MakeKargs(const Hargs& h) + CK_TILE_HOST static constexpr BatchedGemmKernelArgs + MakeKernelArgs(const BatchedGemmHostArgs& hostArgs) { - Kargs k; - k.a_ptr = h.a_ptr; - k.b_ptr = h.b_ptr; - k.c_ptr = h.c_ptr; - k.M = h.M; - k.N = h.N; - k.K = h.K; - k.stride_A = h.stride_A; - k.stride_B = h.stride_B; - k.stride_C = h.stride_C; - k.batch_stride_A = h.batch_stride_A; - k.batch_stride_B = h.batch_stride_B; - k.batch_stride_C = h.batch_stride_C; - k.batch_count = h.batch_count; - return k; + return BatchedGemmKernelArgs{{hostArgs.a_ptr, + hostArgs.b_ptr, + hostArgs.c_ptr, + hostArgs.M, + hostArgs.N, + hostArgs.K, + hostArgs.stride_A, + hostArgs.stride_B, + hostArgs.stride_C}, + hostArgs.batch_stride_A, + hostArgs.batch_stride_B, + hostArgs.batch_stride_C, + hostArgs.batch_count}; } CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() @@ -94,7 +97,7 @@ struct BatchedGemmKernel return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); } - CK_TILE_DEVICE void operator()(Kargs kargs) const + CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const { const auto [i_m, i_n] = TilePartitioner{}(); const auto i_batch = __builtin_amdgcn_readfirstlane(blockIdx.z); @@ -102,156 +105,17 @@ struct BatchedGemmKernel // options const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A); const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A); - const ADataType* a_start = static_cast(kargs.a_ptr); + const ADataType* a_ptr = static_cast(kargs.a_ptr) + batch_offset_A; const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B); const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B); - const BDataType* b_start = static_cast(kargs.b_ptr); - - // Convert pointers to tensor views - auto a_tensor_view = [&]() { - if constexpr(std::is_same_v) - { - return make_naive_tensor_view( - a_start + batch_offset_A, - make_tuple(kargs.M, kargs.K), - make_tuple(kargs.stride_A, 1), - number{}, - number<1>{}); - } - else - { - return make_naive_tensor_view( - a_start + batch_offset_A, - make_tuple(kargs.M, kargs.K), - make_tuple(1, kargs.stride_A), - number<1>{}, - number<1>{}); - } - }(); - - auto b_tensor_view = [&]() { - if constexpr(std::is_same_v) - { - return make_naive_tensor_view( - b_start + batch_offset_B, - make_tuple(kargs.N, kargs.K), - make_tuple(1, kargs.stride_B), - number<1>{}, - number<1>{}); - } - else - { - return make_naive_tensor_view( - b_start + batch_offset_B, - make_tuple(kargs.N, kargs.K), - make_tuple(kargs.stride_B, 1), - number{}, - number<1>{}); - } - }(); - - auto a_pad_view = [&]() { - if constexpr(std::is_same_v) - { - return pad_tensor_view( - a_tensor_view, - make_tuple(number{}, number{}), - sequence{}); - } - else - { - return pad_tensor_view( - a_tensor_view, - make_tuple(number{}, number{}), - sequence{}); - } - }(); - // clang-format on - - auto a_block_window = make_tile_window( - a_pad_view, - make_tuple(number{}, number{}), - {i_m, 0}); - - auto b_pad_view = [&]() { - if constexpr(std::is_same_v) - { - return pad_tensor_view( - b_tensor_view, - make_tuple(number{}, number{}), - sequence{}); - } - else - { - return pad_tensor_view( - b_tensor_view, - make_tuple(number{}, number{}), - sequence{}); - } - }(); - // clang-format on - - auto b_block_window = make_tile_window( - b_pad_view, - make_tuple(number{}, number{}), - {i_n, 0}); - - // allocate LDS - __shared__ char smem_ptr[GetSmemSize()]; - - const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); - - // Run GEMM cooperatively by whole wokrgroup. - auto c_block_tile = - GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr); + const BDataType* b_ptr = static_cast(kargs.b_ptr) + batch_offset_B; const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C); const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C); - CDataType* c_start = static_cast(kargs.c_ptr); - auto c_tensor_view = [&]() { - if constexpr(std::is_same_v) - { - return make_naive_tensor_view( - c_start + batch_offset_C, - make_tuple(kargs.M, kargs.N), - make_tuple(kargs.stride_C, 1), - number{}, - number<1>{}); - } - else - { - return make_naive_tensor_view( - c_start + batch_offset_C, - make_tuple(kargs.M, kargs.N), - make_tuple(1, kargs.stride_C), - number<1>{}, - number<1>{}); - } - }(); - - auto c_pad_view = [&]() { - if constexpr(std::is_same_v) - { - return pad_tensor_view( - c_tensor_view, - make_tuple(number{}, number{}), - sequence{}); - } - else - { - return pad_tensor_view( - c_tensor_view, - make_tuple(number{}, number{}), - sequence{}); - } - }(); - auto c_block_window = make_tile_window( - c_pad_view, - make_tuple(number{}, number{}), - {i_m, i_n}); + CDataType* c_ptr = static_cast(kargs.c_ptr) + batch_offset_C; - EpiloguePipeline{}(c_block_window, c_block_tile); + this->RunGemm(a_ptr, b_ptr, c_ptr, kargs, i_m, i_n); } }; diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp index 763d8cad9..925648a88 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp @@ -12,6 +12,50 @@ namespace ck_tile { +struct GemmProblem +{ + CK_TILE_HOST GemmProblem() = default; + CK_TILE_HOST GemmProblem( + index_t M_, index_t N_, index_t K_, index_t stride_A_, index_t stride_B_, index_t stride_C_) + : M(M_), N(N_), K(K_), stride_A(stride_A_), stride_B(stride_B_), stride_C(stride_C_) + { + } + + index_t M; + index_t N; + index_t K; + index_t stride_A; + index_t stride_B; + index_t stride_C; +}; + +struct GemmHostArgs : public GemmProblem +{ + CK_TILE_HOST GemmHostArgs() = default; + CK_TILE_HOST GemmHostArgs(const void* a_ptr_, + const void* b_ptr_, + void* c_ptr_, + index_t k_batch_, + index_t M_, + index_t N_, + index_t K_, + index_t stride_A_, + index_t stride_B_, + index_t stride_C_) + : GemmProblem(M_, N_, K_, stride_A_, stride_B_, stride_C_), + a_ptr(a_ptr_), + b_ptr(b_ptr_), + c_ptr(c_ptr_), + k_batch(k_batch_) + { + } + + const void* a_ptr; + const void* b_ptr; + void* c_ptr; + index_t k_batch; +}; + template struct GemmKernel { @@ -25,9 +69,12 @@ struct GemmKernel using ADataType = remove_cvref_t; using BDataType = remove_cvref_t; - // using CAccDataType = remove_cvref_t; using CDataType = remove_cvref_t; + static constexpr auto I0 = number<0>(); + static constexpr auto I1 = number<1>(); + static constexpr auto I2 = number<2>(); + __host__ static constexpr auto GridSize(index_t M, index_t N, index_t KBatch) { return TilePartitioner::GridSize(M, N, KBatch); @@ -35,7 +82,7 @@ struct GemmKernel __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); } - struct GemmCommonKargs + struct GemmKernelArgs { const void* a_ptr; const void* b_ptr; @@ -48,25 +95,37 @@ struct GemmKernel index_t stride_C; }; - CK_TILE_HOST static constexpr GemmCommonKargs MakeKargs(const void* a_ptr, - const void* b_ptr, - void* c_ptr, - index_t M, - index_t N, - index_t K, - index_t stride_A, - index_t stride_B, - index_t stride_C) + CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const GemmHostArgs& hostArgs) { - return GemmCommonKargs{a_ptr, b_ptr, c_ptr, M, N, K, stride_A, stride_B, stride_C}; + return GemmKernelArgs{hostArgs.a_ptr, + hostArgs.b_ptr, + hostArgs.c_ptr, + hostArgs.M, + hostArgs.N, + hostArgs.K, + hostArgs.stride_A, + hostArgs.stride_B, + hostArgs.stride_C}; } + // CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const void* a_ptr, + // const void* b_ptr, + // void* c_ptr, + // index_t M, + // index_t N, + // index_t K, + // index_t stride_A, + // index_t stride_B, + // index_t stride_C) + // { + // return GemmKernelArgs{a_ptr, b_ptr, c_ptr, M, N, K, stride_A, stride_B, stride_C}; + // } CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); } - CK_TILE_HOST static bool IsSupportedArgument(const GemmCommonKargs& kargs) + CK_TILE_HOST static bool IsSupportedArgument(const GemmKernelArgs& kargs) { if constexpr(std::is_same_v) { @@ -139,18 +198,16 @@ struct GemmKernel return true; } - CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const + CK_TILE_DEVICE auto MakeGemmTensorViews(const ADataType* a_ptr, + const BDataType* b_ptr, + CDataType* c_ptr, + const GemmKernelArgs& kargs) const { - const auto [i_m, i_n] = TilePartitioner{}(); - // options - const ADataType* a_start = static_cast(kargs.a_ptr); - const BDataType* b_start = static_cast(kargs.b_ptr); - // Convert pointers to tensor views - auto a_tensor_view = [&]() { + const auto& a_tensor_view = [&]() { if constexpr(std::is_same_v) { return make_naive_tensor_view( - a_start, + a_ptr, make_tuple(kargs.M, kargs.K), make_tuple(kargs.stride_A, 1), number{}, @@ -159,7 +216,7 @@ struct GemmKernel else { return make_naive_tensor_view( - a_start, + a_ptr, make_tuple(kargs.M, kargs.K), make_tuple(1, kargs.stride_A), number<1>{}, @@ -167,11 +224,11 @@ struct GemmKernel } }(); - auto b_tensor_view = [&]() { + const auto& b_tensor_view = [&]() { if constexpr(std::is_same_v) { return make_naive_tensor_view( - b_start, + b_ptr, make_tuple(kargs.N, kargs.K), make_tuple(1, kargs.stride_B), number<1>{}, @@ -180,7 +237,7 @@ struct GemmKernel else { return make_naive_tensor_view( - b_start, + b_ptr, make_tuple(kargs.N, kargs.K), make_tuple(kargs.stride_B, 1), number{}, @@ -188,7 +245,35 @@ struct GemmKernel } }(); - auto a_pad_view = [&]() { + const auto& c_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + c_ptr, + make_tuple(kargs.M, kargs.N), + make_tuple(kargs.stride_C, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + c_ptr, + make_tuple(kargs.M, kargs.N), + make_tuple(1, kargs.stride_C), + number<1>{}, + number<1>{}); + } + }(); + + return make_tuple(a_tensor_view, b_tensor_view, c_tensor_view); + } + + template + CK_TILE_DEVICE auto MakeGemmPadViews(const TensorView& views) const + { + const auto& a_pad_view = [&]() { + const auto& a_tensor_view = views.at(I0); if constexpr(std::is_same_v) { return pad_tensor_view( @@ -204,14 +289,9 @@ struct GemmKernel sequence{}); } }(); - // clang-format on - - auto a_block_window = make_tile_window( - a_pad_view, - make_tuple(number{}, number{}), - {i_m, 0}); - auto b_pad_view = [&]() { + const auto& b_pad_view = [&]() { + const auto& b_tensor_view = views.at(I1); if constexpr(std::is_same_v) { return pad_tensor_view( @@ -228,43 +308,8 @@ struct GemmKernel } }(); - auto b_block_window = make_tile_window( - b_pad_view, - make_tuple(number{}, number{}), - {i_n, 0}); - - // allocate LDS - __shared__ char smem_ptr[GetSmemSize()]; - - const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); - - // Run GEMM cooperatively by whole wokrgroup. - auto c_block_tile = - GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr); - - CDataType* c_start = static_cast(kargs.c_ptr); - auto c_tensor_view = [&]() { - if constexpr(std::is_same_v) - { - return make_naive_tensor_view( - c_start, - make_tuple(kargs.M, kargs.N), - make_tuple(kargs.stride_C, 1), - number{}, - number<1>{}); - } - else - { - return make_naive_tensor_view( - c_start, - make_tuple(kargs.M, kargs.N), - make_tuple(1, kargs.stride_C), - number<1>{}, - number<1>{}); - } - }(); - - auto c_pad_view = [&]() { + const auto& c_pad_view = [&]() { + const auto& c_tensor_view = views.at(I2); if constexpr(std::is_same_v) { return pad_tensor_view( @@ -280,12 +325,82 @@ struct GemmKernel sequence{}); } }(); - auto CBlockWindow_pad = make_tile_window( + + return make_tuple(a_pad_view, b_pad_view, c_pad_view); + } + + template + CK_TILE_DEVICE auto + MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n) const + { + const auto& a_pad_view = views.at(I0); + const auto& a_block_window = make_tile_window( + a_pad_view, + make_tuple(number{}, number{}), + {i_m, 0}); + + const auto& b_pad_view = views.at(I1); + const auto& b_block_window = make_tile_window( + b_pad_view, + make_tuple(number{}, number{}), + {i_n, 0}); + + const auto& c_pad_view = views.at(I2); + auto c_block_window = make_tile_window( c_pad_view, make_tuple(number{}, number{}), {i_m, i_n}); - EpiloguePipeline{}(CBlockWindow_pad, c_block_tile); + return make_tuple(a_block_window, b_block_window, c_block_window); + } + + /** + * @brief Runs single GEMM problem cooperatively by whole workgroup. + * + * @param a_ptr input A pointer + * @param b_ptr input B pointer + * @param c_ptr output C pointer + * @param kargs GEMM kernel arguments + * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup. + * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup. + */ + CK_TILE_DEVICE void RunGemm(const ADataType* a_ptr, + const BDataType* b_ptr, + CDataType* c_ptr, + const GemmKernelArgs& kargs, + const index_t block_idx_m, + const index_t block_idx_n) const + { + // Create Gemm tensor views, pad views and tile windows + const auto& gemm_tensor_views_tuple = MakeGemmTensorViews(a_ptr, b_ptr, c_ptr, kargs); + const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); + auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); + + // allocate LDS + __shared__ char smem_ptr[GetSmemSize()]; + + const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); + + // Run GEMM cooperatively by whole workgroup. + const auto& a_block_window = gemm_tile_windows.at(I0); + const auto& b_block_window = gemm_tile_windows.at(I1); + const auto& c_block_tile = + GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr); + + // Run Epilogue Pipeline + auto& c_block_window = gemm_tile_windows.at(I2); + EpiloguePipeline{}(c_block_window, c_block_tile); + } + + CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const + { + const auto [i_m, i_n] = TilePartitioner{}(); + // options + const ADataType* a_ptr = static_cast(kargs.a_ptr); + const BDataType* b_ptr = static_cast(kargs.b_ptr); + CDataType* c_ptr = static_cast(kargs.c_ptr); + + RunGemm(a_ptr, b_ptr, c_ptr, kargs, i_m, i_n); } }; diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp index 88145b987..d3f307787 100644 --- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp +++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp @@ -24,12 +24,9 @@ class TestCkTileBatchedGemm : public ::testing::Test using AccDataType = std::tuple_element_t<5, Tuple>; using CDataType = std::tuple_element_t<6, Tuple>; - struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs - { - }; - template - void invoke_batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s) + void invoke_batched_gemm(const ck_tile::BatchedGemmHostArgs& args, + const ck_tile::stream_config& s) { // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. constexpr bool kPadM = false; @@ -94,9 +91,9 @@ class TestCkTileBatchedGemm : public ::testing::Test using Kernel = ck_tile::BatchedGemmKernel; - auto kargs = Kernel::MakeKargs(args); + auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.batch_count); constexpr dim3 blocks = Kernel::BlockSize(); if(s.log_level_ > 0) @@ -185,21 +182,22 @@ class TestCkTileBatchedGemm : public ::testing::Test c_m_n_dev_buf.SetZero(); c_m_n_dev_result.SetZero(); - batched_gemm_kargs kargs{a_m_k_dev_buf.GetDeviceBuffer(), - b_k_n_dev_buf.GetDeviceBuffer(), - c_m_n_dev_buf.GetDeviceBuffer(), - M, - N, - K, - StrideA, - StrideB, - StrideC, - BatchStrideA, - BatchStrideB, - BatchStrideC, - BatchCount}; - - invoke_batched_gemm(kargs, + ck_tile::BatchedGemmHostArgs args; + args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); + args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); + args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); + args.M = M; + args.N = N; + args.K = K; + args.stride_A = StrideA; + args.stride_B = StrideB; + args.stride_C = StrideC; + args.batch_stride_A = BatchStrideA; + args.batch_stride_B = BatchStrideB; + args.batch_stride_C = BatchStrideC; + args.batch_count = BatchCount; + + invoke_batched_gemm(args, ck_tile::stream_config{nullptr, false}); std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp index a51498602..53ead4d8d 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp @@ -31,22 +31,8 @@ class TestCkTileGemmPipeline : public ::testing::Test static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value; // TODO: expose tile size through test t-param ? - struct gemm_args - { - const void* p_a; - const void* p_b; - void* p_c; - ck_tile::index_t kbatch; - ck_tile::index_t M; - ck_tile::index_t N; - ck_tile::index_t K; - ck_tile::index_t stride_A; - ck_tile::index_t stride_B; - ck_tile::index_t stride_C; - }; - template - void invoke_gemm(const gemm_args& args, const ck_tile::stream_config& s) + void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) { // TODO: This should be parameterized in tests constexpr ck_tile::index_t M_Tile = 128; @@ -117,17 +103,9 @@ class TestCkTileGemmPipeline : public ::testing::Test has_hot_loop_v, tail_number_v>>>; using Kernel = ck_tile::GemmKernel; - auto kargs = Kernel::MakeKargs(args.p_a, - args.p_b, - args.p_c, - args.M, - args.N, - args.K, - args.stride_A, - args.stride_B, - args.stride_C); - - const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); + auto kargs = Kernel::MakeKernelArgs(args); + + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); constexpr dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) @@ -319,11 +297,11 @@ class TestCkTileGemmPipeline : public ::testing::Test c_m_n_dev_buf.SetZero(); c_m_n_dev_result.SetZero(); - gemm_args args; - args.p_a = a_m_k_dev_buf.GetDeviceBuffer(); - args.p_b = b_k_n_dev_buf.GetDeviceBuffer(); - args.p_c = c_m_n_dev_buf.GetDeviceBuffer(); - args.kbatch = kbatch; + ck_tile::GemmHostArgs args; + args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); + args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); + args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); + args.k_batch = kbatch; args.M = M; args.N = N; args.K = K; -- GitLab From e758d006a55dd45ee9aae009b5ab554d42736dfb Mon Sep 17 00:00:00 2001 From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com> Date: Thu, 19 Dec 2024 17:55:35 +0100 Subject: [PATCH 135/153] Apply Ck-tile argument parser for vectors [I/O] (#1758) * Parser for a vector was added. Additionaly we valid correctnes of numbers * Remove unnecessary comments * Review part 1 * Review part 2 * Add const to variadic lambda * Rename C->K --- .../ck_tile/17_grouped_gemm/grouped_gemm.hpp | 20 +++++--- .../run_grouped_gemm_example.inc | 34 ++++++++------ include/ck_tile/host/arg_parser.hpp | 46 ++++++++++++++++++- 3 files changed, 78 insertions(+), 22 deletions(-) diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp index 94af4711d..20ba74088 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp @@ -34,13 +34,19 @@ using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs; auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; - arg_parser.insert("a_layout", "R", "A tensor data layout - Row by default") - .insert("b_layout", "R", "B tensor data layout - Row by default") - .insert("c_layout", "R", "C tensor data layout - Row by default") - .insert("validate", "1", "0. No validation, 1. Validation on CPU") - .insert("warmup", "10", "number of iterations before benchmark the kernel") - .insert("repeat", "100", "number of iterations to benchmark the kernel") - .insert("group_count", "16", "group count"); + arg_parser.insert("Ms", "", "M dimensions - empty by default.") + .insert("Ns", "", "N dimensions - empty by default.") + .insert("Ks", "", "K dimensions - empty by default.") + .insert("stride_As", "", "Tensor A strides - it is empty by default.") + .insert("stride_Bs", "", "Tensor B strides - it is empty by default.") + .insert("stride_Cs", "", "Tensor C strides - it is empty by default.") + .insert("a_layout", "R", "A tensor data layout - Row by default.") + .insert("b_layout", "R", "B tensor data layout - Row by default.") + .insert("c_layout", "R", "C tensor data layout - Row by default.") + .insert("validate", "1", "0. No validation, 1. Validation on CPU.") + .insert("warmup", "10", "number of iterations before benchmark the kernel.") + .insert("repeat", "100", "number of iterations to benchmark the kernel.") + .insert("group_count", "16", "group count."); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc index cd5b1c286..11faa6642 100644 --- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc @@ -53,26 +53,34 @@ int run_grouped_gemm_example_with_layouts(int argc, return -1; }; + auto valid_input_data = [&](int group_count, const auto&... args) { + return !(args.empty() || ...) && group_count == (args.size() == ...); + }; + const int group_count = arg_parser.get_int("group_count"); const int repeat = arg_parser.get_int("repeat"); const int warmup = arg_parser.get_int("warmup"); - std::vector Ms; - std::vector Ns; - std::vector Ks; - std::vector stride_As; - std::vector stride_Bs; - std::vector stride_Cs; + std::vector Ms = arg_parser.get_int_vec("Ms"); + std::vector Ns = arg_parser.get_int_vec("Ns"); + std::vector Ks = arg_parser.get_int_vec("Ks"); + std::vector stride_As = arg_parser.get_int_vec("stride_As"); + std::vector stride_Bs = arg_parser.get_int_vec("stride_Bs"); + std::vector stride_Cs = arg_parser.get_int_vec("stride_Cs"); - for(int i = 0; i < group_count; i++) + if(!valid_input_data(group_count, Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs)) { - Ms.push_back(256 + 256 * i); - Ns.push_back(128 + 128 * i); - Ks.push_back(128 + 64 * i); + std::cout << "Please check the input data. Default values will be used." << std::endl; + for(int i = 0; i < group_count; i++) + { + Ms.push_back(256 + 256 * i); + Ns.push_back(128 + 128 * i); + Ks.push_back(128 + 64 * i); - stride_As.push_back(Ks[i]); - stride_Bs.push_back(Ks[i]); - stride_Cs.push_back(Ns[i]); + stride_As.push_back(Ks[i]); + stride_Bs.push_back(Ks[i]); + stride_Cs.push_back(Ns[i]); + } } std::vector> a_m_k_tensors; diff --git a/include/ck_tile/host/arg_parser.hpp b/include/ck_tile/host/arg_parser.hpp index 3765156df..df309f312 100644 --- a/include/ck_tile/host/arg_parser.hpp +++ b/include/ck_tile/host/arg_parser.hpp @@ -15,11 +15,14 @@ namespace ck_tile { /* - * a host side utility, arg parser for - * -[key0]=[value0] -[key1]=[value1] ... + * a host side utility, arg parser for, either + * -[key0] = [value0, value1, value2] + * or + * -[key0]=[value0] -[key1]=[value1] ... */ class ArgParser { + public: class Arg { @@ -187,6 +190,45 @@ class ArgParser return value; } + std::vector get_string_vec(const std::string& name, + const std::string& delimiter = ",") const + { + if(get_str(name).empty()) + { + return {}; + } + std::string s = get_str(name); + std::vector tokens; + size_t pos = 0; + std::string token; + while((pos = s.find(delimiter)) != std::string::npos) + { + token = s.substr(0, pos); + tokens.push_back(token); + s.erase(0, pos + delimiter.length()); + } + tokens.push_back(s); + + return tokens; + } + + std::vector get_int_vec(const std::string& name, const std::string& delimiter = ",") const + { + if(get_str(name).empty()) + { + return {}; + } + const std::vector args = get_string_vec(name, delimiter); + std::vector tokens; + tokens.reserve(static_cast(args.size())); + for(const std::string& token : args) + { + int value = atoi(token.c_str()); + tokens.push_back(value); + } + return tokens; + } + private: std::unordered_map input_map; std::vector keys; -- GitLab From 2944c508941055a0cf36d5a96092d6c739f53c36 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 19 Dec 2024 17:24:05 -0800 Subject: [PATCH 136/153] fix profiler_grouped_gemm (#1766) --- profiler/include/profiler/profile_grouped_gemm_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp index c10cd0ea9..367e94de1 100644 --- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp @@ -77,7 +77,7 @@ bool profile_grouped_gemm_impl(int do_verification, std::vector> c_m_n_host_results; std::vector> c_m_n_device_results; - ComputeDataType max_abs_in_val = 0.f; + double max_abs_in_val = 0.f; for(std::size_t i = 0; i < group_count; i++) { a_m_k.push_back( -- GitLab From 37cdbf4f0ec88ba5064f46c3370633b5950bc7ae Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Fri, 20 Dec 2024 14:41:01 +0800 Subject: [PATCH 137/153] [CK_TILE] Add fmha fwd N-Warp S-Shuffle pipeline (fmha fwd splitkv pipeline variant) (#1705) * Add check for zero values * Add static assertions * Remove invalid option '-e' in smoke_test.sh * Use correct path of smoke_test.sh * Avoid zero-sized shared memory array * Add warning comment * Replace expr by integer_divide_ceil() call * Use more readable constant names * Write down assumption as static assertion * Add more diagnostic error messages * Fix wrong BlockWarps when using default pipeline policy * Add more static assertions for A LDS desc * Allow using vector size < 8 for data type fp16/bf16 * Align vector size between DRAM dist & LDS desc * Remove no-longer used func decl * Fix wrong displayed piepline name * Undo policy template changes for tile_example_gemm_basic * Add missing space and make error message stands out * Unify print precision * Add missing include directive * Replace constant 64 by get_warp_size() call * Replace constant 128 by named variable: BankLength * Add kAMBlock/kBNBlock attributes * Allow usig different A/B warp dist for multiple blocks * Add helper function to get warp dist encodings * Add 4x64x4 fp16 warp gemm attribute impl * Complete the A/B warp dist encoding logic * Fix wrong thread mapping for C matrix * Use smaller vector size for small tile * Add static assert to block unsupported warp gemm impl * Extract common code out as helper method * Add 4x64x16 fp16 warp gemm type alias * Add comment to warning developers * Undo WarpGemmAtrributeMfma<> changes * Use more clear static assertion error message * Add trivial wrapper to get warp dstr encodings * Only transpose warp gemm result if it's square * Fix compilation error * Support multi-block warp gemm (on N direction) * Remove duplicated code * Fix output encoding of warp gemm * Fix wrong shape of WarpGemmAtrributeMfmaIterateK<> * Remove unused code * Fix wrong shape of WarpGemmAttributeMfmaImplF16F16F32M4N64K4 * Add type config for bf16_t * Add 4x64x16 bf16 warp gemm * Update WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution * Add 64x4x4 fp16/bf16 warp gemm impl * Add 64x4x16 fp16/bf16 warp gemm * Add static assertion for better error diagnostic * Get Q dram dstr directly form block gemm * Add missing header: fused_moe.hpp * Allow specifying different warp-gemm for gemm0 & gemm1 * Store P matrix into LDS before gemm1 * Fix inconsistant kernel name * Remove constraint on gemm0 & gemm1 block warps * Remove unsupported vector size from checking list * Allow using 4x64x16 warp gemm for gemm0 * Finish policy customization * Finish pipeline modification F# * Use block warps in codegen * Fix wrong rank of m_lds_window origin * Use better distributed tensor * Make P-store earlier * Remove duplicated experssions * Remove unnecessary tile window * Create new files for new splitkv pipeline * Separate old/new pipeline codegen logic * Sync changes form develop * Undo gemm kernel/pipeline changes * Undo gemm example changes * Remove blank lines * Fix typo * Use new warp gemm interface * Fix link error * Fix wrong pipeline tag * Fix more link error * Avoid unnecessary padding * Always use vector load for K * Padding on fastest dimension when necessary * Force padding Q on hdim_q * Set high dimension padding flag to false * Re-format headers * Use warps=<1, 4, 1> for both gemm0 & gemm1 * Fix complilation errors * Remove m/l shuffle logics * Ignore duplicate data when write lse_acc * Use gemm0 block warps as lds tile width * Remove hard-coded numbers * Fix wrong distribution width * Remove unnecessary code * Add s_barrier before writing to LDS * Store Q into LDS before gemm0 * Fix wrong Q tile size * Use simple Q lds descriptor for debuging * Use more realistic Q lds descriptor * Add comment & use better variable name * Make Q lds space not overlapped with others * Remove unnecessary block_tile_reduce_sync() call * Move Q load statements * Move block_sync_lds() right before use * Re-order instructions * Remove necessary lambda expression * Use 8 threads on kMaxSplits direction while doing reduction * Tiny correction for using 8 threads on kMaxSplits direction for combine kernel * Padding num_split direction of o_acc tile window to 4x * Update splitkv combine pipeline design * Add kN1 back to splitkv combine pipeline problem * Fix compilation errors * Add missing template parameter * Fix wrong splitkv combine kernel name * Fix wrong origin * Fix wrong LDS descriptor shape * Fix sync & reduction logics * Remove unnecessary static assertions * Extract tile size computation logics * Make sure we can reuse padding flags in combine kernels * Rename variables * Use OaccDataType in BlockFmhaSplitKVCombinePipelineTileSizes<> * Remove unnecessary static assertion * Fix function name typo * Add constraint on kN1 template parameter * Hide K tile loading latency in earlier iteration * Fix wrong splitkv kernel name * Use s_shuffling to replace p_shuffling which removes the needs of cross-warp reduction * Rename pipeline * Fix wrong pipeline name attribute * Add GetAlignmentQ() for NWarpSShuffle pipeline * Separate Q tile into dram tile & register tile concepts * Remove non-squre warp gemm transpose c type alias * Fallback tile size changes for fmha fwd splitkv * Remove redundant change * Refine naming for the S tile * Use better naming of the S tile dstr (read from lds) * Share Q lds with K lds * Tiny change * Fix with using static_for for passing CI checking --------- Co-authored-by: Qianfeng Zhang --- .../ck_tile/01_fmha/codegen/cpp_symbol_map.py | 1 + .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 42 +- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 85 +- example/ck_tile/01_fmha/fmha_fwd.hpp | 2 - .../core/arch/amd_buffer_addressing.hpp | 4 +- .../core/tensor/static_distributed_tensor.hpp | 1 + include/ck_tile/ops/fmha.hpp | 2 + .../ops/fmha/kernel/fmha_fwd_kernel.hpp | 6 +- .../fmha_fwd_splitkv_combine_kernel.hpp | 56 +- .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 9 +- ...lock_fmha_fwd_splitkv_combine_pipeline.hpp | 83 +- ...plitkv_combine_pipeline_default_policy.hpp | 173 ++-- ...litkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp | 794 ++++++++++++++++++ ...nwarp_sshuffle_qr_ks_vs_default_policy.hpp | 226 +++++ .../pipeline/block_fmha_pipeline_problem.hpp | 36 +- ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 55 +- .../ops/fmha/pipeline/tile_fmha_shape.hpp | 2 - ...block_gemm_areg_bsmem_creg_one_warp_v1.hpp | 44 +- .../block/block_gemm_areg_bsmem_creg_v2.hpp | 44 +- include/ck_tile/ops/gemm/warp/warp_gemm.hpp | 16 + .../gemm/warp/warp_gemm_attribute_mfma.hpp | 303 ++++++- .../warp/warp_gemm_attribute_mfma_impl.hpp | 271 ++++++ .../ops/gemm/warp/warp_gemm_dispatcher.hpp | 4 + 23 files changed, 1987 insertions(+), 272 deletions(-) create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py index f6df44a31..332707eaf 100644 --- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py +++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py @@ -119,6 +119,7 @@ PIPELINE_MAP = { PIPELINE_ENUM_MAP = { "qr" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS", "qr_async" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC", + "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS", } BOOL_MAP = { diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index eca638784..66814f5a1 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -44,13 +44,12 @@ FMHA_FWD_KERNEL_BODY=""" using fmha_dtype_{F_idx} = {F_dtype}; using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>; -using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>; using fmha_shape_{F_idx} = ck_tile::TileFmhaShape, - fmha_warp_tile_{F_idx}, + ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>, ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>, - fmha_warp_tile_{F_idx}, + ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>, {F_vlayout}>; using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad}, @@ -306,15 +305,19 @@ class FmhaFwdTileSize: F_rm1 : int # number of warps for gemm1 along q seqlen F_rn1 : int # number of warps for gemm1 along head dim v F_rk1 : int # number of warps for gemm1 along k seqlen (not used) - F_wm : int # warp size along m (warp size) - F_wn : int # warp size along n - F_wk : int # warp size along k + F_wm0 : int # gemm0 warp size along m + F_wn0 : int # gemm0 warp size along n + F_wk0 : int # gemm0 warp size along k + F_wm1 : int # gemm1 warp size along m + F_wn1 : int # gemm1 warp size along n + F_wk1 : int # gemm1 warp size along k F_occupancy : int # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy @property def name(self) -> str: return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\ f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\ - f"_w{self.F_wm}x{self.F_wn}x{self.F_wk}" + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}") + f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" +\ + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}") @dataclass class FmhaFwdKernel: @@ -352,9 +355,12 @@ class FmhaFwdKernel: F_rm1 = self.F_tile.F_rm1, F_rn1 = self.F_tile.F_rn1, F_rk1 = self.F_tile.F_rk1, - F_wm = self.F_tile.F_wm, - F_wn = self.F_tile.F_wn, - F_wk = self.F_tile.F_wk, + F_wm0 = self.F_tile.F_wm0, + F_wn0 = self.F_tile.F_wn0, + F_wk0 = self.F_tile.F_wk0, + F_wm1 = self.F_tile.F_wm1, + F_wn1 = self.F_tile.F_wn1, + F_wk1 = self.F_tile.F_wk1, F_vlayout = LAYOUT_MAP[self.F_pipeline.F_vlayout], F_spad = BOOL_MAP[self.F_pipeline.F_spad], F_skpad = BOOL_MAP[self.F_pipeline.F_skpad], @@ -409,17 +415,17 @@ class FmhaFwdKernel: def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]: if dtype == 'fp16' or dtype == 'bf16': return { - '32' : FmhaFwdTileSize(128, 64, 16, 32, 32, 32, 2, 1, 1, 2, 1, 1, 32, 32, 16, -1), - '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), - ## '96' : FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), - '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), - '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1), + '32' : FmhaFwdTileSize(128, 64, 16, 32, 32, 32, 2, 1, 1, 2, 1, 1, 32, 32, 16, 32, 32, 16, -1), + '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1), + ### '96' : FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1), + '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1), + '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1), } elif dtype == 'fp8' or dtype == 'bf8': return { - '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 2, 1, 1, 2, 1, 1, 32, 32, 32, -1), - '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 32, -1), - '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 32, -1) + '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 2, 1, 1, 2, 1, 1, 32, 32, 32, 32, 32, 32, -1), + '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 32, 32, 32, 32, -1), + '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 32, 32, 32, 32, -1), } else: return None diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index e448902cf..df5b9cecc 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -39,6 +39,7 @@ K0_MAX_SUBMAX_MAP = { FMHA_FWD_SPLITKV_PIPELINE_MAP = { "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS", + "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS", "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync", } @@ -50,13 +51,12 @@ namespace {{ template struct kernel_runner {{ using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>; -using fmha_warp_tile = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>; using fmha_shape = ck_tile::TileFmhaShape, - fmha_warp_tile, + ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>, ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>, - fmha_warp_tile, + ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>, {F_vlayout}>; using fmha_trait = ck_tile::TileFmhaFwdSplitKVTraits<{F_spad}, @@ -161,9 +161,8 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaSplitKVCombinePipelineProblem< typename FmhaFwdTypeConfig::OaccDataType, typename FmhaFwdTypeConfig::ODataType, {F_hdim}, - {F_bm0}, - {F_bn1}, {F_mode}, + {F_bn1}, fmha_trait>; using fmha_pipeline = ck_tile::BlockFmhaFwdSplitKVCombinePipeline< @@ -177,9 +176,11 @@ using fmha_epilogue = false, false>>; using fmha_kernel = - ck_tile::FmhaFwdSplitKVCombineKernel, - fmha_pipeline, - fmha_epilogue>; + ck_tile::FmhaFwdSplitKVCombineKernel< + ck_tile::FmhaFwdSplitKVCombineTilePartitioner< + fmha_pipeline_problem::kM0, fmha_pipeline_problem::kN1>, + fmha_pipeline, + fmha_epilogue>; static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) {{ @@ -192,7 +193,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) }}; }} -using trait_{F_idx} = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn1}, +using trait_{F_idx} = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bn1}, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>; #include @@ -250,16 +251,25 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const FMHA_FWD_SPLITKV_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) && ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{ using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; + + // get combine kernel tile sizes + using OaccDataType = typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType; + constexpr ck_tile::index_t kM0 = ck_tile::BlockFmhaSplitKVCombinePipelineTileSizes::kM0; + + // make sure we can reuse the padding flags in combine kernels + static_assert({F_bm0} % kM0 == 0); + static_assert({F_bn1} % 32 == 0); + if (t.has_lse) {{ if constexpr (std::is_same_v<{F_dtype}, ck_tile::fp8_t>) {{ return -1; }} else {{ - using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, true, {F_squant}, {F_spad}, {F_dvpad}>; + using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, /*F_bn1=*/32, true, {F_squant}, {F_spad}, {F_dvpad}>; return fmha_fwd_splitkv_(s, a); }} }} else {{ - using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, false, {F_squant}, {F_spad}, {F_dvpad}>; + using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, /*F_bn1=*/32, false, {F_squant}, {F_spad}, {F_dvpad}>; return fmha_fwd_splitkv_(s, a); }} @@ -302,7 +312,7 @@ class FmhaFwdSplitKVApiTrait: if self.pipeline_tag == 'qr_async': if self.spad == 't' : return 'true' # always support else : return 'true' - elif self.pipeline_tag in ['qr']: + elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']: if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/' # TODO: order of get_pipelines() matters! (ugly) else : return f'a.seqlen_q % {self.bm0} == 0' else: assert False @@ -313,7 +323,7 @@ class FmhaFwdSplitKVApiTrait: if self.pipeline_tag == 'qr_async': if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0' else : return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0' - elif self.pipeline_tag in ['qr', 'qr_fp8']: + elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']: if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly) else : return f'a.seqlen_k % {self.bn0} == 0' else: assert False @@ -324,7 +334,7 @@ class FmhaFwdSplitKVApiTrait: vec = int((32 * 4) / DTYPE_BITS[self.dtype]) if self.dpad == 't': return f'a.hdim_q % {vec} == 0' else : assert False - elif self.pipeline_tag in ['qr']: + elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']: bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max] if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly) else : return f'a.hdim_q % {bk0submax} == 0' @@ -336,7 +346,7 @@ class FmhaFwdSplitKVApiTrait: vec = int((32 * 4) / DTYPE_BITS[self.dtype]) if self.dvpad == 't': return f'a.hdim_v % {vec} == 0' else : assert False - elif self.pipeline_tag in ['qr']: + elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']: bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max] if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly) else : return f'a.hdim_v % {bk0submax} == 0' @@ -447,12 +457,11 @@ class FmhaFwdSplitKVApiPool: @dataclass class FmhaFwdSplitKVCombineTileSize: - F_bm0 : int # tile size along q seqlen F_bn1 : int # tile size along v head_dim F_occupancy : int # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy @property def name(self) -> str: - return f"b{self.F_bm0}x{self.F_bn1}" +\ + return f"b{self.F_bn1}" +\ ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}") @dataclass @@ -485,9 +494,12 @@ class FmhaFwdSplitKVKernel: F_rm1 = self.F_tile.F_rm1, F_rn1 = self.F_tile.F_rn1, F_rk1 = self.F_tile.F_rk1, - F_wm = self.F_tile.F_wm, - F_wn = self.F_tile.F_wn, - F_wk = self.F_tile.F_wk, + F_wm0 = self.F_tile.F_wm0, + F_wn0 = self.F_tile.F_wn0, + F_wk0 = self.F_tile.F_wk0, + F_wm1 = self.F_tile.F_wm1, + F_wn1 = self.F_tile.F_wn1, + F_wk1 = self.F_tile.F_wk1, F_vlayout = LAYOUT_MAP[self.F_pipeline.F_vlayout], F_spad = BOOL_MAP[self.F_pipeline.F_spad], F_skpad = BOOL_MAP[self.F_pipeline.F_skpad], @@ -553,7 +565,6 @@ class FmhaFwdSplitKVCombineKernel: F_idx = self.F_idx, F_hdim = self.F_hdim, F_dtype = FWD_DTYPE_MAP[self.F_dtype], - F_bm0 = self.F_tile.F_bm0, F_bn1 = self.F_tile.F_bn1, F_spad = BOOL_MAP[self.F_pipeline.F_spad], F_dvpad = BOOL_MAP[self.F_pipeline.F_dvpad], @@ -577,17 +588,17 @@ class FmhaFwdSplitKVCombineKernel: def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]: if dtype == 'fp16' or dtype == 'bf16': return { - '32' : FmhaFwdTileSize(32, 64, 16, 32, 32, 32, 2, 1, 1, 2, 1, 1, 16, 16, 16, -1), - '64' : FmhaFwdTileSize(64, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), - ## '96' : FmhaFwdTileSize(64, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), - '128' : FmhaFwdTileSize(64, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), - '256' : FmhaFwdTileSize(64, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1), + '32' : FmhaFwdTileSize(32, 64, 16, 32, 32, 32, 2, 1, 1, 2, 1, 1, 16, 16, 16, 16, 16, 16, -1), + '64' : FmhaFwdTileSize(64, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 16, 16, 16, 16, 16, 16, -1), + ### '96' : FmhaFwdTileSize(64, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 16, 16, 16, 16, 16, 16, -1), + '128' : FmhaFwdTileSize(64, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 16, 16, 16, 16, 16, 16, -1), + '256' : FmhaFwdTileSize(64, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 16, 16, 16, 16, 16, 16, -1), } elif dtype == 'fp8' or dtype == 'bf8': return { - '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 2, 1, 1, 2, 1, 1, 32, 32, 32, -1), - '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 32, -1), - '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 32, -1) + '64' : FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 2, 1, 1, 2, 1, 1, 32, 32, 32, 32, 32, 32, -1), + '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 32, 32, 32, 32, -1), + '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 32, 32, 32, 32, -1), } else: return None @@ -595,17 +606,17 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]: def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[dict]: if dtype == 'fp16' or dtype == 'bf16': return { - '32' : FmhaFwdSplitKVCombineTileSize(16, 16, -1), - '64' : FmhaFwdSplitKVCombineTileSize(32, 32, -1), - ## '96' : FmhaFwdSplitKVCombineTileSize(32, 64, -1), - '128' : FmhaFwdSplitKVCombineTileSize(32, 64, -1), - '256' : FmhaFwdSplitKVCombineTileSize(32, 128, -1), + '32' : FmhaFwdSplitKVCombineTileSize(32, -1), + '64' : FmhaFwdSplitKVCombineTileSize(32, -1), + ### '96' : FmhaFwdSplitKVCombineTileSize(32, -1), + '128' : FmhaFwdSplitKVCombineTileSize(32, -1), + '256' : FmhaFwdSplitKVCombineTileSize(32, -1), } elif dtype == 'fp8' or dtype == 'bf8': return { - '64' : FmhaFwdSplitKVCombineTileSize(64, 32, -1), - '128' : FmhaFwdSplitKVCombineTileSize(64, 64, -1), - '256' : FmhaFwdSplitKVCombineTileSize(64, 128, -1), + '64' : FmhaFwdSplitKVCombineTileSize(32, -1), + '128' : FmhaFwdSplitKVCombineTileSize(32, -1), + '256' : FmhaFwdSplitKVCombineTileSize(32, -1), } else: return None diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index aee54b475..0e821ed5d 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -709,7 +709,6 @@ std::string fmha_fwd_splitkv_get_name_(); template ; static constexpr bool kIsGroupMode = kIsGroupMode_; - static constexpr ck_tile::index_t kM0 = kM0_; static constexpr ck_tile::index_t kN1 = kN1_; static constexpr bool kStoreLse = kStoreLse_; static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_; diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index bebf035e9..107aae551 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -1303,8 +1303,8 @@ CK_TILE_DEVICE thread_buffer amd_buffer_load_impl(int32x4_t src_wave_buffe static_assert( (std::is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || (std::is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || - (std::is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || - (std::is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (std::is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || + (std::is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || (std::is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (std::is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || diff --git a/include/ck_tile/core/tensor/static_distributed_tensor.hpp b/include/ck_tile/core/tensor/static_distributed_tensor.hpp index 568d618ec..8d2f88af3 100644 --- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp +++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp @@ -29,6 +29,7 @@ struct static_distributed_tensor remove_cvref_t; static constexpr index_t kThreadElementSpaceSize = ThreadTensorDesc{}.get_element_space_size(); + static_assert(0 < kThreadElementSpaceSize, "Make sure tile distribution is valid"); CK_TILE_HOST_DEVICE static constexpr auto get_num_of_dimension() { diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp index e106264ce..7a09e4622 100644 --- a/include/ck_tile/ops/fmha.hpp +++ b/include/ck_tile/ops/fmha.hpp @@ -29,6 +29,8 @@ #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline_default_policy.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp" diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 3de433d6a..90102a6c6 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -71,7 +71,8 @@ struct FmhaFwdKernel using bfs = typename FmhaPipeline::BlockFmhaShape; using g0br = typename bfs::Gemm0BlockWarps; using g1br = typename bfs::Gemm1BlockWarps; - using gwt = typename bfs::Gemm0WarpTile; + using g0wt = typename bfs::Gemm0WarpTile; + using g1wt = typename bfs::Gemm1WarpTile; #define _SS_ std::string #define _TS_ std::to_string auto pn = [&] () { @@ -88,7 +89,8 @@ struct FmhaFwdKernel _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" + "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" + "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" + - "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" + + "w" + _TS_(g0wt::at(ck_tile::number<0>{})) + "x" + _TS_(g0wt::at(ck_tile::number<1>{})) + "x" + _TS_(g0wt::at(ck_tile::number<2>{})) + "_" + + "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" + (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" + "v" + (std::is_same_v ? "r" : "c") + (pn.empty() ? "" : "_" + pn) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr::name)) + diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp index 0bccabdd2..a0adfdc12 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp @@ -8,9 +8,11 @@ namespace ck_tile { template struct FmhaFwdSplitKVCombineKernel { - using TilePartitioner = remove_cvref_t; - using FmhaPipeline = remove_cvref_t; - using EpiloguePipeline = remove_cvref_t; + using TilePartitioner = remove_cvref_t; + using FmhaPipeline = remove_cvref_t; + using EpiloguePipeline = remove_cvref_t; + + static constexpr index_t kNumWarps = FmhaPipeline::kNumWarps; static constexpr index_t kBlockSize = FmhaPipeline::kBlockSize; static constexpr index_t kBlockPerCu = FmhaPipeline::kBlockPerCu; static_assert(kBlockPerCu > 0); @@ -50,8 +52,7 @@ struct FmhaFwdSplitKVCombineKernel return _SS_("fmha_fwd_splitkv_combine_d") + _TS_(FmhaPipeline::kHeadDimV) + "_" + _SS_(t2s::name) + "_" + (kIsGroupMode ? "group" : "batch") + "_" - "b" + _TS_(FmhaPipeline::kM0) + "x" + - _TS_(FmhaPipeline::kN1) + "_" + + "b" + _TS_(FmhaPipeline::kN1) + "_" + (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + (pn.empty() ? "" : "_" + pn) + @@ -339,37 +340,56 @@ struct FmhaFwdSplitKVCombineKernel number{}, number<1>{}); + // read 4 * (kM0, kN1) o_acc tiles simultaneously by 4 warps const auto o_acc_dram_view = pad_tensor_view( o_acc_dram_naive, - make_tuple(number<1>{}, number{}, number{}), - sequence{}); + make_tuple( + number{}, number{}, number{}), + sequence{}); + const index_t padded_num_splits = + o_acc_dram_view.get_tensor_descriptor().get_lengths()[number<0>{}]; const index_t padded_seqlen_q = o_acc_dram_view.get_tensor_descriptor().get_lengths()[number<1>{}]; const index_t padded_hdim_v = o_acc_dram_view.get_tensor_descriptor().get_lengths()[number<2>{}]; - return transform_tensor_view( + const index_t num_m_tiles = integer_divide_floor(padded_seqlen_q, FmhaPipeline::kM0); + + // transform tensor view by following steps, given shape: (padded_num_splits, + // padded_seqlen_q, padded_hdim_v) + // 1. unmerge to (padded_num_splits, num_m_tiles, kM0, padded_hdim_v) + // 2. transpose to (num_m_tiles, padded_num_splits, kM0, padded_hdim_v) + // 3. merge to (num_m_tiles * padded_num_splits * kM0, padded_hdim_v) + auto transposed = transform_tensor_view( o_acc_dram_view, - make_tuple(make_merge_transform(make_tuple(kargs.num_splits, padded_seqlen_q)), + make_tuple(make_pass_through_transform(padded_num_splits), + make_unmerge_transform(make_tuple(num_m_tiles, FmhaPipeline::kM0)), make_pass_through_transform(padded_hdim_v)), - make_tuple(sequence<0, 1>{}, sequence<2>{}), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}), + make_tuple(sequence<1>{}, sequence<0, 2>{}, sequence<3>{})); + + return transform_tensor_view( + transposed, + make_tuple(make_merge_transform( + make_tuple(num_m_tiles, padded_num_splits, FmhaPipeline::kM0)), + make_pass_through_transform(padded_hdim_v)), + make_tuple(sequence<0, 1, 2>{}, sequence<3>{}), make_tuple(sequence<0>{}, sequence<1>{})); }(); auto lse_acc_dram_window = make_tile_window( lse_acc_dram, - [&]() { - return make_tuple(number{}, number{}); - }(), + make_tuple(number{}, number{}), {0, i_m0}); + const index_t padded_num_splits = + integer_divide_ceil(kargs.num_splits, kNumWarps) * kNumWarps; + auto o_acc_dram_window = make_tile_window( o_acc_dram, - [&]() { - return make_tuple(number{}, number{}); - }(), - {i_m0, i_n1}); + make_tuple(number{}, number{}), + {i_tile_m * padded_num_splits * FmhaPipeline::kM0, i_n1}); // LSE DRAM window auto lse_dram_window = [&, i_nhead_ = i_nhead]() { @@ -410,7 +430,6 @@ struct FmhaFwdSplitKVCombineKernel identity{}, // lse_element_func composes(saturates{}, scales{kargs.scale_o}), // o_acc_element_func kargs.num_splits, - kargs.seqlen_q, smem_ptr); } else @@ -419,7 +438,6 @@ struct FmhaFwdSplitKVCombineKernel o_acc_dram_window, lse_dram_window, kargs.num_splits, - kargs.seqlen_q, smem_ptr); } }(); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index f37e676da..dc1748726 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -45,6 +45,7 @@ struct FmhaFwdSplitKVKernel static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ; static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV; static constexpr auto BiasEnum = FmhaPipeline::BiasEnum; + static constexpr bool kStoreLSE = FmhaPipeline::kStoreLSE; static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant; static constexpr bool kIsPagedKV = FmhaPipeline::Problem::kIsPagedKV; @@ -67,7 +68,8 @@ struct FmhaFwdSplitKVKernel using bfs = typename FmhaPipeline::BlockFmhaShape; using g0br = typename bfs::Gemm0BlockWarps; using g1br = typename bfs::Gemm1BlockWarps; - using gwt = typename bfs::Gemm0WarpTile; + using g0wt = typename bfs::Gemm0WarpTile; + using g1wt = typename bfs::Gemm1WarpTile; #define _SS_ std::string #define _TS_ std::to_string auto pn = [&] () { @@ -84,11 +86,12 @@ struct FmhaFwdSplitKVKernel _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" + "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" + "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" + - "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" + + "w" + _TS_(g0wt::at(ck_tile::number<0>{})) + "x" + _TS_(g0wt::at(ck_tile::number<1>{})) + "x" + _TS_(g0wt::at(ck_tile::number<2>{})) + "_" + + "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" + (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" + "v" + (std::is_same_v ? "r" : "c") + (pn.empty() ? "" : "_" + pn) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr::name)) + - (kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kDoFp8StaticQuant ? "_squant" : "") + (kIsPagedKV ? "_pagedkv" : "" ); + (kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kStoreLSE ? "_lse" : "" ) + (kDoFp8StaticQuant ? "_squant" : "") + (kIsPagedKV ? "_pagedkv" : "" ); #undef _SS_ #undef _TS_ // clang-format on diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp index 7c49fce99..7ac86e6d1 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp @@ -53,6 +53,7 @@ struct BlockFmhaFwdSplitKVCombinePipeline using OaccDataType = remove_cvref_t; using ODataType = remove_cvref_t; + static constexpr index_t kNumWarps = Problem::kNumWarps; static constexpr index_t kBlockSize = Problem::kBlockSize; static constexpr index_t kHeadDimV = Problem::kHeadDimV; @@ -117,7 +118,6 @@ struct BlockFmhaFwdSplitKVCombinePipeline const LSEElementFunction& lse_element_func, const OaccElementFunction& o_acc_element_func, index_t num_splits, - index_t seqlen_q, void* smem_ptr) const { // lse_acc tile in LDS @@ -143,11 +143,12 @@ struct BlockFmhaFwdSplitKVCombinePipeline // copy lse_acc tile (shape=[kMaxSplits, kM0]) to LDS (shape=[kMaxSplits, kM0]). auto lse_acc_tile = load_tile(lse_acc_dram_window); store_tile(lse_acc_lds_write_window, lse_acc_tile); - block_sync_lds(); auto lse_accum = make_static_distributed_tensor( Policy::template MakeLSEaccRegTileDistribution()); + __builtin_amdgcn_sched_barrier(0); + block_sync_lds(); // copy LDS (shape=[kM0, kMaxSplits]) to lse_accum (shape=[kM0, kMaxSplits]) // and fill up -INF values outside the [kM0, num_splits] region. { @@ -264,46 +265,94 @@ struct BlockFmhaFwdSplitKVCombinePipeline } }); } - block_sync_lds(); if constexpr(kStoreLSE) { store_tile(lse_dram_window_tmp, tile_elementwise_in(lse_element_func, lse_logsum)); } - auto o_acc_dist = Policy::template MakeOaccDramTileDistribution(); - auto o_acc_dram_window = + auto o_acc_4_dist = Policy::template MakeOacc4DramTileDistribution(); + auto o_acc_4_dram_window = make_tile_window(o_acc_dram_block_window_tmp.get_bottom_tensor_view(), o_acc_dram_block_window_tmp.get_window_lengths(), o_acc_dram_block_window_tmp.get_window_origin(), - o_acc_dist); - auto o_acc = make_static_distributed_tensor(o_acc_dist); - clear_tile(o_acc); + o_acc_4_dist); - const index_t padded_seqlen_q = integer_divide_ceil(seqlen_q, kM0) * kM0; + // shape=[4 * KM0, kN1] + auto o_acc_4 = make_static_distributed_tensor(o_acc_4_dist); + clear_tile(o_acc_4); - for(index_t i_split = 0; i_split < num_splits; ++i_split) + const index_t padded_num_splits = integer_divide_ceil(num_splits, kNumWarps) * kNumWarps; + + __builtin_amdgcn_sched_barrier(0); + block_sync_lds(); + // each warp handles a [KM0, kN1] tile + for(index_t split_start = 0; split_start < padded_num_splits; split_start += kNumWarps) { - auto o_tile = load_tile(o_acc_dram_window); + auto o_tile = load_tile(o_acc_4_dram_window); + const index_t i_split = split_start + get_warp_id(); + const index_t row_start = kM0 * get_warp_id(); { - constexpr auto spans = decltype(o_acc)::get_distributed_spans(); + constexpr auto spans = decltype(o_acc_4)::get_distributed_spans(); sweep_tile_span(spans[number<0>{}], [&](auto idx0) { sweep_tile_span(spans[number<1>{}], [&](auto idx1) { constexpr auto i_j_idx = make_tuple(idx0, idx1); const auto x_indices = get_x_indices_from_distributed_indices( - o_acc.get_tile_distribution(), i_j_idx); + o_acc_4.get_tile_distribution(), i_j_idx); const auto row = x_indices.at(number<0>{}); - const LSEDataType lse_scale = lse_acc_lds(row, i_split); - o_acc(i_j_idx) += lse_scale * o_tile(i_j_idx); + const LSEDataType lse_scale = lse_acc_lds(row - row_start, i_split); + o_acc_4(i_j_idx) += lse_scale * o_tile(i_j_idx); }); }); } - move_tile_window(o_acc_dram_window, {padded_seqlen_q, 0}); + move_tile_window(o_acc_4_dram_window, {kNumWarps * kM0, 0}); + } + + // 4 o_acc tiles in LDS. shape=[4 * kM0, kN1] + OaccDataType* o_acc_4_lds_ptr = static_cast(static_cast( + static_cast(smem_ptr) + Policy::template GetSmemSizeLSEacc())); + + { + auto o_acc_4_lds_window = [&]() { + auto desc = Policy::template MakeOacc4LdsBlockDescriptor(); + auto view = make_tensor_view(o_acc_4_lds_ptr, desc); + return make_tile_window(view, desc.get_lengths(), {0, 0}); + }(); + store_tile(o_acc_4_lds_window, o_acc_4); } + auto o_acc_dist = Policy::template MakeOaccDramTileDistribution(); + + auto o_acc_4_lds_window = [&]() { + auto desc = Policy::template MakeOacc4LdsBlockDescriptor(); + auto view = make_tensor_view(o_acc_4_lds_ptr, desc); + return make_tile_window(view, desc.get_lengths(), {0, 0}, o_acc_dist); + }(); + + auto o_acc = make_static_distributed_tensor(o_acc_dist); + clear_tile(o_acc); + + __builtin_amdgcn_sched_barrier(0); + block_sync_lds(); + static_for<0, kNumWarps, 1>{}([&](auto) { + auto o_acc_in = load_tile(o_acc_4_lds_window); + + { + constexpr auto spans = decltype(o_acc)::get_distributed_spans(); + sweep_tile_span(spans[number<0>{}], [&](auto idx0) { + sweep_tile_span(spans[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = make_tuple(idx0, idx1); + o_acc(i_j_idx) += o_acc_in(i_j_idx); + }); + }); + } + + move_tile_window(o_acc_4_lds_window, {kM0, 0}); + }); + o_acc = tile_elementwise_in(o_acc_element_func, o_acc); return o_acc; @@ -316,7 +365,6 @@ struct BlockFmhaFwdSplitKVCombinePipeline const OaccDramBlockWindow& o_acc_dram_block_window, LSEDramBlockWindow& lse_dram_block_window, index_t num_splits, - index_t seqlen_q, void* smem_ptr) const { return operator()(lse_acc_dram_block_window, @@ -325,7 +373,6 @@ struct BlockFmhaFwdSplitKVCombinePipeline identity{}, identity{}, num_splits, - seqlen_q, smem_ptr); } }; diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp index ebd69c0cf..2d4abb388 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp @@ -10,23 +10,38 @@ namespace ck_tile { struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy { - template + template + CK_TILE_HOST_DEVICE static constexpr auto GetMaxNumWarpsForTile() + { + static_assert(NumWarps == 1 || NumWarps == 2 || NumWarps == 4); + + constexpr index_t ElemPerThread = (M * N) / (NumWarps * get_warp_size()); + if constexpr(0 < ElemPerThread) + { + return NumWarps; + } + else + { // try dividing tile by smaller # of warps + return GetMaxNumWarpsForTile(); + } + } + + template CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeForTile() { - constexpr index_t PixelsPerThread = (M * N) / BlockSize; - static_assert(0 < PixelsPerThread); + constexpr index_t MaxNumWarps = GetMaxNumWarpsForTile(); - constexpr index_t MaxNPerThread = 16 / sizeof(DataType); - constexpr index_t NPerThread = min(MaxNPerThread, PixelsPerThread); + constexpr index_t ElemPerThread = (M * N) / (MaxNumWarps * get_warp_size()); - return NPerThread; + constexpr index_t MaxNPerThread = 16 / sizeof(DataType); + return min(MaxNPerThread, ElemPerThread); } // alignment for dram lse tile (shape=[kMaxSplits, kM0]) template CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentLSE() { - return GetVectorSizeForTile(); @@ -56,40 +71,54 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy } template - CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeLSEacc() { return sizeof(typename Problem::LSEDataType) * MakeLSEaccLdsBlockDescriptor().get_element_space_size(); } + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeOacc4() + { + return sizeof(typename Problem::OaccDataType) * + MakeOacc4LdsBlockDescriptor().get_element_space_size(); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + return GetSmemSizeLSEacc() + GetSmemSizeOacc4(); + } + // shape=[kMaxSplits, kM0] template CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccDramTileDistribution() { using LSEDataType = remove_cvref_t; - constexpr index_t kBlockSize = Problem::kBlockSize; - constexpr index_t kNumWarps = Problem::kNumWarps; - - constexpr index_t kNPerBlock = Problem::kM0; constexpr index_t kMPerBlock = Problem::kMaxSplits; + constexpr index_t kNPerBlock = Problem::kM0; + + constexpr index_t MaxNumWarps = + GetMaxNumWarpsForTile(); + constexpr index_t Replicate = Problem::kNumWarps / MaxNumWarps; constexpr index_t NPerThread = - GetVectorSizeForTile(); + GetVectorSizeForTile(); constexpr index_t NThreads = kNPerBlock / NPerThread; constexpr index_t MThreadsPerWarp = get_warp_size() / NThreads; - constexpr index_t MPerThread = kMPerBlock / (kNumWarps * MThreadsPerWarp); + constexpr index_t MPerThread = kMPerBlock / (MaxNumWarps * MThreadsPerWarp); + static_assert(MPerThread * MaxNumWarps * MThreadsPerWarp == kMPerBlock); static_assert(NThreads * NPerThread == kNPerBlock); - static_assert(MPerThread * kNumWarps * MThreadsPerWarp == kMPerBlock); return make_static_tile_distribution( - tile_distribution_encoding, - tuple, + tile_distribution_encoding, + tuple, sequence>, - tuple, sequence<1, 2>>, - tuple, sequence<2, 0>>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, sequence<1, 2>, sequence<0, 1>>{}); } @@ -100,17 +129,15 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy { using LSEDataType = remove_cvref_t; - constexpr index_t kBlockSize = Problem::kBlockSize; - - constexpr index_t kMPerBlock = Problem::kMaxSplits; - constexpr index_t kNPerBlock = Problem::kM0; + constexpr index_t kMPerBlock = Problem::kM0; + constexpr index_t kNPerBlock = Problem::kMaxSplits; constexpr index_t NPack = - GetVectorSizeForTile(); + GetVectorSizeForTile(); constexpr auto lse_acc_lds_block_desc_0 = make_naive_tensor_descriptor( make_tuple(number{}, number{}, number{}), make_tuple(number<(kMPerBlock + 1) * NPack>{}, number{}, number<1>{}), - number<8>{}, + number{}, number<1>{}); constexpr auto lse_acc_lds_block_desc = transform_tensor_descriptor( @@ -129,17 +156,15 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy { using LSEDataType = remove_cvref_t; - constexpr index_t kBlockSize = Problem::kBlockSize; - - constexpr index_t kMPerBlock = Problem::kMaxSplits; - constexpr index_t kNPerBlock = Problem::kM0; + constexpr index_t kMPerBlock = Problem::kM0; + constexpr index_t kNPerBlock = Problem::kMaxSplits; constexpr index_t NPack = - GetVectorSizeForTile(); + GetVectorSizeForTile(); constexpr auto lse_acc_lds_block_desc_0 = make_naive_tensor_descriptor( make_tuple(number{}, number{}, number{}), make_tuple(number<(kMPerBlock + 1) * NPack>{}, number{}, number<1>{}), - number<8>{}, + number{}, number<1>{}); constexpr auto lse_acc_t_lds_block_desc = transform_tensor_descriptor( @@ -152,33 +177,86 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy return lse_acc_t_lds_block_desc; } + // 3d + padding, shape=[4 * kM0, kN1] template - CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccRegTileDistribution() + CK_TILE_HOST_DEVICE static constexpr auto MakeOacc4LdsBlockDescriptor() { - constexpr index_t kBlockSize = Problem::kBlockSize; + using LSEDataType = remove_cvref_t; - constexpr index_t kNPerBlock = Problem::kMaxSplits; + constexpr index_t kMPerBlock = 4 * Problem::kM0; + constexpr index_t kNPerBlock = Problem::kN1; + constexpr index_t NPack = + GetVectorSizeForTile(); + + constexpr auto o_acc_lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, number{}, number{}), + make_tuple(number<(kMPerBlock + 1) * NPack>{}, number{}, number<1>{}), + number<8>{}, + number<1>{}); + + constexpr auto o_acc_t_lds_block_desc = transform_tensor_descriptor( + o_acc_lds_block_desc_0, + make_tuple(make_pass_through_transform(kMPerBlock), + make_merge_transform(make_tuple(kNPerBlock / NPack, NPack))), + make_tuple(sequence<1>{}, sequence<0, 2>{}), + make_tuple(sequence<1>{}, sequence<0>{})); + + return o_acc_t_lds_block_desc; + } + + // shape=[kM0, kMaxSplits] + template + CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccRegTileDistribution() + { constexpr index_t kMPerBlock = Problem::kM0; + constexpr index_t kNPerBlock = Problem::kMaxSplits; - constexpr index_t NThreads = 4; - constexpr index_t NPerThread = kNPerBlock / NThreads; + constexpr index_t MaxNThreads = 8; + constexpr index_t NThreads = min(kNPerBlock, MaxNThreads); + constexpr index_t NPerThread = kNPerBlock / NThreads; - constexpr index_t MThreads = kBlockSize / NThreads; - constexpr index_t MPerThread = kMPerBlock / MThreads; - constexpr index_t MWarps = kBlockSize / get_warp_size(); + constexpr index_t MPerThread = 1; + constexpr index_t MThreads = kMPerBlock / MPerThread; constexpr index_t MThreadPerWarp = get_warp_size() / NThreads; + constexpr index_t MaxNumWarps = (MThreads * NThreads) / get_warp_size(); + constexpr index_t Replicate = Problem::kNumWarps / MaxNumWarps; + + static_assert(MaxNumWarps * MThreadPerWarp * MPerThread == kMPerBlock); static_assert(NThreads * NPerThread == kNPerBlock); - static_assert(MWarps * MThreadPerWarp * MPerThread == kMPerBlock); return make_static_tile_distribution( - tile_distribution_encoding< - sequence<1>, - tuple, sequence>, - tuple, sequence<2, 1>>, - tuple, sequence<0, 1>>, - sequence<1, 2>, - sequence<2, 1>>{}); + tile_distribution_encoding, + tuple, + sequence>, + tuple, sequence<2, 1>>, + tuple, sequence<0, 1>>, + sequence<1, 2>, + sequence<2, 1>>{}); + } + + // similar to MakeOaccDramTileDistribution(), but duplicate same 1-warp encoding 4 times on M + // direction + template + CK_TILE_HOST_DEVICE static constexpr auto MakeOacc4DramTileDistribution() + { + constexpr index_t kMPerBlock = Problem::kM0; // real kMPerBlock we want is (4 * kM0) + constexpr index_t kNPerBlock = Problem::kN1; + static_assert(get_warp_size() <= kMPerBlock * kNPerBlock); + + constexpr index_t M1 = 1; // compose encoding base on 1 warp + constexpr index_t M2 = min(kMPerBlock / M1, get_warp_size()); + constexpr index_t N0 = get_warp_size() / M2; + constexpr index_t N1 = kNPerBlock / N0; + constexpr index_t M0 = kMPerBlock / (M2 * M1); + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<3, 0>>, + sequence<1, 2>, + sequence<1, 1>>{}); } template @@ -187,6 +265,7 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy constexpr index_t kBlockSize = Problem::kBlockSize; constexpr index_t kMPerBlock = Problem::kM0; constexpr index_t kNPerBlock = Problem::kN1; + static_assert(kBlockSize <= kMPerBlock * kNPerBlock); constexpr index_t M1 = kBlockSize / get_warp_size(); constexpr index_t M2 = min(kMPerBlock / M1, get_warp_size()); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp new file mode 100644 index 000000000..3726cd433 --- /dev/null +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp @@ -0,0 +1,794 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp" +#include "ck_tile/ops/reduce/block/block_reduce.hpp" + +namespace ck_tile { + +// This pipeline is qkv all located in LDS +template +struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS +{ + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + using QDataType = remove_cvref_t; + using KDataType = remove_cvref_t; + using VDataType = remove_cvref_t; + using SaccDataType = remove_cvref_t; + using SMPLComputeDataType = remove_cvref_t; + using BiasDataType = remove_cvref_t; + using LSEDataType = remove_cvref_t; + using PDataType = remove_cvref_t; + using OaccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; + using FmhaMask = remove_cvref_t; + + using BlockFmhaShape = remove_cvref_t; + using VLayout = remove_cvref_t; + static constexpr bool kQLoadOnce = true; // if q_tile load whole block length (hdim) at once + static_assert(kQLoadOnce == Policy::QLoadOnce); + + static constexpr index_t kBlockSize = Problem::kBlockSize; + + static constexpr index_t kM0 = BlockFmhaShape::kM0; + static constexpr index_t kN0 = BlockFmhaShape::kN0; + static constexpr index_t kK0 = BlockFmhaShape::kK0; + static constexpr index_t kN1 = BlockFmhaShape::kN1; + static constexpr index_t kK1 = BlockFmhaShape::kK1; + static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim; + static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim; + + static constexpr bool kIsGroupMode = Problem::kIsGroupMode; + static constexpr bool kPadSeqLenQ = Problem::kPadSeqLenQ; + static constexpr bool kPadSeqLenK = Problem::kPadSeqLenK; + static constexpr bool kPadHeadDimQ = Problem::kPadHeadDimQ; + static constexpr bool kPadHeadDimV = Problem::kPadHeadDimV; + static constexpr auto BiasEnum = Problem::BiasEnum; + static constexpr bool kStoreLSE = Problem::kStoreLSE; + static constexpr bool kIsPagedKV = Problem::kIsPagedKV; + static constexpr bool kHasUnevenSplits = Problem::kHasUnevenSplits; + + // last dimension vector length used to create tensor view(and decide buffer_load vector length) + // ... together with tensor distribution. tensor dist should able to overwrite this + static constexpr index_t kAlignmentQ = + kPadHeadDimQ ? 1 : Policy::template GetAlignmentQ(); + static constexpr index_t kAlignmentK = + kPadHeadDimQ ? 1 : Policy::template GetAlignmentK(); + static constexpr index_t kAlignmentV = []() { + if constexpr(std::is_same_v) + return kPadHeadDimV ? 1 : Policy::template GetAlignmentV(); + else + return kPadSeqLenK ? 1 : Policy::template GetAlignmentV(); + }(); + + static constexpr index_t kAlignmentOacc = + kPadHeadDimV ? 1 : Policy::template GetAlignmentOacc(); + + static constexpr index_t kAlignmentBias = + kPadSeqLenK ? 1 : Policy::template GetAlignmentBias(); + + static constexpr index_t kBlockPerCu = []() { + if constexpr(Problem::kBlockPerCu != -1) + return Problem::kBlockPerCu; + else + { + if constexpr(kQKHeaddim <= 32) + { + return 2; + } + else if constexpr(kQKHeaddim <= 64) + { + return 3; + } + else if constexpr(kQKHeaddim <= 128) + { + if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) + return 1; + else + return 2; + } + else if constexpr(kQKHeaddim <= 256) + { + return 1; + } + } + }(); + + static constexpr const char* name = "qr_nwarp_sshuffle"; + + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + CK_TILE_HOST_DEVICE auto + operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile + const QElementFunction& q_element_func, + const KDramBlockWindowLengths& k_dram_block_window_lengths, // N0*K0 tile + const KPageBlockNavigator& k_page_block_navigator, + const KElementFunction& k_element_func, + const VDramBlockWindowLengths& v_dram_block_window_lengths, // N1*K1 tile + const VPageBlockNavigator& v_page_block_navigator, + const VElementFunction& v_element_func, + const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile + const BiasElementFunction& bias_element_func, + LSEaccDramBlockWindowTmp& lse_acc_dram_window_tmp, // M0*1 tile + const LSEaccElementFunction& lse_acc_element_func, + const SAccElementFunction& s_acc_element_func, + const PComputeElementFunction& p_compute_element_func, + const OAccElementFunction& o_acc_element_func, + index_t num_splits, + index_t i_split, + FmhaMask mask, + PositionEncoding position_encoding, + float scale_s, + index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate + void* smem_ptr) const + { + static_assert( + std::is_same_v> && + std::is_same_v> && + std::is_same_v>, + "wrong!"); + + static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && + kSubQKHeaddim == + QDramBlockWindowTmp{}.get_window_lengths()[number<1>{}] && + kN0 == KDramBlockWindowLengths{}[number<0>{}] && + kK0 == KDramBlockWindowLengths{}[number<1>{}] && + kN1 == VDramBlockWindowLengths{}[number<0>{}] && + kK1 == VDramBlockWindowLengths{}[number<1>{}] && + kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && + kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<1>{}], + "wrong!"); + // Q tile in LDS + QDataType* q_lds_ptr = + static_cast(static_cast(static_cast(smem_ptr))); + auto q_lds = make_tensor_view( + q_lds_ptr, Policy::template MakeQLdsBlockDescriptor()); + + // K tile in LDS + KDataType* k_lds_ptr = + static_cast(static_cast(static_cast(smem_ptr))); + auto k_lds = make_tensor_view( + k_lds_ptr, Policy::template MakeKLdsBlockDescriptor()); + auto k_lds_window = + make_tile_window(k_lds, make_tuple(number{}, number{}), {0, 0}); + + // V tile in LDS + auto v_lds = make_tensor_view( + reinterpret_cast(static_cast(smem_ptr) + + max(Policy::template GetSmemSizeQ(), + Policy::template GetSmemSizeK())), + Policy::template MakeVLdsBlockDescriptor()); + auto v_lds_window = make_tile_window( + v_lds, Policy::template MakeVLdsBlockDescriptor().get_lengths(), {0, 0}); + + // S tile in LDS + auto s_lds = make_tensor_view( + reinterpret_cast(reinterpret_cast(smem_ptr) + + max(Policy::template GetSmemSizeQ(), + Policy::template GetSmemSizeK())), + Policy::template MakeSLdsBlockDescriptor()); + auto s_write_lds_window = make_tile_window( + s_lds, Policy::template MakeSLdsBlockDescriptor().get_lengths(), {0, 0}); + auto s_read_lds_window = + make_tile_window(s_lds, + Policy::template MakeSLdsBlockDescriptor().get_lengths(), + {0, 0}, + Policy::template MakeSRegTileDistribution()); + + // Block GEMM + constexpr auto gemm_0 = Policy::template GetQKBlockGemm(); + constexpr auto gemm_1 = Policy::template GetKVBlockGemm(); + + auto q_dram_window = + make_tile_window(q_dram_block_window_tmp.get_bottom_tensor_view(), + q_dram_block_window_tmp.get_window_lengths(), + q_dram_block_window_tmp.get_window_origin(), + Policy::template MakeQDramTileDistribution()); + + // load Q here, will store Q into LDS to maximize throughput + auto origin_q = load_tile(q_dram_window); + + using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile()); + auto s_acc = SaccBlockTileType{}; + + // reduction function for softmax + const auto f_max = [](auto e0, auto e1) { return max(e0, e1); }; + const auto f_sum = [](auto e0, auto e1) { return e0 + e1; }; + + using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile()); + + auto o_acc = OaccBlockTileType{}; + + // infer Sacc, S, P, M, L, Oacc type + using SBlockTileType = decltype(cast_tile(o_acc)); + + using MLBlockTileType = decltype(block_tile_reduce( + SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0})); + + // init M, L + auto m = MLBlockTileType{}; + auto l = MLBlockTileType{}; + + clear_tile(o_acc); + set_tile(m, -numeric::infinity()); + clear_tile(l); + + const auto q_origin = q_dram_window.get_window_origin(); + const auto [logical_seqlen_k_start, logical_seqlen_k_end] = mask.GetTileRangeAlongX( + q_origin.at(number<0>{}), number{}, number{}, num_splits, i_split); + + // check early exit if no work to do + if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits) + { + const index_t logical_num_total_loop = + integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0); + if(logical_num_total_loop <= 0) + { + if constexpr(kStoreLSE) + { + auto lse_acc = + make_static_distributed_tensor(m.get_tile_distribution()); + + set_tile(lse_acc, -numeric::infinity()); + + if(get_thread_local_1d_id() < kM0) + { + store_tile(lse_acc_dram_window_tmp, + tile_elementwise_in(lse_acc_element_func, lse_acc)); + } + } + + // Note: here occ are all cleard, return it + // Note: q loaded but no fence, ignore it. + return o_acc; + } + } + + const index_t physical_seqlen_k_start = logical_seqlen_k_start + kv_l2p_offset; + const index_t physical_seqlen_k_end = logical_seqlen_k_end + kv_l2p_offset; + // make sure the first tile is completely located in page-block (page-block size should be + // divisible by kN0) + // relationship between each *_start variables: aligned_physical_seqlen_k_start <= + // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start + const index_t aligned_physical_seqlen_k_start = + [&, physical_seqlen_k_start_ = physical_seqlen_k_start] { + if constexpr(kIsPagedKV) + { + return kN0 * integer_divide_floor(physical_seqlen_k_start_, kN0); + } + else + { + return physical_seqlen_k_start_; + } + }(); + const index_t num_total_loop = + integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0); + + auto [i_page_block_k, k_dram_block_window] = k_page_block_navigator.make_tile_window( + k_dram_block_window_lengths, {aligned_physical_seqlen_k_start, 0}); + + const auto bias_origin = bias_dram_block_window_tmp.get_window_origin(); + auto bias_dram_window = + make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(), + bias_dram_block_window_tmp.get_window_lengths(), + {bias_origin.at(number<0>{}), + logical_seqlen_k_start - (physical_seqlen_k_start - + aligned_physical_seqlen_k_start)}, // M/N + Policy::template MakeBiasDramTileDistribution()); + + auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window( + v_dram_block_window_lengths, + {0, aligned_physical_seqlen_k_start}, // TODO: hdim split? + Policy::template MakeVDramTileDistribution()); + + // store Q into LDS + __builtin_amdgcn_sched_barrier(0); + auto q_lds_window_for_store = make_tile_window( + q_lds, Policy::template MakeQLdsBlockDescriptor().get_lengths(), {0, 0}); + + store_tile(q_lds_window_for_store, origin_q); + __builtin_amdgcn_sched_barrier(0); + + // load Q from LDS + __builtin_amdgcn_sched_barrier(0); + auto q_lds_window_for_load = make_tile_window( + q_lds, + Policy::template MakeQLdsBlockDescriptor().get_lengths(), + {0, 0}, + Policy::template MakeQRegTileDistribution()); + block_sync_lds(); + auto q = load_tile(q_lds_window_for_load); + __builtin_amdgcn_sched_barrier(0); + auto q_tile = tile_elementwise_in(q_element_func, q); + + // prefetch K tile + index_t i_total_loops = 0; + constexpr index_t k0_loops = kQKHeaddim / kK0; + constexpr index_t k1_loops = kN0 / kK1; + + static_assert(2 <= k0_loops); + static_assert(1 <= k1_loops); + + auto k_dram_window = make_tile_window( + k_dram_block_window, + Policy::template MakeKDramTileDistribution()); // K DRAM tile window for + + // load the first tile of the first iteration and store to LDS + auto k_block_tile = load_tile(k_dram_window); + // moving k_dram_window is an in-page-block operation, so there is + // no need to invoke k_page_block_navigator.move_tile_window() here. + move_tile_window(k_dram_window, {0, kK0}); + store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile)); + + do + { + // STAGE 1, QK gemm + clear_tile(s_acc); // initialize C + + // load the second tile of the first iteration + k_block_tile = load_tile(k_dram_window); + + if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) + { + __builtin_amdgcn_sched_barrier( + 0); // prevent from messing up the order of global loads + } + const auto bias_tile = load_tile(bias_dram_window); // load bias tile + if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) + { + __builtin_amdgcn_sched_barrier( + 0); // prevent from messing up the order of global loads + } + + if constexpr(k0_loops > 2) + { + static_for<0, k0_loops - 2, 1>{}([&](auto i_k0) { + block_sync_lds(); + gemm_0(s_acc, + get_slice_tile(q_tile, + sequence<0, i_k0 * kK0>{}, + sequence{}), + k_lds_window); + block_sync_lds(); + move_tile_window(k_dram_window, {0, kK0}); + + store_tile( + k_lds_window, + tile_elementwise_in(k_element_func, k_block_tile)); // LDS write i + 1 + k_block_tile = load_tile(k_dram_window); // global read i + 2 + }); + } + + const auto v_prefetch = load_tile(v_dram_window); // prefetch load v tile + { // tail + block_sync_lds(); + gemm_0(s_acc, + get_slice_tile(q_tile, + sequence<0, (k0_loops - 2) * kK0>{}, + sequence{}), + k_lds_window); + block_sync_lds(); + + store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile)); + block_sync_lds(); + + gemm_0(s_acc, + get_slice_tile(q_tile, + sequence<0, (k0_loops - 1) * kK0>{}, + sequence{}), + k_lds_window); + } + + // STAGE 2, scale_s, add bias, mask, softmax + if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) + { + s_acc = tile_elementwise_in(s_acc_element_func, s_acc); + tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc); + tile_elementwise_inout( + [&](auto& x, const auto& y) { +#if !CK_TILE_FMHA_FWD_FAST_EXP2 + x += type_convert(bias_element_func(y)); +#else + x += log2e_v * + type_convert(bias_element_func(y)); +#endif + }, + s_acc, + bias_tile); + } + else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI) + { + const auto k_origin = k_page_block_navigator.to_global_window_origin( + i_page_block_k, k_dram_block_window.get_window_origin()); + constexpr auto s_spans = decltype(s_acc)::get_distributed_spans(); + s_acc = tile_elementwise_in(s_acc_element_func, s_acc); + sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) { + sweep_tile_span(s_spans[number<1>{}], [&](auto idx1) { + const auto tile_idx = get_x_indices_from_distributed_indices( + s_acc.get_tile_distribution(), make_tuple(idx0, idx1)); + + const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{}); + const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{}); + constexpr auto i_j_idx = make_tuple(idx0, idx1); + + s_acc(i_j_idx) *= scale_s; + // position_encoding accept only logical coordinates, do conversion here + position_encoding.update(s_acc(i_j_idx), row, col - kv_l2p_offset); + }); + }); + } + else + { + s_acc = tile_elementwise_in(s_acc_element_func, s_acc); +#if !CK_TILE_FMHA_FWD_FAST_EXP2 + tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc); +#endif + } + move_tile_window(bias_dram_window, {0, kN0}); + + /// TODO: only check in first/last iteration without increasing code size + if constexpr(kHasUnevenSplits) + { + const auto k_origin = k_page_block_navigator.to_global_window_origin( + i_page_block_k, k_dram_block_window.get_window_origin()); + set_tile_if( + s_acc, + -numeric::infinity(), + [&, + physical_seqlen_k_start_ = physical_seqlen_k_start, + physical_seqlen_k_end_ = physical_seqlen_k_end](auto tile_idx) { + const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{}); + if constexpr(kIsPagedKV) + { + return col < physical_seqlen_k_start_ || physical_seqlen_k_end_ <= col; + } + else + { + return physical_seqlen_k_end_ <= col; + } + }); + } + + if constexpr(kPadSeqLenK || FmhaMask::IsMasking) + { + const auto k_origin = k_page_block_navigator.to_global_window_origin( + i_page_block_k, k_dram_block_window.get_window_origin()); + // mask accept only logical coordinates, do conversion here + bool need_perpixel_check = mask.IsEdgeTile(q_origin.at(number<0>{}), + k_origin.at(number<0>{}) - kv_l2p_offset, + number{}, + number{}); + if(need_perpixel_check) + { + set_tile_if( + s_acc, -numeric::infinity(), [&](auto tile_idx) { + const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{}); + const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{}); + return mask.IsOutOfBound(row, col - kv_l2p_offset); + }); + } + } + + __builtin_amdgcn_sched_barrier(0); + + // load the first tile for next iteration + if(i_total_loops < num_total_loop - 1) + { + // move K tile windows + i_page_block_k = k_page_block_navigator.move_tile_window( + i_page_block_k, k_dram_block_window, {kN0, 0}); + + k_dram_window = make_tile_window( + k_dram_block_window, + Policy::template MakeKDramTileDistribution()); // K DRAM tile window + + // laod the first tile of the first iteration and store to LDS + k_block_tile = load_tile(k_dram_window); + } + + __builtin_amdgcn_sched_barrier(0); + + const auto s = cast_tile(s_acc); // S{j} + + // shuffle through LDS so that the tile layout is consistent with required by Gemm1 + store_tile(s_write_lds_window, s); + block_sync_lds(); + auto s_new = load_tile(s_read_lds_window); + + auto m_local = block_tile_reduce( + s_new, + sequence<1>{}, + f_max, + -numeric::infinity()); // m_local = rowmax(S{j}) + block_tile_reduce_sync(m_local, f_max, bool_constant{}); + + const auto m_old = m; // m{j-1} + tile_elementwise_inout( + [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j} + + auto p_compute = make_static_distributed_tensor( + s_new.get_tile_distribution()); // Pcompute{j} + + static const auto get_validated_m = [](SMPLComputeDataType raw_m) { + /// NOTICE: bias might be materialized mask including -inf values, need + /// consideration + if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || + FmhaMask::IsMasking) + { + return raw_m == -numeric::infinity() + ? type_convert(0.f) + : raw_m; + } + else + { + return raw_m; + } + }; + + constexpr auto p_spans = decltype(p_compute)::get_distributed_spans(); + sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) { + constexpr auto i_idx = make_tuple(idx0); +#if CK_TILE_FMHA_FWD_FAST_EXP2 + auto row_max = scale_s * get_validated_m(m[i_idx]); +#endif + sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = make_tuple(idx0, idx1); +#if CK_TILE_FMHA_FWD_FAST_EXP2 + if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || + BiasEnum == BlockAttentionBiasEnum::ALIBI) + { + p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx])); + } + else + { + p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max); + } +#else + p_compute(i_j_idx) = exp(s_new[i_j_idx] - get_validated_m(m[i_idx])); +#endif + }); + }); + + auto rowsum_p = block_tile_reduce( + p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j}) + + block_tile_reduce_sync(rowsum_p, f_sum, bool_constant{}); + + const auto p = + cast_tile(tile_elementwise_in(p_compute_element_func, p_compute)); + + // l{j}, Oacc{j} + constexpr auto o_spans = decltype(o_acc)::get_distributed_spans(); + sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) { + constexpr auto i_idx = make_tuple(idx0); +#if CK_TILE_FMHA_FWD_FAST_EXP2 + const auto tmp = [&]() { + if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || + BiasEnum == BlockAttentionBiasEnum::ALIBI) + { + return exp2(m_old[i_idx] - get_validated_m(m[i_idx])); + } + else + { + auto row_max = scale_s * get_validated_m(m[i_idx]); + return exp2(scale_s * m_old[i_idx] - row_max); + } + }(); +#else + const auto tmp = exp(m_old[i_idx] - get_validated_m(m[i_idx])); +#endif + l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx]; + sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = make_tuple(idx0, idx1); + // FIXME: this use different equation from FA v2 paper, + // but produce correc result. + // Is the equation wrong? + o_acc(i_j_idx) *= tmp; + }); + }); + + block_sync_lds(); + if constexpr(std::is_same_v) + { + auto v_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledVRegBlockDescriptor()); + shuffle_tile(v_shuffle_tmp, v_prefetch); + store_tile( + v_lds_window, + tile_elementwise_in(v_element_func, v_shuffle_tmp)); // store the prefetch + } + else + { + store_tile(v_lds_window, + tile_elementwise_in(v_element_func, v_prefetch)); // store the prefetch + } + i_page_block_v = + v_page_block_navigator.move_tile_window(i_page_block_v, v_dram_window, {0, kK1}); + + // STAGE 3, KV gemm + if constexpr(k1_loops > 1) + { + static_for<0, k1_loops - 1, 1>{}([&, + &i_page_block_v_ = i_page_block_v, + &v_dram_window_ = v_dram_window](auto i_k1) { + const auto v = load_tile(v_dram_window_); // load next v + block_sync_lds(); + + gemm_1(o_acc, + get_slice_tile( + p, sequence<0, i_k1 * kK1>{}, sequence{}), + v_lds_window); + block_sync_lds(); + + if constexpr(std::is_same_v) + { + auto v_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledVRegBlockDescriptor()); + shuffle_tile(v_shuffle_tmp, v); + store_tile(v_lds_window, + tile_elementwise_in(v_element_func, + v_shuffle_tmp)); // store the prefetch + } + else + { + store_tile(v_lds_window, + tile_elementwise_in(v_element_func, v)); // store next v + } + i_page_block_v_ = v_page_block_navigator.move_tile_window( + i_page_block_v_, v_dram_window_, {0, kK1}); + }); + } + + // tail + { + block_sync_lds(); + gemm_1(o_acc, + get_slice_tile( + p, sequence<0, (k1_loops - 1) * kK1>{}, sequence{}), + v_lds_window); + block_sync_lds(); + } + + __builtin_amdgcn_sched_barrier(0); + + // load the first tile for next iteration + if(i_total_loops < num_total_loop - 1) + { + // store the first tile for next iteration to LDS + // moving k_dram_window is an in-page-block operation, so there is + // no need to invoke k_page_block_navigator.move_tile_window() here. + move_tile_window(k_dram_window, {0, kK0}); + store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile)); + } + } while(++i_total_loops < num_total_loop); + + if constexpr(kStoreLSE) + { + // store lse acc + auto lse_acc = make_static_distributed_tensor(m.get_tile_distribution()); + + constexpr auto lse_acc_spans = decltype(lse_acc)::get_distributed_spans(); + sweep_tile_span(lse_acc_spans[number<0>{}], [&, m_ = m, l_ = l](auto idx0) { + constexpr auto i_idx = make_tuple(idx0); +#if CK_TILE_FMHA_FWD_FAST_EXP2 + if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || + BiasEnum == BlockAttentionBiasEnum::ALIBI) + { + lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]); + } + else + { + lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]); + } +#else + lse_acc(i_idx) = m_[i_idx] + log(l_[i_idx]); +#endif + }); + + if(get_thread_local_1d_id() < kM0) + { + store_tile(lse_acc_dram_window_tmp, + tile_elementwise_in(lse_acc_element_func, lse_acc)); + } + } + + // finally, O + constexpr auto o_spans = decltype(o_acc)::get_distributed_spans(); + + sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) { + constexpr auto i_idx = make_tuple(idx0); + const auto tmp = [&]() { + if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || + FmhaMask::IsMasking) + { + return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx]; + } + else + return 1 / l[i_idx]; + }(); + sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = make_tuple(idx0, idx1); + o_acc(i_j_idx) *= tmp; + }); + }); + + o_acc = tile_elementwise_in(o_acc_element_func, o_acc); + + return o_acc; + } + + template + CK_TILE_HOST_DEVICE auto + operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile + const KDramBlockWindowLengths& k_dram_block_window_lengths, // N0*K0 tile + const KPageBlockNavigator& k_page_block_navigator, + const VDramBlockWindowLengths& v_dram_block_window_lengths, // N1*K1 tile + const VPageBlockNavigator& v_page_block_navigator, + const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile + LSEaccDramBlockWindowTmp& lse_acc_dram_block_window_tmp, // M0*1 tile + index_t num_splits, + index_t i_split, + FmhaMask mask, + PositionEncoding position_encoding, + float scale_s, + index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate + void* smem_ptr) const + { + return operator()(q_dram_block_window_tmp, + identity{}, + k_dram_block_window_lengths, + k_page_block_navigator, + identity{}, + v_dram_block_window_lengths, + v_page_block_navigator, + identity{}, + bias_dram_block_window_tmp, + identity{}, + lse_acc_dram_block_window_tmp, + identity{}, + identity{}, + identity{}, + identity{}, + num_splits, + i_split, + mask, + position_encoding, + scale_s, + kv_l2p_offset, + smem_ptr); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp new file mode 100644 index 000000000..74d755ef3 --- /dev/null +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp" +#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp" +#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp" + +namespace ck_tile { + +// This pipeline is qkv all located in LDS +struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVSDefaultPolicy + : BlockFmhaPipelineQXKSVSCustomPolicy +{ + using BasePolicy = BlockFmhaPipelineQXKSVSCustomPolicy; + + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentQ() + { + constexpr index_t kBlockSize = Problem::kBlockSize; + constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0; + constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim; + + constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType); + + // this should align with MakeQDramTileDistribution() + constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize; + static_assert(0 < ElemPerThread); + return min(ElemPerThread, MaxVectorSize); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentOacc() + { + using OaccDataType = remove_cvref_t; + + return static_cast(16 / sizeof(OaccDataType)); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeQDramTileDistribution() + { + constexpr index_t kBlockSize = Problem::kBlockSize; + constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0; + constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim; + + constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType); + + constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize; + static_assert(0 < ElemPerThread); + constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize); + + constexpr index_t KPerThread = kMaxVecLoad; + constexpr index_t KThreads = kKPerBlock / KPerThread; + constexpr index_t MThreadPerWarp = get_warp_size() / KThreads; + constexpr index_t NumWarps = kBlockSize / get_warp_size(); + constexpr index_t MPerThread = kMPerBlock / (MThreadPerWarp * NumWarps); + + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeQRegTileDistribution() + { + return BasePolicy::template MakeQDramTileDistribution(); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackQ() + { + // TODO: this is for 3d layout + using QDataType = remove_cvref_t; + return static_cast(16 / sizeof(QDataType)); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeQLdsBlockDescriptor() + { + constexpr index_t kBlockSize = Problem::kBlockSize; + constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0; + constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim; + + constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize; + static_assert(0 < ElemPerThread); + constexpr index_t kKPack = min(ElemPerThread, GetSmemKPackQ()); + + constexpr auto q_lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, number{}, number{}), + make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number{}, number<1>{}), + number{}, + number<1>{}); + + constexpr auto q_lds_block_desc = transform_tensor_descriptor( + q_lds_block_desc_0, + make_tuple( + make_pass_through_transform(number{}), + make_merge_transform(make_tuple(number{}, number{}))), + make_tuple(sequence<1>{}, sequence<0, 2>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return q_lds_block_desc; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemNPackS() + { + using SDataType = remove_cvref_t; + return static_cast(16 / sizeof(SDataType)); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeSLdsBlockDescriptor() + { + constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0; + constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0; + constexpr index_t kNPack = GetSmemNPackS(); + + constexpr auto s_lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, number{}, number{}), + make_tuple(number<(kMPerBlock + 1) * kNPack>{}, number{}, number<1>{}), + number{}, + number<1>{}); + + constexpr auto s_lds_block_desc = transform_tensor_descriptor( + s_lds_block_desc_0, + make_tuple( + make_pass_through_transform(number{}), + make_merge_transform(make_tuple(number{}, number{}))), + make_tuple(sequence<1>{}, sequence<0, 2>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return s_lds_block_desc; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeSRegTileDistribution() + { + using BlockGemm = remove_cvref_t())>; + + constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp(); + using WG = remove_cvref_t())>; + constexpr index_t MWarp = config.template at<1>(); + constexpr index_t NWarp = config.template at<2>(); + + static_assert(MWarp == 1, "Check failed!"); + + constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0; + constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1; + constexpr index_t kTileK = Problem::BlockFmhaShape::kN0; + + // K2 is equal to Impl::kABKPerLane * kKIterPerWarpGemm + constexpr index_t K3 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane; + constexpr index_t K2 = WG::WarpGemmAttribute::Impl::kABKLane; + constexpr index_t K1 = kKPerBlock / (K2 * K3); + constexpr index_t K0 = kTileK / kKPerBlock; + constexpr index_t M2 = WG::WarpGemmAttribute::Impl::kAMLane; + constexpr index_t M1 = MWarp; + constexpr index_t M0 = kMPerBlock / (M2 * M1); + + constexpr auto s2_block_dstr_encoding = + tile_distribution_encoding, + tuple, sequence>, + tuple, sequence<2, 1>>, + tuple, sequence<2, 2>>, + sequence<1, 2, 2, 2>, + sequence<0, 0, 1, 3>>{}; + + constexpr auto s2_block_dstr = make_static_tile_distribution(s2_block_dstr_encoding); + + return s2_block_dstr; + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeQ() + { + return MakeQLdsBlockDescriptor().get_element_space_size() * + sizeof(typename Problem::QDataType); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeK() + { + return MakeKLdsBlockDescriptor().get_element_space_size() * + sizeof(typename Problem::KDataType); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeV() + { + return MakeVLdsBlockDescriptor().get_element_space_size() * + sizeof(typename Problem::VDataType); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeS() + { + return MakeSLdsBlockDescriptor().get_element_space_size() * + sizeof(typename Problem::SaccDataType); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + return max(GetSmemSizeQ(), GetSmemSizeK()) + + max(GetSmemSizeV(), GetSmemSizeS()); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp index d9da2f088..1fe19faaf 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp @@ -106,28 +106,43 @@ struct BlockFmhaFwdSplitKVPipelineProblem static constexpr index_t kBlockPerCu = Traits::kBlockPerCu; }; +// extract tile size attributes to remove dependency on traits +template +struct BlockFmhaSplitKVCombinePipelineTileSizes +{ + static constexpr index_t MaxVectorSize = 16 / sizeof(OaccDataType_); + + static constexpr index_t kN1 = kN1_; + static constexpr index_t NThreads = kN1 / MaxVectorSize; + static constexpr index_t kM0 = get_warp_size() / NThreads; // MThreadPerWarp +}; + template struct BlockFmhaSplitKVCombinePipelineProblem + : BlockFmhaSplitKVCombinePipelineTileSizes { + using BaseType = BlockFmhaSplitKVCombinePipelineTileSizes; + using LSEDataType = remove_cvref_t; using OaccDataType = remove_cvref_t; using ODataType = remove_cvref_t; using Traits = remove_cvref_t; - static constexpr index_t kNumWarps = kM0_ / (get_warp_size() / 4); - static constexpr index_t kBlockSize = kNumWarps * get_warp_size(); - static constexpr bool kIsGroupMode = kIsGroupMode_; + static_assert(std::is_same_v); static constexpr index_t kHeadDimV = HeadDimV_; - static constexpr index_t kM0 = kM0_; - static constexpr index_t kN1 = kN1_; + static constexpr bool kIsGroupMode = kIsGroupMode_; + + using BaseType::kM0; + using BaseType::kN1; + + static_assert(kN1 <= kHeadDimV && kHeadDimV % kN1 == 0); // attributes from traits static constexpr bool kPadSeqLenQ = Traits::kPadSeqLenQ; @@ -136,6 +151,13 @@ struct BlockFmhaSplitKVCombinePipelineProblem static constexpr bool kDoFp8StaticQuant = Traits::kDoFp8StaticQuant; static constexpr index_t kBlockPerCu = Traits::kBlockPerCu; static constexpr index_t kMaxSplits = Traits::kMaxSplits; + static_assert(8 <= kMaxSplits); + + static constexpr index_t kNumWarps = 4; // always use 4 warps for each workgroup + static constexpr index_t kBlockSize = kNumWarps * get_warp_size(); + + static_assert(get_warp_size() <= (kM0 * kMaxSplits) && + (kM0 * kMaxSplits) % get_warp_size() == 0); }; template template CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentQ() { + constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType); + using BlockGemm = remove_cvref_t())>; constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp(); using WG = remove_cvref_t())>; - return WG::kK / WG::WarpGemmAttribute::Impl::kABKLane; + + return min(MaxVectorSize, WG::kK / WG::WarpGemmAttribute::Impl::kABKLane); } template CK_TILE_HOST_DEVICE static constexpr auto MakeQDramTileDistribution() { - constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp(); - using WG = remove_cvref_t())>; - constexpr index_t MWarp = config.template at<1>(); - - constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0; - constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim; - - constexpr index_t K2 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane; - constexpr index_t K1 = WG::WarpGemmAttribute::Impl::kABKLane; - constexpr index_t K0 = kKPerBlock / (K1 * K2); - - constexpr index_t M2 = WG::WarpGemmAttribute::Impl::kAMLane; - constexpr index_t M1 = MWarp; - constexpr index_t M0 = kMPerBlock / (M2 * M1); - - if constexpr(1 < Problem::kNumGemm0Warps) - { - return make_static_tile_distribution( - tile_distribution_encoding, - tuple, sequence>, - tuple, sequence<2, 1>>, - tuple, sequence<1, 2>>, - sequence<1, 2, 2>, - sequence<0, 0, 2>>{}); - } - else - { - static_assert(MWarp == 1); - - return make_static_tile_distribution( - tile_distribution_encoding, - tuple, sequence>, - tuple>, - tuple>, - sequence<1, 2, 2>, - sequence<0, 0, 2>>{}); - } + return BlockGemm::template MakeABlockTileDistribution< + Problem::BlockFmhaShape::kM0, + Problem::BlockFmhaShape::kSubQKHeaddim>(); } template @@ -105,7 +74,7 @@ struct BlockFmhaPipelineQXCustomPolicy constexpr auto warp_gemm = []() { constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}); - static_assert(WarpGemmM == 16 || WarpGemmM == 32); + static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32); if constexpr(std::is_same_v && std::is_same_v && @@ -113,8 +82,10 @@ struct BlockFmhaPipelineQXCustomPolicy { if constexpr(WarpGemmM == 32) return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{}; - else // WarpGemmM == 16 + else if constexpr(WarpGemmM == 16) return WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution{}; + else // WarpGemmM == 4 + return WarpGemmMfmaF16F16F32M4N64K16{}; } else if constexpr(std::is_same_v && std::is_same_v && @@ -122,8 +93,10 @@ struct BlockFmhaPipelineQXCustomPolicy { if constexpr(WarpGemmM == 32) return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{}; - else // WarpGemmM == 16 + else if constexpr(WarpGemmM == 16) return WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution{}; + else // WarpGemmM == 4 + return WarpGemmMfmaBf16Bf16F32M4N64K16{}; } else if constexpr(std::is_same_v && std::is_same_v && diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp index bb33b5f02..5ce80c2d1 100644 --- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp +++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp @@ -43,8 +43,6 @@ struct TileFmhaShape static constexpr index_t NumWarps = max(NumGemm0Warps, NumGemm1Warps); - static_assert(std::is_same_v); - static constexpr index_t kM0 = BlockTile::at(number<0>{}); // tile size along q seqlen static constexpr index_t kN0 = BlockTile::at(number<1>{}); // tile size along k seqlen static constexpr index_t kK0 = BlockTile::at(number<2>{}); // tile size along qk gemm unroll diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp index ff23f6355..b99466b1e 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp @@ -65,14 +65,6 @@ struct BlockGemmARegBSmemCRegOneWarpV1 const index_t iNWarp = 0; - constexpr auto a_block_outer_dstr_encoding = - tile_distribution_encoding, - tuple, sequence>, - tuple>, - tuple>, - sequence<1, 2>, - sequence<0, 0>>{}; - constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding, tuple, sequence>, @@ -81,19 +73,14 @@ struct BlockGemmARegBSmemCRegOneWarpV1 sequence<1, 2>, sequence<0, 0>>{}; - constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding( - a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{}); - constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{}); - constexpr auto a_block_dstr = make_static_tile_distribution(a_block_dstr_encode); - // constrcut from A-block-tensor from A-Block-tensor-tmp // FIXME: need method to check a_block_tensor and a_block_tensor_tmp have equivalent // distribution - auto a_block_tensor = - make_static_distributed_tensor(a_block_dstr); + auto a_block_tensor = make_static_distributed_tensor( + MakeABlockTileDistribution()); a_block_tensor.get_thread_buffer() = a_block_tensor_tmp.get_thread_buffer(); @@ -187,6 +174,33 @@ struct BlockGemmARegBSmemCRegOneWarpV1 }); } + template + CK_TILE_DEVICE static constexpr auto MakeABlockTileDistribution() + { + constexpr auto config = Policy::template GetWarpGemmMWarpNWarp(); + + using WG = remove_cvref_t())>; + + constexpr index_t MWarp = config.template at<1>(); + constexpr index_t NWarp = config.template at<2>(); + + constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM); + constexpr index_t KIterPerWarp = KPerBlock / WG::kK; + + constexpr auto a_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{}); + + return make_static_tile_distribution(a_block_dstr_encode); + } + CK_TILE_DEVICE static constexpr auto MakeCBlockTile() { constexpr index_t MPerBlock = BlockGemmShape::kM; diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp index 173ef0a02..0181c0eec 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp @@ -59,14 +59,6 @@ struct BlockGemmARegBSmemCRegV2 const index_t iNWarp = get_warp_id() % NWarp; - constexpr auto a_block_outer_dstr_encoding = - tile_distribution_encoding, - tuple, sequence>, - tuple>, - tuple>, - sequence<1, 2>, - sequence<0, 0>>{}; - constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -75,19 +67,14 @@ struct BlockGemmARegBSmemCRegV2 sequence<1, 2>, sequence<0, 0>>{}; - constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding( - a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{}); - constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{}); - constexpr auto a_block_dstr = make_static_tile_distribution(a_block_dstr_encode); - // constrcut from A-block-tensor from A-Block-tensor-tmp // FIXME: need method to check a_block_tensor and a_block_tensor_tmp have equivalent // distribution - auto a_block_tensor = - make_static_distributed_tensor(a_block_dstr); + auto a_block_tensor = make_static_distributed_tensor( + MakeABlockTileDistribution()); a_block_tensor.get_thread_buffer() = a_block_tensor_tmp.get_thread_buffer(); @@ -182,6 +169,33 @@ struct BlockGemmARegBSmemCRegV2 }); } + template + CK_TILE_DEVICE static constexpr auto MakeABlockTileDistribution() + { + constexpr auto config = Policy::template GetWarpGemmMWarpNWarp(); + + using WG = remove_cvref_t())>; + + constexpr index_t MWarp = config.template at<1>(); + constexpr index_t NWarp = config.template at<2>(); + + constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM); + constexpr index_t KIterPerWarp = KPerBlock / WG::kK; + + constexpr auto a_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{}); + + return make_static_tile_distribution(a_block_dstr_encode); + } + CK_TILE_DEVICE static constexpr auto MakeCBlockTile() { constexpr index_t MPerBlock = BlockGemmShape::kM; diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp index 89ea82c5b..1fd12973f 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp @@ -56,6 +56,14 @@ using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmAttributeMfmaImplF16F16F32M32N32K8, 2>>; +using WarpGemmMfmaF16F16F32M4N64K16 = WarpGemmImpl, + 4>>; + +using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl, + 4>>; + // bf16 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl< @@ -104,6 +112,14 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8, 2>>; +using WarpGemmMfmaBf16Bf16F32M4N64K16 = WarpGemmImpl, + 4>>; + +using WarpGemmMfmaBf16Bf16F32M64N4K16 = WarpGemmImpl, + 4>>; + // fp8 using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl< diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp index a9e466a79..e7d4c3796 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp @@ -28,6 +28,9 @@ struct WarpGemmAtrributeMfma CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } + static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1, + "Multi-block WarpGemmAttributeMfmaImpl is not supported"); + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -94,30 +97,130 @@ struct WarpGemmAtrributeMfmaIterateK CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } - using AWarpDstrEncoding = tile_distribution_encoding< - sequence<>, - tuple, sequence>, - tuple>, - tuple>, - sequence<2>, - sequence<1>>; + static_assert(Impl::kAMBlock == 1 || Impl::kBNBlock == 1, + "Multi-block on both M & N directions is not supported"); - using BWarpDstrEncoding = tile_distribution_encoding< - sequence<>, - tuple, sequence>, - tuple>, - tuple>, - sequence<2>, - sequence<1>>; + CK_TILE_DEVICE static constexpr auto get_awarp_dstr_encoding() + { + if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1) + { + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock) + { + // each M blocks share the same data + return tile_distribution_encoding< + sequence, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1) + { + // single block to multi-block thread mapping + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + } - using CWarpDstrEncoding = tile_distribution_encoding< - sequence<>, - tuple, - sequence>, - tuple>, - tuple>, - sequence<1, 1>, - sequence<0, 2>>; + CK_TILE_DEVICE static constexpr auto get_bwarp_dstr_encoding() + { + if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1) + { + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock) + { + // single block to multi-block thread mapping + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1) + { + // each N blocks share the same data + return tile_distribution_encoding< + sequence, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + } + + CK_TILE_DEVICE static constexpr auto get_cwarp_dstr_encoding() + { + if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1) + { + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 1>, + sequence<0, 2>>{}; + } + else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock) + { + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 1>, + sequence<0, 2>>{}; + } + else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1) + { + return tile_distribution_encoding< + sequence<>, + tuple< + sequence, + sequence>, + tuple>, + tuple>, + sequence<1, 1>, + sequence<0, 2>>{}; + } + } + + using AWarpDstrEncoding = decltype(get_awarp_dstr_encoding()); + + using BWarpDstrEncoding = decltype(get_bwarp_dstr_encoding()); + + using CWarpDstrEncoding = decltype(get_cwarp_dstr_encoding()); // c_vec += a_vec * b_vec template @@ -206,6 +309,9 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } + static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1, + "Multi-block WarpGemmAttributeMfmaImpl is not supported"); + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -270,6 +376,9 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } + static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1, + "Multi-block WarpGemmAttributeMfmaImpl is not supported"); + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -341,30 +450,130 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } - using AWarpDstrEncoding = tile_distribution_encoding< - sequence<>, - tuple, sequence>, - tuple>, - tuple>, - sequence<2>, - sequence<1>>; + static_assert(Impl::kAMBlock == 1 || Impl::kBNBlock == 1, + "Multi-block on both M & N directions is not supported"); - using BWarpDstrEncoding = tile_distribution_encoding< - sequence<>, - tuple, sequence>, - tuple>, - tuple>, - sequence<2>, - sequence<1>>; + CK_TILE_DEVICE static constexpr auto get_awarp_dstr_encoding() + { + if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1) + { + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock) + { + // single block to multi-block thread mapping + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1) + { + // each N blocks share the same data + return tile_distribution_encoding< + sequence, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + } - using CWarpDstrEncoding = tile_distribution_encoding< - sequence<>, - tuple, - sequence>, - tuple>, - tuple>, - sequence<2, 2>, - sequence<0, 2>>; + CK_TILE_DEVICE static constexpr auto get_bwarp_dstr_encoding() + { + if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1) + { + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock) + { + // each M blocks share the same data + return tile_distribution_encoding< + sequence, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1) + { + // single block to multi-block thread mapping + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}; + } + } + + CK_TILE_DEVICE static constexpr auto get_cwarp_dstr_encoding() + { + if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1) + { + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2, 2>, + sequence<0, 2>>{}; + } + else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock) + { + return tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2, 2>, + sequence<0, 2>>{}; + } + else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1) + { + return tile_distribution_encoding< + sequence<>, + tuple< + sequence, + sequence>, + tuple>, + tuple>, + sequence<2, 2>, + sequence<0, 2>>{}; + } + } + + using AWarpDstrEncoding = decltype(get_awarp_dstr_encoding()); + + using BWarpDstrEncoding = decltype(get_bwarp_dstr_encoding()); + + using CWarpDstrEncoding = decltype(get_cwarp_dstr_encoding()); template // c_vec += a_vec * b_vec @@ -457,6 +666,9 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } + static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1, + "Multi-block WarpGemmAttributeMfmaImpl is not supported"); + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -597,6 +809,9 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } + static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1, + "Multi-block WarpGemmAttributeMfmaImpl is not supported"); + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple +struct WarpGemmAttributeMfmaImplF16F16F32M4N64K4 +{ + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = fp16_t; + using BDataType = fp16_t; + using CDataType = float; + + using AVecType = ext_vector_t; + using BVecType = ext_vector_t; + using CVecType = ext_vector_t; + + static constexpr index_t kM = 4; + static constexpr index_t kN = 64; + static constexpr index_t kK = 4; + + static constexpr index_t kAMBlock = 1; + static constexpr index_t kBNBlock = 16; + + // we only write down single block (4 threads) thread mapping here + static constexpr index_t kAMLane = 4; + static constexpr index_t kBNLane = 4; + static constexpr index_t kABKLane = 1; + static constexpr index_t kABKPerLane = 4; + + static constexpr index_t kCMLane = 1; + static constexpr index_t kCNLane = 4; + static constexpr index_t kCM0PerLane = 1; + static constexpr index_t kCM1PerLane = 4; + + // c_vec += a_vec * b_vec + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const + { + DISPATCH_MFMA_CTRL_("v_mfma_f32_4x4x4f16", Ctrl) + else + { +#if defined(__gfx9__) + c_vec = __builtin_amdgcn_mfma_f32_4x4x4f16(a_vec, b_vec, c_vec, 0, 0, 0); +#else + ignore = c_vec; + ignore = a_vec; + ignore = b_vec; +#endif + } + } + + // c_vec = a_vec * b_vec + CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const + { +#if defined(__gfx9__) + return bit_cast( + __builtin_amdgcn_mfma_f32_4x4x4f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0)); +#else + ignore = a_vec; + ignore = b_vec; + return CVecType{0.f}; +#endif + } +}; + +template +struct WarpGemmAttributeMfmaImplF16F16F32M64N4K4 +{ + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = fp16_t; + using BDataType = fp16_t; + using CDataType = float; + + using AVecType = ext_vector_t; + using BVecType = ext_vector_t; + using CVecType = ext_vector_t; + + static constexpr index_t kM = 64; + static constexpr index_t kN = 4; + static constexpr index_t kK = 4; + + static constexpr index_t kAMBlock = 16; + static constexpr index_t kBNBlock = 1; + + // we only write down single block (4 threads) thread mapping here + static constexpr index_t kAMLane = 4; + static constexpr index_t kBNLane = 4; + static constexpr index_t kABKLane = 1; + static constexpr index_t kABKPerLane = 4; + + static constexpr index_t kCMLane = 1; + static constexpr index_t kCNLane = 4; + static constexpr index_t kCM0PerLane = 1; + static constexpr index_t kCM1PerLane = 4; + + // c_vec += a_vec * b_vec + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const + { + DISPATCH_MFMA_CTRL_("v_mfma_f32_4x4x4f16", Ctrl) + else + { +#if defined(__gfx9__) + c_vec = __builtin_amdgcn_mfma_f32_4x4x4f16(a_vec, b_vec, c_vec, 0, 0, 0); +#else + ignore = c_vec; + ignore = a_vec; + ignore = b_vec; +#endif + } + } + + // c_vec = a_vec * b_vec + CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const + { +#if defined(__gfx9__) + return bit_cast( + __builtin_amdgcn_mfma_f32_4x4x4f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0)); +#else + ignore = a_vec; + ignore = b_vec; + return CVecType{0.f}; +#endif + } +}; + // Bf16 template struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8 @@ -199,6 +333,9 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8 static constexpr index_t kN = 32; static constexpr index_t kK = 8; + static constexpr index_t kAMBlock = 1; + static constexpr index_t kBNBlock = 1; + static constexpr index_t kAMLane = 32; static constexpr index_t kBNLane = 32; static constexpr index_t kABKLane = 2; @@ -285,6 +422,9 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16 static constexpr index_t kN = 16; static constexpr index_t kK = 16; + static constexpr index_t kAMBlock = 1; + static constexpr index_t kBNBlock = 1; + static constexpr index_t kAMLane = 16; static constexpr index_t kBNLane = 16; static constexpr index_t kABKLane = 4; @@ -354,6 +494,134 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16 } }; +template +struct WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4 +{ + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = bf16_t; + using BDataType = bf16_t; + using CDataType = float; + + using AVecType = ext_vector_t; + using BVecType = ext_vector_t; + using CVecType = ext_vector_t; + + static constexpr index_t kM = 4; + static constexpr index_t kN = 64; + static constexpr index_t kK = 4; + + static constexpr index_t kAMBlock = 1; + static constexpr index_t kBNBlock = 16; + + // we only write down single block (4 threads) thread mapping here + static constexpr index_t kAMLane = 4; + static constexpr index_t kBNLane = 4; + static constexpr index_t kABKLane = 1; + static constexpr index_t kABKPerLane = 4; + + static constexpr index_t kCMLane = 1; + static constexpr index_t kCNLane = 4; + static constexpr index_t kCM0PerLane = 1; + static constexpr index_t kCM1PerLane = 4; + + // c_vec += a_vec * b_vec + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const + { + DISPATCH_MFMA_CTRL_("v_mfma_f32_4x4x4bf16_1k", Ctrl) + else + { +#if defined(__gfx9__) + c_vec = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0); +#else + ignore = c_vec; + ignore = a_vec; + ignore = b_vec; +#endif + } + } + + // c_vec = a_vec * b_vec + CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const + { +#if defined(__gfx9__) + return bit_cast( + __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0)); +#else + ignore = a_vec; + ignore = b_vec; + return CVecType{0.f}; +#endif + } +}; + +template +struct WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4 +{ + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = bf16_t; + using BDataType = bf16_t; + using CDataType = float; + + using AVecType = ext_vector_t; + using BVecType = ext_vector_t; + using CVecType = ext_vector_t; + + static constexpr index_t kM = 64; + static constexpr index_t kN = 4; + static constexpr index_t kK = 4; + + static constexpr index_t kAMBlock = 16; + static constexpr index_t kBNBlock = 1; + + // we only write down single block (4 threads) thread mapping here + static constexpr index_t kAMLane = 4; + static constexpr index_t kBNLane = 4; + static constexpr index_t kABKLane = 1; + static constexpr index_t kABKPerLane = 4; + + static constexpr index_t kCMLane = 1; + static constexpr index_t kCNLane = 4; + static constexpr index_t kCM0PerLane = 1; + static constexpr index_t kCM1PerLane = 4; + + // c_vec += a_vec * b_vec + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const + { + DISPATCH_MFMA_CTRL_("v_mfma_f32_4x4x4bf16_1k", Ctrl) + else + { +#if defined(__gfx9__) + c_vec = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0); +#else + ignore = c_vec; + ignore = a_vec; + ignore = b_vec; +#endif + } + } + + // c_vec = a_vec * b_vec + CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const + { +#if defined(__gfx9__) + return bit_cast( + __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0)); +#else + ignore = a_vec; + ignore = b_vec; + return CVecType{0.f}; +#endif + } +}; + // FP8 template struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base @@ -371,6 +639,9 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base static constexpr index_t kN = 32; static constexpr index_t kK = 16; + static constexpr index_t kAMBlock = 1; + static constexpr index_t kBNBlock = 1; + static constexpr index_t kAMLane = 32; static constexpr index_t kBNLane = 32; static constexpr index_t kABKLane = 2; diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp index 99cd5d787..9c319b5e5 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp @@ -29,6 +29,8 @@ template<> struct WarpGemmMfmaDispatcher struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; }; template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32; }; template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M4N64K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M64N4K16; }; template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; }; template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; }; @@ -42,6 +44,8 @@ template<> struct WarpGemmMfmaDispatcher struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; }; template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; }; template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M4N64K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M64N4K16; }; template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; }; template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; }; -- GitLab From 1c45ca35dd5c215e0c1db1f40f01556f467f52a8 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Fri, 20 Dec 2024 16:40:45 +0800 Subject: [PATCH 138/153] hot-fix (#1768) --- .../ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp index fa24711de..21a865e79 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp @@ -839,6 +839,9 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8 static constexpr index_t kN = 32; static constexpr index_t kK = 16; + static constexpr index_t kAMBlock = 1; + static constexpr index_t kBNBlock = 1; + static constexpr index_t kAMLane = 32; static constexpr index_t kBNLane = 32; static constexpr index_t kABKLane = 2; -- GitLab From 07339c738396ebeae57374771ded4dcf11bddf1e Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 20 Dec 2024 07:52:24 -0800 Subject: [PATCH 139/153] fix typo for CK_USE_OCP_FP8 (#1769) --- include/ck/config.h.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/config.h.in b/include/ck/config.h.in index 55a498073..2c37300e9 100644 --- a/include/ck/config.h.in +++ b/include/ck/config.h.in @@ -115,8 +115,8 @@ #cmakedefine CK_USE_GFX94 @CK_USE_GFX94@ #endif -#ifndef DCK_USE_OCP_FP8 -#cmakedefine DCK_USE_OCP_FP8 @DCK_USE_OCP_FP8@ +#ifndef CK_USE_OCP_FP8 +#cmakedefine CK_USE_OCP_FP8 @CK_USE_OCP_FP8@ #endif #ifndef CK_USE_FNUZ_FP8 -- GitLab From 3d15f364b367b24ac709ea5687fa2d7d39f07cf9 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Mon, 23 Dec 2024 10:59:02 +0800 Subject: [PATCH 140/153] [CK_TILE] optimize moe-sorting kernel (#1771) * opt moe sorting * remove commented code --- .../13_moe_sorting/moe_sorting_api.cpp | 53 ++-- .../13_moe_sorting/script/smoke_test.sh | 3 +- .../instances/fused_moesorting_api.cpp | 53 ++-- .../fused_moe/kernel/moe_sorting_kernel.hpp | 247 +++++++++++++++--- .../pipeline/moe_sorting_problem.hpp | 13 +- 5 files changed, 289 insertions(+), 80 deletions(-) diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp index 25e99c530..723fb3f69 100644 --- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp +++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp @@ -3,18 +3,42 @@ #include "moe_sorting_api.hpp" -#define MOE_SORTING_DISPATCH(unroll_num_) \ - constexpr ck_tile::index_t unroll_num = unroll_num_; \ - using ms_problem = ck_tile::MoeSortingProblem; \ - using kernel = ck_tile::MoeSortingKernel; \ - auto kargs = kernel::MakeKargs(a); \ - const dim3 grids = kernel::GridSize(a); \ - const dim3 blocks = kernel::BlockSize(a); \ - const auto lds_bytes = kernel::GetSmemSize(a); \ - float ave_time = ck_tile::launch_kernel( \ - s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \ +#define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_) \ + constexpr ck_tile::index_t unroll_num = unroll_num_; \ + constexpr ck_tile::index_t expert_tile = expert_tile_; \ + using ms_problem = \ + ck_tile::MoeSortingProblem; \ + using kernel = ck_tile::MoeSortingKernel; \ + auto kargs = kernel::MakeKargs(a); \ + const dim3 grids = kernel::GridSize(a); \ + const dim3 blocks = kernel::BlockSize(a); \ + const auto lds_bytes = kernel::GetSmemSize(a); \ + float ave_time = ck_tile::launch_kernel( \ + s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \ return ave_time; +#define MOE_SORTING_DISPATCH(unroll_num_) \ + if(a.num_experts <= 8) \ + { \ + MOE_SORTING_DISPATCH_ETILE(unroll_num_, 8) \ + } \ + else if(a.num_experts <= 16) \ + { \ + MOE_SORTING_DISPATCH_ETILE(unroll_num_, 16) \ + } \ + else if(a.num_experts <= 32) \ + { \ + MOE_SORTING_DISPATCH_ETILE(unroll_num_, 32) \ + } \ + else if(a.num_experts <= 64) \ + { \ + MOE_SORTING_DISPATCH_ETILE(unroll_num_, 64) \ + } \ + else \ + { \ + MOE_SORTING_DISPATCH_ETILE(unroll_num_, 0) \ + } + float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s) { if(t.weight_type == "fp32" && t.index_type == "int32") @@ -49,21 +73,12 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi case(6): { MOE_SORTING_DISPATCH(6); } - case(7): { - MOE_SORTING_DISPATCH(7); - } case(8): { MOE_SORTING_DISPATCH(8); } - case(9): { - MOE_SORTING_DISPATCH(9); - } case(10): { MOE_SORTING_DISPATCH(10); } - case(11): { - MOE_SORTING_DISPATCH(11); - } default: { MOE_SORTING_DISPATCH(4); } diff --git a/example/ck_tile/13_moe_sorting/script/smoke_test.sh b/example/ck_tile/13_moe_sorting/script/smoke_test.sh index 1fc5eafcb..3ff8a7332 100644 --- a/example/ck_tile/13_moe_sorting/script/smoke_test.sh +++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh @@ -16,4 +16,5 @@ $EXE -t=127 -e=99 -k=19 $EXE -t=71 -e=11 -k=11 $EXE -t=1 -e=1 -k=1 $EXE -t=99 -e=2 -k=1 -$EXE -t=333 -e=99 -k=13 \ No newline at end of file +$EXE -t=333 -e=99 -k=13 +$EXE -t=128 -e=32 -k=5 -moe_buf_size=262144 diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp index 75aaf86b7..7ca24c5c9 100644 --- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp +++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp @@ -3,18 +3,42 @@ #include "fused_moesorting.hpp" -#define MOE_SORTING_DISPATCH(unroll_num_) \ - constexpr ck_tile::index_t unroll_num = unroll_num_; \ - using ms_problem = ck_tile::MoeSortingProblem; \ - using kernel = ck_tile::MoeSortingKernel; \ - auto kargs = kernel::MakeKargs(a); \ - const dim3 grids = kernel::GridSize(a); \ - const dim3 blocks = kernel::BlockSize(a); \ - const auto lds_bytes = kernel::GetSmemSize(a); \ - float ave_time = ck_tile::launch_kernel( \ - s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \ +#define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_) \ + constexpr ck_tile::index_t unroll_num = unroll_num_; \ + constexpr ck_tile::index_t expert_tile = expert_tile_; \ + using ms_problem = \ + ck_tile::MoeSortingProblem; \ + using kernel = ck_tile::MoeSortingKernel; \ + auto kargs = kernel::MakeKargs(a); \ + const dim3 grids = kernel::GridSize(a); \ + const dim3 blocks = kernel::BlockSize(a); \ + const auto lds_bytes = kernel::GetSmemSize(a); \ + float ave_time = ck_tile::launch_kernel( \ + s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \ return ave_time; +#define MOE_SORTING_DISPATCH(unroll_num_) \ + if(a.num_experts <= 8) \ + { \ + MOE_SORTING_DISPATCH_ETILE(unroll_num_, 8) \ + } \ + else if(a.num_experts <= 16) \ + { \ + MOE_SORTING_DISPATCH_ETILE(unroll_num_, 16) \ + } \ + else if(a.num_experts <= 32) \ + { \ + MOE_SORTING_DISPATCH_ETILE(unroll_num_, 32) \ + } \ + else if(a.num_experts <= 64) \ + { \ + MOE_SORTING_DISPATCH_ETILE(unroll_num_, 64) \ + } \ + else \ + { \ + MOE_SORTING_DISPATCH_ETILE(unroll_num_, 0) \ + } + float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s) { if(t.weight_type == "fp32" && t.index_type == "int32") @@ -49,21 +73,12 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til case(6): { MOE_SORTING_DISPATCH(6); } - case(7): { - MOE_SORTING_DISPATCH(7); - } case(8): { MOE_SORTING_DISPATCH(8); } - case(9): { - MOE_SORTING_DISPATCH(9); - } case(10): { MOE_SORTING_DISPATCH(10); } - case(11): { - MOE_SORTING_DISPATCH(11); - } default: { MOE_SORTING_DISPATCH(4); } diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp index d9e28ceb5..30e68996b 100644 --- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp +++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp @@ -130,7 +130,8 @@ struct MoeSortingKernel CK_TILE_HOST static constexpr auto GetSmemSize(const Hargs& h) { const auto blocks = BlockSize(h); - return ((blocks.x + 1) * h.num_experts + (h.num_experts + 1)) * sizeof(index_t); + // usually num_experts is power of 2, we pad 1 dword here for the row-size + return ((blocks.x + 1) * (h.num_experts + 1) + (h.num_experts + 1)) * sizeof(index_t); } CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h) @@ -154,6 +155,75 @@ struct MoeSortingKernel return k; } + // [a, b, c, d....] -> [a, a+b, a+b+c, a+b+c+d, ....] + template + __device__ inline void wave_cumsum(data_t& thread_data) const + { + // wave_size must be power of 2 + constexpr int row_mask = 0xf; + constexpr int bank_mask = 0xf; + constexpr bool bound_ctrl = true; // ! out-of-bound is zero ! + auto reduce_op = [&](auto x_, auto y_) { return x_ + y_; }; + + if constexpr(wave_size > 1) + { + thread_data = reduce_op( + thread_data, + __builtin_bit_cast(data_t, __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data), + 0x111, + row_mask, + bank_mask, + bound_ctrl))); // row_shr:1 + } + + if constexpr(wave_size > 2) + { + thread_data = reduce_op( + thread_data, + __builtin_bit_cast(data_t, __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data), + 0x112, + row_mask, + bank_mask, + bound_ctrl))); // row_shr:2 + } + if constexpr(wave_size > 4) + { + thread_data = + reduce_op(thread_data, + __builtin_bit_cast(data_t, __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data), + 0x114, + row_mask, + bank_mask, + bound_ctrl))); // row_shr:4 + } + if constexpr(wave_size > 8) + { + thread_data = + reduce_op(thread_data, + __builtin_bit_cast(data_t, __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data), + 0x118, + row_mask, + bank_mask, + bound_ctrl))); // row_shr:8 + } + + if constexpr(wave_size > 16) + { + // now row-0, row-0+row-1, row-1+row-2, row-2+row-3 + int v_remote_tmp = __builtin_amdgcn_ds_bpermute(((__lane_id() & 0x30) - 1) << 2, __builtin_bit_cast(int, thread_data)); + v_remote_tmp = __lane_id() >= 16 ? v_remote_tmp : 0; + thread_data = reduce_op(thread_data, __builtin_bit_cast(data_t, v_remote_tmp)); + } + + if constexpr(wave_size > 32) + { + // lane-id 48...63->31 + int v_remote_tmp = __builtin_amdgcn_ds_bpermute(((__lane_id() & 0x30) - 17) << 2, __builtin_bit_cast(int, thread_data)); + v_remote_tmp = __lane_id() >= 32 ? v_remote_tmp : 0; + thread_data = reduce_op(thread_data, __builtin_bit_cast(data_t, v_remote_tmp)); + } + } + CK_TILE_DEVICE index_t calc_index(index_t total_col, index_t row, index_t col) const { return row * total_col + col; @@ -187,48 +257,124 @@ struct MoeSortingKernel index_t* shared_mem = reinterpret_cast(smem); index_t* tokens_cnts = shared_mem; // 2d: (blockDim.x + 1, num_experts) - index_t* cumsum = shared_mem + (blockDim.x + 1) * num_experts; // 1: (num_experts + 1) + index_t* cumsum = shared_mem + (blockDim.x + 1) * (num_experts+1); // 1: (num_experts + 1) + for(int i = 0; i < num_experts; ++i) { - tokens_cnts[calc_index(num_experts, tid + 1, i)] = 0; + tokens_cnts[calc_index(num_experts+1, tid + 1, i)] = 0; } + #pragma unroll Problem_::InternalLoadUnroll for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - ++tokens_cnts[calc_index(num_experts, tid + 1, topk_id[i])]; + ++tokens_cnts[calc_index(num_experts+1, tid + 1, topk_id[i])]; } __syncthreads(); +#if 1 if(tid < num_experts) { - tokens_cnts[calc_index(num_experts, 0, tid)] = 0; - for(int i = 1; i <= static_cast(blockDim.x); ++i) + tokens_cnts[calc_index(num_experts+1, 0, tid)] = 0; + index_t local_c[8]; + index_t prev_c = 0; + // TODO: manually unroll. pragma unroll does not work well when we have dependency + for(int i = 1; i <= static_cast(blockDim.x); i+= 8) { - tokens_cnts[calc_index(num_experts, i, tid)] += - tokens_cnts[calc_index(num_experts, i - 1, tid)]; + local_c[0] = tokens_cnts[calc_index(num_experts+1, i + 0, tid)]; + local_c[1] = tokens_cnts[calc_index(num_experts+1, i + 1, tid)]; + local_c[2] = tokens_cnts[calc_index(num_experts+1, i + 2, tid)]; + local_c[3] = tokens_cnts[calc_index(num_experts+1, i + 3, tid)]; + local_c[4] = tokens_cnts[calc_index(num_experts+1, i + 4, tid)]; + local_c[5] = tokens_cnts[calc_index(num_experts+1, i + 5, tid)]; + local_c[6] = tokens_cnts[calc_index(num_experts+1, i + 6, tid)]; + local_c[7] = tokens_cnts[calc_index(num_experts+1, i + 7, tid)]; + + local_c[0] += prev_c; + local_c[1] += local_c[0]; + local_c[2] += local_c[1]; + local_c[3] += local_c[2]; + local_c[4] += local_c[3]; + local_c[5] += local_c[4]; + local_c[6] += local_c[5]; + local_c[7] += local_c[6]; + prev_c = local_c[7]; + + tokens_cnts[calc_index(num_experts+1, i + 0, tid)] = local_c[0]; + tokens_cnts[calc_index(num_experts+1, i + 1, tid)] = local_c[1]; + tokens_cnts[calc_index(num_experts+1, i + 2, tid)] = local_c[2]; + tokens_cnts[calc_index(num_experts+1, i + 3, tid)] = local_c[3]; + tokens_cnts[calc_index(num_experts+1, i + 4, tid)] = local_c[4]; + tokens_cnts[calc_index(num_experts+1, i + 5, tid)] = local_c[5]; + tokens_cnts[calc_index(num_experts+1, i + 6, tid)] = local_c[6]; + tokens_cnts[calc_index(num_experts+1, i + 7, tid)] = local_c[7]; } } - - // __syncthreads(); - if(tid == 0) +#else + // TODO: below code still working, but slow in expert=32/topk=5 case. Put here for future heuristic { - cumsum[0] = 0; - for(int i = 1; i <= num_experts; ++i) + if(tid < num_experts) + tokens_cnts[calc_index(num_experts+1, 0, tid)] = 0; + for(int i = 0; i < num_experts; i+=8) { + index_t local_c[8]; + #pragma unroll + for(int j = 0; j < 8; j++) { + local_c[j] = tokens_cnts[calc_index(num_experts+1, tid+1, i+j)]; + } + + #pragma unroll + for(int j = 0; j < 8; j++) { + wave_cumsum(local_c[j]); + } + + #pragma unroll + for(int j = 0; j < 8; j++) { + tokens_cnts[calc_index(num_experts+1, tid+1, i+j)] = local_c[j]; + } + } + } +#endif + + __syncthreads(); + if constexpr (Problem::ExpertTile == 0) { + if(tid == 0) { - auto current_units = [&]() { - index_t x_ = tokens_cnts[calc_index(num_experts, blockDim.x, i - 1)] + - unit_size_mdiv.divisor - 1; - index_t y_ = unit_size_mdiv.div(x_); - return max(y_, 1) * unit_size_mdiv.divisor; - }(); - cumsum[i] = cumsum[i - 1] + current_units; + cumsum[0] = 0; + for(int i = 1; i <= num_experts; ++i) + { + auto current_units = [&]() { + index_t x_ = tokens_cnts[calc_index(num_experts+1, blockDim.x, i - 1)] + + unit_size_mdiv.divisor - 1; + index_t y_ = unit_size_mdiv.div(x_); + return max(y_, 1) * unit_size_mdiv.divisor; + }(); + cumsum[i] = cumsum[i - 1] + current_units; + } + *p_total_tokens_post_pad = cumsum[num_experts]; + } + } else { + // TODO: we have out-of-bound read here. But result is still OK (will ignore tid >= expert) + // for simplicity, not check experts here. + int local_cnt = tokens_cnts[calc_index(num_experts+1, blockDim.x, tid)]; + int blocks_pers_expert = unit_size_mdiv.div(local_cnt + unit_size_mdiv.divisor - 1); + int padded_tokens_per_expert = max(blocks_pers_expert, 1) * unit_size_mdiv.divisor; + int local_cumsum = padded_tokens_per_expert; + wave_cumsum(local_cumsum); + + if(tid == (num_experts - 1)) { + cumsum[0] = 0; + *p_total_tokens_post_pad = local_cumsum; + } + if(tid < num_experts) { + cumsum[tid + 1] = local_cumsum; } - *p_total_tokens_post_pad = cumsum[num_experts]; } + __syncthreads(); if(tid < num_experts) { - for(int i = cumsum[tid]; i < cumsum[tid + 1]; i += unit_size_mdiv.divisor) + int e_start = cumsum[tid]; + int e_end = cumsum[tid + 1]; + for(int i = e_start; i < e_end; i += unit_size_mdiv.divisor) { p_sorted_expert_ids[unit_size_mdiv.div(i)] = tid; } @@ -238,8 +384,8 @@ struct MoeSortingKernel for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { index_t expert_id = topk_id[i]; - index_t rank_post_pad = - tokens_cnts[calc_index(num_experts, tid, expert_id)] + cumsum[expert_id]; + index_t local_cnt = tokens_cnts[calc_index(num_experts+1, tid, expert_id)]; + index_t rank_post_pad = local_cnt + cumsum[expert_id]; #if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID uint32_t curr_token_id, curr_topk_id; topk_mdiv.divmod(i, curr_token_id, curr_topk_id); @@ -247,27 +393,54 @@ struct MoeSortingKernel #else p_sorted_token_ids[rank_post_pad] = topk_mdiv.div(i); #endif - p_sorted_weights[rank_post_pad] = weights[i]; - ++tokens_cnts[calc_index(num_experts, tid, expert_id)]; + p_sorted_weights[rank_post_pad] = weights[i]; + tokens_cnts[calc_index(num_experts+1, tid, expert_id)] = local_cnt+1; } - const index_t prefill_token = topk_mdiv.div(numel); - if(tid < num_experts) - { - index_t expert_offset = - cumsum[tid] + tokens_cnts[calc_index(num_experts, blockDim.x, tid)]; - while(expert_offset < cumsum[tid + 1]) + if constexpr (Problem::ExpertTile == 0) { + const index_t prefill_token = topk_mdiv.div(numel); + if(tid < num_experts) { + index_t expert_offset = + cumsum[tid] + tokens_cnts[calc_index(num_experts+1, blockDim.x, tid)]; + index_t expert_end = cumsum[tid + 1]; + while(expert_offset < expert_end) + { #if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID - p_sorted_token_ids[expert_offset] = - MOE_SORTING_MOCK_ID(prefill_token, topk_mdiv.divisor); + p_sorted_token_ids[expert_offset] = + MOE_SORTING_MOCK_ID(prefill_token, topk_mdiv.divisor); #else - p_sorted_token_ids[expert_offset] = prefill_token; + p_sorted_token_ids[expert_offset] = prefill_token; #endif - p_sorted_weights[expert_offset] = static_cast(0.0); - expert_offset++; + p_sorted_weights[expert_offset] = static_cast(0.0); + expert_offset++; + } } } + else { + const index_t prefill_token = topk_mdiv.div(numel); + // TODO: only support expert-tile like 8, 16, 32 + static constexpr index_t experts_per_wave = warpSize / Problem::ExpertTile; + { + index_t eid = tid / experts_per_wave; + index_t expert_offset = + cumsum[eid] + tokens_cnts[calc_index(num_experts+1, blockDim.x, eid)] + tid % experts_per_wave; + index_t expert_end = cumsum[eid + 1]; + if(eid < num_experts) { + while(expert_offset < expert_end) + { +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + p_sorted_token_ids[expert_offset] = + MOE_SORTING_MOCK_ID(prefill_token, topk_mdiv.divisor); +#else + p_sorted_token_ids[expert_offset] = prefill_token; +#endif + p_sorted_weights[expert_offset] = static_cast(0.0); + expert_offset+=experts_per_wave; + } + } + } + } } CK_TILE_DEVICE void operator()(Kargs kargs) const diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp index adde59e35..50005c440 100644 --- a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp +++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp @@ -9,15 +9,20 @@ namespace ck_tile { -template +template struct MoeSortingProblem { // TODO: this kernel only support warp per row using WeightType = remove_cvref_t; using IndexType = remove_cvref_t; - static constexpr index_t WarpSize = get_warp_size(); - static constexpr index_t WarpsPerBlock = 1; - static constexpr index_t InternalLoadUnroll = InternalLoadUnroll_; + static constexpr index_t WarpSize = get_warp_size(); + static constexpr index_t WarpsPerBlock = 1; + static constexpr index_t InternalLoadUnroll = + InternalLoadUnroll_; // TODO: need better design(like tile size) + static constexpr index_t ExpertTile = ExpertTile_; // TODO: only used in store out }; } // namespace ck_tile -- GitLab From 4c2eff023a26821512a100171531dc8757ad0e8f Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Wed, 25 Dec 2024 23:57:28 +0800 Subject: [PATCH 141/153] Correct the dtype checking logics (#1775) --- example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index df5b9cecc..2f7edd547 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -261,7 +261,7 @@ FMHA_FWD_SPLITKV_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F static_assert({F_bn1} % 32 == 0); if (t.has_lse) {{ - if constexpr (std::is_same_v<{F_dtype}, ck_tile::fp8_t>) {{ + if constexpr (std::is_same_v<{F_dtype}, FmhaFwdFp8>) {{ return -1; }} else {{ using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, /*F_bn1=*/32, true, {F_squant}, {F_spad}, {F_dvpad}>; @@ -614,7 +614,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d } elif dtype == 'fp8' or dtype == 'bf8': return { - '64' : FmhaFwdSplitKVCombineTileSize(32, -1), + '64' : FmhaFwdSplitKVCombineTileSize(32, -1), '128' : FmhaFwdSplitKVCombineTileSize(32, -1), '256' : FmhaFwdSplitKVCombineTileSize(32, -1), } -- GitLab From af66494880fc6256e5e1ced779b6d80446726970 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Sat, 28 Dec 2024 14:40:17 +0100 Subject: [PATCH 142/153] [CK TILE] GEMM and Batched GEMM SplitK support (#1724) * [CK TILE] Add split K support in GEMM * Updates * Fixes * rebase * fix * Fix * fixes * support for batched gemm --- example/ck_tile/03_gemm/gemm_basic.hpp | 6 +- example/ck_tile/03_gemm/run_gemm_example.inc | 8 +- example/ck_tile/03_gemm/universal_gemm.cpp | 20 +-- .../ck_tile/16_batched_gemm/batched_gemm.cpp | 13 +- .../ck_tile/16_batched_gemm/batched_gemm.hpp | 3 +- .../run_batched_gemm_example.inc | 4 + .../ops/epilogue/cshuffle_epilogue.hpp | 31 +++- .../ops/epilogue/default_2d_epilogue.hpp | 26 ++- .../ops/gemm/kernel/batched_gemm_kernel.hpp | 32 +++- .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 164 +++++++++++++----- .../gemm_pipeline_ag_bg_cr_comp_v3.hpp | 2 + .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 2 + .../gemm_pipeline_agmem_bgmem_creg_v1.hpp | 2 + ...ine_agmem_bgmem_creg_v1_default_policy.hpp | 14 +- .../gemm_pipeline_agmem_bgmem_creg_v2.hpp | 2 + ...emm_universal_pipeline_ag_bg_cr_policy.hpp | 2 + .../batched_gemm/test_batched_gemm_util.hpp | 3 +- test/ck_tile/gemm/test_gemm_pipeline_util.hpp | 4 +- 18 files changed, 246 insertions(+), 92 deletions(-) diff --git a/example/ck_tile/03_gemm/gemm_basic.hpp b/example/ck_tile/03_gemm/gemm_basic.hpp index 58cdaea7d..38c0a279d 100644 --- a/example/ck_tile/03_gemm/gemm_basic.hpp +++ b/example/ck_tile/03_gemm/gemm_basic.hpp @@ -54,8 +54,7 @@ using CDataType = Types::CDataType; auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; - arg_parser.insert("b", "1", "batch size") - .insert("m", "3840", "m dimension") + arg_parser.insert("m", "3840", "m dimension") .insert("n", "4096", "n dimension") .insert("k", "2048", "k dimension") .insert("a_layout", "R", "A tensor data layout - Row by default") @@ -68,7 +67,8 @@ auto create_args(int argc, char* argv[]) .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") .insert("warmup", "50", "number of iterations before benchmark the kernel") .insert("repeat", "100", "number of iterations to benchmark the kernel") - .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer"); + .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer") + .insert("split_k", "1", "splitK value"); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index 68df389bf..56d0348bd 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -64,9 +64,9 @@ int run_gemm_example_with_layouts(int argc, ck_tile::index_t stride_B = arg_parser.get_int("stride_b"); ck_tile::index_t stride_C = arg_parser.get_int("stride_c"); - ck_tile::index_t batch_size = arg_parser.get_int("b"); - int n_warmup = arg_parser.get_int("warmup"); - int n_repeat = arg_parser.get_int("repeat"); + ck_tile::index_t kbatch = arg_parser.get_int("split_k"); + int n_warmup = arg_parser.get_int("warmup"); + int n_repeat = arg_parser.get_int("repeat"); using namespace ck_tile::literals; @@ -133,7 +133,7 @@ int run_gemm_example_with_layouts(int argc, stride_A, stride_B, stride_C, - batch_size, + kbatch, n_warmup, n_repeat); diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp index 6c87ca008..1a9e025a9 100644 --- a/example/ck_tile/03_gemm/universal_gemm.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -22,7 +22,7 @@ #endif template -float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) +float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) { #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) // Memory friendly for Interwave scheduler @@ -78,7 +78,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) #endif ck_tile::GemmPipelineProblem>; - const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(args.K); + const ck_tile::index_t k_grain = args.k_batch * K_Tile; + const ck_tile::index_t K_split = (args.K + k_grain - 1) / k_grain * K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); @@ -106,17 +108,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) has_hot_loop_v, tail_number_v>>; using Kernel = ck_tile::GemmKernel; - auto kargs = Kernel::MakeKargs(args.p_a, - args.p_b, - args.p_c, - args.M, - args.N, - args.K, - args.stride_A, - args.stride_B, - args.stride_C); - - const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); + auto kargs = Kernel::MakeKernelArgs(args); + + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); constexpr dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp index 9b4ed9a9e..b9c9eaa58 100644 --- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp +++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp @@ -70,20 +70,25 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre using CodegenGemmTraits = ck_tile::TileGemmTraits; - using CodegenPipelineProblem = ck_tile:: GemmPipelineProblem; - - using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy; + using CodegenGemmPipeline = + ck_tile::GemmPipelineAGmemBGmemCRegV1; // ToDo: Will add the codegen part to test different pipeline policies in GEMM. // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. using Kernel = ck_tile::BatchedGemmKernel; auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.batch_count); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count); constexpr dim3 blocks = Kernel::BlockSize(); + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } + if(s.log_level_ > 0) { std::cout << "Launching kernel with args:" diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp index f0c0c9efb..62f0058fd 100644 --- a/example/ck_tile/16_batched_gemm/batched_gemm.hpp +++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp @@ -49,7 +49,8 @@ auto create_args(int argc, char* argv[]) .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") .insert("warmup", "50", "number of iterations before benchmark the kernel") .insert("repeat", "100", "number of iterations to benchmark the kernel") - .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer"); + .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer") + .insert("split_k", "1", "splitK value"); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc index 4e7218b5b..c14bb5668 100644 --- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc +++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc @@ -17,6 +17,7 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, ck_tile::index_t batch_stride_B, ck_tile::index_t batch_stride_C, ck_tile::index_t batch_count, + ck_tile::index_t kbatch, int n_warmup, int n_repeat) { @@ -24,6 +25,7 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); + args.k_batch = kbatch; args.M = M; args.N = N; args.K = K; @@ -79,6 +81,7 @@ int run_batched_gemm_example_with_layouts(int argc, ck_tile::index_t batch_stride_B = arg_parser.get_int("batch_stride_b"); ck_tile::index_t batch_stride_C = arg_parser.get_int("batch_stride_c"); ck_tile::index_t batch_count = arg_parser.get_int("batch_count"); + ck_tile::index_t kbatch = arg_parser.get_int("split_k"); int n_warmup = arg_parser.get_int("warmup"); int n_repeat = arg_parser.get_int("repeat"); @@ -159,6 +162,7 @@ int run_batched_gemm_example_with_layouts(int argc, batch_stride_B, batch_stride_C, batch_count, + kbatch, n_warmup, n_repeat); diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp index 9625b137b..01105d2a8 100644 --- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -56,6 +56,13 @@ struct CShuffleEpilogue // No additional shared memory needed CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; } + CK_TILE_HOST_DEVICE static constexpr bool IsOutputTransposed() + { + // TODO: At now CShuffle doesn't allow to vector store after permute. + // It should be fixed and this function should return true. + return false; + } + template CK_TILE_DEVICE void permute_tile_data(OAccTile& o_acc_tile) { @@ -111,7 +118,9 @@ struct CShuffleEpilogue } } - template + template CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, OAccTile& o_acc_tile) { const auto& current_window_origin = o_dram_window_tmp.get_window_origin(); @@ -158,12 +167,26 @@ struct CShuffleEpilogue // Store the tile data to the permuted location if constexpr(kPadM || kPadN) { - store_tile_raw(o_dram_window_tmp, cast_tile(o_acc_tile)); + if constexpr(out_memory_data_op == memory_operation_enum::set) + { + store_tile_raw(o_dram_window_tmp, cast_tile(o_acc_tile)); + } + else + { + update_tile_raw(o_dram_window_tmp, cast_tile(o_acc_tile)); + } buffer_store_fence(); } else { - store_tile(o_dram_window_tmp, cast_tile(o_acc_tile)); + if constexpr(out_memory_data_op == memory_operation_enum::set) + { + store_tile(o_dram_window_tmp, cast_tile(o_acc_tile)); + } + else + { + update_tile(o_dram_window_tmp, cast_tile(o_acc_tile)); + } } } }; diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp index 7c5d5a6f3..177573de3 100644 --- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -35,21 +35,39 @@ struct Default2DEpilogue CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; } + CK_TILE_HOST_DEVICE static constexpr bool IsOutputTransposed() { return false; } + // TODO: this function assume store out vector size is the same as OAccTile last dimension size // how do we fix this ? - template + template CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile) { // TODO: this is ugly if constexpr(UseRawStore && (kPadM || kPadN)) { - store_tile_raw(o_dram_window_tmp, cast_tile(o_acc_tile)); + if constexpr(out_memory_data_op == memory_operation_enum::set) + { + store_tile_raw(o_dram_window_tmp, cast_tile(o_acc_tile)); + } + else + { + update_tile_raw(o_dram_window_tmp, cast_tile(o_acc_tile)); + } buffer_store_fence(); } else { - store_tile(o_dram_window_tmp, cast_tile(o_acc_tile)); + if constexpr(out_memory_data_op == memory_operation_enum::set) + { + store_tile(o_dram_window_tmp, cast_tile(o_acc_tile)); + } + else + { + update_tile(o_dram_window_tmp, cast_tile(o_acc_tile)); + } } } }; diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp index 07a4cf8fb..eaf66237a 100644 --- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp @@ -67,9 +67,10 @@ struct BatchedGemmKernel : public GemmKernel(kargs.a_ptr) + batch_offset_A; + const ADataType* a_ptr = static_cast(kargs.a_ptr) + batch_offset_A + + splitk_batch_offset.a_k_split_offset; const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B); const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B); - const BDataType* b_ptr = static_cast(kargs.b_ptr) + batch_offset_B; + const BDataType* b_ptr = static_cast(kargs.b_ptr) + batch_offset_B + + splitk_batch_offset.b_k_split_offset; const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C); const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C); CDataType* c_ptr = static_cast(kargs.c_ptr) + batch_offset_C; - this->RunGemm(a_ptr, b_ptr, c_ptr, kargs, i_m, i_n); + // allocate LDS + __shared__ char smem_ptr[GetSmemSize()]; + + if(kargs.KBatch == 1) + { + this->RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n); + } + else + { + this->template RunGemm( + a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n); + } } }; diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp index 925648a88..c81a64f7a 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp @@ -93,6 +93,7 @@ struct GemmKernel index_t stride_A; index_t stride_B; index_t stride_C; + index_t KBatch; }; CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const GemmHostArgs& hostArgs) @@ -105,28 +106,72 @@ struct GemmKernel hostArgs.K, hostArgs.stride_A, hostArgs.stride_B, - hostArgs.stride_C}; + hostArgs.stride_C, + hostArgs.k_batch}; } - // CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const void* a_ptr, - // const void* b_ptr, - // void* c_ptr, - // index_t M, - // index_t N, - // index_t K, - // index_t stride_A, - // index_t stride_B, - // index_t stride_C) - // { - // return GemmKernelArgs{a_ptr, b_ptr, c_ptr, M, N, K, stride_A, stride_B, stride_C}; - // } CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); } + struct SplitKBatchOffset + { + __device__ SplitKBatchOffset(const GemmKernelArgs& kargs, + const std::size_t k_id = blockIdx.z) + { + constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}); + const index_t K_t = kargs.KBatch * K1; + const index_t KRead = (kargs.K + K_t - 1) / K_t * K1; + + if constexpr(std::is_same_v) + { + a_k_split_offset = k_id * KRead; + } + else if constexpr(std::is_same_v) + { + a_k_split_offset = k_id * KRead * kargs.stride_A; + } + + if constexpr(std::is_same_v) + { + b_k_split_offset = k_id * KRead * kargs.stride_B; + } + else if constexpr(std::is_same_v) + { + b_k_split_offset = k_id * KRead; + } + + if(k_id < static_cast(kargs.KBatch - 1)) + { + splitted_k = KRead; + } + else + { + splitted_k = kargs.K - KRead * (kargs.KBatch - 1); + } + } + + index_t a_k_split_offset; + index_t b_k_split_offset; + index_t splitted_k; + }; + CK_TILE_HOST static bool IsSupportedArgument(const GemmKernelArgs& kargs) { + constexpr bool is_output_c_reg_transposed = + EpiloguePipeline::IsOutputTransposed() != GemmPipeline::IsTransposeC(); + if constexpr(!((GemmPipeline::VectorSizeC % 2 == 0 && + std::is_same_v && + is_output_c_reg_transposed) || + !(std::is_same_v || std::is_same_v))) + { + if(kargs.KBatch != 1) + { + return false; + } + } + if constexpr(std::is_same_v) { if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false) @@ -198,17 +243,19 @@ struct GemmKernel return true; } - CK_TILE_DEVICE auto MakeGemmTensorViews(const ADataType* a_ptr, - const BDataType* b_ptr, - CDataType* c_ptr, - const GemmKernelArgs& kargs) const + template + CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr, + const BDataType* b_ptr, + CDataType* c_ptr, + const GemmKernelArgs& kargs, + const SplitKBatchOffset& splitk_batch_offset) { const auto& a_tensor_view = [&]() { if constexpr(std::is_same_v) { return make_naive_tensor_view( a_ptr, - make_tuple(kargs.M, kargs.K), + make_tuple(kargs.M, splitk_batch_offset.splitted_k), make_tuple(kargs.stride_A, 1), number{}, number<1>{}); @@ -217,7 +264,7 @@ struct GemmKernel { return make_naive_tensor_view( a_ptr, - make_tuple(kargs.M, kargs.K), + make_tuple(kargs.M, splitk_batch_offset.splitted_k), make_tuple(1, kargs.stride_A), number<1>{}, number<1>{}); @@ -229,7 +276,7 @@ struct GemmKernel { return make_naive_tensor_view( b_ptr, - make_tuple(kargs.N, kargs.K), + make_tuple(kargs.N, splitk_batch_offset.splitted_k), make_tuple(1, kargs.stride_B), number<1>{}, number<1>{}); @@ -238,7 +285,7 @@ struct GemmKernel { return make_naive_tensor_view( b_ptr, - make_tuple(kargs.N, kargs.K), + make_tuple(kargs.N, splitk_batch_offset.splitted_k), make_tuple(kargs.stride_B, 1), number{}, number<1>{}); @@ -248,7 +295,7 @@ struct GemmKernel const auto& c_tensor_view = [&]() { if constexpr(std::is_same_v) { - return make_naive_tensor_view( + return make_naive_tensor_view( c_ptr, make_tuple(kargs.M, kargs.N), make_tuple(kargs.stride_C, 1), @@ -257,7 +304,7 @@ struct GemmKernel } else { - return make_naive_tensor_view( + return make_naive_tensor_view( c_ptr, make_tuple(kargs.M, kargs.N), make_tuple(1, kargs.stride_C), @@ -270,7 +317,7 @@ struct GemmKernel } template - CK_TILE_DEVICE auto MakeGemmPadViews(const TensorView& views) const + CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views) { const auto& a_pad_view = [&]() { const auto& a_tensor_view = views.at(I0); @@ -330,8 +377,8 @@ struct GemmKernel } template - CK_TILE_DEVICE auto - MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n) const + CK_TILE_DEVICE static auto + MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n) { const auto& a_pad_view = views.at(I0); const auto& a_block_window = make_tile_window( @@ -363,23 +410,27 @@ struct GemmKernel * @param kargs GEMM kernel arguments * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup. * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup. + * + * @tparam DstInMemOp Destination memory operation (default: set). */ - CK_TILE_DEVICE void RunGemm(const ADataType* a_ptr, - const BDataType* b_ptr, - CDataType* c_ptr, - const GemmKernelArgs& kargs, - const index_t block_idx_m, - const index_t block_idx_n) const + template + CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr, + const BDataType* b_ptr, + CDataType* c_ptr, + void* smem_ptr, + const GemmKernelArgs& kargs, + const SplitKBatchOffset& splitk_batch_offset, + const index_t block_idx_m, + const index_t block_idx_n) { // Create Gemm tensor views, pad views and tile windows - const auto& gemm_tensor_views_tuple = MakeGemmTensorViews(a_ptr, b_ptr, c_ptr, kargs); - const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); - auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - - // allocate LDS - __shared__ char smem_ptr[GetSmemSize()]; + const auto& gemm_tensor_views_tuple = + MakeGemmTensorViews(a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset); + ; + const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); + auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); + const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k); // Run GEMM cooperatively by whole workgroup. const auto& a_block_window = gemm_tile_windows.at(I0); @@ -389,18 +440,43 @@ struct GemmKernel // Run Epilogue Pipeline auto& c_block_window = gemm_tile_windows.at(I2); - EpiloguePipeline{}(c_block_window, c_block_tile); + + constexpr bool is_output_c_reg_transposed = + EpiloguePipeline::IsOutputTransposed() != GemmPipeline::IsTransposeC(); + if constexpr((DstInMemOp == memory_operation_enum::set) || (sizeof(CDataType) > 2) || + (GemmPipeline::VectorSizeC % 2 == 0 && + std::is_same_v && + is_output_c_reg_transposed)) + { + EpiloguePipeline{} + .template operator()( + c_block_window, c_block_tile); + } } CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const { const auto [i_m, i_n] = TilePartitioner{}(); + const SplitKBatchOffset splitk_batch_offset(kargs); // options - const ADataType* a_ptr = static_cast(kargs.a_ptr); - const BDataType* b_ptr = static_cast(kargs.b_ptr); - CDataType* c_ptr = static_cast(kargs.c_ptr); + const ADataType* a_ptr = + static_cast(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset; + const BDataType* b_ptr = + static_cast(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset; + CDataType* c_ptr = static_cast(kargs.c_ptr); + + // allocate LDS + __shared__ char smem_ptr[GetSmemSize()]; - RunGemm(a_ptr, b_ptr, c_ptr, kargs, i_m, i_n); + if(kargs.KBatch == 1) + { + RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n); + } + else + { + RunGemm( + a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n); + } } }; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp index a72728b4a..40628b186 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp @@ -82,6 +82,8 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3 return Policy::template GetSmemSize(); } + CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); } + template struct PipelineImpl : public PipelineImplBase { diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index e2e94cf92..c7a74c81e 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -132,6 +132,8 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem return Policy::template GetSmemSize(); } + CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); } + template struct PipelineImpl : public PipelineImplBase { diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index 822748c69..11a18e52c 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -53,6 +53,8 @@ struct GemmPipelineAGmemBGmemCRegV1 return Policy::template GetSmemSize(); } + CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); } + template @@ -114,8 +116,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy { constexpr index_t smem_size_a = GetSmemSizeA(); constexpr index_t smem_size_b = GetSmemSizeB(); - index_t smem_size = 0; - smem_size += smem_size_a + smem_size_b; + constexpr index_t smem_size = smem_size_a + smem_size_b; return smem_size; } @@ -485,13 +486,14 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy } } + CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return TransposeC; } + template CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm() { - constexpr bool TransposeC = false; - constexpr auto I0 = number<0>{}; - constexpr auto I1 = number<1>{}; - constexpr auto I2 = number<2>{}; + constexpr auto I0 = number<0>{}; + constexpr auto I1 = number<1>{}; + constexpr auto I2 = number<2>{}; using AccDataType = float; using BlockWarps = typename Problem::BlockGemmShape::BlockWarps; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp index 96a5a61c8..07d4dc441 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp @@ -36,6 +36,8 @@ struct GemmPipelineAGmemBGmemCRegV2 Policy::template MakeBLdsBlockDescriptor().get_element_space_size(); } + CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); } + template CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm() { diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp index d3f307787..e7e9b3d67 100644 --- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp +++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp @@ -93,7 +93,7 @@ class TestCkTileBatchedGemm : public ::testing::Test auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.batch_count); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count); constexpr dim3 blocks = Kernel::BlockSize(); if(s.log_level_ > 0) @@ -186,6 +186,7 @@ class TestCkTileBatchedGemm : public ::testing::Test args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); + args.k_batch = 1; args.M = M; args.N = N; args.K = K; diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp index 53ead4d8d..4b0e40060 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp @@ -74,7 +74,9 @@ class TestCkTileGemmPipeline : public ::testing::Test ck_tile:: GemmPipelineProblem>>; - const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(args.K); + const ck_tile::index_t k_grain = args.k_batch * K_Tile; + const ck_tile::index_t K_split = (args.K + k_grain - 1) / k_grain * K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); -- GitLab From 4e076909b6c1e1404d9ff5dc0e71e3be1c06569e Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Sun, 29 Dec 2024 14:29:56 +0800 Subject: [PATCH 143/153] Remove using partitioner for all fmha kernels (#1778) * Remove using tile partitioner for fmha_fwd_kernel * Remove using tile partitioner for fmha_fwd_splitkv and splitkv-combine kernels * Remove using tile partitioner for fmha_fwd_appendkv kernel * Unify the format of GetTileIndex --- example/ck_tile/01_fmha/README.md | 3 +- .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 20 +--- .../01_fmha/codegen/ops/fmha_fwd_appendkv.py | 6 +- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 10 +- example/ck_tile/01_fmha/fmha_fwd.hpp | 14 ++- include/ck_tile/ops/fmha.hpp | 3 - .../fmha/kernel/fmha_fwd_appendkv_kernel.hpp | 28 +++-- .../ops/fmha/kernel/fmha_fwd_kernel.hpp | 78 +++++++++++-- .../fmha_fwd_splitkv_combine_kernel.hpp | 39 +++++-- ...a_fwd_splitkv_combine_tile_partitioner.hpp | 48 -------- .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 40 +++++-- .../fmha_fwd_splitkv_tile_partitioner.hpp | 54 --------- .../fmha/kernel/fmha_fwd_tile_partitioner.hpp | 105 ------------------ 13 files changed, 171 insertions(+), 277 deletions(-) delete mode 100644 include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp delete mode 100644 include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp delete mode 100644 include/ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp diff --git a/example/ck_tile/01_fmha/README.md b/example/ck_tile/01_fmha/README.md index c7ab296c3..e9806e7a6 100644 --- a/example/ck_tile/01_fmha/README.md +++ b/example/ck_tile/01_fmha/README.md @@ -15,8 +15,7 @@ This will result in an executable `build/bin/tile_example_fmha_fwd` ## kernel The kernel template is `fmha_fwd_kernel.hpp`, this is the grid-wise op in old ck_tile's terminology. We put it here purposely, to demonstrate one can construct a kernel by using various internal component from ck_tile. We may still have an implementation under ck_tile's include path (in the future) for the kernel template. -There are 3 template parameters for this kernel template. -* `TilePartitioner` is used to map the workgroup to corresponding tile, `fmha_fwd_tile_partitioner.hpp` in this folder served as this purpose. +There are 2 template parameters for this kernel template. * `FmhaPipeline` is one of the block_tile_pipeline(under `include/ck_tile/tile_program/block_tile_pipeline`) which is a performance critical component. Indeed, we did a lot of optimization and trials to optimize the pipeline and may still workout more performance pipeline and update into that folder. People only need to replace this pipeline type and would be able to enjoy the benefit of different performant implementations (stay tuned for updated pipeline(s)). * `EpiloguePipeline` will modify and store out the result in the last phase. People usually will do lot of post-fusion at this stage, so we also abstract this concept. Currently we didn't do much thing at the epilogue stage but leave the room for future possible support. diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index 66814f5a1..1c9d743f3 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -29,11 +29,6 @@ K0_MAX_SUBMAX_MAP = { 256: 256 } -TILE_PARTITIONER_MAP = { - "shb" : "ck_tile::FmhaFwdTilePartitioner_SHB", - "hbs" : "ck_tile::FmhaFwdTilePartitioner_HBS", -} - FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n // auto generated by generate.py @@ -90,9 +85,7 @@ using fmha_epilogue_{F_idx} = {F_spad}, {F_dvpad}>>; using fmha_kernel_{F_idx} = - ck_tile::FmhaFwdKernel<{F_tile_partitioner}, - fmha_pipeline_{F_idx}, - fmha_epilogue_{F_idx}>; + ck_tile::FmhaFwdKernel; using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; @@ -329,12 +322,6 @@ class FmhaFwdKernel: F_pipeline : FmhaFwdPipeline mask_impl : str - def get_tp(self) -> str: - if self.F_mode == 'group': - return 'hbs' - else: - return 'shb' - @property def template(self) -> str: kernel_body = str() @@ -374,13 +361,12 @@ class FmhaFwdKernel: F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag], F_mask = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask], F_mode = MODE_MAP[self.F_mode], - F_pipeline = PIPELINE_MAP[self.F_pipeline.tag], - F_tile_partitioner = TILE_PARTITIONER_MAP[self.get_tp()]) + F_pipeline = PIPELINE_MAP[self.F_pipeline.tag]) @property def name(self) -> str: # TODO: we don't encode idx here - return f"fmha_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_{self.get_tp()}_" + \ + return f"fmha_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + \ self.F_tile.name + '_' + self.F_pipeline.name @property diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py index fb998a33d..2f2081930 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py @@ -46,9 +46,7 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipelineProbl using fmha_pipeline_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipeline< fmha_pipeline_problem_{F_idx}>; -using fmha_kernel_{F_idx} = - ck_tile::FmhaFwdAppendKVKernel, - fmha_pipeline_{F_idx}>; +using fmha_kernel_{F_idx} = ck_tile::FmhaFwdAppendKVKernel; using trait_{F_idx} = fmha_fwd_appendkv_traits_<{F_hdim}, {F_dtype}, {F_bs}, {F_bsk}, {F_bd}, {F_bdv}, {F_vlayout}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_rope}, {F_pagedkv}>; @@ -355,4 +353,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl) for kernel in kernels: f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n") - f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_APPENDKV_API_FILENAME) + "\n") \ No newline at end of file + f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_APPENDKV_API_FILENAME) + "\n") diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index 2f7edd547..fb8a4389f 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -96,9 +96,7 @@ using fmha_epilogue = {F_spad}, {F_dvpad}>>; using fmha_kernel = - ck_tile::FmhaFwdSplitKVKernel, - fmha_pipeline, - fmha_epilogue>; + ck_tile::FmhaFwdSplitKVKernel; static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) {{ @@ -176,11 +174,7 @@ using fmha_epilogue = false, false>>; using fmha_kernel = - ck_tile::FmhaFwdSplitKVCombineKernel< - ck_tile::FmhaFwdSplitKVCombineTilePartitioner< - fmha_pipeline_problem::kM0, fmha_pipeline_problem::kN1>, - fmha_pipeline, - fmha_epilogue>; + ck_tile::FmhaFwdSplitKVCombineKernel; static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) {{ diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 0e821ed5d..0368de352 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -400,8 +400,18 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args) } }(); - dim3 grids = FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v); - return ck_tile::make_tuple(kargs, grids); + if constexpr(FmhaKernel::kIsGroupMode) + { + dim3 grids = FmhaKernel::GridSize( + args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, args.seqlen_k_ptr != nullptr); + return ck_tile::make_tuple(kargs, grids); + } + else + { + dim3 grids = + FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, false); + return ck_tile::make_tuple(kargs, grids); + } } template diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp index 7a09e4622..d5920f483 100644 --- a/include/ck_tile/ops/fmha.hpp +++ b/include/ck_tile/ops/fmha.hpp @@ -14,10 +14,7 @@ #include "ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_tile_partitioner.hpp" #include "ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp" #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp" -#include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp" #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp" -#include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp" -#include "ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_convert_dq.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp" diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp index d598f9743..9fec9a320 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp @@ -10,10 +10,9 @@ namespace ck_tile { -template +template struct FmhaFwdAppendKVKernel { - using TilePartitioner = ck_tile::remove_cvref_t; using FmhaPipeline = ck_tile::remove_cvref_t; static constexpr ck_tile::index_t kBlockSize = FmhaPipeline::kBlockSize; static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu; @@ -234,12 +233,25 @@ struct FmhaFwdAppendKVKernel return kargs; } - __host__ static constexpr auto GridSize(ck_tile::index_t batch_size, - ck_tile::index_t nhead, - ck_tile::index_t seqlen_q, - ck_tile::index_t seqlen_knew) + CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size, + ck_tile::index_t nhead, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_knew) { - return TilePartitioner::GridSize(batch_size, nhead, seqlen_q, seqlen_knew); + // TODO: this may need tuning + return dim3(std::max(ck_tile::integer_divide_ceil(seqlen_q, FmhaPipeline::kM0), + ck_tile::integer_divide_ceil(seqlen_knew, FmhaPipeline::kN0)), + nhead, + batch_size); + } + + CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& /* kargs */) + { + const index_t i_tile = blockIdx.x; + const index_t i_nhead = blockIdx.y; + const index_t i_batch = blockIdx.z; + + return ck_tile::make_tuple(i_tile, i_nhead, i_batch); } __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); } @@ -247,7 +259,7 @@ struct FmhaFwdAppendKVKernel CK_TILE_DEVICE void operator()(Kargs kargs) const { // divide problem - const auto [i_tile, i_nhead, i_batch] = TilePartitioner{}(); + const auto [i_tile, i_nhead, i_batch] = GetTileIndex(kargs); const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0); const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 90102a6c6..f107b10df 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -20,10 +20,9 @@ namespace ck_tile { -template +template struct FmhaFwdKernel { - using TilePartitioner = ck_tile::remove_cvref_t; using FmhaPipeline = ck_tile::remove_cvref_t; using EpiloguePipeline = ck_tile::remove_cvref_t; static constexpr ck_tile::index_t kBlockSize = FmhaPipeline::kBlockSize; @@ -84,7 +83,7 @@ struct FmhaFwdKernel return n.empty() ? n : std::string("p") + n; }(); return _SS_("fmha_fwd_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s::name) + - "_" + (kIsGroupMode ? "group" : "batch") + "_" + _SS_(TilePartitioner::name) + "_" + "_" + (kIsGroupMode ? "group" : "batch") + "_" "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" + _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" + "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" + @@ -867,9 +866,75 @@ struct FmhaFwdKernel CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_, ck_tile::index_t nhead_, ck_tile::index_t seqlen_q_, - ck_tile::index_t hdim_v_) + ck_tile::index_t hdim_v_, + bool has_padded_seqlen_k = false) { - return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_); + // has_padded_seqlen_k is determined by checking (seqlen_k_ptr != nullptr) + if(has_padded_seqlen_k) + { + // TODO: this may need tuning + return dim3(nhead_, + batch_size_, + ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) * + ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1)); + } + else + { + // TODO: this may need tuning + return dim3(ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) * + ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1), + nhead_, + batch_size_); + } + } + + CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& kargs) + { + bool has_padded_seqlen_k = false; + + if constexpr(kIsGroupMode) + has_padded_seqlen_k = (kargs.seqlen_k_ptr != nullptr); + + if(has_padded_seqlen_k) + { + // const index_t num_tile_m0 = seqlen_q / kM0; + const index_t num_tile_n1 = + ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1); + + const index_t i_block = blockIdx.z; + const index_t i_nhead = blockIdx.x; + const index_t i_batch = blockIdx.y; + + const auto f = [](index_t dividend, index_t divisor) { + index_t quotient = dividend / divisor; + index_t modulus = dividend - quotient * divisor; + return ck_tile::make_tuple(quotient, modulus); + }; + + const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1); + + return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch); + } + else + { + // const index_t num_tile_m0 = seqlen_q / kM0; + const index_t num_tile_n1 = + ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1); + + const index_t i_block = blockIdx.x; + const index_t i_nhead = blockIdx.y; + const index_t i_batch = blockIdx.z; + + const auto f = [](index_t dividend, index_t divisor) { + index_t quotient = dividend / divisor; + index_t modulus = dividend - quotient * divisor; + return ck_tile::make_tuple(quotient, modulus); + }; + + const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1); + + return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch); + } } CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); } @@ -885,8 +950,7 @@ struct FmhaFwdKernel __shared__ char smem_ptr[GetSmemSize()]; // divide problem - const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = - TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v); + const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp index a0adfdc12..a342a91f1 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp @@ -5,10 +5,9 @@ namespace ck_tile { -template +template struct FmhaFwdSplitKVCombineKernel { - using TilePartitioner = remove_cvref_t; using FmhaPipeline = remove_cvref_t; using EpiloguePipeline = remove_cvref_t; @@ -235,12 +234,35 @@ struct FmhaFwdSplitKVCombineKernel return kargs; } - __host__ static constexpr auto GridSize(ck_tile::index_t batch_size, - ck_tile::index_t nhead, - ck_tile::index_t max_seqlen_q, - ck_tile::index_t hdim_v) + CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size, + ck_tile::index_t nhead, + ck_tile::index_t max_seqlen_q, + ck_tile::index_t hdim_v) { - return TilePartitioner::GridSize(batch_size, nhead, max_seqlen_q, hdim_v); + // TODO: this may need tuning + return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, FmhaPipeline::kM0) * + ck_tile::integer_divide_ceil(hdim_v, FmhaPipeline::kN1), + nhead, + batch_size); + } + + CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& kargs) + { + const index_t num_tile_n1 = ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1); + + const index_t i_block = blockIdx.x; + const index_t i_nhead = blockIdx.y; + const index_t i_batch = blockIdx.z; + + const auto f = [](index_t dividend, index_t divisor) { + index_t quotient = dividend / divisor; + index_t modulus = dividend - quotient * divisor; + return ck_tile::make_tuple(quotient, modulus); + }; + + const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1); + + return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch); } __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); } @@ -256,8 +278,7 @@ struct FmhaFwdSplitKVCombineKernel __shared__ char smem_ptr[GetSmemSize()]; // divide problem - const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = - TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v); + const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp deleted file mode 100644 index 3b7390971..000000000 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core.hpp" - -namespace ck_tile { - -template -struct FmhaFwdSplitKVCombineTilePartitioner -{ - static constexpr ck_tile::index_t kM0 = kM0_; - static constexpr ck_tile::index_t kN1 = kN1_; - - CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size, - ck_tile::index_t nhead, - ck_tile::index_t max_seqlen_q, - ck_tile::index_t hdim_v) - { - // TODO: this may need tuning - return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, kM0) * - ck_tile::integer_divide_ceil(hdim_v, kN1), - nhead, - batch_size); - } - - CK_TILE_DEVICE auto operator()(ck_tile::index_t /*seqlen_q*/, ck_tile::index_t hdim_v) - { - const index_t num_tile_n1 = ck_tile::integer_divide_ceil(hdim_v, kN1); - - const index_t i_block = blockIdx.x; - const index_t i_nhead = blockIdx.y; - const index_t i_batch = blockIdx.z; - - const auto f = [](index_t dividend, index_t divisor) { - index_t quotient = dividend / divisor; - index_t modulus = dividend - quotient * divisor; - return ck_tile::make_tuple(quotient, modulus); - }; - - const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1); - - return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch); - } -}; - -} // namespace ck_tile diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index dc1748726..10ab25119 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -17,10 +17,9 @@ namespace ck_tile { -template +template struct FmhaFwdSplitKVKernel { - using TilePartitioner = ck_tile::remove_cvref_t; using FmhaPipeline = ck_tile::remove_cvref_t; using EpiloguePipeline = ck_tile::remove_cvref_t; static constexpr ck_tile::index_t kBlockSize = FmhaPipeline::kBlockSize; @@ -476,13 +475,35 @@ struct FmhaFwdSplitKVKernel return kargs; } - __host__ static constexpr auto GridSize(ck_tile::index_t batch_size, - ck_tile::index_t nhead, - ck_tile::index_t max_seqlen_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_splits) + CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size, + ck_tile::index_t nhead, + ck_tile::index_t max_seqlen_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_splits) { - return TilePartitioner::GridSize(batch_size, nhead, max_seqlen_q, hdim_v, num_splits); + // TODO: this may need tuning + return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, FmhaPipeline::kM0) * + ck_tile::integer_divide_ceil(hdim_v, FmhaPipeline::kN1) * num_splits, + nhead, + batch_size); + } + + CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& kargs) + { + const index_t num_tile_n1 = ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1); + + const auto f = [](index_t dividend, index_t divisor) { + index_t quotient = dividend / divisor; + index_t modulus = dividend - quotient * divisor; + return ck_tile::make_tuple(quotient, modulus); + }; + + const auto [mn, i_split] = f(blockIdx.x, kargs.num_splits); + const auto [i_tile_m, i_tile_n] = f(mn, num_tile_n1); + const index_t i_nhead = blockIdx.y; + const index_t i_batch = blockIdx.z; + + return ck_tile::make_tuple(i_tile_m, i_tile_n, i_split, i_nhead, i_batch); } __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); } @@ -498,8 +519,7 @@ struct FmhaFwdSplitKVKernel __shared__ char smem_ptr[GetSmemSize()]; // divide problem - const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = - TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v, kargs.num_splits); + const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = GetTileIndex(kargs); const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp deleted file mode 100644 index 5a52fa0f6..000000000 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core.hpp" - -namespace ck_tile { - -template -struct FmhaFwdSplitKVTilePartitioner -{ - using BlockFmhaShape = ck_tile::remove_cvref_t; - - static constexpr ck_tile::index_t kM0 = BlockFmhaShape::kM0; - static constexpr ck_tile::index_t kN0 = BlockFmhaShape::kN0; - static constexpr ck_tile::index_t kK0 = BlockFmhaShape::kK0; - static constexpr ck_tile::index_t kN1 = BlockFmhaShape::kN1; - static constexpr ck_tile::index_t kK1 = BlockFmhaShape::kK1; - - CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size, - ck_tile::index_t nhead, - ck_tile::index_t max_seqlen_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_splits) - { - // TODO: this may need tuning - return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, kM0) * - ck_tile::integer_divide_ceil(hdim_v, kN1) * num_splits, - nhead, - batch_size); - } - - CK_TILE_DEVICE auto - operator()(ck_tile::index_t /*seqlen_q*/, ck_tile::index_t hdim_v, ck_tile::index_t num_splits) - { - const index_t num_tile_n1 = ck_tile::integer_divide_ceil(hdim_v, kN1); - - const auto f = [](index_t dividend, index_t divisor) { - index_t quotient = dividend / divisor; - index_t modulus = dividend - quotient * divisor; - return ck_tile::make_tuple(quotient, modulus); - }; - - const auto [mn, i_split] = f(blockIdx.x, num_splits); - const auto [i_tile_m, i_tile_n] = f(mn, num_tile_n1); - const index_t i_nhead = blockIdx.y; - const index_t i_batch = blockIdx.z; - - return ck_tile::make_tuple(i_tile_m, i_tile_n, i_split, i_nhead, i_batch); - } -}; - -} // namespace ck_tile diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp deleted file mode 100644 index 2dca84b78..000000000 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core.hpp" - -namespace ck_tile { - -template -struct FmhaFwdTilePartitioner -{ - using BlockFmhaShape = ck_tile::remove_cvref_t; - - static constexpr ck_tile::index_t kM0 = BlockFmhaShape::kM0; - static constexpr ck_tile::index_t kN0 = BlockFmhaShape::kN0; - static constexpr ck_tile::index_t kK0 = BlockFmhaShape::kK0; - static constexpr ck_tile::index_t kN1 = BlockFmhaShape::kN1; - static constexpr ck_tile::index_t kK1 = BlockFmhaShape::kK1; - - static constexpr const char* name = "shb"; - - CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_, - ck_tile::index_t nhead_, - ck_tile::index_t seqlen_q_, - ck_tile::index_t hdim_v_) - { - // TODO: this may need tuning - return dim3(ck_tile::integer_divide_ceil(seqlen_q_, kM0) * - ck_tile::integer_divide_ceil(hdim_v_, kN1), - nhead_, - batch_size_); - } - - CK_TILE_DEVICE auto operator()(ck_tile::index_t /*seqlen_q*/, ck_tile::index_t hdim_v) - { - // const index_t num_tile_m0 = seqlen_q / kM0; - const index_t num_tile_n1 = ck_tile::integer_divide_ceil(hdim_v, kN1); - - const index_t i_block = blockIdx.x; - const index_t i_nhead = blockIdx.y; - const index_t i_batch = blockIdx.z; - - const auto f = [](index_t dividend, index_t divisor) { - index_t quotient = dividend / divisor; - index_t modulus = dividend - quotient * divisor; - return ck_tile::make_tuple(quotient, modulus); - }; - - const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1); - - return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch); - } -}; - -template -using FmhaFwdTilePartitioner_SHB = FmhaFwdTilePartitioner; - -template -struct FmhaFwdTilePartitioner_HBS -{ - using BlockFmhaShape = ck_tile::remove_cvref_t; - - static constexpr ck_tile::index_t kM0 = BlockFmhaShape::kM0; - static constexpr ck_tile::index_t kN0 = BlockFmhaShape::kN0; - static constexpr ck_tile::index_t kK0 = BlockFmhaShape::kK0; - static constexpr ck_tile::index_t kN1 = BlockFmhaShape::kN1; - static constexpr ck_tile::index_t kK1 = BlockFmhaShape::kK1; - - static constexpr const char* name = "hbs"; - - CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_, - ck_tile::index_t nhead_, - ck_tile::index_t seqlen_q_, - ck_tile::index_t hdim_v_) - { - // TODO: this may need tuning - return dim3(nhead_, - batch_size_, - ck_tile::integer_divide_ceil(seqlen_q_, kM0) * - ck_tile::integer_divide_ceil(hdim_v_, kN1)); - } - - CK_TILE_DEVICE auto operator()(ck_tile::index_t /*seqlen_q*/, ck_tile::index_t hdim_v) - { - // const index_t num_tile_m0 = seqlen_q / kM0; - const index_t num_tile_n1 = ck_tile::integer_divide_ceil(hdim_v, kN1); - - const index_t i_block = blockIdx.z; - const index_t i_nhead = blockIdx.x; - const index_t i_batch = blockIdx.y; - - const auto f = [](index_t dividend, index_t divisor) { - index_t quotient = dividend / divisor; - index_t modulus = dividend - quotient * divisor; - return ck_tile::make_tuple(quotient, modulus); - }; - - const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1); - - return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch); - } -}; - -} // namespace ck_tile -- GitLab From 159fa31946191747eed397abfa23a1910a85de67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Wed, 1 Jan 2025 18:00:06 +0100 Subject: [PATCH 144/153] Add NGCHW bf16 grouped conv fwd instances (#1783) * Add NGCHW bf16 grouped conv fwd instances * add missed cmake --- .../gpu/grouped_convolution_forward.hpp | 18 ++++++- .../grouped_convolution_forward_comp_xdl.inc | 16 +++++++ ...uped_convolution_forward_mem_inter_xdl.inc | 16 +++++++ ...uped_convolution_forward_mem_intra_xdl.inc | 16 +++++++ .../gpu/grouped_convolution_forward_xdl.inc | 16 +++++++ ..._convolution_forward_xdl_merged_groups.inc | 14 ++++++ .../gpu/grouped_conv2d_fwd/CMakeLists.txt | 5 ++ ...l_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp | 39 +++++++++++++++ ...wd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp | 38 +++++++++++++++ ...hw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp | 39 +++++++++++++++ ...hw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp | 39 +++++++++++++++ ...groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp | 48 +++++++++++++++++++ .../test_grouped_convnd_fwd.cpp | 1 + 13 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp index 8090b2449..01415c2dd 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp @@ -304,7 +304,23 @@ struct DeviceOperationInstanceFactory && + is_same_v && + is_same_v && + is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances(op_ptrs); + add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(op_ptrs); + add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances( + op_ptrs); + } +#endif #ifdef CK_ENABLE_INT8 if constexpr(is_same_v && is_same_v && is_same_v && is_same_v && diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc index e47a876e1..9a83e36b9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc @@ -90,6 +90,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instances( PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_BF16 +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances( + std::vector>>& instances); +#endif + #ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instances( std::vector>>& instances); #endif +#ifdef CK_ENABLE_BF16 +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances( + std::vector>>& instances); +#endif + #ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instances( std::vector>>& instances); #endif +#ifdef CK_ENABLE_BF16 +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances( + std::vector>>& instances); +#endif + #ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instances( std::vector>>& instances); #endif +#ifdef CK_ENABLE_BF16 +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances( + std::vector>>& instances); +#endif + #ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instances( std::vector>>& instances); + +void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances( + std::vector>>& instances); #endif #ifdef CK_ENABLE_FP16 diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt index 98bee66a9..146916cfd 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt @@ -11,6 +11,7 @@ add_instance_library(device_grouped_conv2d_fwd_instance xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp # NGCHW, GKYXC, NGKHW + xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp @@ -27,6 +28,7 @@ add_instance_library(device_grouped_conv2d_fwd_instance xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp # NGCHW, GKYXC, NGKHW + xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instance.cpp xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp @@ -42,10 +44,12 @@ add_instance_library(device_grouped_conv2d_fwd_instance xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp # NGCHW, GKYXC, NGKHW + xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp # NGCHW, GKYXC, NGKHW + xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instance.cpp xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp @@ -56,6 +60,7 @@ add_instance_library(device_grouped_conv2d_fwd_instance xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp # NGCHW, GKYXC, NGKHW + xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instance.cpp xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instance.cpp xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp new file mode 100644 index 000000000..65e233ce0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_bf16_comp_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwdDefault>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp new file mode 100644 index 000000000..6ee6aa1e4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_bf16_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwdDefault>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp new file mode 100644 index 000000000..88b5f30da --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwdDefault, + Interwave>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp new file mode 100644 index 000000000..48cca9c3f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwdDefault, + Intrawave>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp new file mode 100644 index 000000000..14f00d8e8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwdDefault>{}); + + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2, + NGCHW, + GKYXC, + Empty_Tuple, + NGKHW, + ConvFwd3x3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp index 1abd4fd9f..25481e0d7 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp @@ -64,6 +64,7 @@ using KernelTypes2d = ::testing::Types, std::tuple, std::tuple, std::tuple, + std::tuple, std::tuple>; using KernelTypes3d = ::testing::Types, -- GitLab From 1d8e4ec2ced2da813947e89654f69f7bf6b5079e Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Thu, 2 Jan 2025 04:48:06 +0100 Subject: [PATCH 145/153] Jing's contribution: prototype of mixed precision gemm FP16/BF16xint4 GEMM (#1762) * add a prototype of int4 * clean * debug * clean * clean * move packed into dynamic_buffer * fixed coord reset * add fast pki4 to half conversion * fix * fixed reference and host_tensor * fixed tensor init * format * debug i4_to_f16_convert * format * fixed splitk * weight permute * add b tile permute * clean * weight permute with splitki * format * improve weight layout * add and_or_b32 * fixed splitk crush * add permute switch as a template * recover v3r1 * clean * failure with intrawave v2 * fixed * fixed * add ckProfiler * add bfp16 support * add bf16 example * fixed int4 to bhalf_t conversion * format * fixed int4 to bf16 conversion * clean * add instances for mem * clean * fixed host tensor size * fixed * debug * fixed * add pk_i4_t as a struct * fix * Update example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * Update example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * Update example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * revert * Update example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * Update example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * Update example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * Update example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * Update example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * fixed comments * revert * clean * revert * revert * fixed * Update CMakeLists.txt * Update script/cmake-ck-dev.sh Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * Update include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * Update CMakeLists.txt Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> * fixed * fixed * fixed * revert * revert * add comments * format * fixed assert * fixed * Fix I4 define in ckProfiler * Fixed example_gemm_xdl_bf16_pk_i4_v3 test failed issue --------- Co-authored-by: Jing Zhang Co-authored-by: zjing14 Co-authored-by: mtgu0705 --- CMakeLists.txt | 2 +- cmake/EnableCompilerWarnings.cmake | 2 +- example/01_gemm/CMakeLists.txt | 2 + example/01_gemm/common.hpp | 82 +++++ example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp | 253 +++++++++++++++ example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp | 16 +- example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp | 303 ++++++++++++++++++ example/01_gemm/gemm_xdl_fp16_v3.cpp | 20 +- example/01_gemm/run_gemm_example.inc | 82 ----- .../01_gemm/run_gemm_example_streamk_v2.inc | 82 ----- example/01_gemm/run_gemm_example_v2.inc | 82 ----- include/ck/library/utility/host_tensor.hpp | 65 +++- .../library/utility/host_tensor_generator.hpp | 30 ++ include/ck/tensor/static_tensor.hpp | 4 +- .../gpu/device/device_gemm_v2.hpp | 4 + .../impl/device_gemm_xdl_cshuffle_v3.hpp | 13 +- .../element/unary_element_wise_operation.hpp | 189 +++++++++++ .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 104 ++++-- .../threadwise_tensor_slice_transfer.hpp | 50 ++- .../threadwise_tensor_slice_transfer_v3r1.hpp | 74 +++-- include/ck/utility/amd_buffer_addressing.hpp | 3 +- include/ck/utility/amd_inline_asm.hpp | 21 ++ include/ck/utility/data_type.hpp | 35 ++ include/ck/utility/dynamic_buffer.hpp | 6 +- include/ck/utility/static_buffer.hpp | 6 +- .../cpu/reference_gemm.hpp | 22 ++ .../device_operation_instance_factory.hpp | 1 + .../gpu/gemm_universal.hpp | 33 ++ .../gpu/gemm_universal/CMakeLists.txt | 3 + ...mm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp | 87 +++++ ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp | 24 ++ ...gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp | 86 +++++ ...4_f16_mk_nk_mn_mem_v2_default_instance.cpp | 24 ++ .../profiler/profile_gemm_universal_impl.hpp | 103 +++++- profiler/src/CMakeLists.txt | 1 - profiler/src/profile_gemm_universal.cpp | 17 +- script/cmake-ck-dev.sh | 2 +- 37 files changed, 1583 insertions(+), 350 deletions(-) create mode 100644 example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp create mode 100644 example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index be4efd3df..6d4176735 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -585,7 +585,7 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS) ) add_subdirectory(example) if(BUILD_TESTING) - add_subdirectory(test) + add_subdirectory(test) endif() endif() diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake index 93fd306e9..fb2b38d68 100644 --- a/cmake/EnableCompilerWarnings.cmake +++ b/cmake/EnableCompilerWarnings.cmake @@ -66,7 +66,7 @@ else() -Wunreachable-code -Wunused -Wno-reserved-identifier - -Werror + -Werror -Wno-option-ignored -Wsign-compare -Wno-extra-semi-stmt diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 957acce16..df7be0466 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -29,6 +29,8 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_v3) add_example_executable(example_gemm_xdl_fp8_v3 gemm_xdl_fp8_v3.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_v3) add_example_executable(example_gemm_xdl_fp16_fp8_v3 gemm_xdl_fp16_fp8_v3.cpp) +add_example_executable(example_gemm_xdl_fp16_pk_i4_v3 gemm_xdl_fp16_pk_i4_v3.cpp) +add_example_executable(example_gemm_xdl_bf16_pk_i4_v3 gemm_xdl_bf16_pk_i4_v3.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3) add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3) diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp index a3a62d4cf..9664c50b6 100644 --- a/example/01_gemm/common.hpp +++ b/example/01_gemm/common.hpp @@ -287,3 +287,85 @@ bool parse_cmd_args(int argc, return true; } + +template +inline __host__ __device__ constexpr double get_rtol() +{ + if constexpr(std::is_same_v) + { + return 1e-3; + } + else if constexpr(std::is_same_v) + { + return 1e-6; + } + else if constexpr(std::is_same_v) + { + return 1e-3; + } + else if constexpr(std::is_same_v) + { + return 5e-2; + } + else if constexpr(std::is_same_v) + { + return 1e-1; + } + else if constexpr(std::is_same_v) + { + return 1e-1; + } + else if constexpr(std::is_same_v) + { + return 1e-1; // 240 and 224 are acceptable + } + else if constexpr(std::is_same_v) + { + return 1.5e-1; // 57344 and 49152 are acceptable + } + else + { + return 1e-3; + } +} + +template +inline __host__ __device__ constexpr double get_atol() +{ + if constexpr(std::is_same_v) + { + return 1e-3; + } + else if constexpr(std::is_same_v) + { + return 1e-6; + } + else if constexpr(std::is_same_v) + { + return 1e-3; + } + else if constexpr(std::is_same_v) + { + return 5e-2; + } + else if constexpr(std::is_same_v) + { + return 1e-1; + } + else if constexpr(std::is_same_v) + { + return 1e-1; + } + else if constexpr(std::is_same_v) + { + return 16.1; // 240 and 224 are acceptable + } + else if constexpr(std::is_same_v) + { + return 8192.1; // 57344 and 49152 are acceptable + } + else + { + return 1e-3; + } +} diff --git a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp new file mode 100644 index 000000000..7b491173a --- /dev/null +++ b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp" + +using ADataType = ck::bhalf_t; +using BDataType = ck::pk_i4_t; +using AccDataType = float; +using CShuffleDataType = ck::bhalf_t; +using CDataType = ck::bhalf_t; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr bool PermuteA = false; +static constexpr bool PermuteB = true; +static constexpr ck::index_t KPerBlock = 128; + +// clang-format off +using DeviceGemmV2Instance = + ck::tensor_operation::device::DeviceGemm_Xdl_CShuffleV3< + ALayout, BLayout, CLayout, + ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, + AElementOp, BElementOp, CElementOp, GemmDefault, + 128, + 16, 64, + KPerBlock, 8, 32, + 16, 16, + 1, 2, + S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 8, 8, 0, + S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 32, 32, 0, + 1, 1, S<1, 16, 1, 8>, 4, + ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2, ADataType, ADataType, PermuteA, PermuteB>; + +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; +template +bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) +{ + using namespace ck::literals; + + auto M = problem_size.M; + auto N = problem_size.N; + auto K = problem_size.K; + auto StrideA = problem_size.StrideA; + auto StrideB = problem_size.StrideB; + auto StrideC = problem_size.StrideC; + auto KBatch = problem_size.KBatch; + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if constexpr(std::is_same_v) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + auto f_get_default_stride = + [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) { + if(stride == -1) + { + // give a chance if stride is -1, return a default packed stride + if constexpr(std::is_same_v) + { + return static_cast(col); + } + else + { + return static_cast(row); + } + } + else + return static_cast(stride); + }; + + StrideA = f_get_default_stride(M, K, StrideA, ALayout{}); + StrideB = f_get_default_stride(K, N, StrideB, BLayout{}); + StrideC = f_get_default_stride(M, N, StrideC, CLayout{}); + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + + switch(config.init_method) + { + case 0: + a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); + b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + break; + case 1: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 2: + a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 3: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + break; + default: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + } + + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; + + DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); + + // weight permute + if constexpr(PermuteB) + { + int K1 = KPerBlock; + int K0 = K / KPerBlock; + + // int K0, N, K1 + for(int j = 0; j < K0; j++) + { + for(int i = 0; i < N; i++) + { + for(int jj = 0; jj < K1; jj++) + { + b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj)); + } + } + } + } + else + { + for(int i = 0; i < N; i++) + { + for(int j = 0; j < K; j++) + { + b_k_n_permute(i * K + j) = b_k_n(i * K + j); + } + } + } + + a_m_k_device_buf.ToDevice(a_m_k.mData.data()); + b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data()); + DeviceMem workspace; + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + auto gemm = DeviceGemmV2Instance{}; + auto invoker = gemm.MakeInvoker(); + float ave_time = 0; + + auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer()), + M, + N, + K, + StrideA, + StrideB, + StrideC, + KBatch, + a_element_op, + b_element_op, + c_element_op); + + if(!gemm.IsSupportedArgument(argument)) + { + std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl; + + return true; + } + + bool pass = true; + if(config.do_verification) + { + auto ref_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = ref_gemm.MakeArgument( + a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{}); + + ref_invoker.Run(ref_argument); + + ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0}); + c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + pass &= ck::utils::check_err(c_m_n_device_result, + c_m_n_host_result, + "Error: Incorrect results!", + get_rtol(), + get_atol()); + } + + if(config.time_kernel) + { + ave_time = + invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50}); + + std::size_t flop = 2_uz * M * N * K; + std::size_t num_btype = + sizeof(ADataType) * M * K + + sizeof(BDataType) * K * N / + (ck::is_same_v, ck::pk_i4_t> ? 2 : 1) + + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << gemm.GetTypeString() << std::endl; + } + return pass; +} + +bool run_gemm_splitk_example(int argc, char* argv[]) +{ + ProblemSizeSplitK problem_size; + ExecutionConfig config; + + return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config); +} + +int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp b/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp index 2e27fc66f..b0e36b394 100644 --- a/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp +++ b/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp @@ -1,12 +1,12 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp" -using ADataType = ck::f8_t; -using BDataType = ck::half_t; +using ADataType = ck::half_t; +using BDataType = ck::f8_t; using AccDataType = float; using CShuffleDataType = ck::half_t; using CDataType = ck::half_t; @@ -29,15 +29,15 @@ using DeviceGemmV2Instance = AElementOp, BElementOp, CElementOp, GemmDefault, 64, 16, 16, - 64, 16, 8, + 256, 8, 16, 16, 16, 1, 1, - S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, - 2, 16, 16, 0, - S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, + S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, + S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, - ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v1>; + ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>; // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm, S<1, 0, 2>, S<1, 0, 2>, + 2, 8, 8, 0, + S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 32, 32, 0, + 1, 1, S<1, 16, 1, 8>, 4, + ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2, ADataType, ADataType, PermuteA, PermuteB>; + +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; +template +bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) +{ + using namespace ck::literals; + + auto M = problem_size.M; + auto N = problem_size.N; + auto K = problem_size.K; + auto StrideA = problem_size.StrideA; + auto StrideB = problem_size.StrideB; + auto StrideC = problem_size.StrideC; + auto KBatch = problem_size.KBatch; + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if constexpr(std::is_same_v) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + auto f_get_default_stride = + [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) { + if(stride == -1) + { + // give a chance if stride is -1, return a default packed stride + if constexpr(std::is_same_v) + { + return static_cast(col); + } + else + { + return static_cast(row); + } + } + else + return static_cast(stride); + }; + + StrideA = f_get_default_stride(M, K, StrideA, ALayout{}); + StrideB = f_get_default_stride(K, N, StrideB, BLayout{}); + StrideC = f_get_default_stride(M, N, StrideC, CLayout{}); + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + + switch(config.init_method) + { + case 0: + a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); + b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + break; + case 1: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 2: + a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 3: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + break; + default: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + } + + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; + + DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); + + // weight permute + if constexpr(PermuteB) + { + int K1 = KPerBlock; + int K0 = K / KPerBlock; + + // int K0, N, K1 + for(int j = 0; j < K0; j++) + { + for(int i = 0; i < N; i++) + { + for(int jj = 0; jj < K1; jj++) + { + b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj)); + } + } + } + } + else + { + for(int i = 0; i < N; i++) + { + for(int j = 0; j < K; j++) + { + b_k_n_permute(i * K + j) = b_k_n(i * K + j); + } + } + } + + // vector pk_i4x4 permute + for(int i = 0; i < N; i++) + { + for(int j = 0; j < K; j += 8) + { + int input[8]; + + for(int k = 0; k < 4; k++) + { + int i4x2 = b_k_n_permute(j + k * 2, i).data; + input[k * 2 + 0] = (i4x2 >> 4) & 0xf; + input[k * 2 + 1] = (i4x2 >> 0) & 0xf; + } + + // permute 01234567->20643175 + { + int hi = input[2]; + int lo = input[0]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 0, i) = i4x2; + } + + { + int hi = input[6]; + int lo = input[4]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 2, i) = i4x2; + } + + { + int hi = input[3]; + int lo = input[1]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 4, i) = i4x2; + } + + { + int hi = input[7]; + int lo = input[5]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 6, i) = i4x2; + } + } + } + + a_m_k_device_buf.ToDevice(a_m_k.mData.data()); + b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data()); + DeviceMem workspace; + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + auto gemm = DeviceGemmV2Instance{}; + auto invoker = gemm.MakeInvoker(); + float ave_time = 0; + + auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer()), + M, + N, + K, + StrideA, + StrideB, + StrideC, + KBatch, + a_element_op, + b_element_op, + c_element_op); + + if(!gemm.IsSupportedArgument(argument)) + { + std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl; + + return true; + } + + bool pass = true; + if(config.do_verification) + { + auto ref_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = ref_gemm.MakeArgument( + a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{}); + + ref_invoker.Run(ref_argument); + + ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0}); + c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + pass &= ck::utils::check_err(c_m_n_device_result, + c_m_n_host_result, + "Error: Incorrect results!", + get_rtol(), + get_atol()); + } + + if(config.time_kernel) + { + ave_time = + invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50}); + + std::size_t flop = 2_uz * M * N * K; + std::size_t num_btype = + sizeof(ADataType) * M * K + + sizeof(BDataType) * K * N / + (ck::is_same_v, ck::pk_i4_t> ? 2 : 1) + + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << gemm.GetTypeString() << std::endl; + } + return pass; +} + +bool run_gemm_splitk_example(int argc, char* argv[]) +{ + ProblemSizeSplitK problem_size; + ExecutionConfig config; + + return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config); +} + +int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_fp16_v3.cpp b/example/01_gemm/gemm_xdl_fp16_v3.cpp index ad370f570..4a969246c 100644 --- a/example/01_gemm/gemm_xdl_fp16_v3.cpp +++ b/example/01_gemm/gemm_xdl_fp16_v3.cpp @@ -12,7 +12,7 @@ using CShuffleDataType = ck::half_t; using CDataType = ck::half_t; using ALayout = Row; -using BLayout = Row; +using BLayout = Col; using CLayout = Row; using AElementOp = PassThrough; @@ -27,17 +27,17 @@ using DeviceGemmV2Instance = ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, PassThrough, PassThrough, PassThrough, GemmDefault, - 256, - 224, 256, - 64, 8, 2, + 64, + 16, 16, + 256, 8, 8, 16, 16, - 7, 8, - S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, + 1, 1, + S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, - S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, - 1, 8, 2, 0, - 1, 2, S<1, 32, 1, 8>, 8, - ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>; + S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 8, 8, 0, + 1, 1, S<1, 16, 1, 4>, 4, + ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2>; // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index 3ee6e2685..4371af624 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -5,88 +5,6 @@ #include "ck/tensor_operation/gpu/device/device_gemm_streamk.hpp" -template -inline __host__ __device__ constexpr double get_rtol() -{ - if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 1e-6; - } - else if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 5e-2; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 2e-1; - } - else if constexpr(std::is_same_v) - { - return 2e-1; - } - else - { - return 1e-3; - } -} - -template -inline __host__ __device__ constexpr double get_atol() -{ - if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 1e-6; - } - else if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 5e-2; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 2e-1; - } - else if constexpr(std::is_same_v) - { - return 2e-1; - } - else - { - return 1e-3; - } -} - template bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) { diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc index 04243b829..9ee380d24 100755 --- a/example/01_gemm/run_gemm_example_streamk_v2.inc +++ b/example/01_gemm/run_gemm_example_streamk_v2.inc @@ -3,88 +3,6 @@ #pragma once -template -inline __host__ __device__ constexpr double get_rtol() -{ - if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 1e-6; - } - else if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 5e-2; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 1e-1; // 240 and 224 are acceptable - } - else if constexpr(std::is_same_v) - { - return 1.5e-1; // 57344 and 49152 are acceptable - } - else - { - return 1e-3; - } -} - -template -inline __host__ __device__ constexpr double get_atol() -{ - if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 1e-6; - } - else if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 5e-2; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 16.1; // 240 and 224 are acceptable - } - else if constexpr(std::is_same_v) - { - return 8192.1; // 57344 and 49152 are acceptable - } - else - { - return 1e-3; - } -} - template bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) { diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc index 5b6969f1d..2b60fa5d2 100644 --- a/example/01_gemm/run_gemm_example_v2.inc +++ b/example/01_gemm/run_gemm_example_v2.inc @@ -3,88 +3,6 @@ #pragma once -template -inline __host__ __device__ constexpr double get_rtol() -{ - if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 1e-6; - } - else if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 5e-2; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 1e-1; // 240 and 224 are acceptable - } - else if constexpr(std::is_same_v) - { - return 1.5e-1; // 57344 and 49152 are acceptable - } - else - { - return 1e-3; - } -} - -template -inline __host__ __device__ constexpr double get_atol() -{ - if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 1e-6; - } - else if constexpr(std::is_same_v) - { - return 1e-3; - } - else if constexpr(std::is_same_v) - { - return 5e-2; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 1e-1; - } - else if constexpr(std::is_same_v) - { - return 16.1; // 240 and 224 are acceptable - } - else if constexpr(std::is_same_v) - { - return 8192.1; // 57344 and 49152 are acceptable - } - else - { - return 1e-3; - } -} - template bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) { diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp index 18e1db462..ef5738be0 100644 --- a/include/ck/library/utility/host_tensor.hpp +++ b/include/ck/library/utility/host_tensor.hpp @@ -266,18 +266,18 @@ struct Tensor using Data = std::vector; template - Tensor(std::initializer_list lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize()) + Tensor(std::initializer_list lens) : mDesc(lens), mData(GetElementSpaceSize()) { } template Tensor(std::initializer_list lens, std::initializer_list strides) - : mDesc(lens, strides), mData(mDesc.GetElementSpaceSize()) + : mDesc(lens, strides), mData(GetElementSpaceSize()) { } template - Tensor(const Lengths& lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize()) + Tensor(const Lengths& lens) : mDesc(lens), mData(GetElementSpaceSize()) { } @@ -287,7 +287,7 @@ struct Tensor { } - Tensor(const Descriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpaceSize()) {} + Tensor(const Descriptor& desc) : mDesc(desc), mData(GetElementSpaceSize()) {} template Tensor CopyAsType() const @@ -322,7 +322,17 @@ struct Tensor std::size_t GetElementSize() const { return mDesc.GetElementSize(); } - std::size_t GetElementSpaceSize() const { return mDesc.GetElementSpaceSize(); } + std::size_t GetElementSpaceSize() const + { + if constexpr(ck::is_same_v, ck::pk_i4_t>) + { + return (mDesc.GetElementSpaceSize() + 1) / 2; + } + else + { + return mDesc.GetElementSpaceSize(); + } + } std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); } @@ -469,29 +479,64 @@ struct Tensor template std::size_t GetOffsetFromMultiIndex(Is... is) const { - return mDesc.GetOffsetFromMultiIndex(is...); + if constexpr(ck::is_same_v, ck::pk_i4_t>) + { + return mDesc.GetOffsetFromMultiIndex(is...) / 2; + } + else + { + return mDesc.GetOffsetFromMultiIndex(is...); + } } template T& operator()(Is... is) { - return mData[mDesc.GetOffsetFromMultiIndex(is...)]; + if constexpr(ck::is_same_v, ck::pk_i4_t>) + { + return mData[mDesc.GetOffsetFromMultiIndex(is...) / 2]; + } + else + { + return mData[mDesc.GetOffsetFromMultiIndex(is...)]; + } } template const T& operator()(Is... is) const { - return mData[mDesc.GetOffsetFromMultiIndex(is...)]; + if constexpr(ck::is_same_v, ck::pk_i4_t>) + { + return mData[mDesc.GetOffsetFromMultiIndex(is...) / 2]; + } + else + { + return mData[mDesc.GetOffsetFromMultiIndex(is...)]; + } } T& operator()(std::vector idx) { - return mData[mDesc.GetOffsetFromMultiIndex(idx)]; + if constexpr(ck::is_same_v, ck::pk_i4_t>) + { + return mData[mDesc.GetOffsetFromMultiIndex(idx) / 2]; + } + else + { + return mData[mDesc.GetOffsetFromMultiIndex(idx)]; + } } const T& operator()(std::vector idx) const { - return mData[mDesc.GetOffsetFromMultiIndex(idx)]; + if constexpr(ck::is_same_v, ck::pk_i4_t>) + { + return mData[mDesc.GetOffsetFromMultiIndex(idx) / 2]; + } + else + { + return mData[mDesc.GetOffsetFromMultiIndex(idx)]; + } } typename Data::iterator begin() { return mData.begin(); } diff --git a/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp index ab9f01b53..6a90523c3 100644 --- a/include/ck/library/utility/host_tensor_generator.hpp +++ b/include/ck/library/utility/host_tensor_generator.hpp @@ -81,6 +81,20 @@ struct GeneratorTensor_1 } }; +template <> +struct GeneratorTensor_1 +{ + int8_t value = 1; + + template + ck::pk_i4_t operator()(Is...) + { + int t = value + 8; + ck::pk_i4_t r = ((t << 4) + t) & 0xff; + return r; + } +}; + template struct GeneratorTensor_2 { @@ -121,6 +135,22 @@ struct GeneratorTensor_2 } }; +template <> +struct GeneratorTensor_2 +{ + int min_value = 0; + int max_value = 1; + + template + ck::pk_i4_t operator()(Is...) + { + int hi = std::rand() % (max_value - min_value) + min_value + 8; + int lo = std::rand() % (max_value - min_value) + min_value + 8; + ck::pk_i4_t r = ((hi << 4) + lo) & 0xff; + return r; + } +}; + #if defined CK_ENABLE_FP8 template <> struct GeneratorTensor_2 diff --git a/include/ck/tensor/static_tensor.hpp b/include/ck/tensor/static_tensor.hpp index d719ef976..ef2bedd65 100644 --- a/include/ck/tensor/static_tensor.hpp +++ b/include/ck/tensor/static_tensor.hpp @@ -167,7 +167,7 @@ struct StaticTensorTupleOfVectorBuffer // Idx is for S, not X. Idx should be aligned with X template ::value && + typename enable_if<(has_same_scalar_type::value || !is_native_type()) && is_known_at_compile_time::value && Idx::Size() == ndim_, bool>::type = false> __host__ __device__ constexpr X GetAsType(Idx) const @@ -201,7 +201,7 @@ struct StaticTensorTupleOfVectorBuffer // Idx is for S, not X. Idx should be aligned with X template ::value && + typename enable_if<(has_same_scalar_type::value || !is_native_type()) && is_known_at_compile_time::value && Idx::Size() == ndim_, bool>::type = false> __host__ __device__ constexpr void SetAsType(Idx, X x) diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp index b2db35b15..43909f77d 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp @@ -36,6 +36,10 @@ struct DeviceGemmV2 : public BaseOperator CElementwiseOperation c_element_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; + + virtual bool GetPermuteA() = 0; + virtual bool GetPermuteB() = 0; + virtual ck::index_t GetKPerBlock() = 0; }; template + typename ComputeTypeB = ComputeTypeA, + bool PermuteA = false, + bool PermuteB = false> struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2; + ComputeTypeB, + PermuteA, + PermuteB>; using Argument = typename GridwiseGemm::Argument; @@ -633,6 +637,11 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2(p_arg)); } + index_t GetKPerBlock() override { return KPerBlock; } + + bool GetPermuteA() override { return PermuteA; } + bool GetPermuteB() override { return PermuteB; } + static auto MakeArgument(const ADataType* p_a, const BDataType* p_b, CDataType* p_c, diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 39b81ca57..86a5af41b 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -7,12 +7,177 @@ #include "ck/utility/math.hpp" #include "ck/utility/math_v2.hpp" #include "ck/utility/type_convert.hpp" +#include "ck/utility/amd_inline_asm.hpp" #include namespace ck { + +// Fast int4x4 to half8_t data type conversion based on paper +// [Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production] +// (https://arxiv.org/abs/2211.10017) and implementation: +// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h +__host__ __device__ inline half4_t pki4_to_half4(int q) +{ + const int LO = 0x000f000f; + const int HI = 0x00f000f0; + const int EX = 0x64006400; + + // Extract the two int4 at low bit and create two fp16 number. + int lo = amd_assembly_and_or_b32(q, LO, EX); + // Extract the two int4 at hight bit and create two fp16 number. + int hi = amd_assembly_and_or_b32(q, HI, EX); + + const int SUB = 0xE408E408; // half2 {-1032, -1032} + const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16} + const int ADD = 0xd480d480; // half2 {-72, -72} + + vector_type res; + + // for two fp16 from lowbit, subtract 1032 to get correct fp16 value + res.template AsType()(Number<0>{}) = + amd_assembly_pk_add_f16(bit_cast(lo), bit_cast(SUB)); + + // for two fp16 from highbit, divide 16 and subtract 72 to get correct fp16 value + res.template AsType()(Number<1>{}) = amd_assembly_pk_fma_f16( + bit_cast(hi), bit_cast(MUL), bit_cast(ADD)); + + return res.template AsType()[Number<0>{}]; +} + +__host__ __device__ inline half2_t pki4_to_half2(pk_i4_t q) +{ +#if 1 + uint8_t x_u8 = ck::bit_cast(q); + uint32_t i4s = ((x_u8 & 0x0f) << 16) | ((x_u8 & 0xf0) >> 4); + + const int EX = 0x64006400; + const int SUB = 0xE408E408; //-8 + + int lo = i4s | EX; + + return amd_assembly_pk_add_f16(bit_cast(lo), bit_cast(SUB)); +#else + uint8_t x_u8 = ck::bit_cast(q); + + vector_type res; + + half_t x_h = (x_u8 & 0x0f) - 8; + half_t x_l = ((x_u8 & 0xf0) >> 4) - 8; + + res.template AsType()(Number<0>{}) = x_l; + res.template AsType()(Number<1>{}) = x_h; + + return res.template AsType()[Number<0>{}]; +#endif +} + +__host__ __device__ inline bhalf4_t pki4_to_bhalf4(int q) +{ + uint32_t i8s = (q & 0xf) | ((q & 0xf0) << 4) | ((q & 0xf00) << 8) | ((q & 0xf000) << 12); + + static constexpr uint32_t fp32_base = 0x4B000000; + + float fp32_intermediates[4]; + + uint32_t* fp32_intermediates_casted = reinterpret_cast(fp32_intermediates); + + fp32_intermediates_casted[0] = __byte_perm(i8s, fp32_base, 0x7650); + fp32_intermediates_casted[1] = __byte_perm(i8s, fp32_base, 0x7651); + fp32_intermediates_casted[2] = __byte_perm(i8s, fp32_base, 0x7652); + fp32_intermediates_casted[3] = __byte_perm(i8s, fp32_base, 0x7653); + + fp32_intermediates[0] -= 8388616.f; + fp32_intermediates[1] -= 8388616.f; + fp32_intermediates[2] -= 8388616.f; + fp32_intermediates[3] -= 8388616.f; + + vector_type res; + res.template AsType()(Number<0>{}) = bit_cast( + __byte_perm(fp32_intermediates_casted[1], fp32_intermediates_casted[0], 0x7632)); + res.template AsType()(Number<1>{}) = bit_cast( + __byte_perm(fp32_intermediates_casted[3], fp32_intermediates_casted[2], 0x7632)); + + return res.template AsType()[Number<0>{}]; +} + +__host__ __device__ inline bhalf2_t pki4_to_bhalf2(pk_i4_t q) +{ + uint8_t x_u8 = ck::bit_cast(q); + + float x_h = ((x_u8 & 0x0f) >> 0) - 8.f; + float x_l = ((x_u8 & 0xf0) >> 4) - 8.f; + + vector_type res; + + res.template AsType()(Number<0>{}) = type_convert(x_l); + res.template AsType()(Number<1>{}) = type_convert(x_h); + + return res.template AsType()[Number<0>{}]; +} + namespace tensor_operation { namespace element_wise { +struct PassThroughPack8 +{ + template + __host__ __device__ void operator()(Y& y, const X& x) const; + + __host__ __device__ constexpr void operator()(ck::half8_t& y, const ck::pk_i4x4_t& x) const + { +#if 1 + vector_type result; + + result.template AsType()(Number<0>{}) = pki4_to_half4(bit_cast(x)); + result.template AsType()(Number<1>{}) = pki4_to_half4(bit_cast(x) >> 8); + + y = result.template AsType()[Number<0>{}]; +#else + vector_type dst; + vector_type src{x}; + + dst.template AsType()(Number<0>{}) = + pki4_to_half2(src.template AsType()[Number<0>{}]); + dst.template AsType()(Number<1>{}) = + pki4_to_half2(src.template AsType()[Number<1>{}]); + dst.template AsType()(Number<2>{}) = + pki4_to_half2(src.template AsType()[Number<2>{}]); + dst.template AsType()(Number<3>{}) = + pki4_to_half2(src.template AsType()[Number<3>{}]); + + y = dst.template AsType()[Number<0>{}]; +#endif + } + + __host__ __device__ constexpr void operator()(ck::bhalf8_t& y, const ck::pk_i4x4_t& x) const + { +#if 1 + vector_type result; + + result.template AsType()(Number<0>{}) = pki4_to_bhalf4(bit_cast(x)); + result.template AsType()(Number<1>{}) = pki4_to_bhalf4(bit_cast(x) >> 16); + + y = result.template AsType()[Number<0>{}]; +#else + vector_type dst; + vector_type src{x}; + + dst.template AsType()(Number<0>{}) = + pki4_to_bhalf2(src.template AsType()[Number<0>{}]); + dst.template AsType()(Number<1>{}) = + pki4_to_bhalf2(src.template AsType()[Number<1>{}]); + dst.template AsType()(Number<2>{}) = + pki4_to_bhalf2(src.template AsType()[Number<2>{}]); + dst.template AsType()(Number<3>{}) = + pki4_to_bhalf2(src.template AsType()[Number<3>{}]); + + y = dst.template AsType()[Number<0>{}]; +#endif + } + + constexpr const static bool is_pack8_invocable = true; +}; + #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wnon-virtual-dtor" struct UnaryOpBase @@ -49,6 +214,24 @@ struct PassThroughPack2 auto t = type_convert(x); y = type_convert(t); } + + __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const + { +#if 1 + uint8_t x_u8 = ck::bit_cast(x); + uint8_t x_l = (x_u8 & 0x0f) >> 0; + uint8_t x_h = (x_u8 & 0xf0) >> 4; + + auto l_f16 = ck::type_convert(x_l); + auto h_f16 = ck::type_convert(x_h); + + y = {l_f16, h_f16}; +#else + uint32_t t = ck::bit_cast(x); + y = ck::bit_cast(t); +#endif + } + constexpr const static bool is_pack2_invocable = true; }; @@ -76,6 +259,12 @@ struct PassThrough final : public UnaryOpBase template __host__ __device__ void operator()(Y& y, const X& x) const; + template <> + __host__ __device__ void operator()(pk_i4_t& y, const pk_i4_t& x) const + { + y = x; + } + template <> __host__ __device__ void operator()(float& y, const double& x) const { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index 36797a906..a43f0f880 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -127,7 +127,9 @@ template + typename ComputeTypeB = ComputeTypeA, + bool PermuteA = false, + bool PermuteB = false> struct GridwiseGemm_xdl_cshuffle_v3 { static constexpr auto I0 = Number<0>{}; @@ -151,6 +153,20 @@ struct GridwiseGemm_xdl_cshuffle_v3 using ThisThreadBlock = ThisThreadBlock; + static constexpr index_t APackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + + static constexpr index_t BPackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch) { return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch); @@ -319,6 +335,10 @@ struct GridwiseGemm_xdl_cshuffle_v3 using GemmSpecialization = tensor_operation::device::GemmSpecialization; + static_assert(!(is_same_v, pk_i4_t> && + GemmSpec != GemmSpecialization::Default), + "pk_i4_t does not support padding"); + if constexpr(GemmSpec == GemmSpecialization::NKPadding || GemmSpec == GemmSpecialization::MNKPadding) { @@ -373,15 +393,39 @@ struct GridwiseGemm_xdl_cshuffle_v3 } else { - // not pad N or K - const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( - b_grid_desc_nraw_kraw, - make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)), - make_pass_through_transform(N)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - - return b_grid_desc_bk0_n_bk1; + if constexpr(!PermuteB) + { + // not pad N or K + const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( + b_grid_desc_nraw_kraw, + make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + return b_grid_desc_bk0_n_bk1; + } + else + { + // Pre-shuffled Weight + // BGlobal[K / KPerBlock, N, KPerBlock / K1, K1] -> BTile[K / K1, N, K1] + constexpr index_t BK01 = KPerBlock / BK1Value; + const index_t BK0_ = StrideB / BK1Value; + const index_t BK00 = BK0_ / BK01; + + const auto b_grid_desc_bk00_n_bk01_bk1_permute = + make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value)); + + const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor( + b_grid_desc_bk00_n_bk01_bk1_permute, + make_tuple(make_merge_transform(make_tuple(BK00, BK01)), + make_pass_through_transform(make_tuple(N)), + make_pass_through_transform(BK1Value)), + make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return b_grid_desc_bk0_n_bk1_permute; + } } } @@ -572,7 +616,7 @@ struct GridwiseGemm_xdl_cshuffle_v3 { if constexpr(is_same_v) { - a_k_split_offset = blockIdx.z * karg.KRead; + a_k_split_offset = blockIdx.z * karg.KRead / APackedSize; } else if constexpr(is_same_v) { @@ -585,7 +629,15 @@ struct GridwiseGemm_xdl_cshuffle_v3 } else if constexpr(is_same_v) { - b_k_split_offset = blockIdx.z * karg.KRead; + if constexpr(!PermuteB) + { + b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize; + } + else + { + const int k0_offset = karg.KRead * karg.N; + b_k_split_offset = blockIdx.z * k0_offset / BPackedSize; + } } if(blockIdx.z < static_cast(karg.KBatch - 1)) @@ -625,9 +677,8 @@ struct GridwiseGemm_xdl_cshuffle_v3 // in some cases. else if constexpr(is_same::value) { - constexpr auto MLdsLayer = 32 * 4 / KPerBlock / sizeof(ADataType) < 1 - ? 1 - : 32 * 4 / KPerBlock / sizeof(ADataType); + constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize; + constexpr auto MLdsLayer = LdsSize < 1 ? 1 : LdsSize; constexpr auto a_lds_block_desc = make_naive_tensor_descriptor( make_tuple( AK0Number * Number{}, Number{}, AK1Number), @@ -761,10 +812,8 @@ struct GridwiseGemm_xdl_cshuffle_v3 else if constexpr(is_same::value) { // NLdsLayer * K0 as logical Bank - constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1 - ? 1 - : 32 * 4 / KPerBlock / sizeof(BDataType); - ; + constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize; + constexpr index_t NLdsLayer = LdsSize < 1 ? 1 : LdsSize; constexpr auto b_lds_block_desc = make_naive_tensor_descriptor( make_tuple( BK0Number * Number{}, Number{}, BK1Number), @@ -946,8 +995,8 @@ struct GridwiseGemm_xdl_cshuffle_v3 constexpr auto c_block_size = c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize(); - return math::max((a_block_space_size_aligned * sizeof(ADataType) + - b_block_space_size_aligned * sizeof(BDataType)), + return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize + + b_block_space_size_aligned * sizeof(BDataType) / BPackedSize), c_block_size * sizeof(CShuffleDataType)); } @@ -1312,8 +1361,9 @@ struct GridwiseGemm_xdl_cshuffle_v3 static_cast(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); auto b_block_buf = make_dynamic_buffer( - static_cast(p_shared) + - a_block_space_size_aligned * sizeof(ADataType) / sizeof(BDataType), + reinterpret_cast(static_cast(p_shared) + a_block_space_size_aligned * + sizeof(ADataType) / + APackedSize), b_block_desc_bk0_n_bk1.GetElementSpaceSize()); constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0); @@ -1706,16 +1756,16 @@ struct GridwiseGemm_xdl_cshuffle_v3 static_cast(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); auto b_block_buf_ping = make_dynamic_buffer( - static_cast(p_shared_0) + - a_block_space_size_aligned * sizeof(ADataType) / sizeof(BDataType), + bit_cast(static_cast(p_shared_0) + + a_block_space_size_aligned * sizeof(ADataType)), b_block_desc_bk0_n_bk1.GetElementSpaceSize()); auto a_block_buf_pong = make_dynamic_buffer( static_cast(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); auto b_block_buf_pong = make_dynamic_buffer( - static_cast(p_shared_1) + - a_block_space_size_aligned * sizeof(ADataType) / sizeof(BDataType), + bit_cast(bit_cast(p_shared_1) + + a_block_space_size_aligned * sizeof(ADataType)), b_block_desc_bk0_n_bk1.GetElementSpaceSize()); auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index d7a6a3624..758900200 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1007,6 +1007,13 @@ struct ThreadwiseTensorSliceTransfer_v4 using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); + static constexpr index_t PackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx) : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx)) { @@ -1015,6 +1022,11 @@ struct ThreadwiseTensorSliceTransfer_v4 static_assert(SliceLengths::At(Number{}) % SrcScalarPerVector == 0, "wrong! Not divisible"); + + if constexpr(is_same_v, pk_i4_t>) + { + static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1"); + } } template src_tmp_vector; + vector_type_maker_t src_tmp_vector; using src_vector_t = typename decltype(src_tmp_vector)::type; @@ -1120,7 +1132,8 @@ struct ThreadwiseTensorSliceTransfer_v4 if constexpr(SrcBuffer::IsDynamicBuffer()) { src_tmp_vector.template AsType()(Number<0>{}) = - src_buf.template Get(src_data_coord.GetOffset(), is_src_valid); + src_buf.template Get(src_data_coord.GetOffset() / PackedSize, + is_src_valid); } else if constexpr(SrcBuffer::IsStaticBuffer()) { @@ -1133,9 +1146,36 @@ struct ThreadwiseTensorSliceTransfer_v4 }); } - if constexpr(is_same, f8_t>::value && - is_same, half_t>::value && - SrcScalarPerVector % 2 == 0) + if constexpr(is_same, pk_i4_t>::value) + { + // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to + // DstData) + vector_type_maker_t dst_tmp_vector; + + constexpr index_t pack_size = 8; + + static_assert(SrcScalarPerVector % pack_size == 0, ""); + + using src_v_t = typename vector_type_maker_t::type; + using dst_v_t = typename vector_type_maker_t::type; + + static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) { + ck::tensor_operation::element_wise::PassThroughPack8{}( + dst_tmp_vector.template AsType()(i), + src_tmp_vector.template AsType()[i]); + }); + + // copy data from dst_tmp_vector into dst_buf + static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { + constexpr index_t dst_offset = dst_desc.CalculateOffset( + dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector); + + dst_buf(Number{}) = dst_tmp_vector.template AsType()[i]; + }); + } + else if constexpr(is_same, f8_t>::value && + is_same, half_t>::value && + SrcScalarPerVector % 2 == 0) { // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to // DstData) diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index 96ea04c8f..8cbe6bd2c 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -31,8 +31,8 @@ template {}; + static constexpr index_t PackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + + static constexpr auto SrcScalarPerVector = Number{}; + static constexpr auto DstScalarPerVector = Number{}; + __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1( const SrcDesc& src_desc, const Index& src_slice_origin, @@ -67,6 +77,17 @@ struct ThreadwiseTensorSliceTransfer_v3r1 src_element_op_(src_element_op), dst_element_op_(dst_element_op) { + if constexpr(is_same_v, pk_i4_t>) + { + static_assert(is_same_v, remove_cvref_t>, + "SrcData != DstData"); + + static_assert( + SrcScalarPerVector_ % PackedSize == 0 && DstScalarPerVector_ % PackedSize == 0, + "SrcScalarPerVector_ and DstScalarPerVector_ cannot be 1 for packed data type"); + + static_assert(SrcVectorDim == DstVectorDim, "pk_i4_t does not support transpose"); + } } __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) @@ -95,11 +116,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1 // scalar per access on each dim // TODO: don't use lambda_scalar_per_access constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); + detail::lambda_scalar_per_access{}, Number{}); constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; - static_assert(SliceLengths::At(SrcVectorDim) % SrcScalarPerVector == 0, + static_assert(SliceLengths::At(SrcVectorDim) % (SrcScalarPerVector_) == 0, "SliceLengths[SrcVectorDim] must be divisible by SrcScalarPerVector"); constexpr auto src_dim_access_order = SrcDimAccessOrder{}; @@ -180,9 +201,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1 using src_vector_type = vector_type_maker_t; using src_vector_t = typename src_vector_type::type; - auto src_vector_container = - src_vector_type{src_buf.template Get(src_coord_.GetOffset(), true)}; - using dst_vector_type = vector_type_maker_t; using dst_vector_t = typename dst_vector_type::type; dst_vector_type op_r_v; @@ -193,17 +211,22 @@ struct ThreadwiseTensorSliceTransfer_v3r1 if constexpr(decltype(src_element_op_)::is_pack8_invocable) return math::min(8, SrcScalarPerVector); } - if constexpr(is_detected::value) + else if constexpr(is_detected::value) { if constexpr(decltype(src_element_op_)::is_pack4_invocable) return math::min(4, SrcScalarPerVector); } - if constexpr(is_detected::value) + else if constexpr(is_detected::value) { if constexpr(decltype(src_element_op_)::is_pack2_invocable) return math::min(2, SrcScalarPerVector); } - return 1; + else + { + return 1; + } }; constexpr index_t elem_op_vec_len = get_elem_op_vec_len(); @@ -211,6 +234,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1 using src_elem_op_vec_t = typename vector_type::type; using dst_elem_op_vec_t = typename vector_type::type; + auto src_vector_container = src_vector_type{ + src_buf.template Get(src_coord_.GetOffset() / PackedSize, true)}; + static_for<0, SrcScalarPerVector / elem_op_vec_len, 1>{}([&](auto idx) { // apply the src elementwise op and convert to DstData under the hood if needed src_element_op_(op_r_v.template AsType()(idx), @@ -276,10 +302,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1 dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx]; }); #else - // OOB Check constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); + detail::lambda_scalar_per_access{}, Number{}); constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; @@ -350,6 +375,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1 (is_same>::value && SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0))) { + static_assert(!is_same_v, pk_i4_t>, + "in-register transpose is not supported for pk_i4_t"); // each transpose does // DstScalarPerVector # of src vectors in src_thread_scratch_ // SrcScalarPerVector # of dst vectors in dst_thread_scratch_ @@ -410,7 +437,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1 } else { - static_ford{}([&](auto idx) { + constexpr auto packed_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto packed_access_lengths = SliceLengths{} / packed_per_access; + + static_ford{}([&](auto idx) { dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx]; }); } @@ -438,7 +470,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 // src scalar per access on each dim // TODO: don't use this constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); + detail::lambda_scalar_per_access{}, Number{}); constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; @@ -526,13 +558,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1 // apply DstElementwiseOperation dst_element_op_(dst_v, dst_vector_container.template AsType()[i]); - - dst_vector_container.template AsType()(i) = dst_v; }); // copy data from dst_vector_container to dst_buf dst_buf.template Set( - dst_coord_.GetOffset(), + dst_coord_.GetOffset() / PackedSize, is_dst_valid, dst_vector_container.template AsType()[I0]); @@ -586,7 +616,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 // scalar per access on each dim // TODO: don't use lambda_scalar_per_access constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); + detail::lambda_scalar_per_access{}, Number{}); constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; @@ -644,7 +674,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 // scalar per access on each dim // TODO: don't use lambda_scalar_per_access constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); + detail::lambda_scalar_per_access{}, Number{}); constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; @@ -730,7 +760,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 __device__ static constexpr auto GetSrcThreadScratchDescriptor() { constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); + detail::lambda_scalar_per_access{}, Number{}); constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; @@ -779,7 +809,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 __device__ static constexpr auto GetSrcOOBThreadScratchDescriptor() { constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); + detail::lambda_scalar_per_access{}, Number{}); constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; @@ -790,7 +820,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 { // 1st stage of transforms constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); + detail::lambda_scalar_per_access{}, Number{}); constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index 5367c3d72..ad13c4431 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -429,7 +429,8 @@ __device__ typename vector_type::type amd_buffer_load_impl(int32x4_t src_w (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), "wrong! not implemented"); using r_t = typename vector_type::type; diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp index 5dc67a5ad..6761c08f2 100644 --- a/include/ck/utility/amd_inline_asm.hpp +++ b/include/ck/utility/amd_inline_asm.hpp @@ -11,6 +11,27 @@ namespace ck { +inline __device__ int amd_assembly_and_or_b32(int a, int b, int d) +{ + int c; + asm volatile("v_and_or_b32 %0, %1, %2, %3" : "=v"(c) : "v"(a), "v"(b), "v"(d)); + return c; +} + +inline __device__ half2_t amd_assembly_pk_fma_f16(half2_t a, half2_t b, half2_t c) +{ + half2_t d; + asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c)); + return d; +} + +inline __device__ half2_t amd_assembly_pk_add_f16(half2_t a, half2_t b) +{ + half2_t c; + asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b)); + return c; +} + // c0 += inner_product(a, b0) // c1 += inner_product(a, b1) __device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1) diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index a7dc071bc..86bc3c394 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -12,6 +12,15 @@ using bhalf_t = ushort; using half_t = _Float16; using int4_t = _BitInt(4); +// custom data type - pack int4 data +struct pk_i4_t +{ + using type = int8_t; + type data; + __host__ __device__ constexpr pk_i4_t() : data{type{}} {} + __host__ __device__ constexpr pk_i4_t(type init) : data{init} {} +}; + inline constexpr auto next_pow2(uint32_t x) { // Precondition: x > 1. @@ -165,6 +174,13 @@ struct scalar_type }; #endif +template <> +struct scalar_type +{ + using type = pk_i4_t; + static constexpr index_t vector_size = 1; +}; + template <> struct scalar_type { @@ -1044,6 +1060,12 @@ struct nnvb_data_t_selector using type = bf8_ocp_t::data_type; }; +template <> +struct nnvb_data_t_selector +{ + using type = pk_i4_t::type; +}; + template struct non_native_vector_base< T, @@ -1163,6 +1185,14 @@ struct scalar_type> static constexpr index_t vector_size = N; }; +template +struct scalar_type> +{ + using type = typename non_native_vector_base::data_t; + + static constexpr index_t vector_size = N; +}; + // non-native vector_type implementation template struct vector_type()>> @@ -1871,6 +1901,11 @@ using uint8x16_t = typename vector_type::type; using uint8x32_t = typename vector_type::type; using uint8x64_t = typename vector_type::type; +// pack int4 +using pk_i4x2_t = typename vector_type::type; +using pk_i4x4_t = typename vector_type::type; +using pk_i4x8_t = typename vector_type::type; + template struct NumericLimits { diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index 0dcc514a2..639aa1efe 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -54,7 +54,8 @@ struct DynamicBuffer template >::type, - typename scalar_type>::type>::value, + typename scalar_type>::type>::value || + !is_native_type(), bool>::type = false> __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const { @@ -195,7 +196,8 @@ struct DynamicBuffer template >::type, - typename scalar_type>::type>::value, + typename scalar_type>::type>::value || + !is_native_type(), bool>::type = false> __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x) { diff --git a/include/ck/utility/static_buffer.hpp b/include/ck/utility/static_buffer.hpp index 835f56573..602e76abd 100644 --- a/include/ck/utility/static_buffer.hpp +++ b/include/ck/utility/static_buffer.hpp @@ -116,7 +116,8 @@ struct StaticBufferTupleOfVector // i is offset of S, not X. i should be aligned to X template ::value, bool>::type = false> + typename enable_if::value || !is_native_type(), + bool>::type = false> __host__ __device__ constexpr auto GetAsType(Number i) const { constexpr auto s_per_x = Number>::vector_size>{}; @@ -134,7 +135,8 @@ struct StaticBufferTupleOfVector // i is offset of S, not X. i should be aligned to X template ::value, bool>::type = false> + typename enable_if::value || !is_native_type(), + bool>::type = false> __host__ __device__ constexpr void SetAsType(Number i, X x) { constexpr auto s_per_x = Number>::vector_size>{}; diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index 1ae11fe9d..8dd5d086b 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -74,6 +74,17 @@ struct ReferenceGemm : public device::BaseOperator { ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k)); } + else if constexpr(is_same_v) + { + uint8_t i4x2 = arg.a_m_k_(m, k).data; + int8_t i4 = 0; + if(k % 2 == 1) + i4 = (i4x2 >> 0) & 0xf; + else + i4 = (i4x2 >> 4) & 0xf; + i4 = i4 - 8; + v_a = type_convert(i4); + } else { arg.a_element_op_(v_a, arg.a_m_k_(m, k)); @@ -84,6 +95,17 @@ struct ReferenceGemm : public device::BaseOperator { ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n)); } + else if constexpr(is_same_v) + { + uint8_t i4x2 = arg.b_k_n_(k, n).data; + int8_t i4 = 0; + if(k % 2 == 1) + i4 = (i4x2 >> 0) & 0xf; + else + i4 = (i4x2 >> 4) & 0xf; + i4 = i4 - 8; + v_b = type_convert(i4); + } else { arg.b_element_op_(v_b, arg.b_k_n_(k, n)); diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index 4358953a5..4a44c425a 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -22,6 +22,7 @@ using I8 = int8_t; using I32 = int32_t; using F8 = ck::f8_t; using BF8 = ck::bf8_t; +using I4 = ck::pk_i4_t; using Empty_Tuple = ck::Tuple<>; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp index 52046a107..4218c51ca 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp @@ -166,11 +166,22 @@ void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances std::vector>>& instances); + void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instances( std::vector>>& instances); +void add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances); + void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances( std::vector>>& @@ -810,6 +821,28 @@ struct DeviceOperationInstanceFactory< } } #endif + + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(op_ptrs); + } + } + + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances( + op_ptrs); + } + } + return op_ptrs; } }; diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt index 188c9f68e..ade65eacf 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt @@ -97,6 +97,9 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp new file mode 100644 index 000000000..8d109d134 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using I4 = pk_i4_t; +using BF16 = bhalf_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +#if 0 +template +using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_comp_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Compute friendly + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 16, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + // clang-format on + >; +#endif + +template +using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_instances = + std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| ACompType| BCompType| APermute| BPermute| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| | | | | + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| | | | | + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 8, 32, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 128, 8, 32, 32, 32, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 128, 8, 16, 16, 16, 4, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 128, 8, 32, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 128, 8, 16, 16, 16, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 8, 32, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 128, 8, 32, 16, 16, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 128, 8, 32, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 128, 8, 32, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 128, 8, 32, 32, 32, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 128, 8, 32, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, I4, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 128, 8, 32, 32, 32, 1, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, bhalf_t, bhalf_t, false, true> + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100644 index 000000000..b060a92eb --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp new file mode 100644 index 000000000..680788d66 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using I4 = pk_i4_t; +using F16 = half_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +#if 0 +template +using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Compute friendly + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 16, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + // clang-format on + >; +#endif + +template +using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| ACompType| BCompType| APermute| BPermute| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| | | | | + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| | | | | + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 8, 32, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 128, 8, 32, 32, 32, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 128, 8, 16, 16, 16, 4, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 128, 8, 32, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 128, 8, 16, 16, 16, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 8, 32, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 128, 8, 32, 16, 16, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 128, 8, 32, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 128, 8, 32, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 128, 8, 32, 32, 32, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 128, 8, 32, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 128, 8, 32, 32, 32, 1, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2, half_t, half_t, false, true> + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100644 index 000000000..a884a3ec5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp index 30f0da212..ed7e86ded 100644 --- a/profiler/include/profiler/profile_gemm_universal_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp @@ -65,11 +65,13 @@ bool profile_gemm_universal_impl(int do_verification, Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - int total_gemm_needed = a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes(); - int rotating_count = std::max( + std::size_t total_gemm_needed = + a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes(); + int rotating_count = std::max( 1, std::min(n_iter, static_cast(std::ceil(static_cast(rotating) / total_gemm_needed)))); @@ -86,9 +88,13 @@ bool profile_gemm_universal_impl(int do_verification, a_m_k.GenerateTensorValue(GeneratorTensor_2{-1, 2}); b_k_n.GenerateTensorValue(GeneratorTensor_2{-1, 2}); break; - default: + case 2: a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + default: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); } using AElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -100,11 +106,10 @@ bool profile_gemm_universal_impl(int do_verification, const auto c_element_op = CElementOp{}; DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); - DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); + DeviceMem b_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize()); DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); a_device_buf.ToDevice(a_m_k.mData.data()); - b_device_buf.ToDevice(b_k_n.mData.data()); using DeviceOp = ck::tensor_operation::device::DeviceGemmV2GetKPerBlock(); + + if(op_ptr->GetPermuteB()) + { + int K1 = KPerBlock; + int K0 = K / KPerBlock; + + // int K0, N, K1 + for(int j = 0; j < K0; j++) + { + for(int i = 0; i < N; i++) + { + for(int jj = 0; jj < K1; jj++) + { + b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj)); + } + } + } + + if(is_same_v && is_same_v) + { + // vector pk_i4x4 permute + for(int i = 0; i < N; i++) + { + for(int j = 0; j < K; j += 8) + { + int input[8]; + + for(int k = 0; k < 4; k++) + { + int i4x2 = b_k_n_permute(j + k * 2, i); + input[k * 2 + 0] = (i4x2 >> 4) & 0xf; + input[k * 2 + 1] = (i4x2 >> 0) & 0xf; + } + + // permute 01234567->20643175 + { + int hi = input[2]; + int lo = input[0]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 0, i) = i4x2; + } + + { + int hi = input[6]; + int lo = input[4]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 2, i) = i4x2; + } + + { + int hi = input[3]; + int lo = input[1]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 4, i) = i4x2; + } + + { + int hi = input[7]; + int lo = input[5]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 6, i) = i4x2; + } + } + } + } + } + else + { + b_k_n_permute = b_k_n; + } + + b_device_buf.ToDevice(b_k_n_permute.mData.data()); + std::vector kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38}; if(KBatch > 0) @@ -240,7 +323,15 @@ bool profile_gemm_universal_impl(int do_verification, std::size_t flop = std::size_t(2) * M * N * K; - std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + + static constexpr index_t BPackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + + std::size_t num_btype = sizeof(ADataType) * M * K + + sizeof(BDataType) * K * N / BPackedSize + sizeof(CDataType) * M * N; float tflops = static_cast(flop) / 1.E9 / ave_time; diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 35e91f817..a0978eb6b 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -177,5 +177,4 @@ if(DL_KERNELS) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance) endif() - rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp index 990cbd292..a22d983da 100644 --- a/profiler/src/profile_gemm_universal.cpp +++ b/profiler/src/profile_gemm_universal.cpp @@ -1,10 +1,10 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. +#include +#include #include #include -#include -#include #include "profiler/profile_gemm_universal_impl.hpp" #include "profiler_operation_registry.hpp" @@ -27,6 +27,8 @@ enum struct GemmDataType F16_F8_F16, // 5 F16_F16_F16_F8, // 6 F8_F8_BF16, // 7 + F16_I4_F16, // 8 + BF16_I4_BF16, // 9 }; #define OP_NAME "gemm_universal" @@ -39,7 +41,7 @@ int profile_gemm_universal(int argc, char* argv[]) printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: " "f16->f8; 7: f8->bf16, " - "comp f8)\n"); + "comp f8; 8: f16@i4; 9: bf16@i4\n"); printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"); printf(" 1: A[m, k] * B[n, k] = C[m, n];\n"); printf(" 2: A[k, m] * B[k, n] = C[m, n];\n"); @@ -103,6 +105,7 @@ int profile_gemm_universal(int argc, char* argv[]) using BF16 = ck::bhalf_t; #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) using F8 = ck::f8_t; + using I4 = ck::pk_i4_t; #endif using Row = ck::tensor_layout::gemm::RowMajor; @@ -207,6 +210,14 @@ int profile_gemm_universal(int argc, char* argv[]) { return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{}); } + else if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile(F16{}, I4{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); + } + else if(data_type == GemmDataType::BF16_I4_BF16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile(BF16{}, I4{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{}); + } #endif else { diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index 4097ca98f..f7177a7ab 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -17,7 +17,7 @@ fi cmake \ -D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ +-D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ -D CMAKE_BUILD_TYPE=Release \ -D BUILD_DEV=ON \ -D GPU_TARGETS=$GPU_TARGETS \ -- GitLab From 9e95d54cd2160dffc07c1197951a9ab1ca6c35f2 Mon Sep 17 00:00:00 2001 From: Muhammed Emin Ozturk Date: Thu, 2 Jan 2025 10:30:04 -0800 Subject: [PATCH 146/153] BF16 GEMM Stream-K (#1541) * initial * Cmake file * successfull compilation but validation failed * Cmake * update * gpu validation * gemm universal * gemm universal sk update * sk bf16 universal instance * gemm_universal_streamk.hpp * only build for gfx94 * Cmakelist * profiler update, bf16 sk only works at gfx42 * clang * clang * clang all * no need flags * cmake script * delete comment * gemm universal sk fix * clang * profiler fix * clang * update * update * delete comment * code formatting * cmake * fix instance * clang * argument supported * argument supported and clang * update * fix * removing unnecessary comments * clang formatting * Update library/src/tensor_operation_instance/gpu/CMakeLists.txt Co-authored-by: afagaj * CopyRight Comment 2025 * clang reformatting * copy right 2025 --------- Co-authored-by: Emin Ozturk Co-authored-by: root Co-authored-by: Muhammed Emin Ozturk Co-authored-by: root Co-authored-by: Muhammed Emin Ozturk Co-authored-by: Muhammed Emin Ozturk Co-authored-by: Muhammed Emin Ozturk Co-authored-by: Emin Ozturk Co-authored-by: Muhammed Emin Ozturk Co-authored-by: afagaj --- example/01_gemm/CMakeLists.txt | 3 + example/01_gemm/gemm_xdl_bf16.cpp | 0 example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp | 59 +++ example/01_gemm/gemm_xdl_streamk.cpp | 1 - .../01_gemm/run_gemm_example_streamk_v2.inc | 0 .../device_gemm_xdl_cshuffle_streamk_v3.hpp | 6 +- .../gpu/gemm_universal_streamk.hpp | 500 ++++++++++++++++++ .../gpu/CMakeLists.txt | 8 +- .../gpu/gemm_universal_streamk/CMakeLists.txt | 39 +- ...versal_streamk_bf16_bf16_bf16_km_kn_mn.hpp | 91 ++++ ...16_bf16_km_kn_mn_comp_default_instance.cpp | 30 ++ ...6_bf16_km_kn_mn_comp_kpadding_instance.cpp | 30 ++ ...bf16_km_kn_mn_comp_mnkpadding_instance.cpp | 30 ++ ..._bf16_km_kn_mn_comp_mnpadding_instance.cpp | 30 ++ ..._bf16_km_kn_mn_mem_v1_default_instance.cpp | 31 ++ ...bf16_km_kn_mn_mem_v1_kpadding_instance.cpp | 31 ++ ...16_km_kn_mn_mem_v1_mnkpadding_instance.cpp | 31 ++ ..._bf16_km_kn_mn_mem_v2_default_instance.cpp | 31 ++ ...bf16_km_kn_mn_mem_v2_kpadding_instance.cpp | 31 ++ ...16_km_kn_mn_mem_v2_mnkpadding_instance.cpp | 31 ++ ...versal_streamk_bf16_bf16_bf16_km_nk_mn.hpp | 97 ++++ ...16_bf16_km_nk_mn_comp_default_instance.cpp | 30 ++ ...6_bf16_km_nk_mn_comp_kpadding_instance.cpp | 30 ++ ..._bf16_km_nk_mn_comp_mkpadding_instance.cpp | 30 ++ ...6_bf16_km_nk_mn_comp_mpadding_instance.cpp | 30 ++ ..._bf16_km_nk_mn_mem_v1_default_instance.cpp | 31 ++ ...bf16_km_nk_mn_mem_v1_kpadding_instance.cpp | 31 ++ ...f16_km_nk_mn_mem_v1_mkpadding_instance.cpp | 31 ++ ..._bf16_km_nk_mn_mem_v2_default_instance.cpp | 31 ++ ...bf16_km_nk_mn_mem_v2_kpadding_instance.cpp | 31 ++ ...f16_km_nk_mn_mem_v2_mkpadding_instance.cpp | 31 ++ ...versal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp | 89 ++++ ...16_bf16_mk_kn_mn_comp_default_instance.cpp | 30 ++ ...6_bf16_mk_kn_mn_comp_kpadding_instance.cpp | 30 ++ ...bf16_mk_kn_mn_comp_mnkpadding_instance.cpp | 30 ++ ..._bf16_mk_kn_mn_comp_mnpadding_instance.cpp | 30 ++ ..._bf16_mk_kn_mn_mem_v1_default_instance.cpp | 31 ++ ...bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp | 31 ++ ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp | 31 ++ ..._bf16_mk_kn_mn_mem_v2_default_instance.cpp | 31 ++ ...bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp | 31 ++ ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp | 31 ++ ...versal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp | 93 ++++ ...16_bf16_mk_nk_mn_comp_default_instance.cpp | 30 ++ ...6_bf16_mk_nk_mn_comp_kpadding_instance.cpp | 30 ++ ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp | 31 ++ ...bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp | 31 ++ ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp | 31 ++ ...bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp | 31 ++ .../src/profile_gemm_universal_streamk.cpp | 21 +- script/cmake-ck-dev.sh | 2 +- 51 files changed, 2101 insertions(+), 10 deletions(-) mode change 100644 => 100755 example/01_gemm/CMakeLists.txt mode change 100644 => 100755 example/01_gemm/gemm_xdl_bf16.cpp create mode 100755 example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp mode change 100644 => 100755 example/01_gemm/gemm_xdl_streamk.cpp mode change 100755 => 100644 example/01_gemm/run_gemm_example_streamk_v2.inc mode change 100755 => 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp mode change 100644 => 100755 library/src/tensor_operation_instance/gpu/CMakeLists.txt create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp mode change 100755 => 100644 profiler/src/profile_gemm_universal_streamk.cpp diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt old mode 100644 new mode 100755 index df7be0466..354e443b3 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -35,6 +35,9 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3) add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3) +add_example_executable(example_gemm_xdl_bf16_streamk_v3 gemm_xdl_bf16_streamk_v3.cpp) +add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_streamk_v3) + add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16) diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp old mode 100644 new mode 100755 diff --git a/example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp b/example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp new file mode 100755 index 000000000..5b56a4348 --- /dev/null +++ b/example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved + +#include "common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +using ADataType = ck::bhalf_t; +using BDataType = ck::bhalf_t; +using CDataType = ck::bhalf_t; +using AccDataType = float; +using CShuffleDataType = ck::bhalf_t; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +// clang-format off +using DeviceGemmV2_Streamk_Instance = + ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3< + ALayout, BLayout, CLayout, + ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, + PassThrough, PassThrough, PassThrough, GemmDefault, + 256, + 128, 128, + 64, 8, 8, + 16, 16, + 4, 4, + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 8, 8, 0, + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 8, 8, 0, + 1, 2, S<1, 32, 1, 8>, 8, + ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>; +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; + +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + +#include "run_gemm_example_streamk_v2.inc" + +int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_streamk.cpp b/example/01_gemm/gemm_xdl_streamk.cpp old mode 100644 new mode 100755 index 5a02457da..dbdf7199e --- a/example/01_gemm/gemm_xdl_streamk.cpp +++ b/example/01_gemm/gemm_xdl_streamk.cpp @@ -15,7 +15,6 @@ using F16 = ck::half_t; using ALayout = Row; using BLayout = Row; -// using BLayout = Col; using CLayout = Row; using AElementOp = PassThrough; diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc old mode 100755 new mode 100644 diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp old mode 100755 new mode 100644 index cfd9a1204..26be5cfc6 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp @@ -469,7 +469,11 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2 && + arg.Streamk_sel > 0) + { + return false; + } if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding || GemmSpec == GemmSpecialization::NKPadding || GemmSpec == GemmSpecialization::MNKPadding || diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp index f44c02517..18203e7d5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp @@ -238,6 +238,403 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpaddin PassThrough>>>& instances); #endif +#ifdef CK_ENABLE_BF16 +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector>>& instances); +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances( + std::vector>>& instances); +#endif + #if(defined(CK_ENABLE_FP8)) void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances( std::vector && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances( + op_ptrs); + } + } +#endif + #if(defined(CK_ENABLE_FP8)) if constexpr(is_same_v && is_same_v && is_same_v) diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt old mode 100644 new mode 100755 index dd023e6b5..d72281f43 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -183,6 +183,10 @@ FOREACH(subdir_path ${dir_list}) message("bf8 instance found!") set(add_inst 1) endif() + if(("${cmake_instance}" MATCHES "_bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16") + message("bf16 instance found!") + set(add_inst 1) + endif() if(("${cmake_instance}" MATCHES "_fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16") message("fp16 instance found!") set(add_inst 1) @@ -195,10 +199,6 @@ FOREACH(subdir_path ${dir_list}) message("fp64 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "_bf16" AND DTYPES MATCHES "bf16") - message("bf16 instance found!") - set(add_inst 1) - endif() if(("${cmake_instance}" MATCHES "_int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8") message("int8 instance found!") set(add_inst 1) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt index 08746a52d..e1612bcd2 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt @@ -64,6 +64,43 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp) + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp + + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp) add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp new file mode 100755 index 000000000..b4554fc6a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = bhalf_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMPadding = GemmSpecialization::MPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMKPadding = GemmSpecialization::MKPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 2, 2, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + // Can we support this kind of odd case? 224(256) = 28*8 + (4*8) + //DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 2, 16, 16, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 2, 2, 16, 16, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 4, 16, 16, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 4, 4, 16, 16, 2, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 4, 4, 16, 16, 1, 2, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 4, 4, 16, 16, 1, 4, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 2, 4, 16, 16, 1, 4, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 2, 2, 16, 16, 1, 4, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp new file mode 100755 index 000000000..9b21e0bbd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp new file mode 100755 index 000000000..9b9195a44 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp new file mode 100755 index 000000000..d941d769c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp new file mode 100755 index 000000000..0cc69b589 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp new file mode 100755 index 000000000..e5cf052b3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp new file mode 100755 index 000000000..9ce9f8678 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp new file mode 100755 index 000000000..c95d90793 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp new file mode 100755 index 000000000..1071a2ac0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp new file mode 100755 index 000000000..ad569f721 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp new file mode 100755 index 000000000..0dad13c7e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp new file mode 100755 index 000000000..b6a60a1f3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = bhalf_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMPadding = GemmSpecialization::MPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMKPadding = GemmSpecialization::MKPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Compute friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 8, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 8, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 8, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 64, 8, 8, 16, 16, 8, 7, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 32, 1, 8>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 2, 2, 32, 32, 2, 2, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 8, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 8, 16, 16, 1, 1, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 8, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 4, 8, 16, 16, 2, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 4, 4, 16, 16, 2, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 8, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 8, 16, 16, 1, 1, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 8, 16, 16, 1, 1, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 4, 8, 16, 16, 1, 2, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 4, 4, 16, 16, 1, 2, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 4, 8, 16, 16, 1, 4, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 2, 8, 16, 16, 1, 4, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Col, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 2, 2, 16, 16, 1, 4, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp new file mode 100755 index 000000000..1cca948e5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp new file mode 100755 index 000000000..85bed59c1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp new file mode 100755 index 000000000..e6cb58f28 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp new file mode 100755 index 000000000..feeed4cc7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp new file mode 100755 index 000000000..32ba00037 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp new file mode 100755 index 000000000..2a7a683c3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp new file mode 100755 index 000000000..975313603 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp new file mode 100755 index 000000000..f9175984f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp new file mode 100755 index 000000000..181faa74b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp new file mode 100755 index 000000000..417fb4066 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp new file mode 100755 index 000000000..763ac4fac --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = bhalf_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMPadding = GemmSpecialization::MPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMKPadding = GemmSpecialization::MKPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 2, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 2, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 2, 2, 16, 16, 4, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 4, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 4, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 8, 4, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp new file mode 100755 index 000000000..8b2bfb5d2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp new file mode 100755 index 000000000..a7c33ffdc --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp new file mode 100755 index 000000000..adc2f23d4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp new file mode 100755 index 000000000..0336f6466 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp new file mode 100755 index 000000000..54488f269 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp new file mode 100755 index 000000000..8477a48be --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp new file mode 100755 index 000000000..0621df013 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp new file mode 100755 index 000000000..49fd1ccd3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp new file mode 100755 index 000000000..354231624 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp new file mode 100755 index 000000000..dff56ca62 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp new file mode 100755 index 000000000..7a59823d9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = bhalf_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMPadding = GemmSpecialization::MPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMKPadding = GemmSpecialization::MKPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Compute friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 4, 4, 32, 32, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 2, 2, 32, 32, 4, 4, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + // AGPR Spill + // DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + // AGPR Spill when use permuted lds layout. so, use padding for these two. + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 64, 8, 8, 16, 16, 8, 7, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 32, 1, 8>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 4, 4, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 2, 2, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 4, 4, 16, 16, 4, 1, S<16,16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 2, 2, 16, 16, 4, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 8, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 8, 8, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp new file mode 100755 index 000000000..e192bf14c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp new file mode 100755 index 000000000..d58ec3eb3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp new file mode 100755 index 000000000..545ef40b9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp new file mode 100755 index 000000000..d899d5704 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100755 index 000000000..a5a5640eb --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp new file mode 100755 index 000000000..a4e69f0a2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp old mode 100755 new mode 100644 index 85f6c2577..a94bb866f --- a/profiler/src/profile_gemm_universal_streamk.cpp +++ b/profiler/src/profile_gemm_universal_streamk.cpp @@ -83,8 +83,9 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) rotating = std::stoull(argv[18]) * 1024 * 1024; } - using F32 = float; - using F16 = ck::half_t; + using F32 = float; + using F16 = ck::half_t; + using BF16 = ck::bhalf_t; #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) using F8 = ck::f8_t; @@ -165,6 +166,22 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); } #endif + else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN) + { + return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{}); + } + else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{}); + } + else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN) + { + return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{}); + } + else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN) + { + return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Col{}, Row{}); + } else { std::cout << "this data_type & layout is not implemented" << std::endl; diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index f7177a7ab..6089fc7a7 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -15,7 +15,7 @@ else fi cmake \ --D CMAKE_PREFIX_PATH=/opt/rocm \ +-D CMAKE_PREFIX_PATH=/opt/rocm/ \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ -D CMAKE_BUILD_TYPE=Release \ -- GitLab From 17e8efb573781febcf3256b10751e7e39b1a2197 Mon Sep 17 00:00:00 2001 From: John Afaganis Date: Thu, 2 Jan 2025 19:50:07 -0700 Subject: [PATCH 147/153] Add afagaj to CODEOWNERS (#1787) --- .github/CODEOWNERS | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d7a6b1778..f6ab388e2 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,8 +1,8 @@ -* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca +* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj # Documentation files -docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca -*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca -*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca -.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca +docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj +*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj +*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj +.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj # Header directory for Doxygen documentation -library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca +library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj -- GitLab From 4bc610416ada7504c62d02c5cde0187f22f59d80 Mon Sep 17 00:00:00 2001 From: feli Date: Fri, 3 Jan 2025 14:28:59 +0800 Subject: [PATCH 148/153] Ck tile/layernorm: implement naive reduce, opt performance (#1784) * add no welford * enable output raw * raw of int8 * fix build * fix smoke test err * [ck_tile]layernorm: fix welford ok, set int8 and bf16 small N as default and others open by generate * [cktile]layernorm, fix err commit files and remove uselss * fix quant 8192 err & change norm_reduce class and file name --------- Co-authored-by: coderfeli Co-authored-by: carlushuang --- example/ck_tile/02_layernorm2d/generate.py | 160 ++++++++++-------- .../02_layernorm2d/script/smoke_test.sh | 3 +- ...ayernorm2d_fwd_pipeline_default_policy.hpp | 57 ++++--- .../layernorm2d_fwd_pipeline_one_pass.hpp | 40 +++-- .../layernorm2d_fwd_pipeline_two_pass.hpp | 20 ++- .../pipeline/layernorm2d_fwd_traits.hpp | 2 + .../ops/{welford.hpp => norm_reduce.hpp} | 6 +- .../block/block_norm_reduce.hpp} | 126 ++++++++------ .../block/block_norm_reduce_problem.hpp} | 9 +- .../thread/thread_welford.hpp | 0 10 files changed, 253 insertions(+), 170 deletions(-) rename include/ck_tile/ops/{welford.hpp => norm_reduce.hpp} (54%) rename include/ck_tile/ops/{welford/block/block_welford.hpp => norm_reduce/block/block_norm_reduce.hpp} (79%) rename include/ck_tile/ops/{welford/block/block_welford_problem.hpp => norm_reduce/block/block_norm_reduce_problem.hpp} (66%) rename include/ck_tile/ops/{welford => norm_reduce}/thread/thread_welford.hpp (100%) diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py index ca9e432a4..0581c4597 100644 --- a/example/ck_tile/02_layernorm2d/generate.py +++ b/example/ck_tile/02_layernorm2d/generate.py @@ -58,6 +58,7 @@ template @@ -120,6 +121,7 @@ struct layernorm2d_fwd_traits_ static constexpr bool kPadN = kPadN_; static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; static constexpr bool kFastFDiv = kFastFDiv_; + static constexpr bool kWelford = kWelford_; static constexpr bool kTwoPass = kTwoPass_; static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_; static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_; @@ -137,6 +139,7 @@ template @@ -152,6 +155,7 @@ using traits_ = layernorm2d_fwd_traits_; @@ -184,6 +188,7 @@ float layernorm2d_fwd_(const S& s, A a) using PipelineTraits = ck_tile::Layernorm2dFwdTraits(Traits_::kFusedAdd), static_cast(Traits_::kFusedQuant)>; @@ -204,12 +209,13 @@ float layernorm2d_fwd_(const S& s, A a) using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass; using Pipeline = std::conditional_t; - using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem; + using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem; using Default2DEpilogue = ck_tile::Default2DEpilogue; static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1; + static constexpr bool UseRawStore = sizeof(YDataType) == 4; using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem>; + ck_tile::DynamicQuantEpilogueTraits>; using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue; @@ -274,7 +280,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, #include "layernorm2d_fwd_api_common.hpp" // clang-format off -// prec_i prec_o prec_sy rm rn tm tn vn pd mv rpcf 2p add sweep +// prec_i prec_o prec_sy rm rn tm tn vn pd mv rpcf welford 2p add sweep {F_instance_def} // clang-format on @@ -362,6 +368,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, F_kPadN : bool F_kSaveMeanInvStd_ : bool F_kFastFDiv_ : bool + F_kWelford_ : bool F_kTwoPass_ : bool F_kFusedAdd : int F_kFusedQuant : int @@ -369,7 +376,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, @property def trait_name(self) ->str: t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_XScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}' - t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}' + t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}, {BOOL_MAP(self.F_kWelford_):5}' t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}' return t_ @@ -422,11 +429,10 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, def name_common_header(self) -> str: return 'layernorm2d_fwd_api_common' - @property - def content_api(self) -> str: + def content_api(self, args) -> str: # 1 sort based on dtype t_dtype_dict = dict() - blobs = self.get_blobs() + blobs = self.get_blobs(args) for blob in blobs: if blob.F_DataTypePair not in t_dtype_dict: t_dtype_dict[blob.F_DataTypePair] = {} @@ -462,8 +468,8 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False), F_VEC_COND = _cond, F_instance_func=ins.call_name) #inner_str = inner_str + vec_str - n_cnd = f'(a.n <= {n_})' if (i_n < len(blob_per_t) - 1) else '' - n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t)), F_N_COND=n_cnd, F_inner_dispatch=inner_str) + n_cnd = f'(a.n <= {n_})' if isinstance(n_, int) else '' + n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t), not isinstance(n_, int)), F_N_COND=n_cnd, F_inner_dispatch=inner_str) prec_i, prec_o = dtype_.split(',') d_str += self.API_PER_DTYPE.format(F_if = get_if_str(i_d, len(t_dtype_dict), False), F_i_type=prec_i, F_o_type=prec_o, F_per_n_case=n_str) @@ -474,7 +480,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, def content_common_header(self) -> str: return self.API_COMMON_HEADER.format(F_traits_define=self.API_TRAITS_DEFINE) - def get_blobs(self): + def get_blobs(self, args): h_traits = layernorm_fwd_codegen.h_traits h_instance = layernorm_fwd_codegen.h_instance @@ -484,60 +490,61 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, scale_list = [('fp32,fp32')] dtype_list = [('fp16,fp16'), ('bf16,bf16'), ('fp16,int8'), ('bf16,int8')] # NOTE: only fused-dynamic-quant use int8 out + types_8bit = ('int8', 'fp8') + types_16bit = ('int16', 'fp16', 'bf16') #fused_add_list = [0, 1, 2] #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused dynamic quant fused_add_list = [0, 1] fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant - - # rm rn tm tn vn pd mv fdiv 2p add sweep - h_trait_dict = {'64' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 8, 8, 8, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 1, True, False, True, False, 0, 0)], - '128' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 8, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 1, True, False, True, False, 0, 0)], - '256' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 1, True, False, True, False, 0, 0)], - '512' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 8, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 8, 4, 64, 1, True, False, True, False, 0, 0)], - '768' : [ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 4, 64, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 12, 4, 64, 1, True, False, True, False, 0, 0)], - '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 2, 128, 8, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 2, 128, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 2, 128, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 1, True, False, True, False, 0, 0)], - '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 8, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 2, 128, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 1, True, False, True, False, 0, 0)], - '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 1, 256, 8, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 8, 1, 256, 1, True, False, True, False, 0, 0)], - '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 128, 8, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 1, True, False, True, False, 0, 0)], - '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, False, 0, 0)], - '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 8, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 512, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 1,1024, 1, True, False, True, False, 0, 0)], - '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 8, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 512, 4, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 2, True, False, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 8, 1,1024, 1, True, False, True, False, 0, 0)], - 'big' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, True, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, True, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, 0, 0)]} + # rm rn tm tn vn pd mv fdiv welford 2p add sweep + h_trait_dict = {'64' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 8, 8, 8, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 1, True, False, True, True, False, 0, 0)], + '128' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 8, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 1, True, False, True, True, False, 0, 0)], + '256' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 1, True, False, True, True, False, 0, 0)], + '512' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 8, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 4, 64, 1, True, False, True, True, False, 0, 0)], + '768' : [ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 4, 64, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 12, 4, 64, 1, True, False, True, True, False, 0, 0)], + '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 2, 128, 8, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 2, 128, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 2, 128, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 1, True, False, True, True, False, 0, 0)], + '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 8, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 2, 128, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 1, True, False, True, True, False, 0, 0)], + '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 1, 256, 8, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 1, 256, 1, True, False, True, True, False, 0, 0)], + '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 128, 8, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 1, True, False, True, True, False, 0, 0)], + '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, False, 0, 0)], + '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 8, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 512, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1,1024, 1, True, False, True, True, False, 0, 0)], + '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 8, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 512, 4, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 2, True, False, True, True, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 1,1024, 1, True, False, True, True, False, 0, 0)], + 'big' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, True, True, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, True, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, True, True, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, True, 0, 0)]} total_blob = list() for hs_key in h_trait_dict: hs = h_trait_dict[hs_key] @@ -558,16 +565,27 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, h_.F_YScaleDataType = scale_x h_.F_kFusedAdd = fused_add h_.F_kFusedQuant = fused_quant + # disable welford update for 8bit and 16 bit smallN + if not h_.F_kTwoPass_: + #disable 16 bit when set args disable_16b_welford + if args.disable_16b_welford and prec_i in types_16bit: + h_.F_kWelford_ = False + #disable 8bit by default + elif prec_i in types_8bit or prec_o in types_8bit: + h_.F_kWelford_ = False + #disable 16bit small N + elif prec_i in types_16bit and hs_key == '64': + h_.F_kWelford_ = False current_hs.append(h_) # + "\n" #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_ current_n_str = 'big' if hs_key == 'big' else current_n total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, current_hs)) return total_blob - def list_blobs(self) -> None: + def list_blobs(self, args) -> None: w_p = Path(self.working_path) list_p = w_p / 'layernorm2d_fwd_blobs.txt' - blobs = self.get_blobs() + blobs = self.get_blobs(args) with list_p.open('w') as list_f: # api related file list_f.write(str(w_p / (self.name_api + ".cpp")) + "\n") @@ -576,11 +594,12 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, for b in blobs: list_f.write(str(w_p / (b.name + ".cpp")) + "\n") - def gen_blobs(self) -> None: + def gen_blobs(self, args) -> None: w_p = Path(self.working_path) - (w_p / (self.name_api + ".cpp")).write_text(self.content_api) + w_str = self.content_api(args) + (w_p / (self.name_api + ".cpp")).write_text(w_str) (w_p / (self.name_common_header + ".hpp")).write_text(self.content_common_header) - blobs = self.get_blobs() + blobs = self.get_blobs(args) for b in blobs: (w_p / (b.name + ".cpp")).write_text(b.content) @@ -588,14 +607,14 @@ def list_blobs(args): api_list = args.api.split(',') for api in api_list: if api == 'fwd': - layernorm_fwd_codegen(args.working_path, args.filter).list_blobs() + layernorm_fwd_codegen(args.working_path, args.filter).list_blobs(args) def gen_blobs(args): api_list = args.api.split(',') for api in api_list: if api == 'fwd': - layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs() + layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs(args) if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -663,6 +682,13 @@ if __name__ == "__main__": help="codegen receipt." ) + parser.add_argument( + "--disable_16b_welford", + default=False, + required=False, + help="enable/disable welford for 16bit datatype n > 64" + ) + args = parser.parse_args() # print(f'{args.list_blobs}-{args.gen_blobs}') diff --git a/example/ck_tile/02_layernorm2d/script/smoke_test.sh b/example/ck_tile/02_layernorm2d/script/smoke_test.sh index b7fd354bb..3f5c3eb13 100755 --- a/example/ck_tile/02_layernorm2d/script/smoke_test.sh +++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh @@ -27,7 +27,8 @@ $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7 -n=2734 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1 -n=3182 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=9 -n=4096 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3 -n=8192 -#$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1 -n=10547 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3 -n=9120 +$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1 -n=10547 #$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3 -n=17134 done done diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp index 724f6261d..37f87b4fe 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp @@ -4,8 +4,8 @@ #pragma once #include "ck_tile/core.hpp" -#include "ck_tile/ops/welford/block/block_welford_problem.hpp" -#include "ck_tile/ops/welford/block/block_welford.hpp" +#include "ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp" +#include "ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp" namespace ck_tile { @@ -43,36 +43,38 @@ struct Layernorm2dFwdPipelineDefaultPolicy } template - CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelford() + CK_TILE_HOST_DEVICE static constexpr auto GetBlockNormReduce() { - using P_ = BlockWelfordProblem; - - return BlockWelford{}; + using P_ = BlockNormReduceProblem; + return BlockNormReduce{}; } template - CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordSync() + CK_TILE_HOST_DEVICE static constexpr auto GetBlockNormReduceSync() { - using P_ = BlockWelfordProblem; + using P_ = BlockNormReduceProblem; - return BlockWelfordSync{}; + return BlockNormReduceSync{}; } template - CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordCrossWarpSync() + CK_TILE_HOST_DEVICE static constexpr auto GetBlockNormReduceCrossWarpSync() { - using P_ = BlockWelfordProblem; + using P_ = BlockNormReduceProblem; - return BlockWelfordCrossWarpSync{}; + return BlockNormReduceCrossWarpSync{}; } template @@ -80,19 +82,20 @@ struct Layernorm2dFwdPipelineDefaultPolicy { if constexpr(Problem::kNeedCrossWarpSync) { - using P_ = BlockWelfordProblem; + using P_ = BlockNormReduceProblem; - using block_welford = BlockWelford; + using block_welford = BlockNormReduce; using x_block_tile = decltype(make_static_distributed_tensor( MakeXBlockTileDistribution())); using mean_var_block_tile = decltype(block_welford::template MakeMeanVarBlockTile()); - return GetBlockWelfordCrossWarpSync() + return GetBlockNormReduceCrossWarpSync() .template GetSmemSize(); } else diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index eefdaf917..a30a9256a 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -37,6 +37,7 @@ struct Layernorm2dFwdPipelineOnePass static constexpr bool kPadM = false; // TODO - BlockLayernorm2dFwdProblem::kPadM static constexpr bool kPadN = Problem::Traits::kPadN; static constexpr bool kFastFDiv = Problem::Traits::kFastFDiv; + static constexpr bool kWelford = Problem::Traits::kWelford; static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd; static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant; @@ -95,11 +96,16 @@ struct Layernorm2dFwdPipelineOnePass int cur_count = 0; int max_count = block_tile_welford_calculate_max_count(row_size); - auto block_welford = Policy::template GetBlockWelford(); - auto block_welford_sync = Policy::template GetBlockWelfordSync(); - auto block_welford_cross_warp_sync = - Policy::template GetBlockWelfordCrossWarpSync(); - + auto block_norm_reduce = Policy::template GetBlockNormReduce(); + auto block_norm_reduce_sync = Policy::template GetBlockNormReduceSync(); + auto block_norm_reduce_cross_warp_sync = + Policy::template GetBlockNormReduceCrossWarpSync(); + + using XTensorType = decltype(cast_tile(x)); + auto mean = block_norm_reduce.template MakeMeanVarBlockTile(); + auto var = block_norm_reduce.template MakeMeanVarBlockTile(); + clear_tile(mean); + clear_tile(var); // load gamma/beta (TODO: support no gamma/beta?) const auto gamma = load_tile(gamma_window); const auto beta = load_tile(beta_window); @@ -117,12 +123,21 @@ struct Layernorm2dFwdPipelineOnePass store_tile(y_residual_window, cast_tile(acc)); } - // compute welford each-thread->cross-lane->cross-warp - auto [mean, var] = block_welford(acc, cur_count, max_count); - block_welford_sync(mean, var, cur_count); - block_welford_cross_warp_sync(mean, var, cur_count, smem); - block_tile_welford_post_scale_var(var, cur_count, constant{}); - + // compute reduce each-thread->cross-lane->cross-warp + block_norm_reduce(acc, mean, var, cur_count, max_count); + block_norm_reduce_sync(mean, var, cur_count); + block_norm_reduce_cross_warp_sync(mean, var, cur_count, smem); + if(kWelford) + { + block_tile_welford_post_scale_var(var, cur_count, constant{}); + } + else + { + sweep_tile(mean, [&](auto idx) { + mean(idx) = mean(idx) / type_convert(row_size); + var(idx) = var(idx) / type_convert(row_size) - mean(idx) * mean(idx); + }); + } // compute inv-std auto inv_std = tile_elementwise_in( [&](const auto& v_) { @@ -153,8 +168,7 @@ struct Layernorm2dFwdPipelineOnePass const auto beta_ = type_convert(beta[j_idx]); auto ln_ = (acc[idx] - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_; - - ln(idx) = ln_; + ln(idx) = ln_; }); if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::DYNAMIC_QUANT || diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp index 6a86cc43c..4a37be877 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -36,6 +36,7 @@ struct Layernorm2dFwdPipelineTwoPass static constexpr bool kPadM = false; // TODO - BlockLayernorm2dFwdProblem::kPadM static constexpr bool kPadN = Problem::Traits::kPadN; static constexpr bool kFastFDiv = Problem::Traits::kFastFDiv; + static constexpr bool kWelford = Problem::Traits::kWelford; static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd; static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant; @@ -77,6 +78,7 @@ struct Layernorm2dFwdPipelineTwoPass void* smem, Epilogue) const { + static_assert(kWelford == true, "2 pass only supports welford merge"); auto x_window = make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); auto gamma_window = make_tile_window( @@ -102,14 +104,14 @@ struct Layernorm2dFwdPipelineTwoPass int max_count = (num_n_tile_iteration - 1) * count_per_iter + block_tile_welford_calculate_max_count(last_iter_n); - auto block_welford = Policy::template GetBlockWelford(); - auto block_welford_sync = Policy::template GetBlockWelfordSync(); - auto block_welford_cross_warp_sync = - Policy::template GetBlockWelfordCrossWarpSync(); + auto block_norm_reduce = Policy::template GetBlockNormReduce(); + auto block_norm_reduce_sync = Policy::template GetBlockNormReduceSync(); + auto block_norm_reduce_cross_warp_sync = + Policy::template GetBlockNormReduceCrossWarpSync(); using XTensorType = decltype(cast_tile(load_tile(x_window))); - auto mean = block_welford.template MakeMeanVarBlockTile(); - auto var = block_welford.template MakeMeanVarBlockTile(); + auto mean = block_norm_reduce.template MakeMeanVarBlockTile(); + auto var = block_norm_reduce.template MakeMeanVarBlockTile(); for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { @@ -133,11 +135,11 @@ struct Layernorm2dFwdPipelineTwoPass move_tile_window(y_residual_window, {0, Block_N}); } } - block_welford(acc, mean, var, cur_count, max_count); + block_norm_reduce(acc, mean, var, cur_count, max_count); } - block_welford_sync(mean, var, cur_count); - block_welford_cross_warp_sync(mean, var, cur_count, smem); + block_norm_reduce_sync(mean, var, cur_count); + block_norm_reduce_cross_warp_sync(mean, var, cur_count, smem); block_tile_welford_post_scale_var(var, cur_count, constant{}); // compute inv-std diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp index e8c22f8ab..045bd24e4 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp @@ -40,6 +40,7 @@ template<> struct Layernorm2dFusedQuantEnumName @@ -48,6 +49,7 @@ struct Layernorm2dFwdTraits static constexpr bool kPadN = kPadN_; static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; static constexpr bool kFastFDiv = kFastFDiv_; + static constexpr bool kWelford = kWelford_; static constexpr bool kTwoPass = kTwoPass_; static constexpr Layernorm2dFusedAddEnum kFusedAdd = kFusedAdd_; static constexpr Layernorm2dFusedQuantEnum kFusedQuant = kFusedQuant_; diff --git a/include/ck_tile/ops/welford.hpp b/include/ck_tile/ops/norm_reduce.hpp similarity index 54% rename from include/ck_tile/ops/welford.hpp rename to include/ck_tile/ops/norm_reduce.hpp index a4c479dd9..02d8eabd8 100644 --- a/include/ck_tile/ops/welford.hpp +++ b/include/ck_tile/ops/norm_reduce.hpp @@ -3,8 +3,8 @@ #pragma once -#include "ck_tile/ops/welford/block/block_welford.hpp" -#include "ck_tile/ops/welford/block/block_welford_problem.hpp" -#include "ck_tile/ops/welford/thread/thread_welford.hpp" +#include "ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp" +#include "ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp" +#include "ck_tile/ops/norm_reduce/thread/thread_welford.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp similarity index 79% rename from include/ck_tile/ops/welford/block/block_welford.hpp rename to include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp index 56ca86d9d..15ac02163 100644 --- a/include/ck_tile/ops/welford/block/block_welford.hpp +++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp @@ -4,22 +4,23 @@ #pragma once #include "ck_tile/core.hpp" -#include "ck_tile/ops/welford/thread/thread_welford.hpp" +#include "ck_tile/ops/norm_reduce/thread/thread_welford.hpp" namespace ck_tile { template -struct BlockWelford +struct BlockNormReduce { using Problem = remove_cvref_t; using XDataType = typename Problem::XDataType; using ComputeDataType = typename Problem::ComputeDataType; static constexpr bool kFastFDiv = Problem::kFastFDiv; + static constexpr bool kWelford = Problem::kWelford; - CK_TILE_DEVICE constexpr BlockWelford() {} + CK_TILE_DEVICE constexpr BlockNormReduce() {} // [CAUSION] - max_count_ is to deal with the padding problem - // max_count_ is depend on caller, eg: naive and splitN welford will have different + // max_count_ is depend on caller, eg: naive and splitN norm_reduce will have different // calculation of max_count_ // -> use block_welford_calculate_max_count to compute template (x_tensor[in_dstr_idx]); - - welford_update(mean_tensor(out_dstr_idx), - var_tensor(out_dstr_idx), - x, - cur_count_, - constant{}); + if(kWelford) + { + welford_update(mean_tensor(out_dstr_idx), + var_tensor(out_dstr_idx), + x, + cur_count_, + constant{}); + } + else + { + mean_tensor(out_dstr_idx) += x; + var_tensor(out_dstr_idx) += x * x; + } }); } }); @@ -91,10 +98,11 @@ struct BlockWelford }; template -struct BlockWelfordSync +struct BlockNormReduceSync { using Problem = remove_cvref_t; static constexpr bool kFastFDiv = Problem::kFastFDiv; + static constexpr bool kWelford = Problem::kWelford; template CK_TILE_DEVICE void @@ -152,36 +160,48 @@ struct BlockWelfordSync (number{}.value); // pull data from remote lane - const auto v_remote_mean = warp_shuffle(v_local_mean, src_lane); - const auto v_remote_var = warp_shuffle(v_local_var, src_lane); - const auto v_remote_count = warp_shuffle(v_local_count, src_lane); - - // welford merge - welford_merge(v_local_mean, - v_local_var, - v_local_count, - v_remote_mean, - v_remote_var, - v_remote_count, - constant{}); + const auto v_remote_mean = warp_shuffle(v_local_mean, src_lane); + const auto v_remote_var = warp_shuffle(v_local_var, src_lane); + if(kWelford) + { + const auto v_remote_count = warp_shuffle(v_local_count, src_lane); + + // norm_reduce merge + welford_merge(v_local_mean, + v_local_var, + v_local_count, + v_remote_mean, + v_remote_var, + v_remote_count, + constant{}); + } + else + { + v_local_mean += v_remote_mean; + v_local_var += v_remote_var; + } }); } }); mean_tensor.get_thread_buffer()(i) = v_local_mean; var_tensor.get_thread_buffer()(i) = v_local_var; - - count = v_local_count; + if(kWelford) + { + count = v_local_count; + } }); } }; template -struct BlockWelfordCrossWarpSync +struct BlockNormReduceCrossWarpSync { using Problem = remove_cvref_t; using BlockShape = typename Problem::BlockShape; static constexpr bool kFastFDiv = Problem::kFastFDiv; + static constexpr bool kWelford = Problem::kWelford; + using smem_dtype = std::conditional_t; template CK_TILE_DEVICE static constexpr index_t GetReduceWarps() @@ -252,7 +272,7 @@ struct BlockWelfordCrossWarpSync static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size()); // Note: we always pack everything into fp32x4 - fp32x4_t* smem_ptr = reinterpret_cast(smem); + smem_dtype* smem_ptr = reinterpret_cast(smem); const index_t lane_id = get_lane_id(); const index_t warp_id = get_warp_id(); constexpr auto num_reduce_warps = GetReduceWarps(); @@ -267,11 +287,13 @@ struct BlockWelfordCrossWarpSync if(lane_id == 0) { static_for<0, thread_buf_size, 1>{}([&](auto i) { - fp32x4_t local_scratch_; + smem_dtype local_scratch_; local_scratch_[0] = bit_cast(mean_tensor.get_thread_buffer()[i]); local_scratch_[1] = bit_cast(var_tensor.get_thread_buffer()[i]); - local_scratch_[2] = bit_cast(count); - + if(kWelford) + { + local_scratch_[2] = bit_cast(count); + } smem_ptr[smem_offset + i * num_warps] = local_scratch_; }); } @@ -280,7 +302,7 @@ struct BlockWelfordCrossWarpSync // load from smem. here we let everythread to do compute :) index_t local_warp_id = warp_id / num_reduce_warps; index_t local_smem_os = local_warp_id * num_reduce_warps; - fp32x4_t all_scratch[thread_buf_size * num_reduce_warps]; + smem_dtype all_scratch[thread_buf_size * num_reduce_warps]; static_for<0, thread_buf_size, 1>{}([&](auto i_0) { static_for<0, num_reduce_warps, 1>{}([&](auto i_1) { all_scratch[i_0 * num_reduce_warps + i_1] = @@ -293,32 +315,40 @@ struct BlockWelfordCrossWarpSync static_for<0, thread_buf_size, 1>{}([&](auto i_0) { // TODO: use descriptor for this - auto v_local = all_scratch[i_0 * num_reduce_warps]; - auto v_local_mean = bit_cast(v_local[0]); - auto v_local_var = bit_cast(v_local[1]); - auto v_local_count = bit_cast(v_local[2]); + auto v_local = all_scratch[i_0 * num_reduce_warps]; + auto v_local_mean = bit_cast(v_local[0]); + auto v_local_var = bit_cast(v_local[1]); + int v_local_count = kWelford ? bit_cast(v_local[2]) : 0; // further reduce mean/var static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) { constexpr auto i_1 = number{}; - const fp32x4_t v_remote = all_scratch[i_0 * num_reduce_warps + i_1]; + const smem_dtype v_remote = all_scratch[i_0 * num_reduce_warps + i_1]; const auto v_remote_mean = bit_cast(v_remote[0]); const auto v_remote_var = bit_cast(v_remote[1]); - const auto v_remote_count = bit_cast(v_remote[2]); - - welford_merge(v_local_mean, - v_local_var, - v_local_count, - v_remote_mean, - v_remote_var, - v_remote_count, - constant{}); + if(kWelford) + { + const auto v_remote_count = bit_cast(v_remote[2]); + + welford_merge(v_local_mean, + v_local_var, + v_local_count, + v_remote_mean, + v_remote_var, + v_remote_count, + constant{}); + } + else + { + v_local_mean += v_remote_mean; + v_local_var += v_remote_var; + } }); mean_tensor.get_thread_buffer()(i_0) = v_local_mean; var_tensor.get_thread_buffer()(i_0) = v_local_var; - - count = v_local_count; + if(kWelford) + count = v_local_count; }); } }; diff --git a/include/ck_tile/ops/welford/block/block_welford_problem.hpp b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp similarity index 66% rename from include/ck_tile/ops/welford/block/block_welford_problem.hpp rename to include/ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp index bcbfb7d76..53f5bfc6f 100644 --- a/include/ck_tile/ops/welford/block/block_welford_problem.hpp +++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp @@ -7,13 +7,18 @@ namespace ck_tile { -template -struct BlockWelfordProblem +template +struct BlockNormReduceProblem { using XDataType = remove_cvref_t; using ComputeDataType = remove_cvref_t; using BlockShape = remove_cvref_t; static constexpr bool kFastFDiv = kFastFDiv_; + static constexpr bool kWelford = kWelford_; }; } // namespace ck_tile diff --git a/include/ck_tile/ops/welford/thread/thread_welford.hpp b/include/ck_tile/ops/norm_reduce/thread/thread_welford.hpp similarity index 100% rename from include/ck_tile/ops/welford/thread/thread_welford.hpp rename to include/ck_tile/ops/norm_reduce/thread/thread_welford.hpp -- GitLab From 4f62f6e9b77a41ca34a68efd1297d4b68eda06d2 Mon Sep 17 00:00:00 2001 From: Mingtao Gu <145657261+mtgu0705@users.noreply.github.com> Date: Fri, 3 Jan 2025 18:35:21 +0800 Subject: [PATCH 149/153] Implement the fp16xint4 scale weight only kernel for Ali (#1786) * enable int4 scale (weight only) kernel * format some files * Add unit test for int4 weight only * fixed and formatted code * fixed * formated * formated * fixed * fixed a bug in the ckProfiler, and formatted the code --------- Co-authored-by: mtgu0705 --- example/01_gemm/CMakeLists.txt | 1 + .../gemm_xdl_fp16_pk_i4_v3_b_scale.cpp | 357 +++ ..._gemm_pipeline_xdlops_b_scale_selector.hpp | 167 ++ ...ckwise_gemm_pipeline_xdlops_v1_b_scale.hpp | 403 +++ ...ckwise_gemm_pipeline_xdlops_v2_b_scale.hpp | 1248 ++++++++++ ...ckwise_gemm_pipeline_xdlops_v3_b_scale.hpp | 530 ++++ ...ckwise_gemm_pipeline_xdlops_v4_b_scale.hpp | 686 +++++ .../gpu/device/device_gemm_v2.hpp | 37 + .../device_gemm_xdl_cshuffle_v3_b_scale.hpp | 781 ++++++ .../element/unary_element_wise_operation.hpp | 71 +- .../gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp | 2208 +++++++++++++++++ .../threadwise_tensor_slice_transfer.hpp | 200 ++ include/ck/utility/amd_inline_asm.hpp | 6 +- include/ck/utility/data_type.hpp | 2 + .../gpu/gemm_b_scale.hpp | 91 + .../gpu/gemm_b_scale/CMakeLists.txt | 10 + ...e_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp | 105 + ...4_f16_mk_nk_mn_mem_v2_default_instance.cpp | 32 + .../profiler/profile_gemm_b_scale_impl.hpp | 448 ++++ profiler/src/CMakeLists.txt | 2 + profiler/src/profile_gemm_b_scale.cpp | 181 ++ 21 files changed, 7562 insertions(+), 4 deletions(-) create mode 100644 example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp create mode 100644 profiler/include/profiler/profile_gemm_b_scale_impl.hpp create mode 100644 profiler/src/profile_gemm_b_scale.cpp diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 354e443b3..d6df1514b 100755 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -30,6 +30,7 @@ add_example_executable(example_gemm_xdl_fp8_v3 gemm_xdl_fp8_v3.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_v3) add_example_executable(example_gemm_xdl_fp16_fp8_v3 gemm_xdl_fp16_fp8_v3.cpp) add_example_executable(example_gemm_xdl_fp16_pk_i4_v3 gemm_xdl_fp16_pk_i4_v3.cpp) +add_example_executable(example_gemm_xdl_fp16_pk_i4_v3_b_scale gemm_xdl_fp16_pk_i4_v3_b_scale.cpp) add_example_executable(example_gemm_xdl_bf16_pk_i4_v3 gemm_xdl_bf16_pk_i4_v3.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3) add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp) diff --git a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp new file mode 100644 index 000000000..c8a40baa8 --- /dev/null +++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp" + +using ADataType = ck::half_t; +using BDataType = ck::pk_i4_t; +using BScaleDataType = ck::half_t; +using AccDataType = float; +using CShuffleDataType = ck::half_t; +using CDataType = ck::half_t; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +static constexpr bool PermuteA = false; +static constexpr bool PermuteB = true; + +static constexpr ck::index_t Scale_Block_N = 1; +static constexpr ck::index_t Scale_Block_K = 128; + +static constexpr ck::index_t KPerBlock = 64; + +// clang-format off +using DeviceGemmV2Instance = + ck::tensor_operation::device::DeviceGemm_Xdl_CShuffleV3< + ALayout, BLayout, CLayout, + ADataType, BDataType, BScaleDataType, CDataType, AccDataType, CShuffleDataType, + AElementOp, BElementOp, CElementOp, GemmDefault, + 256, Scale_Block_N, Scale_Block_K, + 128, 128, + KPerBlock, 8, 32, + 32, 32, + 4, 1, + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 8, 8, 0, + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 32, 32, 0, + 1, 1, S<1, 32, 1, 8>, 8, + ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, CDataType, CDataType, PermuteA, PermuteB>; + +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; +template +bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) +{ + using namespace ck::literals; + + auto M = problem_size.M; + auto N = problem_size.N; + auto K = problem_size.K; + auto StrideA = problem_size.StrideA; + auto StrideB = problem_size.StrideB; + auto StrideC = problem_size.StrideC; + auto KBatch = problem_size.KBatch; + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if constexpr(std::is_same_v) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + auto f_get_default_stride = + [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) { + if(stride == -1) + { + // give a chance if stride is -1, return a default packed stride + if constexpr(std::is_same_v) + { + return static_cast(col); + } + else + { + return static_cast(row); + } + } + else + return static_cast(stride); + }; + + ck::index_t Scale_Stride_BN = (K + Scale_Block_K - 1) / Scale_Block_K; + + StrideA = f_get_default_stride(M, K, StrideA, ALayout{}); + StrideB = f_get_default_stride(K, N, StrideB, BLayout{}); + StrideC = f_get_default_stride(M, N, StrideC, CLayout{}); + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor b1_k_n(f_host_tensor_descriptor((K + Scale_Block_K - 1) / Scale_Block_K, + (N + Scale_Block_N - 1) / Scale_Block_N, + Scale_Stride_BN, + BLayout{})); + + switch(config.init_method) + { + case 0: + a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); + b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + b1_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + break; + case 1: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_k_n.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); + break; + case 2: + a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + break; + case 3: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + b1_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + break; + case 4: + a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); + b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + b1_k_n.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); + break; + case 5: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + break; + default: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.5, 0.5}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_k_n.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); + } + + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; + + DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize()); + DeviceMem b1_scale_device_buf(sizeof(BScaleDataType) * b1_k_n.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); + + // weight permute + if constexpr(PermuteB) + { + int K1 = KPerBlock; + int K0 = K / KPerBlock; + + // int K0, N, K1 + for(int j = 0; j < K0; j++) + { + for(int i = 0; i < N; i++) + { + for(int jj = 0; jj < K1; jj++) + { + b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj)); + } + } + } + } + else + { + for(int i = 0; i < N; i++) + { + for(int j = 0; j < K; j++) + { + b_k_n_permute(i * K + j) = b_k_n(i * K + j); + } + } + } + + // vector pk_i4x4 permute + for(int i = 0; i < N; i++) + { + for(int j = 0; j < K; j += 8) + { + int input[8]; + + for(int k = 0; k < 4; k++) + { + int i4x2 = b_k_n_permute(j + k * 2, i).data; + input[k * 2 + 0] = (i4x2 >> 4) & 0xf; + input[k * 2 + 1] = (i4x2 >> 0) & 0xf; + } + + // permute 01234567->20643175 + { + int hi = input[2]; + int lo = input[0]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 0, i) = i4x2; + } + + { + int hi = input[6]; + int lo = input[4]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 2, i) = i4x2; + } + + { + int hi = input[3]; + int lo = input[1]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 4, i) = i4x2; + } + + { + int hi = input[7]; + int lo = input[5]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 6, i) = i4x2; + } + } + } + + a_m_k_device_buf.ToDevice(a_m_k.mData.data()); + b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data()); + b1_scale_device_buf.ToDevice(b1_k_n.mData.data()); + DeviceMem workspace; + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + auto gemm = DeviceGemmV2Instance{}; + auto invoker = gemm.MakeInvoker(); + float ave_time = 0; + + auto argument = + gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer()), + M, + N, + K, + StrideA, + StrideB, + StrideC, + Scale_Stride_BN, + static_cast(b1_scale_device_buf.GetDeviceBuffer()), + KBatch, + a_element_op, + b_element_op, + c_element_op); + + if(!gemm.IsSupportedArgument(argument)) + { + std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl; + + return true; + } + + bool pass = true; + if(config.do_verification) + { + Tensor b_k_n_dequant({K, N}); + + float v_b = 0; + for(int n = 0; n < N; n++) + { + for(int k = 0; k < K; k++) + { + ck::pk_i4_t i4x2 = b_k_n(k, n).data; + int8_t i4 = 0; + if(k % 2 == 1) + i4 = (i4x2.data >> 0) & 0xf; + else + i4 = (i4x2.data >> 4) & 0xf; + i4 = i4 - 8; + v_b = ck::type_convert(i4); + + b_k_n_dequant(k, n) = + ck::type_convert(v_b) * + ck::type_convert(b1_k_n(k / Scale_Block_K, n / Scale_Block_N)); + } + } + + auto ref_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = ref_gemm.MakeArgument( + a_m_k, b_k_n_dequant, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{}); + + ref_invoker.Run(ref_argument); + + ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0}); + c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + pass &= ck::utils::check_err(c_m_n_device_result, + c_m_n_host_result, + "Error: Incorrect results!", + get_rtol(), + get_atol()); + } + + if(config.time_kernel) + { + ave_time = + invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50}); + + std::size_t flop = 2_uz * M * N * K; + std::size_t num_btype = + sizeof(ADataType) * M * K + + sizeof(BDataType) * K * N / + (ck::is_same_v, ck::pk_i4_t> ? 2 : 1) + + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << gemm.GetTypeString() << std::endl; + } + return pass; +} + +bool run_gemm_splitk_example(int argc, char* argv[]) +{ + ProblemSizeSplitK problem_size; + ExecutionConfig config; + + return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); +} + +int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); } diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp new file mode 100644 index 000000000..ea0c511da --- /dev/null +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp" + +namespace ck { + +enum struct BlockGemmPipelineVersion +{ + v1, // Naive + v2, // Mem + v3, // Comp + v4, // Comp, double lds buffer + v5, // Comp, double global prefetch register buffer +}; + +template +constexpr auto BlockGemmPipeline_Selector() +{ + if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1) + { + return BlockwiseGemmXdlops_pipeline_v1_b_scale{}; + } + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2) + { + return BlockwiseGemmXdlops_pipeline_v2_b_scale{}; + } + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3) + { + return BlockwiseGemmXdlops_pipeline_v3_b_scale{}; + } + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4) + { + return BlockwiseGemmXdlops_pipeline_v4_b_scale{}; + } + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v5) + { + return BlockwiseGemmXdlops_pipeline_v5{}; + } + else + { + std::cerr << "BlockGemmPipeline configuration is not available" << std::endl; + } +} + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp new file mode 100644 index 000000000..4246f4a44 --- /dev/null +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp @@ -0,0 +1,403 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp" + +namespace ck { + +// Naive pipeline with lowest resource request per WGP +// GlobalPrefetchStages: 1 +// LocalPreFillStages: 1 +// LocalPreFetchStages: 0 +// LocalSharedMemoryBuffer: 1 + +template +struct BlockwiseGemmXdlops_pipeline_v1_b_scale +{ +}; + +template +struct BlockwiseGemmXdlops_pipeline_v1_b_scale + : BlockwiseGemmXdlops_pipeline_base + +{ + using Base = BlockwiseGemmXdlops_pipeline_base; + using Base::I0; + using Base::KRepeat; + using Base::xdlops_gemm; + + using Base::CalculateCThreadOriginDataIndex; + using Base::CalculateCThreadOriginDataIndex8D; + using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4; + using Base::GetCThreadBuffer; + using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4; + using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + + using Base::a_block_desc_m0_m1_m2_k; + using Base::b_block_desc_n0_n1_n2_k; + + using Base::AMmaKStride; + using Base::BMmaKStride; + + static constexpr index_t PrefetchStages = 1; + static constexpr index_t PrefillStages = 1; + static constexpr index_t GlobalBufferNum = 1; + + __host__ static constexpr bool BlockHasHotloop(index_t num_loop) + { + return num_loop > PrefetchStages; + } + + __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop) + { + ignore = num_loop; + return TailNumber::Full; + } + + template + __device__ void Run( + // ABlockCopy + const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + // BBlockCopy + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + // CThread + CThreadBuffer& c_thread_buf, + // BScaleThreadCopy + const BScaleGridDesc& b_scale_grid_desc, + const BScaleThreadDesc& b_scale_thread_desc, + BScaleThreadTransfer& b_scale_thread_copy, + const BScaleGridBuffer& b_scale_grid_buf, + const BScaleThreadTransferStep& b_scale_thread_copy_step, + // num_loop + index_t num_loop, + index_t num_loop_per_scale) const + { + // assume kperblock = scaleblockk + ignore = num_loop_per_scale; + auto a_thread_buf = make_static_buffer( + a_thread_desc_.GetElementSpaceSize()); + auto b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); + + auto b_scale_thread_buf = make_static_buffer( + b_scale_thread_desc.GetElementSpaceSize()); + + // Global prefetch 1 + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_scale_thread_copy.Run(b_scale_grid_desc, + b_scale_grid_buf, + b_scale_thread_desc, + make_tuple(n0, I0), + b_scale_thread_buf); + + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<0>{})); + }); + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<1>{})); + + // Local prefill 1 + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + // Initialize C + c_thread_buf.Clear(); + + auto c_thread_buf_per_scale = remove_cvref_t(); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + do + { + // ------------------------------------------------------------------------------------------- + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k, I0), + a_thread_buf); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); + }); + }); + + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + c_thread_buf_per_scale.Clear(); + static_for<0, KRepeat, 1>{}([&](auto k0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + xdlops_gemm.template Run<>( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf_per_scale.GetVectorTypeReference(I0)); + }); + static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); + c_thread_buf(Number{}) += + c_thread_buf_per_scale[Number{}] * + type_convert(b_scale_thread_buf[n0]); + }); + }); + }); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_scale_thread_copy.Run(b_scale_grid_desc, + b_scale_grid_buf, + b_scale_thread_desc, + make_tuple(n0, I0), + b_scale_thread_buf); + + b_scale_thread_copy.MoveSrcSliceWindow( + b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{})); + }); + + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<1>{})); + + block_sync_lds(); + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + i += 1; + + } while(i < (num_loop - 1)); + } + + // tail + if constexpr(TailNum == TailNumber::Full) + { + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k, I0), + a_thread_buf); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); + }); + }); + + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + c_thread_buf_per_scale.Clear(); + static_for<0, KRepeat, 1>{}([&](auto k0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + xdlops_gemm.template Run<>( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf_per_scale.GetVectorTypeReference(I0)); + }); + static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); + c_thread_buf(Number{}) += + c_thread_buf_per_scale[Number{}] * + type_convert(b_scale_thread_buf[n0]); + }); + }); + }); + } + } + + protected: + using Base::a_thread_copy_; + using Base::a_thread_desc_; + using Base::b_thread_copy_; + using Base::b_thread_desc_; + using Base::c_thread_desc_; +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp new file mode 100644 index 000000000..776f66dbb --- /dev/null +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp @@ -0,0 +1,1248 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp" + +namespace ck { + +// Maximum Global Memory throughput pipeline with >=32KB data in fly +// GlobalPrefetchStages: >=2 +// LocalPreFillStages: 1 +// LocalPreFetchStages: 0 +// LocalSharedMemoryBuffer: 1 + +template +struct BlockwiseGemmXdlops_pipeline_v2_b_scale +{ +}; + +template +struct BlockwiseGemmXdlops_pipeline_v2_b_scale + : BlockwiseGemmXdlops_pipeline_base + +{ + using Base = BlockwiseGemmXdlops_pipeline_base; + using Base::I0; + using Base::KRepeat; + using Base::xdlops_gemm; + + using Base::CalculateCThreadOriginDataIndex; + using Base::CalculateCThreadOriginDataIndex8D; + using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4; + using Base::GetCThreadBuffer; + using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4; + using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + + using Base::a_block_desc_m0_m1_m2_k; + using Base::b_block_desc_n0_n1_n2_k; + + using Base::AMmaKStride; + using Base::BMmaKStride; + + static constexpr index_t WgpPerCU = + (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1; + static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil( + 32768 / WgpPerCU, + (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock); + static constexpr index_t PrefetchStages = + FullMemBandPrefetchStages >= 2 + ? FullMemBandPrefetchStages <= 8 ? FullMemBandPrefetchStages : 8 + : 2; + + static constexpr index_t PrefillStages = 1; + static constexpr index_t GlobalBufferNum = PrefetchStages; + + __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop) + { + return num_loop > PrefetchStages; + } + + __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop) + { + if(num_loop % PrefetchStages == 1) + { + return TailNumber::One; + } + else if(num_loop % PrefetchStages == 2) + { + return TailNumber::Two; + } + else if(num_loop % PrefetchStages == 3) + { + return TailNumber::Three; + } + else if(num_loop % PrefetchStages == 4) + { + return TailNumber::Four; + } + else if(num_loop % PrefetchStages == 5) + { + return TailNumber::Five; + } + else if(num_loop % PrefetchStages == 6) + { + return TailNumber::Six; + } + else if(num_loop % PrefetchStages == 7) + { + return TailNumber::Seven; + } + else + { + return TailNumber::Full; + } + } + + template + __device__ void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + CThreadBuffer& c_thread_buf, + index_t num_loop) const + { + auto a_thread_buf = make_static_buffer( + a_thread_desc_.GetElementSpaceSize()); + auto b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); + + // Global prefetch 1 + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + // Initialize C + c_thread_buf.Clear(); + + // Local prefill 1 + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0); + + // Global prefetch [2, PrefetchStages] + static_for<1, PrefetchStages, 1>{}([&](auto iprefetch) { + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + }); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + do + { + static_for<0, PrefetchStages, 1>{}([&](auto iprefetch) { + // ------------------------------------------------------------------------------------------- + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k, I0), + a_thread_buf); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run( + b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); + }); + }); + }); + + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + xdlops_gemm.Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + + block_sync_lds(); + a_blockwise_copy.RunWrite( + a_block_desc, a_block_buf, Number<(iprefetch + 1) % PrefetchStages>{}); + b_blockwise_copy.RunWrite( + b_block_desc, b_block_buf, Number<(iprefetch + 1) % PrefetchStages>{}); + + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + }); + + i += PrefetchStages; + } while(i < (num_loop - PrefetchStages)); + } + + // tail + + auto LoopTailFunc = [&](auto tail_num) { + static_for<1, tail_num, 1>{}([&](auto iprefetch) { + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k, I0), + a_thread_buf); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); + }); + }); + }); + + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + xdlops_gemm.Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + + block_sync_lds(); + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, iprefetch); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, iprefetch); + }); + + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k, I0), + a_thread_buf); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); + }); + }); + }); + + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + xdlops_gemm.Run(a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + }; + + if constexpr(TailNum == TailNumber::One) + { + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k, I0), + a_thread_buf); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); + }); + }); + }); + + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + xdlops_gemm.Run(a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + } + else if constexpr(TailNum == TailNumber::Two) + { + LoopTailFunc(Number<2>{}); + } + else if constexpr(TailNum == TailNumber::Three) + { + LoopTailFunc(Number<3>{}); + } + else if constexpr(TailNum == TailNumber::Four) + { + LoopTailFunc(Number<4>{}); + } + else if constexpr(TailNum == TailNumber::Five) + { + LoopTailFunc(Number<5>{}); + } + else if constexpr(TailNum == TailNumber::Six) + { + LoopTailFunc(Number<6>{}); + } + else if constexpr(TailNum == TailNumber::Seven) + { + LoopTailFunc(Number<7>{}); + } + else if constexpr(TailNum == TailNumber::Full) + { + LoopTailFunc(Number{}); + } + } + + protected: + using Base::a_thread_copy_; + using Base::a_thread_desc_; + using Base::b_thread_copy_; + using Base::b_thread_desc_; + using Base::c_thread_desc_; +}; + +template +struct BlockwiseGemmXdlops_pipeline_v2_b_scale + : BlockwiseGemmXdlops_pipeline_base + +{ + using Base = BlockwiseGemmXdlops_pipeline_base; + using Base::A_K1; + using Base::B_K1; + using Base::I0; + using Base::I1; + using Base::KPerThread; + using Base::xdlops_gemm; + + using Base::CalculateCThreadOriginDataIndex; + using Base::CalculateCThreadOriginDataIndex8D; + using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4; + using Base::GetCThreadBuffer; + using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4; + using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + + using Base::a_block_desc_m0_m1_m2_k; + using Base::b_block_desc_n0_n1_n2_k; + + static constexpr index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS; + static constexpr index_t KPerInnerLoop = math::max(KPerThread / NumMacClusters, KPack); + static constexpr index_t KRepeat = KPerThread / KPerInnerLoop; + + static constexpr index_t WgpPerCU = + (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1; + static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil( + 32768 / WgpPerCU, + (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock); + static constexpr index_t PrefetchStages = + FullMemBandPrefetchStages >= 2 + ? FullMemBandPrefetchStages <= 8 ? FullMemBandPrefetchStages : 8 + : 2; + + static constexpr index_t PrefillStages = 1; + static constexpr index_t GlobalBufferNum = PrefetchStages; + + __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop) + { + return num_loop > PrefetchStages; + } + + __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop) + { + if(num_loop % PrefetchStages == 1) + { + return TailNumber::One; + } + else if(num_loop % PrefetchStages == 2) + { + return TailNumber::Two; + } + else if(num_loop % PrefetchStages == 3) + { + return TailNumber::Three; + } + else if(num_loop % PrefetchStages == 4) + { + return TailNumber::Four; + } + else if(num_loop % PrefetchStages == 5) + { + return TailNumber::Five; + } + else if(num_loop % PrefetchStages == 6) + { + return TailNumber::Six; + } + else if(num_loop % PrefetchStages == 7) + { + return TailNumber::Seven; + } + else + { + return TailNumber::Full; + } + } + + template + __device__ void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + CThreadBuffer& c_thread_buf, + const BScaleGridDesc& b_scale_grid_desc, + // BScaleThreadCopy + const BScaleThreadDesc& b_scale_thread_desc, + BScaleThreadTransfer& b_scale_thread_copy, + const BScaleGridBuffer& b_scale_grid_buf, + const BScaleThreadTransferStep& b_scale_thread_copy_step, + // num loop + index_t num_loop, + index_t num_loop_per_scale) const + { + ignore = num_loop_per_scale; + + auto a_thread_buf = make_static_buffer( + a_thread_desc_.GetElementSpaceSize()); + auto b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); + + auto b_scale_thread_buf = make_static_buffer( + b_scale_thread_desc.GetElementSpaceSize()); + + // Global prefetch 1 + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_scale_thread_copy.Run(b_scale_grid_desc, + b_scale_grid_buf, + b_scale_thread_desc, + make_tuple(n0, I0), + b_scale_thread_buf); + + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<0>{})); + }); + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<1>{})); + + // Initialize C + c_thread_buf.Clear(); + + // Local prefill 1 + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0); + + // Global prefetch [2, PrefetchStages] + static_for<1, PrefetchStages, 1>{}([&](auto iprefetch) { + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + }); + + auto c_thread_buf_per_scale = remove_cvref_t(); // need? + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + do + { + static_for<0, PrefetchStages, 1>{}([&](auto iprefetch) { + // ------------------------------------------------------------------------------------------- + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k0, I0), + a_thread_buf); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run( + b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); + }); + }); + __builtin_amdgcn_sched_barrier(0); + // NOTE: Synchronize threads in a workgroup at the start of each MAC + // cluster, but except the first, as we can shorten non-MAC cluster a bit + // and there's no observable negative impact. The desired effect is waves in + // a workgroup executing MAC in sync. This avoids some out-of-sync waves + // hijacking MAC resource from other workgroups and reducing the chance of + // latency hiding by waiting for the rest of the workgroup at the eventual + // sync point. + if constexpr(k0.value != 0 || KRepeat == 1) + { + __builtin_amdgcn_s_barrier(); + __builtin_amdgcn_sched_barrier(0); + } + static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + // The block_sync_lds() here performs double duty: + // A) safeguard against data hazard because barrier from + // blockwise_gemm is moved here B) reduce VMEM FIFO congestion + // by applying small delays to different wavefronts It is + // performed near the end of MAC cluster to minimize lgkmcnt + // penalty + if constexpr(k0.value == KRepeat - 1 && + k_.value == KPerInnerLoop - KPack && + m0.value == MRepeat - 1 && n0.value == NRepeat - 1) + { + __builtin_amdgcn_sched_barrier(0); + block_sync_lds(); + __builtin_amdgcn_sched_barrier(0); + } + xdlops_gemm.Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0) + { + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(1); + __builtin_amdgcn_sched_barrier(0); + } + }); + + // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) + // { + // constexpr index_t c_offset = + // c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); + // c_thread_buf(Number{}) += + // c_thread_buf_per_scale[Number{}] * + // type_convert(b_scale_thread_buf[n0]); + // }); + }); + }); + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(0); + __builtin_amdgcn_sched_barrier(0); + }); + + // static_for<0, NRepeat, 1>{}([&](auto n0) { + // b_scale_thread_copy.Run(b_scale_grid_desc, + // b_scale_grid_buf, + // b_scale_thread_desc, + // make_tuple(n0, I0), + // b_scale_thread_buf); + + // b_scale_thread_copy.MoveSrcSliceWindow( + // b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{})); + // }); + // b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + // b_scale_thread_copy_step.At(Number<1>{})); + + // block_sync_lds(); + a_blockwise_copy.RunWrite( + a_block_desc, a_block_buf, Number<(iprefetch + 1) % PrefetchStages>{}); + b_blockwise_copy.RunWrite( + b_block_desc, b_block_buf, Number<(iprefetch + 1) % PrefetchStages>{}); + + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + }); + i += PrefetchStages; + } while(i < (num_loop - PrefetchStages)); + } + + // tail + + auto LoopTailFunc = [&](auto tail_num) { + static_for<1, tail_num, 1>{}([&](auto iprefetch) { + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k0, I0), + a_thread_buf); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); + }); + }); + + __builtin_amdgcn_sched_barrier(0); + if constexpr(k0.value != 0 || KRepeat == 1) + { + __builtin_amdgcn_s_barrier(); + __builtin_amdgcn_sched_barrier(0); + } + static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + if constexpr(k0.value == KRepeat - 1 && + k_.value == KPerInnerLoop - KPack && + m0.value == MRepeat - 1 && n0.value == NRepeat - 1) + { + __builtin_amdgcn_sched_barrier(0); + block_sync_lds(); + __builtin_amdgcn_sched_barrier(0); + } + xdlops_gemm.Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0) + { + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(1); + __builtin_amdgcn_sched_barrier(0); + } + }); + + // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { + // constexpr index_t c_offset = + // c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); + // c_thread_buf(Number{}) += + // c_thread_buf_per_scale[Number{}] * + // type_convert(b_scale_thread_buf[n0]); + // }); + }); + }); + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(0); + __builtin_amdgcn_sched_barrier(0); + }); + + // static_for<0, NRepeat, 1>{}([&](auto n0) { + // b_scale_thread_copy.Run(b_scale_grid_desc, + // b_scale_grid_buf, + // b_scale_thread_desc, + // make_tuple(n0, I0), + // b_scale_thread_buf); + + // b_scale_thread_copy.MoveSrcSliceWindow( + // b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{})); + // }); + // b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + // b_scale_thread_copy_step.At(Number<1>{})); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, iprefetch); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, iprefetch); + }); + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k0, I0), + a_thread_buf); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); + }); + }); + + __builtin_amdgcn_sched_barrier(0); + if constexpr(k0.value != 0 || KRepeat == 1) + { + __builtin_amdgcn_s_barrier(); + __builtin_amdgcn_sched_barrier(0); + } + static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + if constexpr(k0.value == KRepeat - 1 && + k_.value == KPerInnerLoop - KPack && + m0.value == MRepeat - 1 && n0.value == NRepeat - 1) + { + __builtin_amdgcn_sched_barrier(0); + block_sync_lds(); + __builtin_amdgcn_sched_barrier(0); + } + xdlops_gemm.Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0) + { + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(1); + __builtin_amdgcn_sched_barrier(0); + } + }); + + // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { + // constexpr index_t c_offset = + // c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); + // c_thread_buf(Number{}) += + // c_thread_buf_per_scale[Number{}] * + // type_convert(b_scale_thread_buf[n0]); + // }); + }); + }); + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(0); + __builtin_amdgcn_sched_barrier(0); + }); + }; + + if constexpr(TailNum == TailNumber::One) + { + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k0, I0), + a_thread_buf); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); + }); + }); + + __builtin_amdgcn_sched_barrier(0); + if constexpr(k0.value != 0 || KRepeat == 1) + { + __builtin_amdgcn_s_barrier(); + __builtin_amdgcn_sched_barrier(0); + } + static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + if constexpr(k0.value == KRepeat - 1 && + k_.value == KPerInnerLoop - KPack && + m0.value == MRepeat - 1 && n0.value == NRepeat - 1) + { + __builtin_amdgcn_sched_barrier(0); + block_sync_lds(); + __builtin_amdgcn_sched_barrier(0); + } + xdlops_gemm.Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0) + { + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(1); + __builtin_amdgcn_sched_barrier(0); + } + }); + + // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { + // constexpr index_t c_offset = + // c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); + // c_thread_buf(Number{}) += + // c_thread_buf_per_scale[Number{}] * + // type_convert(b_scale_thread_buf[n0]); + // }); + }); + }); + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(0); + __builtin_amdgcn_sched_barrier(0); + }); + } + else if constexpr(TailNum == TailNumber::Two) + { + LoopTailFunc(Number<2>{}); + } + else if constexpr(TailNum == TailNumber::Three) + { + LoopTailFunc(Number<3>{}); + } + else if constexpr(TailNum == TailNumber::Four) + { + LoopTailFunc(Number<4>{}); + } + else if constexpr(TailNum == TailNumber::Five) + { + LoopTailFunc(Number<5>{}); + } + else if constexpr(TailNum == TailNumber::Six) + { + LoopTailFunc(Number<6>{}); + } + else if constexpr(TailNum == TailNumber::Seven) + { + LoopTailFunc(Number<7>{}); + } + else if constexpr(TailNum == TailNumber::Full) + { + LoopTailFunc(Number{}); + } + } + + protected: + // K->M loopover + static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor( + make_tuple(Number{}, I1, Number{}, Number{}), + make_tuple(Number{}, + Number{}, + Number{}, + I1)); + + static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor( + make_tuple(Number{}, I1, Number{}, Number{}), + make_tuple(Number{}, + Number{}, + Number{}, + I1)); + + using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3>, + 3, + A_K1, + A_K1>; + + using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3>, + 3, + B_K1, + B_K1>; + + AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex()}; + BThreadCopy b_thread_copy_{Base::CalculateBThreadOriginDataIndex()}; + using Base::c_thread_desc_; +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp new file mode 100644 index 000000000..d1be88dd6 --- /dev/null +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp @@ -0,0 +1,530 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp" + +namespace ck { + +// Compute optimized pipeline +// GlobalPrefetchStages: 2 +// LocalPreFillStages: 1 +// LocalPreFetchStages: 1 +// LocalSharedMemoryBuffer: 1 + +template +struct BlockwiseGemmXdlops_pipeline_v3_b_scale +{ +}; + +template +struct BlockwiseGemmXdlops_pipeline_v3_b_scale + : BlockwiseGemmXdlops_pipeline_base + +{ + using Base = BlockwiseGemmXdlops_pipeline_base; + using Base::I0; + using Base::I1; + using Base::KRepeat; + using Base::xdlops_gemm; + using typename Base::HotLoopInstList; + + using Base::CalculateCThreadOriginDataIndex; + using Base::CalculateCThreadOriginDataIndex8D; + using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4; + using Base::GetCThreadBuffer; + using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4; + using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + + using Base::a_block_desc_m0_m1_m2_k; + using Base::b_block_desc_n0_n1_n2_k; + + using Base::AMmaKStride; + using Base::BMmaKStride; + + static constexpr index_t PrefetchStages = 2; + static constexpr index_t PrefillStages = 1; + static constexpr index_t GlobalBufferNum = 1; + + __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop) + { + return num_loop > PrefetchStages; + } + + __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop) + { + ignore = num_loop; + return TailNumber::Full; + } + + __device__ static constexpr auto HotLoopScheduler() + { + // A/B split schedule + // compiler is likely to use ds_read2 when instruction width smaller than 16bytes + constexpr auto num_ds_read_inst_a = + HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 + ? HotLoopInstList::A_LDS_Read_Inst_Num + : HotLoopInstList::A_LDS_Read_Inst_Num / 2; + constexpr auto num_ds_read_inst_b = + HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 + ? HotLoopInstList::B_LDS_Read_Inst_Num + : HotLoopInstList::B_LDS_Read_Inst_Num / 2; + + constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num; + constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num; + + constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num; + constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num; + + constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num; + + constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32; + constexpr auto ds_read_a_issue_cycle = + HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4; + constexpr auto ds_read_b_issue_cycle = + HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4; + constexpr auto ds_read_a_mfma_rate = + (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle); + constexpr auto ds_read_b_mfma_rate = + (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle); + + constexpr auto num_dsread_a_mfma = + (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate; + constexpr auto num_dsread_b_mfma = + (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate; + + // stage 1 + // Separate this part? + // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) > + // sizeof(ComputeDataType) / sizeof(BDataType) + // ? sizeof(ComputeDataType) / sizeof(ADataType) + // : sizeof(ComputeDataType) / sizeof(BDataType); + constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma); + constexpr auto num_mfma_per_issue = + num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b); + constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a; + constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b; + + static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) { + ignore = i; + static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) { + ignore = idswrite; + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read + __builtin_amdgcn_sched_group_barrier( + 0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA + }); + static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) { + ignore = i; + static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) { + ignore = idswrite; + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read + __builtin_amdgcn_sched_group_barrier( + 0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA + }); + + // stage 2 + static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) { + if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >= + ds_read_a_mfma_rate) + { + __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read + } + else + { + __builtin_amdgcn_sched_group_barrier(0x100, + num_ds_read_inst_a - (num_dsread_a_mfma - 1) * + ds_read_a_mfma_rate, + 0); // DS read + } + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + + static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) { + if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >= + ds_read_b_mfma_rate) + { + __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read + } + else + { + __builtin_amdgcn_sched_group_barrier(0x100, + num_ds_read_inst_b - (num_dsread_b_mfma - 1) * + ds_read_b_mfma_rate, + 0); // DS read + } + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + } + + template + __device__ void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + CThreadBuffer& c_thread_buf, + // BScaleThreadCopy + const BScaleGridDesc& b_scale_grid_desc, + const BScaleThreadDesc& b_scale_thread_desc, + BScaleThreadTransfer& b_scale_thread_copy, + const BScaleGridBuffer& b_scale_grid_buf, + const BScaleThreadTransferStep& b_scale_thread_copy_step, + // num loop + index_t num_loop, + index_t num_loop_per_scale) const + { + __builtin_amdgcn_sched_barrier(0); + + auto a_thread_buf = make_static_buffer( + a_thread_desc_.GetElementSpaceSize()); + auto b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); + + // B scale buffer + auto b_scale_thread_buf = make_static_buffer( + b_scale_thread_desc.GetElementSpaceSize()); + + // Global prefetch 1 + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_scale_thread_copy.Run(b_scale_grid_desc, + b_scale_grid_buf, + b_scale_thread_desc, + make_tuple(n0, I0), + b_scale_thread_buf); + + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<0>{})); + }); + + if(num_loop_per_scale == 1) + { + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<2>{})); + } + else + { + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<1>{})); + } + + constexpr auto num_scale_k_block = BScaleThreadDesc{}.GetLength(I1); + constexpr auto num_scale_krepeat = KRepeat / num_scale_k_block; + + // Local prefill 1 + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + // Global prefetch 2 + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + // Initialize C + c_thread_buf.Clear(); + + // Local prefetch 1 + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k0, I0), + a_thread_buf); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run( + b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_scale_thread_buf[Number{}], + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); + }); + }); + + __builtin_amdgcn_sched_barrier(0); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + do + { + block_sync_lds(); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_scale_thread_copy.Run(b_scale_grid_desc, + b_scale_grid_buf, + b_scale_thread_desc, + make_tuple(n0, I0), + b_scale_thread_buf); + + b_scale_thread_copy.MoveSrcSliceWindow( + b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{})); + }); + + if((i + 2) % num_loop_per_scale == 0) + { + b_scale_thread_copy.MoveSrcSliceWindow( + b_scale_grid_desc, b_scale_thread_copy_step.At(Number<2>{})); + } + else + { + b_scale_thread_copy.MoveSrcSliceWindow( + b_scale_grid_desc, b_scale_thread_copy_step.At(Number<1>{})); + } + + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + xdlops_gemm.Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + + block_sync_lds(); + + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, k0, I0), + a_thread_buf); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_scale_thread_buf[Number{}], + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); + }); + }); + + HotLoopScheduler(); + __builtin_amdgcn_sched_barrier(0); + + i += 1; + } while(i < (num_loop - 1)); + } + // tail + if constexpr(TailNum == TailNumber::Full) + { + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + xdlops_gemm.Run(a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + __builtin_amdgcn_sched_barrier(0); + } + } + + protected: + using Base::a_thread_copy_; + using Base::a_thread_desc_; + using Base::b_thread_copy_; + using Base::b_thread_desc_; + using Base::c_thread_desc_; +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp new file mode 100644 index 000000000..f35c7a97c --- /dev/null +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp @@ -0,0 +1,686 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp" + +namespace ck { + +// Compute optimimal pipeline with highest resource request +// GlobalPrefetchStages: 4 +// LocalPreFillStages: 2 +// LocalPreFetchStages: 1 +// LocalSharedMemoryBuffer: 2 + +template +struct BlockwiseGemmXdlops_pipeline_v4_b_scale +{ +}; + +template +struct BlockwiseGemmXdlops_pipeline_v4_b_scale + : BlockwiseGemmXdlops_pipeline_base + +{ + using Base = BlockwiseGemmXdlops_pipeline_base; + using Base::I0; + using Base::I1; + using Base::KRepeat; + using Base::xdlops_gemm; + using typename Base::HotLoopInstList; + + using Base::CalculateCThreadOriginDataIndex; + using Base::CalculateCThreadOriginDataIndex8D; + using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4; + using Base::GetCThreadBuffer; + using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4; + using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2; + using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2; + + using Base::a_block_desc_m0_m1_m2_k; + using Base::b_block_desc_n0_n1_n2_k; + + using Base::AMmaKStride; + using Base::BMmaKStride; + + static constexpr index_t PrefetchStages = 3; + static constexpr index_t PrefillStages = 2; + static constexpr index_t GlobalBufferNum = 1; + static constexpr index_t HotloopUnroll = 2; + + __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop) + { + return num_loop > PrefetchStages; + } + + __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop) + { + if(num_loop % HotloopUnroll == 1) + { + return TailNumber::Odd; + } + else + { + return TailNumber::Even; + } + } + + __device__ static constexpr void HotLoopScheduler() + { + // TODO: Take data type into consideration as pipe ver 3 + // A-B splited schedule + constexpr auto num_ds_read_inst_a = + HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 + ? HotLoopInstList::A_LDS_Read_Inst_Num + : HotLoopInstList::A_LDS_Read_Inst_Num / 2; + constexpr auto num_ds_read_inst_b = + HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 + ? HotLoopInstList::B_LDS_Read_Inst_Num + : HotLoopInstList::B_LDS_Read_Inst_Num / 2; + + constexpr auto num_issue_a = HotLoopInstList::A_Buffer_Load_Inst_Num; + constexpr auto num_dswrite_per_issue_a = + (HotLoopInstList::A_LDS_Write_Inst_Num + num_issue_a - 1) / num_issue_a; + constexpr auto num_dsread_per_issue_a = num_ds_read_inst_a / num_issue_a; + + constexpr auto num_issue_b = HotLoopInstList::B_Buffer_Load_Inst_Num; + constexpr auto num_dswrite_per_issue_b = + (HotLoopInstList::B_LDS_Write_Inst_Num + num_issue_b - 1) / num_issue_b; + constexpr auto num_dsread_per_issue_b = num_ds_read_inst_b / num_issue_b; + + constexpr auto num_mfma_per_issue = + HotLoopInstList::C_MFMA_Inst_Num / (num_issue_a + num_issue_b); + + static_for<0, num_issue_a, 1>{}([&](auto i) { + ignore = i; + static_for<0, num_dsread_per_issue_a, 1>{}([&](auto idsread) { + ignore = idsread; + __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + + static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) { + ignore = idswrite; + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read + __builtin_amdgcn_sched_group_barrier(0x008, + num_mfma_per_issue - num_dsread_per_issue_a - + num_dswrite_per_issue_a, + 0); // MFMA + }); + + static_for<0, num_issue_b, 1>{}([&](auto i) { + ignore = i; + static_for<0, num_dsread_per_issue_b, 1>{}([&](auto idsread) { + ignore = idsread; + __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + + static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) { + ignore = idswrite; + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read + __builtin_amdgcn_sched_group_barrier(0x008, + num_mfma_per_issue - num_dsread_per_issue_a - + num_dswrite_per_issue_b, + 0); // MFMA + }); + __builtin_amdgcn_sched_barrier(0); + } + + template + __device__ void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + CThreadBuffer& c_thread_buf, + // BScaleThreadCopy + const BScaleGridDesc& b_scale_grid_desc, + const BScaleThreadDesc& b_scale_thread_desc, + BScaleThreadTransfer& b_scale_thread_copy, + const BScaleGridBuffer& b_scale_grid_buf, + const BScaleThreadTransferStep& b_scale_thread_copy_step, + // num loop + index_t num_loop, + index_t num_loop_per_scale) const + { + auto a_thread_buf = make_static_buffer( + a_thread_desc_.GetElementSpaceSize()); + auto b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); + + // B scale buffer + auto b_scale_thread_buf = make_static_buffer( + b_scale_thread_desc.GetElementSpaceSize()); + + StaticallyIndexedArray{}> a_thread_bufs; + StaticallyIndexedArray{}> b_thread_bufs; + StaticallyIndexedArray{}> b_scale_thread_bufs; + + // Global prefetch 1 + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_scale_thread_copy.Run(b_scale_grid_desc, + b_scale_grid_buf, + b_scale_thread_desc, + make_tuple(n0, I0), + b_scale_thread_bufs(I0)); + + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<0>{})); + }); + + if(num_loop_per_scale == 1) + { + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<2>{})); + } + else + { + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<1>{})); + } + + // Local prefill 1 + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0)); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I0)); + + // Global prefetch 2 + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_scale_thread_copy.Run(b_scale_grid_desc, + b_scale_grid_buf, + b_scale_thread_desc, + make_tuple(n0, I0), + b_scale_thread_bufs(I1)); + + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<0>{})); + }); + + if(2 % num_loop_per_scale == 0) + { + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<2>{})); + } + else + { + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<1>{})); + } + + // Local prefetch 1 + block_sync_lds(); + static_for<0, KRepeat, 1>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf.At(I0), + a_thread_desc_, + make_tuple(m0, I0, k, I0), + a_thread_bufs(I0)); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(I0), + b_scale_thread_bufs(I0)[n0], + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(I0)); + }); + }); + }); + + // Local prefill 2 + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1)); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I1)); + + // Global prefetch 3 + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_scale_thread_copy.Run(b_scale_grid_desc, + b_scale_grid_buf, + b_scale_thread_desc, + make_tuple(n0, I0), + b_scale_thread_bufs(I0)); + + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<0>{})); + }); + + if(3 % num_loop_per_scale == 0) + { + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<2>{})); + } + else + { + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, + b_scale_thread_copy_step.At(Number<1>{})); + } + + // Initialize C + c_thread_buf.Clear(); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + // This hot loop has two legacy loopover, to implement the double local buffer strategy + do + { + auto LoopFunc = [&](auto lds_read_buf, + auto lds_read_reg_buf, + auto lds_write_buf, + auto mfma_reg_buf) { + block_sync_lds(); + + static_for<0, KRepeat, 1>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf.At(lds_read_buf), + a_thread_desc_, + make_tuple(m0, I0, k, I0), + a_thread_bufs(lds_read_reg_buf)); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(lds_read_buf), + b_scale_thread_bufs(lds_read_buf)[n0], + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(lds_read_reg_buf)); + }); + }); + + // B scale copy + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_scale_thread_copy.Run(b_scale_grid_desc, + b_scale_grid_buf, + b_scale_thread_desc, + make_tuple(n0, I0), + b_scale_thread_bufs(lds_read_reg_buf)); + + b_scale_thread_copy.MoveSrcSliceWindow( + b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{})); + }); + + if((i + 4 + mfma_reg_buf.value) % num_loop_per_scale == 0) + { + b_scale_thread_copy.MoveSrcSliceWindow( + b_scale_grid_desc, b_scale_thread_copy_step.At(Number<2>{})); + } + else + { + b_scale_thread_copy.MoveSrcSliceWindow( + b_scale_grid_desc, b_scale_thread_copy_step.At(Number<1>{})); + } + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf)); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf)); + + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_bufs[mfma_reg_buf] + [Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_bufs[mfma_reg_buf] + [Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + xdlops_gemm.Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + + HotLoopScheduler(); + }; + + LoopFunc(I1, I1, I0, I0); + LoopFunc(I0, I0, I1, I1); + + i += HotloopUnroll; + } while(i < (num_loop - PrefetchStages)); + } + + auto ReadWriteCompFunc = [&](auto lds_read_buf, + auto lds_read_reg_buf, + auto lds_write_buf, + auto mfma_reg_buf) { + block_sync_lds(); + + static_for<0, KRepeat, 1>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf.At(lds_read_buf), + a_thread_desc_, + make_tuple(m0, I0, k, I0), + a_thread_bufs(lds_read_reg_buf)); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(lds_read_buf), + b_scale_thread_bufs(lds_read_buf)[n0], + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(lds_read_reg_buf)); + }); + }); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf)); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf)); + + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_bufs[mfma_reg_buf][Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_bufs[mfma_reg_buf][Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + xdlops_gemm.Run(a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + + HotLoopScheduler(); + }; + + auto ReadCompFunc = [&](auto lds_read_buf, auto lds_read_reg_buf, auto mfma_reg_buf) { + block_sync_lds(); + + static_for<0, KRepeat, 1>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, Number{}), + a_block_buf.At(lds_read_buf), + a_thread_desc_, + make_tuple(m0, I0, k, I0), + a_thread_bufs(lds_read_reg_buf)); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(lds_read_buf), + b_scale_thread_bufs(lds_read_buf)[n0], + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(lds_read_reg_buf)); + }); + }); + + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_bufs[mfma_reg_buf][Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_bufs[mfma_reg_buf][Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + xdlops_gemm.Run(a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + + HotLoopScheduler(); + }; + + auto CompFunc = [&](auto mfma_reg_buf) { + static_for<0, KRepeat, 1>{}([&](auto k0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_bufs[mfma_reg_buf][Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_bufs[mfma_reg_buf][Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + xdlops_gemm.Run(a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + }; + + // tail + if constexpr(TailNum == TailNumber::Odd) + { + ReadWriteCompFunc(I1, I1, I0, I0); + ReadCompFunc(I0, I0, I1); + CompFunc(I0); + } + else if constexpr(TailNum == TailNumber::Even) + { + ReadCompFunc(I1, I1, I0); + CompFunc(I1); + } + } + + protected: + using Base::a_thread_copy_; + using Base::a_thread_desc_; + using Base::b_thread_copy_; + using Base::b_thread_desc_; + using Base::c_thread_desc_; +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp index 43909f77d..78d8aa997 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp @@ -77,6 +77,43 @@ struct DeviceGemmV2R1 : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; +template +struct DeviceGemmV2BScale : public BaseOperator +{ + virtual std::unique_ptr + MakeArgumentPointer(const void* p_a, + const void* p_b, + void* p_c, + ck::index_t M, + ck::index_t N, + ck::index_t K, + ck::index_t StrideA, + ck::index_t StrideB, + ck::index_t StrideC, + ck::index_t StrideScaleB, + const void* p_b_scale, + ck::index_t KSplit, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; + + virtual bool GetPermuteB() = 0; + virtual ck::index_t GetKPerBlock() = 0; +}; + } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp new file mode 100644 index 000000000..044350d11 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp @@ -0,0 +1,781 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/common_header.hpp" + +#include "ck/host_utility/flush_cache.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale +{ + // GridwiseGemm + using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3< + ALayout, + BLayout, + CLayout, + ADataType, + BDataType, + GemmAccDataType, + CShuffleDataType, + CDataType, + AElementwiseOperation, + BElementwiseOperation, + CElementwiseOperation, + GemmSpec, + BlockSize, + ScaleBlockN, + ScaleBlockK, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + false, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + false, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CShuffleBlockTransferScalarPerVector_NPerBlock, + BlkGemmPipeSched, + BlkGemmPipelineVer, + ComputeTypeA, + ComputeTypeB, + PermuteA, + PermuteB>; + + using Argument = typename GridwiseGemm::Argument; + + // Invoker + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + if(stream_config.log_level_ > 0) + { + arg.Print(); + } + + if(!GridwiseGemm::CheckValidity(arg)) + { + throw std::runtime_error("wrong! GridwiseGemm has invalid setting"); + } + + index_t gdx, gdy, gdz; + std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch); + + float ave_time = 0; + + index_t k_grain = arg.KBatch * KPerBlock; + index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock; + + const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split); + + const auto Run = [&](const auto& kernel) { + if(stream_config.flush_cache) + { + Argument arg_ = arg; + + const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1( + arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0); + const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1( + arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0); + + auto size_a_buffer = + a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType); + auto size_b_buffer = + b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType); + + ck::utility::RotatingMemWrapper rotating_mem( + arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer); + rotating_mem.Print(); + + auto run_flush_cache = [&]() { + // flush icache + ck::utility::flush_icache(); + // rotating mem + rotating_mem.Next(); + // clear c mem + if(arg_.KBatch > 1) + hipGetErrorString(hipMemsetAsync(arg_.p_c_grid, + 0, + arg_.M * arg_.N * sizeof(CDataType), + stream_config.stream_id_)); + }; + + ave_time = ck::utility::launch_and_time_kernel_with_preprocess( + stream_config, + run_flush_cache, + kernel, + dim3(gdx, gdy, gdz), + dim3(BlockSize), + 0, + arg_); + } + else + { + if(arg.KBatch > 1) + hipGetErrorString(hipMemsetAsync(arg.p_c_grid, + 0, + arg.M * arg.N * sizeof(CDataType), + stream_config.stream_id_)); + + ave_time = launch_and_time_kernel( + stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg); + } + }; + + constexpr index_t minimum_occupancy = + BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave + ? (BlkGemmPipelineVer == BlockGemmPipelineVersion::v3 && + MPerBlock * NPerBlock * KPerBlock * sizeof(ADataType) <= 128 * 128 * 64 * 2) + ? 2 + : 1 + : 2; + + if(has_main_k_block_loop) + { + // Tail number always full + if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 || + BlkGemmPipelineVer == BlockGemmPipelineVersion::v3) + { + if(arg.KBatch > 1) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + // Tail number could be One to Seven + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2) + { + if(arg.KBatch > 1) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Full) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3< + GridwiseGemm, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Two>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Three) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3< + GridwiseGemm, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Three>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Four) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3< + GridwiseGemm, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Four>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Five) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3< + GridwiseGemm, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Five>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3< + GridwiseGemm, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Six>; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Seven) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3< + GridwiseGemm, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Seven>; + Run(kernel); + } + } + } + else + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Full) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Three) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Four) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Five) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Seven) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + } + } + // Tail number could be Odd or Even + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4) + { + if(arg.KBatch > 1) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds< + GridwiseGemm, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Odd>; + Run(kernel); + } + else + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds< + GridwiseGemm, + true, + InMemoryDataOperationEnum::AtomicAdd, + minimum_occupancy, + TailNumber::Even>; + Run(kernel); + } + } + else + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3_2lds; + Run(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3_2lds; + Run(kernel); + } + } + } + else + { + if(arg.KBatch > 1) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + else + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + } + } + else + { + // Tail number always 1 + if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1) + { + if(arg.KBatch > 1) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + } + + return ave_time; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + static bool IsSupportedArgument(const Argument& arg) + { + if(!ck::is_xdl_supported()) + { + return false; + } + + if(!is_bf16_atomic_supported() && std::is_same_v && arg.KBatch > 1) + { + return false; + } + + if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding || + GemmSpec == GemmSpecialization::NKPadding || + GemmSpec == GemmSpecialization::MNKPadding || + GemmSpec == GemmSpecialization::KPadding)) + { + return false; + } + + return GridwiseGemm::CheckValidity(arg); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + index_t GetKPerBlock() override { return KPerBlock; } + + bool GetPermuteB() override { return PermuteB; } + + static auto MakeArgument(const ADataType* p_a, + const BDataType* p_b, + CDataType* p_c, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + index_t StrideC, + index_t StrideScaleB, + const BScaleDataType* p_b_scale, + index_t KBatch, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + { + return Argument{p_a, + p_b, + p_c, + M, + N, + K, + StrideA, + StrideB, + StrideC, + StrideScaleB, + p_b_scale, + KBatch, + a_element_op, + b_element_op, + c_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + // polymorphic + std::unique_ptr MakeArgumentPointer(const void* p_a, + const void* p_b, + void* p_c, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + index_t StrideC, + index_t StrideScaleB, + const void* p_b_scale, + index_t KBatch, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) override + { + return std::make_unique(static_cast(p_a), + static_cast(p_b), + static_cast(p_c), + M, + N, + K, + StrideA, + StrideB, + StrideC, + StrideScaleB, + static_cast(p_b_scale), + KBatch, + a_element_op, + b_element_op, + c_element_op); + } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + std::map BlkGemmPipelineSchedulerToString{ + {BlockGemmPipelineScheduler::Intrawave, "Intrawave"}, + {BlockGemmPipelineScheduler::Interwave, "Interwave"}}; + + std::map BlkGemmPipelineVersionToString{ + {BlockGemmPipelineVersion::v1, "v1"}, + {BlockGemmPipelineVersion::v2, "v2"}, + {BlockGemmPipelineVersion::v3, "v3"}, + {BlockGemmPipelineVersion::v4, "v4"}, + {BlockGemmPipelineVersion::v5, "v5"}}; + + // clang-format off + str << "DeviceGemmXdlUniversal" + << "<" + << getGemmSpecializationString(GemmSpec) << ", " + << std::string(ALayout::name)[0] + << std::string(BLayout::name)[0] + << std::string(CLayout::name)[0] + << ">" + << " BlkSize: " + << BlockSize << ", " + << "BlkTile: " + << MPerBlock<<"x"<()[Number<0>{}]; } +__host__ __device__ inline half4_t pki4_to_half4_scale(int q, const ck::half2_t& scale) +{ + const int LO = 0x000f000f; + const int HI = 0x00f000f0; + const int EX = 0x64006400; + + // Extract the two int4 at low bit and create two fp16 number. + int lo = amd_assembly_and_or_b32(q, LO, EX); + // Extract the two int4 at hight bit and create two fp16 number. + int hi = amd_assembly_and_or_b32(q, HI, EX); + + const int SUB = 0xE408E408; // half2 {-1032, -1032} + const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16} + const int ADD = 0xd480d480; // half2 {-72, -72} + + vector_type res; + + res.template AsType()(Number<0>{}) = + amd_assembly_pk_add_f16(bit_cast(lo), bit_cast(SUB)); + + res.template AsType()(Number<1>{}) = amd_assembly_pk_fma_f16( + bit_cast(hi), bit_cast(MUL), bit_cast(ADD)); + + asm volatile("v_pk_mul_f16 %0, %1, %2" + : "=v"(res.template AsType()(Number<0>{})) + : "v"(res.template AsType()(Number<0>{})), "v"(scale)); + + asm volatile("v_pk_mul_f16 %0, %1, %2" + : "=v"(res.template AsType()(Number<1>{})) + : "v"(res.template AsType()(Number<1>{})), "v"(scale)); + + return res.template AsType()[Number<0>{}]; +} + __host__ __device__ inline half2_t pki4_to_half2(pk_i4_t q) { #if 1 @@ -171,7 +205,42 @@ struct PassThroughPack8 dst.template AsType()(Number<3>{}) = pki4_to_bhalf2(src.template AsType()[Number<3>{}]); - y = dst.template AsType()[Number<0>{}]; + y = dst.template AsType()[Number<0>{}]; +#endif + } + constexpr const static bool is_pack8_invocable = true; +}; + +struct DequantPack8 +{ + template + __host__ __device__ void operator()(Y& y, const X& x, const Z& z) const; + + __host__ __device__ constexpr void + operator()(ck::half8_t& y, const ck::pk_i4x4_t& x, const ck::half2_t& z) const + { +#if 1 + vector_type result; + + result.template AsType()(Number<0>{}) = pki4_to_half4_scale(bit_cast(x), z); + result.template AsType()(Number<1>{}) = + pki4_to_half4_scale(bit_cast(x) >> 8, z); + + y = result.template AsType()[Number<0>{}]; +#else + vector_type dst; + vector_type src{x}; + + dst.template AsType()(Number<0>{}) = + pki4_to_half2(src.template AsType()[Number<0>{}]); + dst.template AsType()(Number<1>{}) = + pki4_to_half2(src.template AsType()[Number<1>{}]); + dst.template AsType()(Number<2>{}) = + pki4_to_half2(src.template AsType()[Number<2>{}]); + dst.template AsType()(Number<3>{}) = + pki4_to_half2(src.template AsType()[Number<3>{}]); + + y = dst.template AsType()[Number<0>{}]; #endif } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp new file mode 100644 index 000000000..bdb24c25a --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp @@ -0,0 +1,2208 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_description/multi_index_transform_helper.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/utility/common_header.hpp" + +namespace ck { + +// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same +// kernel function Blockers: +// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on +// two lds chunks. +// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds +// buffer when we declare __shared__ inside blkgemmpipe +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy) +#endif + // __attribute__((amdgpu_waves_per_eu(1, 1))) + kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) + __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + + auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg); + + GridwiseGemm::template Run( + karg.p_a_grid + splitk_batch_offset.a_k_split_offset, + karg.p_b_grid + splitk_batch_offset.b_k_split_offset, + karg.p_c_grid + splitk_batch_offset.c_reduce_offset, + karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset, + p_shared, + karg); + +#else + ignore = karg; +#endif // end of if (defined(__gfx9__)) +} + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy) +#endif + // __attribute__((amdgpu_waves_per_eu(1, 1))) + kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) + // Pass two lds pointer is the key to tell compiler that ds_read/write + // operate on different lds chunk at same time without order dependecy + __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + + auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg); + + GridwiseGemm::template Run_2Lds( + karg.p_a_grid + splitk_batch_offset.a_k_split_offset, + karg.p_b_grid + splitk_batch_offset.b_k_split_offset, + karg.p_c_grid + splitk_batch_offset.c_reduce_offset, + karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset, + p_shared_0, + p_shared_1, + karg); + +#else + ignore = karg; +#endif // end of if (defined(__gfx9__)) +} + +template +struct GridwiseGemm_xdl_cshuffle_v3 +{ + using BScaleType = ck::half_t; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; + static constexpr auto I7 = Number<7>{}; + + // K1 should be Number<...> + static constexpr auto AK0Number = Number{}; + static constexpr auto BK0Number = Number{}; + static constexpr auto AK1Number = Number{}; + static constexpr auto BK1Number = Number{}; + + static constexpr index_t KPack = + math::max(math::lcm(AK1Number, BK1Number), + MfmaSelector::selected_mfma.k_per_blk); + + using ThisThreadBlock = ThisThreadBlock; + + static constexpr index_t APackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + + static constexpr index_t BPackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + + __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch) + { + return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch); + } + + __host__ static auto CalculateMPadded(index_t M) + { + return math::integer_least_multiple(M, MPerBlock); + } + + __host__ static auto CalculateNPadded(index_t N) + { + return math::integer_least_multiple(N, NPerBlock); + } + + __host__ static auto CalculateKPadded(index_t K) + { + return math::integer_divide_ceil(K, KPerBlock) * KPerBlock; + } + + __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1) + { + auto K_t = K_Batch * KPerBlock; + return (K + K_t - 1) / K_t * (KPerBlock / AK1Value); + } + + __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1) + { + auto K_t = K_Batch * KPerBlock; + return (K + K_t - 1) / K_t * (KPerBlock / BK1Value); + } + + __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1) + { + auto K_t = K_Batch * KPerBlock; + return (K + K_t - 1) / K_t * KPerBlock; + } + + __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1) + { + constexpr auto KReadVec = math::lcm(AK1Number, BK1Number); + auto K_t = K_Batch * KReadVec; + return (K + K_t - 1) / K_t * KReadVec; + } + + __host__ static auto CalculateMBlock(index_t M) + { + return math::integer_divide_ceil(M, MPerBlock); + } + + __host__ static auto CalculateNBlock(index_t N) + { + return math::integer_divide_ceil(N, NPerBlock); + } + + template + __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&) + { + constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{}); + constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{}); + + return transform_tensor_descriptor( + TileDesc_K0_MN_K1{}, + make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number{}, Number{})), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{}))), + make_tuple(Sequence<0, 2>{}, Sequence<1>{}), + make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{})); + } + + __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1( + index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0) + { + const auto a_grid_desc_mraw_kraw = [&]() { + if constexpr(is_same_v) + { + return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1)); + } + else if constexpr(is_same_v) + { + return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA)); + } + }(); + + using GemmSpecialization = tensor_operation::device::GemmSpecialization; + + if constexpr(GemmSpec == GemmSpecialization::MKPadding || + GemmSpec == GemmSpecialization::MNKPadding) + { + // pad both M and K + const auto a_grid_desc_m_k = + transform_tensor_descriptor(a_grid_desc_mraw_kraw, + make_tuple(make_right_pad_transform(M, MPad - M), + make_right_pad_transform(K, KPad - K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)), + make_pass_through_transform(MPad)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + return a_grid_desc_ak0_m_ak1; + } + else if constexpr(GemmSpec == GemmSpecialization::MPadding || + GemmSpec == GemmSpecialization::MNPadding) + { + // pad M, but not K + const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor( + a_grid_desc_mraw_kraw, + make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)), + make_right_pad_transform(M, MPad - M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + return a_grid_desc_ak0_m_ak1; + } + else if constexpr(GemmSpec == GemmSpecialization::KPadding || + GemmSpec == GemmSpecialization::NKPadding) + { + // pad K, but not M + const auto a_grid_desc_m_k = transform_tensor_descriptor( + a_grid_desc_mraw_kraw, + make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)), + make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + return a_grid_desc_ak0_m_ak1; + } + else + { + // not pad M or K + const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor( + a_grid_desc_mraw_kraw, + make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)), + make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + return a_grid_desc_ak0_m_ak1; + } + } + + __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1( + index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0) + { + const auto b_grid_desc_nraw_kraw = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1)); + } + }(); + + using GemmSpecialization = tensor_operation::device::GemmSpecialization; + + static_assert(!(is_same_v, pk_i4_t> && + GemmSpec != GemmSpecialization::Default), + "pk_i4_t does not support padding"); + + if constexpr(GemmSpec == GemmSpecialization::NKPadding || + GemmSpec == GemmSpecialization::MNKPadding) + { + // pad both N and K + const auto b_grid_desc_n_k = + transform_tensor_descriptor(b_grid_desc_nraw_kraw, + make_tuple(make_right_pad_transform(N, NPad - N), + make_right_pad_transform(K, KPad - K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)), + make_pass_through_transform(NPad)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + return b_grid_desc_bk0_n_bk1; + } + else if constexpr(GemmSpec == GemmSpecialization::NPadding || + GemmSpec == GemmSpecialization::MNPadding) + { + // pad N, but not K + const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( + b_grid_desc_nraw_kraw, + make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)), + make_right_pad_transform(N, NPad - N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + return b_grid_desc_bk0_n_bk1; + } + else if constexpr(GemmSpec == GemmSpecialization::KPadding || + GemmSpec == GemmSpecialization::MKPadding) + { + // pad K, but not N + const auto b_grid_desc_n_k = transform_tensor_descriptor( + b_grid_desc_nraw_kraw, + make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + return b_grid_desc_bk0_n_bk1; + } + else + { + if constexpr(!PermuteB) + { + // not pad N or K + const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( + b_grid_desc_nraw_kraw, + make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + return b_grid_desc_bk0_n_bk1; + } + else + { + // Weight Tile Permute + constexpr index_t BK01 = KPerBlock / BK1Value; + // const index_t BK00 = BK0 / BK01; + const index_t BK0_ = StrideB / BK1Value; + const index_t BK00 = BK0_ / BK01; + + const auto b_grid_desc_bk00_n_bk01_bk1_permute = + make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value)); + + const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor( + b_grid_desc_bk00_n_bk01_bk1_permute, + make_tuple(make_merge_transform(make_tuple(BK00, BK01)), + make_pass_through_transform(make_tuple(N)), + make_pass_through_transform(BK1Value)), + make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return b_grid_desc_bk0_n_bk1_permute; + } + } + } + + template + __host__ __device__ static constexpr auto + MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&) + { + constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl); + + return MakeGemmMmaTileDescriptor(ABlockDesc_AK0_M_AK1{}); + } + + template + __host__ __device__ static constexpr auto + MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&) + { + constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl); + + return MakeGemmMmaTileDescriptor(BBlockDesc_BK0_N_BK1{}); + } + + __host__ __device__ static auto + MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC) + { + const auto c_grid_desc_mraw_nraw = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC)); + } + }(); + + // pad M and N + return transform_tensor_descriptor(c_grid_desc_mraw_nraw, + make_tuple(make_right_pad_transform(M, MPad - M), + make_right_pad_transform(N, NPad - N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); +#if 0 + using GemmSpecialization = tensor_operation::device::GemmSpecialization; + + if constexpr(GemmSpec == GemmSpecialization::MNPadding || + GemmSpec == GemmSpecialization::MNKPadding) + { + // pad M and N + return transform_tensor_descriptor(c_grid_desc_mraw_nraw, + make_tuple(make_right_pad_transform(M, MPad - M), + make_right_pad_transform(N, NPad - N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + else if constexpr(GemmSpec == GemmSpecialization::MPadding || + GemmSpec == GemmSpecialization::MKPadding) + { + // pad M, but not N + return transform_tensor_descriptor( + c_grid_desc_mraw_nraw, + make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + else if constexpr(GemmSpec == GemmSpecialization::NPadding || + GemmSpec == GemmSpecialization::NKPadding) + { + // pad N, but not M + return transform_tensor_descriptor( + c_grid_desc_mraw_nraw, + make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + else + { + // not pad M or N + return c_grid_desc_mraw_nraw; + } +#endif + } + + struct Problem + { + __host__ Problem(index_t M_, + index_t N_, + index_t K_, + index_t StrideA_, + index_t StrideB_, + index_t StrideC_, + index_t StrideScaleB_, + index_t KBatch_) + : M{M_}, + N{N_}, + K{K_}, + StrideA{StrideA_}, + StrideB{StrideB_}, + StrideC{StrideC_}, + StrideScaleB{StrideScaleB_}, + KBatch{KBatch_}, + MPadded{CalculateMPadded(M_)}, + NPadded{CalculateNPadded(N_)}, + KRead{CalculateKRead(K_, KBatch_)}, + KPadded{CalculateKPadded(K_, KBatch_)}, + AK0{CalculateAK0Padded(K_, KBatch_)}, + BK0{CalculateBK0Padded(K_, KBatch_)}, + MBlock{CalculateMBlock(M_)}, + NBlock{CalculateNBlock(N_)} + { + } + + __host__ void Print() const + { + std::cout << "problem {" + << "M:" << M << ", " + << "N:" << N << ", " + << "K:" << K << ", " + << "SA:" << StrideA << ", " + << "SB:" << StrideB << ", " + << "SC:" << StrideC << ", " + << "SScaleB:" << StrideScaleB << ", " + << "MP:" << MPadded << ", " + << "NP:" << NPadded << ", " + << "KRead:" << KRead << ", " + << "KP:" << KPadded << ", " + << "AK0:" << AK0 << ", " + << "BK0:" << BK0 << ", " + << "MBlock: " << MBlock << ", " + << "NBlock: " << NBlock << "}" << std::endl; + } + + index_t M; + index_t N; + index_t K; + index_t StrideA; + index_t StrideB; + index_t StrideC; + index_t StrideScaleB; + index_t KBatch; + index_t MPadded; + index_t NPadded; + index_t KRead; + index_t KPadded; + index_t AK0; + index_t BK0; + index_t MBlock; + index_t NBlock; + }; + + // Argument + struct Argument : public tensor_operation::device::BaseArgument, public Problem + { + __host__ Argument(const ADataType* p_a_grid_, + const BDataType* p_b_grid_, + CDataType* p_c_grid_, + index_t M_, + index_t N_, + index_t K_, + index_t StrideA_, + index_t StrideB_, + index_t StrideC_, + index_t StrideScaleB_, + const BScaleType* p_b_scale_grid_, + index_t k_batch_, + AElementwiseOperation a_element_op_, + BElementwiseOperation b_element_op_, + CElementwiseOperation c_element_op_, + bool is_reduce_ = false) + : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, StrideScaleB_, k_batch_}, + p_a_grid{p_a_grid_}, + p_b_grid{p_b_grid_}, + p_c_grid{p_c_grid_}, + p_b_scale_grid{p_b_scale_grid_}, + a_element_op{a_element_op_}, + b_element_op{b_element_op_}, + c_element_op{c_element_op_}, + is_reduce(is_reduce_) + { + } + + __host__ __device__ inline bool IsReduceAdd() const + { + return (Problem::KBatch > 1) && is_reduce; + } + + __host__ __device__ inline bool IsAtomicAdd() const + { + return (Problem::KBatch > 1) && (!is_reduce); + } + + const ADataType* p_a_grid; + const BDataType* p_b_grid; + CDataType* p_c_grid; + + const BScaleType* p_b_scale_grid; + const AElementwiseOperation a_element_op; + const BElementwiseOperation b_element_op; + const CElementwiseOperation c_element_op; + bool is_reduce; + }; + + struct SplitKBatchOffset + { + + __device__ SplitKBatchOffset(Argument& karg) + { + if constexpr(is_same_v) + { + a_k_split_offset = blockIdx.z * karg.KRead / APackedSize; + } + else if constexpr(is_same_v) + { + a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA; + } + + if constexpr(is_same_v) + { + b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB; + } + else if constexpr(is_same_v) + { + if constexpr(!PermuteB) + { + b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize; + } + else + { + const int k0_offset = karg.KRead * karg.N; + b_k_split_offset = blockIdx.z * k0_offset / BPackedSize; + } + } + + // Calculate B scale offset + if constexpr(is_same_v) + { + scale_k_split_offset = blockIdx.z * (karg.KRead / ScaleBlockK) * karg.StrideB; + } + else if constexpr(is_same_v) + { + scale_k_split_offset = blockIdx.z * (karg.KRead / ScaleBlockK); + } + + if(blockIdx.z < static_cast(karg.KBatch - 1)) + { + karg.K = karg.KRead; + } + else + { + karg.K = karg.K - karg.KRead * (karg.KBatch - 1); + } + + if(karg.IsReduceAdd()) + { + c_reduce_offset = blockIdx.z * karg.M * karg.N; + } + else + { + c_reduce_offset = 0; + } + } + + index_t a_k_split_offset; + index_t b_k_split_offset; + index_t scale_k_split_offset; // New member for scale matrix offset + index_t c_reduce_offset; + }; + + __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() + { + // A matrix in LDS memory, dst of blockwise copy + if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4) + { + return make_naive_tensor_descriptor( + make_tuple(AK0Number, Number{}, AK1Number), + make_tuple(AK1Number, Number{}, I1)); + } + // xor tensor transformation request more unnecessary vgpr usage, would cause register spill + // in some cases. + else if constexpr(is_same::value) + { + constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize; + constexpr auto MLdsLayer = LdsSize < 1 ? 1 : LdsSize; + constexpr auto a_lds_block_desc = make_naive_tensor_descriptor( + make_tuple( + AK0Number * Number{}, Number{}, AK1Number), + make_tuple(AK1Number, Number{}, I1)); + + constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor( + a_lds_block_desc, + make_tuple(make_xor_with_modulo_transform(make_tuple( + Number{}, Number{})), + make_pass_through_transform(AK1Number)), + make_tuple(Sequence<1, 0>{}, Sequence<2>{}), + make_tuple(Sequence<1, 0>{}, Sequence<2>{})); + + constexpr auto a_lds_block_desc_ak0_mldslayer_m_ak1 = transform_tensor_descriptor( + a_lds_block_desc_permuted, + make_tuple(make_unmerge_transform(make_tuple(AK0Number, Number{})), + make_pass_through_transform(Number{}), + make_pass_through_transform(AK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{})); + + constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor( + a_lds_block_desc_ak0_mldslayer_m_ak1, + make_tuple(make_pass_through_transform(AK0Number), + make_merge_transform_v3_division_mod( + make_tuple(Number{}, Number{})), + make_pass_through_transform(AK1Number)), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return a_lds_block_desc_ak0_m_ak1; + } + else // ColumnMajor A + { + // kfold and mpair dimension is not always required. + // more dimension in merge_transform increase the difficulty of generating immarg offset + // for compiler. + constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1); + constexpr auto M1 = MPerBlock / M0; + + constexpr auto KThreadWrite = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0); + constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite; + constexpr auto KThreadRead = 64 / MPerXdl; + constexpr auto K0PerThreadRead = AK0Number / KThreadRead; + + constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128) + ? 1 + : 128 / (AK1Number * M0 * sizeof(ADataType)); + constexpr auto KThreadReadPerm = + (kfold * K0PerThreadWrite / K0PerThreadRead) > 1 + ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead) + : KThreadRead; + + // 1<=mpair<=n0 + constexpr auto mpair = (AK1Number * MPerXdl * sizeof(ADataType) > 128) + ? 1 + : ((128 / (AK1Number * MPerXdl * sizeof(ADataType))) > M0 + ? M0 + : 128 / (AK1Number * MPerXdl * sizeof(ADataType))); + + constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + AK1Number)); + + constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor( + a_lds_block_desc, + make_tuple( + make_pass_through_transform(Number{}), + make_pass_through_transform(Number{}), + make_xor_with_modulo_transform( + make_tuple(Number{}, Number{})), + make_pass_through_transform(Number{}), + make_pass_through_transform(AK1Number)), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{})); + + constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor( + a_lds_block_desc_permuted, + make_tuple( + make_pass_through_transform(Number{}), + make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple(Number{}, Number{})), + make_unmerge_transform(make_tuple(Number{}, Number{})), + make_pass_through_transform(Number{}), + make_pass_through_transform(AK1Number)), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<1>{}, + Sequence<2>{}, + Sequence<0, 3>{}, + Sequence<4, 5>{}, + Sequence<6>{}, + Sequence<7>{})); + + constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor( + a_lds_block_desc_unmerged, + make_tuple(make_merge_transform_v3_division_mod( + make_tuple(Number{}, + Number{}, + Number{}, + Number{})), + make_merge_transform_v3_division_mod( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(AK1Number)), + make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return a_lds_block_desc_ak0_m_ak1; + } + } + + __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1() + { + // B matrix in LDS memory, dst of blockwise copy + if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4) + { + return make_naive_tensor_descriptor( + make_tuple(BK0Number, Number{}, BK1Number), + make_tuple(BK1Number, Number{}, I1)); + } + else if constexpr(is_same::value) + { + // NLdsLayer * K0 as logical Bank + constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize; + constexpr index_t NLdsLayer = LdsSize < 1 ? 1 : LdsSize; + constexpr auto b_lds_block_desc = make_naive_tensor_descriptor( + make_tuple( + BK0Number * Number{}, Number{}, BK1Number), + make_tuple(BK1Number, Number{}, I1)); + + constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor( + b_lds_block_desc, + make_tuple(make_xor_with_modulo_transform(make_tuple( + Number{}, Number{})), + make_pass_through_transform(BK1Number)), + make_tuple(Sequence<1, 0>{}, Sequence<2>{}), + make_tuple(Sequence<1, 0>{}, Sequence<2>{})); + + constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor( + b_lds_block_desc_permuted, + make_tuple(make_unmerge_transform(make_tuple(BK0Number, Number{})), + make_pass_through_transform(Number{}), + make_pass_through_transform(BK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{})); + + constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor( + b_lds_block_desc_bk0_nldslayer_n_bk1, + make_tuple(make_pass_through_transform(BK0Number), + make_merge_transform_v3_division_mod( + make_tuple(Number{}, Number{})), + make_pass_through_transform(BK1Number)), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return b_lds_block_desc_bk0_n_bk1; + } + else // RowMajor B + { + constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1); + constexpr auto N1 = NPerBlock / N0; + + constexpr auto KThreadWrite = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0); + constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite; + constexpr auto KThreadRead = 64 / NPerXdl; + constexpr auto K0PerThreadRead = BK0Number / KThreadRead; + + constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128) + ? 1 + : 128 / (BK1Number * N0 * sizeof(BDataType)); + constexpr auto KThreadReadPerm = + (kfold * K0PerThreadWrite / K0PerThreadRead) > 1 + ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead) + : KThreadRead; + + // 1<=npair<=n0 + constexpr auto npair = (BK1Number * NPerXdl * sizeof(BDataType) > 128) + ? 1 + : ((128 / (BK1Number * NPerXdl * sizeof(BDataType))) > N0 + ? N0 + : 128 / (BK1Number * NPerXdl * sizeof(BDataType))); + + constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + BK1Number)); + + constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor( + b_lds_block_desc, + make_tuple( + make_pass_through_transform(Number{}), + make_pass_through_transform(Number{}), + make_xor_with_modulo_transform( + make_tuple(Number{}, Number{})), + make_pass_through_transform(Number{}), + make_pass_through_transform(BK1Number)), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{})); + + constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor( + b_lds_block_desc_permuted, + make_tuple( + make_pass_through_transform(Number{}), + make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple(Number{}, Number{})), + make_unmerge_transform(make_tuple(Number{}, Number{})), + make_pass_through_transform(Number{}), + make_pass_through_transform(BK1Number)), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<1>{}, + Sequence<2>{}, + Sequence<0, 3>{}, + Sequence<4, 5>{}, + Sequence<6>{}, + Sequence<7>{})); + + constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor( + b_lds_block_desc_unmerged, + make_tuple(make_merge_transform_v3_division_mod( + make_tuple(Number{}, + Number{}, + Number{}, + Number{})), + make_merge_transform_v3_division_mod( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(BK1Number)), + make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return b_lds_block_desc_bk0_n_bk1; + } + } + + __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock() + { + constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl); + constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl); + + constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = + make_naive_tensor_descriptor_packed( + make_tuple(I1, + Number{}, + I1, + Number{})); + + return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock; + } + + using BlockwiseGemmPipe = + remove_cvref_t())>; + + __device__ static constexpr index_t GetSharedMemoryNumberOfByte() + { + // LDS allocation for A and B: be careful of alignment + constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); + constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1(); + + // lds max alignment + constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number); + + constexpr auto a_block_space_size_aligned = math::integer_least_multiple( + a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); + + constexpr auto b_block_space_size_aligned = math::integer_least_multiple( + b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align); + + // LDS allocation for C shuffle in LDS + constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = + GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); + + constexpr auto c_block_size = + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize(); + + return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize + + b_block_space_size_aligned * sizeof(BDataType) / BPackedSize), + c_block_size * sizeof(CShuffleDataType)); + } + + // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} + __host__ static constexpr bool CheckValidity(const Argument& karg) + { + static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) && + (NPerBlock % (NXdlPerWave * NPerXdl)) == 0, + "Invalid tuning param!"); + + if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) && + !(is_same::value)) + { + if(!(karg.M % MPerBlock == 0)) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " " + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ + << std::endl; + } + return false; + } + } + + if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) && + (is_same::value)) + { + if(!(karg.N % NPerBlock == 0)) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " " + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ + << std::endl; + } + return false; + } + } + + if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)) + { + + auto K_t = karg.KBatch * KPerBlock; + if(!(karg.K % K_t == 0)) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: " + << karg.K << " " << __FILE__ << ":" << __LINE__ + << ", in function: " << __func__ << std::endl; + } + return false; + } + } + else + { + constexpr auto KReadVec = math::lcm(AK1Number, BK1Number); + auto K_t = karg.KBatch * KReadVec; + auto KReadPadSplited = math::integer_divide_ceil(karg.K, K_t) * KReadVec; + if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K) + { + return false; + } + } + + if constexpr(is_same::value) + { + if(karg.K % ABlockTransferSrcScalarPerVector != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg K (" << karg.K + << ") value is not a multiple of ABlockTransferSrcScalarPerVector (" + << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" + << __LINE__ << ", in function: " << __func__ << std::endl; + } + return false; + } + } + else + { + if(karg.M % ABlockTransferSrcScalarPerVector != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg M (" << karg.M + << ") value is not a multiple of ABlockTransferSrcScalarPerVector (" + << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" + << __LINE__ << ", in function: " << __func__ << std::endl; + } + return false; + } + } + + if constexpr(is_same::value) + { + if(karg.N % BBlockTransferSrcScalarPerVector != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg N (" << karg.N + << ") value is not a multiple of BBlockTransferSrcScalarPerVector (" + << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" + << __LINE__ << ", in function: " << __func__ << std::endl; + } + return false; + } + } + else + { + if(karg.K % BBlockTransferSrcScalarPerVector != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg K (" << karg.K + << ") value is not a multiple of BBlockTransferSrcScalarPerVector (" + << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" + << __LINE__ << ", in function: " << __func__ << std::endl; + } + return false; + } + } + + if constexpr(is_same::value) + { + if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg N (" << karg.N + << ") value is not a multiple of " + "CShuffleBlockTransferScalarPerVector_NPerBlock (" + << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ + << std::endl; + } + return false; + } + } + else + { + if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg M (" << karg.M + << ") value is not a multiple of " + "CShuffleBlockTransferScalarPerVector_NPerBlock (" + << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ + << std::endl; + } + return false; + } + } + + if constexpr(!(is_same, half_t>::value || + is_same, float>::value || + is_same, bhalf_t>::value || + is_same, int32_t>::value)) + { + if(!karg.IsReduceAdd()) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << " KBatch: " << karg.KBatch << " > 1 is not support yet" << __FILE__ + << ":" << __LINE__ << ", in function: " << __func__ << std::endl; + } + if(karg.KBatch > 1) + { + return false; + } + } + } + + // check gridwise gemm pipeline + const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value); + + if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1) + { + if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages) + { + return false; + } + } + + // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) + return true; + } + + __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) + { + const index_t num_loop = K / KPerBlock; + + return BlockwiseGemmPipe::BlockHasHotloop(num_loop); + } + + __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K) + { + const index_t num_loop = K / KPerBlock; + + return BlockwiseGemmPipe::BlockLoopTailNum(num_loop); + } + + template + __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock) + { + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})), + make_unmerge_transform(make_tuple(NBlock, Number{}))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); + + return c_grid_desc_mblock_mperblock_nblock_nperblock; + } + + // return block_id to C matrix tile idx (m0, n0) mapping + // if arch = gfx942 + using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>; + // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit; + + template + __device__ static void Run(const ADataType* p_a_grid, + const BDataType* p_b_grid, + CDataType* p_c_grid, + const BScaleType* p_b_scale_grid, + void* p_shared, + const Problem& problem, + const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1, + const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1, + const BScaleGridDesc_BN_AK& b_scale_grid_desc_bn_ak, + const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock& + c_grid_desc_mblock_mperblock_nblock_nperblock) + { + const auto a_grid_buf = make_dynamic_buffer( + p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); + const auto b_grid_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); + auto c_grid_buf = make_dynamic_buffer( + p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + + // B Scale buffer + const auto b_scale_grid_buf = make_dynamic_buffer( + p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize()); + + const AElementwiseOperation a_element_op{}; + const BElementwiseOperation b_element_op{}; + const CElementwiseOperation c_element_op{}; + + // divide block work by [M, N] + const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4}; + + const auto block_work_idx = + block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); + + if(!block_2_ctile_map.ValidCTileIndex( + block_work_idx, + make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0), + c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2)))) + { + return; + } + + const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]); + const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]); + + // HACK: this force m/n_block_data_idx_on_grid into SGPR + const index_t m_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock); + + const index_t n_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock); + + // lds max alignment + constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number); + + // A matrix in LDS memory, dst of blockwise copy + constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); + + // B matrix in LDS memory, dst of blockwise copy + constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1(); + + // A matrix blockwise copy + auto a_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ADataType, + ADataType, + decltype(a_grid_desc_ak0_m_ak1), + decltype(a_block_desc_ak0_m_ak1), + ABlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + ABlockTransferSrcVectorDim, + 2, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + 1, + 1, + AThreadTransferSrcResetCoordinateAfterRun, + true, + BlockwiseGemmPipe::GlobalBufferNum>( + a_grid_desc_ak0_m_ak1, + make_multi_index(0, m_block_data_idx_on_grid, 0), + a_element_op, + a_block_desc_ak0_m_ak1, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + // B matrix blockwise copy + auto b_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BDataType, + BDataType, + decltype(b_grid_desc_bk0_n_bk1), + decltype(b_block_desc_bk0_n_bk1), + BBlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true, + BlockwiseGemmPipe::GlobalBufferNum>( + b_grid_desc_bk0_n_bk1, + make_multi_index(0, n_block_data_idx_on_grid, 0), + b_element_op, + b_block_desc_bk0_n_bk1, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + // LDS allocation for A and B: be careful of alignment + constexpr auto a_block_space_size_aligned = math::integer_least_multiple( + a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); + + // Cast after lds + auto a_block_buf = make_dynamic_buffer( + static_cast(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); + + auto b_block_buf = make_dynamic_buffer( + reinterpret_cast(static_cast(p_shared) + a_block_space_size_aligned * + sizeof(ADataType) / + APackedSize), + b_block_desc_bk0_n_bk1.GetElementSpaceSize()); + + constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0); + constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0); + + // Blockwise GEMM pipeline + static_assert(std::is_default_constructible_v); + auto blockwise_gemm_pipeline = BlockwiseGemmPipe{}; + auto c_thread_buf = blockwise_gemm_pipeline.GetCThreadBuffer(); + + const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane( + (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / + KPerBlock); + + // b scale + // static_assert(KPerBlock <= ScaleBlockK); + static constexpr auto mfma = MfmaSelector{}; + static constexpr auto KPerXdlops = mfma.GetKPerXdlops(); + static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops(); + static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops; + static constexpr auto KPerThread = KPerBlock / K0PerXdlops; + + static constexpr auto ScaleSliceSizeN = NXdlPerWave; + static constexpr auto ScaleSliceSizeK = (KPerThread + ScaleBlockK - 1) / ScaleBlockK; + static constexpr auto KBlockScaleSliceSizeK = (KPerBlock + ScaleBlockK - 1) / ScaleBlockK; + + constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl); + + auto b_thread_offset_n = + get_thread_local_1d_id() % NPerXdl + (get_thread_local_1d_id() / 64) % NWaves * NPerXdl; + auto b_thread_offset_k = (get_thread_local_1d_id() % 64) / NPerXdl * KPerThread; + + auto b_scale_thread_copy = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0, 1>, + 1, + ScaleSliceSizeK, + 1, + false>( + b_scale_grid_desc_bn_ak, + make_multi_index(block_n_id * NPerBlock / ScaleBlockN + b_thread_offset_n, + b_thread_offset_k / ScaleBlockK)); + + constexpr auto b_scale_thread_slice_copy_step = + make_tuple(make_multi_index(NWaves * NPerXdl, 0), + make_multi_index(-NPerBlock, 0), + make_multi_index(-NPerBlock, KBlockScaleSliceSizeK)); + + const index_t num_k_block_per_scale = (ScaleBlockK + KPerBlock - 1) / KPerBlock; + + blockwise_gemm_pipeline.template Run( + a_grid_desc_ak0_m_ak1, + a_block_desc_ak0_m_ak1, + a_blockwise_copy, + a_grid_buf, + a_block_buf, + a_block_slice_copy_step, + b_grid_desc_bk0_n_bk1, + b_block_desc_bk0_n_bk1, + b_blockwise_copy, + b_grid_buf, + b_block_buf, + b_block_slice_copy_step, + c_thread_buf, + b_scale_grid_desc_bn_ak, + b_scale_thread_desc, + b_scale_thread_copy, + b_scale_grid_buf, + b_scale_thread_slice_copy_step, + num_k_block_main_loop, + num_k_block_per_scale); + + // shuffle C and write out + { + static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 && + NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0, + "wrong!"); + + constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl); + constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl); + + // TODO: hacky, fix it! + constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 = + blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(); + + // TODO: hacky, fix it! + // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths + constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp = + blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(); + + constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0); + constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1); + constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2); + constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3); + constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4); + constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5); + constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6); + constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7); + + constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = + GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); + + auto c_shuffle_block_buf = make_dynamic_buffer( + static_cast(p_shared), + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + + constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + make_tuple( + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // M0 (MXdlPerWave) per shuffle + M1, // M1 = MWave + M2, // M2 * M3 * M4 = MPerXdl + M3, + M4)), + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // N0 (NXdlPerWave) per shuffle + N1, // N1 = NWave + N2))), // N2 = NPerXdl + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple( + Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{})); + + // calculate origin of thread output tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = + blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0); + + const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; + const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; + + const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))), + make_tuple(Sequence<0, 1, 2, 3, 4>{}), + make_tuple(Sequence<0>{})); + + const auto m_thread_data_on_block_idx = + m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex( + make_multi_index(m_thread_data_on_block)); + + const auto n_thread_data_on_block_to_n0_n1_n2_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(N0, N1, N2))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + const auto n_thread_data_on_block_idx = + n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex( + make_multi_index(n_thread_data_on_block)); + + // shuffle: threadwise copy C from VGPR to LDS + auto c_thread_copy_vgpr_to_lds = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0, 1, 2, 3, 4, 5, 6, 7>, + 7, + 1, + InMemoryDataOperationEnum::Set, + 1, + true>{ + c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, + make_multi_index(0, + 0, + m_thread_data_on_block_idx[I1], + n_thread_data_on_block_idx[I1], + m_thread_data_on_block_idx[I2], + m_thread_data_on_block_idx[I3], + m_thread_data_on_block_idx[I4], + n_thread_data_on_block_idx[I2]), + ck::tensor_operation::element_wise::PassThrough{}}; + + // shuffle: blockwise copy C from LDS to global + auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1< + ThisThreadBlock, // ThreadGroup + CElementwiseOperation, // ElementwiseOperation, + CGlobalMemoryDataOperation, // DstInMemOp, + Sequence<1, + CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + 1, + CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, + CShuffleDataType, // typename SrcData, + CDataType, // typename DstData, + decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock), + decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), + Sequence<0, 1, 2, 3>, // typename DimAccessOrder, + 3, // index_t VectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, + true, // bool ThreadTransferSrcResetCoordinateAfterRun, + false> // bool ThreadTransferDstResetCoordinateAfterRun> + {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(0, 0, 0, 0), + c_grid_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(block_m_id, 0, block_n_id, 0), + c_element_op}; + + // space filling curve for threadwise C in VGPR + constexpr auto sfc_c_vgpr = + SpaceFillingCurve, + Sequence<0, 1, 2, 3, 4, 5, 6, 7>, + Sequence>{}; + + // space filling curve for shuffled blockwise C in global mem + constexpr auto sfc_c_global = + SpaceFillingCurve, + Sequence<0, 2, 1, 3>, + Sequence<1, + CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + 1, + CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{}; + + constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess(); + + static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!"); + + static_for<0, num_access, 1>{}([&](auto access_id) { + // make sure it's safe to write to LDS + block_sync_lds(); + + // each thread write its data from VGPR to LDS + c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2, + sfc_c_vgpr.GetIndexTupleOfNumber(access_id), + c_thread_buf, + c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, + c_shuffle_block_buf); + + // make sure it's safe to read from LDS + block_sync_lds(); + + // each block copy its data from LDS to global + c_shuffle_block_copy_lds_to_global.Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + + if constexpr(access_id < num_access - 1) + { + constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id); + + // move on C + c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); + } + }); + } + } + + template + __device__ static void Run(const ADataType* p_a_grid, + const BDataType* p_b_grid, + CDataType* p_c_grid, + const BScaleType* p_b_scale_grid, + void* p_shared, + const Problem& problem) + { + const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1( + problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0); + const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1( + problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0); + const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( + problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC); + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + c_grid_desc_m_n, problem.MBlock, problem.NBlock); + + // B Scale grid + const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor( + make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN), + math::integer_divide_ceil(problem.K, ScaleBlockK)), + make_tuple(problem.StrideScaleB, 1)); + + Run(p_a_grid, + p_b_grid, + p_c_grid, + p_b_scale_grid, + p_shared, + problem, + a_grid_desc_ak0_m_ak1, + b_grid_desc_bk0_n_bk1, + b_scale_grid_desc_bn_ak, + c_grid_desc_mblock_mperblock_nblock_nperblock); + } + + template + __device__ static void Run_2Lds(const ADataType* p_a_grid, + const BDataType* p_b_grid, + CDataType* p_c_grid, + const BScaleType* p_b_scale_grid, + void* p_shared_0, + void* p_shared_1, + const Problem& problem, + const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1, + const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1, + const BScaleGridDesc_BN_AK& b_scale_grid_desc_bn_ak, + const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock& + c_grid_desc_mblock_mperblock_nblock_nperblock) + { + const auto a_grid_buf = make_dynamic_buffer( + p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); + const auto b_grid_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); + auto c_grid_buf = make_dynamic_buffer( + p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + + // B Scale buffer + const auto b_scale_grid_buf = make_dynamic_buffer( + p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize()); + + const AElementwiseOperation a_element_op{}; + const BElementwiseOperation b_element_op{}; + const CElementwiseOperation c_element_op{}; + + // divide block work by [M, N] + const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4}; + + const auto block_work_idx = + block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); + + if(!block_2_ctile_map.ValidCTileIndex( + block_work_idx, + make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0), + c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2)))) + { + return; + } + + const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]); + const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]); + + // HACK: this force m/n_block_data_idx_on_grid into SGPR + const index_t m_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock); + + const index_t n_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock); + + // lds max alignment + constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number); + + // A matrix in LDS memory, dst of blockwise copy + constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); + + // B matrix in LDS memory, dst of blockwise copy + constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1(); + + // A matrix blockwise copy + auto a_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ADataType, + ADataType, + decltype(a_grid_desc_ak0_m_ak1), + decltype(a_block_desc_ak0_m_ak1), + ABlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + ABlockTransferSrcVectorDim, + 2, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + 1, + 1, + AThreadTransferSrcResetCoordinateAfterRun, + true, + BlockwiseGemmPipe::GlobalBufferNum>( + a_grid_desc_ak0_m_ak1, + make_multi_index(0, m_block_data_idx_on_grid, 0), + a_element_op, + a_block_desc_ak0_m_ak1, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + // B matrix blockwise copy + auto b_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BDataType, + BDataType, + decltype(b_grid_desc_bk0_n_bk1), + decltype(b_block_desc_bk0_n_bk1), + BBlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true, + BlockwiseGemmPipe::GlobalBufferNum>( + b_grid_desc_bk0_n_bk1, + make_multi_index(0, n_block_data_idx_on_grid, 0), + b_element_op, + b_block_desc_bk0_n_bk1, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + // LDS allocation for A and B: be careful of alignment + constexpr auto a_block_space_size_aligned = math::integer_least_multiple( + a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); + + auto a_block_buf_ping = make_dynamic_buffer( + static_cast(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); + + auto b_block_buf_ping = make_dynamic_buffer( + bit_cast(static_cast(p_shared_0) + + a_block_space_size_aligned * sizeof(ADataType) / APackedSize), + b_block_desc_bk0_n_bk1.GetElementSpaceSize()); + + auto a_block_buf_pong = make_dynamic_buffer( + static_cast(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); + + auto b_block_buf_pong = make_dynamic_buffer( + bit_cast(bit_cast(p_shared_1) + + a_block_space_size_aligned * sizeof(ADataType) / APackedSize), + b_block_desc_bk0_n_bk1.GetElementSpaceSize()); + + auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong); + auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong); + + constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0); + constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0); + + // Blockwise GEMM pipeline + static_assert(std::is_default_constructible_v); + auto blockwise_gemm_pipeline = BlockwiseGemmPipe{}; + auto c_thread_buf = blockwise_gemm_pipeline.GetCThreadBuffer(); + + const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane( + (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / + KPerBlock); + + // B scale + static constexpr auto mfma = MfmaSelector{}; + static constexpr auto KPerXdlops = mfma.GetKPerXdlops(); + static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops(); + static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops; + static constexpr auto KPerThread = KPerBlock / K0PerXdlops; + + const index_t ScaleSliceSizeN = NXdlPerWave; + static constexpr auto ScaleSliceSizeK = (KPerThread + ScaleBlockK - 1) / ScaleBlockK; + static constexpr auto KBlockScaleSliceSizeK = (KPerBlock + ScaleBlockK - 1) / ScaleBlockK; + + constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl); + + auto b_thread_offset_n = + get_thread_local_1d_id() % NPerXdl + (get_thread_local_1d_id() / 64) % NWaves * NPerXdl; + auto b_thread_offset_k = (get_thread_local_1d_id() % 64) / NPerXdl * KPerThread; + + auto b_scale_thread_copy = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0, 1>, + 1, + ScaleSliceSizeK, + 1, + false>( + b_scale_grid_desc_bn_ak, + make_multi_index(block_n_id * NPerBlock / ScaleBlockN + b_thread_offset_n, + b_thread_offset_k / ScaleBlockK)); + + constexpr auto b_scale_thread_slice_copy_step = + make_tuple(make_multi_index(NWaves * NPerXdl, 0), + make_multi_index(-NPerBlock, 0), + make_multi_index(-NPerBlock, KBlockScaleSliceSizeK)); + + const index_t num_k_block_per_scale = (ScaleBlockK + KPerBlock - 1) / KPerBlock; + + blockwise_gemm_pipeline.template Run( + a_grid_desc_ak0_m_ak1, + a_block_desc_ak0_m_ak1, + a_blockwise_copy, + a_grid_buf, + a_block_bufs, + a_block_slice_copy_step, + b_grid_desc_bk0_n_bk1, + b_block_desc_bk0_n_bk1, + b_blockwise_copy, + b_grid_buf, + b_block_bufs, + b_block_slice_copy_step, + c_thread_buf, + + b_scale_grid_desc_bn_ak, + b_scale_thread_desc, + b_scale_thread_copy, + b_scale_grid_buf, + b_scale_thread_slice_copy_step, + + num_k_block_main_loop, + num_k_block_per_scale); + + // shuffle C and write out + { + static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 && + NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0, + "wrong!"); + + constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl); + constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl); + + // TODO: hacky, fix it! + constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 = + blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(); + + // TODO: hacky, fix it! + // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths + constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp = + blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(); + + constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0); + constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1); + constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2); + constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3); + constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4); + constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5); + constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6); + constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7); + + constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = + GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); + + auto c_shuffle_block_buf = make_dynamic_buffer( + static_cast(p_shared_0), + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + + constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + make_tuple( + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // M0 (MXdlPerWave) per shuffle + M1, // M1 = MWave + M2, // M2 * M3 * M4 = MPerXdl + M3, + M4)), + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // N0 (NXdlPerWave) per shuffle + N1, // N1 = NWave + N2))), // N2 = NPerXdl + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple( + Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{})); + + // calculate origin of thread output tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = + blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0); + + const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; + const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; + + const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))), + make_tuple(Sequence<0, 1, 2, 3, 4>{}), + make_tuple(Sequence<0>{})); + + const auto m_thread_data_on_block_idx = + m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex( + make_multi_index(m_thread_data_on_block)); + + const auto n_thread_data_on_block_to_n0_n1_n2_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(N0, N1, N2))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + const auto n_thread_data_on_block_idx = + n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex( + make_multi_index(n_thread_data_on_block)); + + // shuffle: threadwise copy C from VGPR to LDS + auto c_thread_copy_vgpr_to_lds = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0, 1, 2, 3, 4, 5, 6, 7>, + 7, + 1, + InMemoryDataOperationEnum::Set, + 1, + true>{ + c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, + make_multi_index(0, + 0, + m_thread_data_on_block_idx[I1], + n_thread_data_on_block_idx[I1], + m_thread_data_on_block_idx[I2], + m_thread_data_on_block_idx[I3], + m_thread_data_on_block_idx[I4], + n_thread_data_on_block_idx[I2]), + ck::tensor_operation::element_wise::PassThrough{}}; + + // shuffle: blockwise copy C from LDS to global + auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1< + ThisThreadBlock, // ThreadGroup + CElementwiseOperation, // ElementwiseOperation, + CGlobalMemoryDataOperation, // DstInMemOp, + Sequence<1, + CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + 1, + CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, + CShuffleDataType, // typename SrcData, + CDataType, // typename DstData, + decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock), + decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), + Sequence<0, 1, 2, 3>, // typename DimAccessOrder, + 3, // index_t VectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, + true, // bool ThreadTransferSrcResetCoordinateAfterRun, + false> // bool ThreadTransferDstResetCoordinateAfterRun> + {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(0, 0, 0, 0), + c_grid_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(block_m_id, 0, block_n_id, 0), + c_element_op}; + + // space filling curve for threadwise C in VGPR + constexpr auto sfc_c_vgpr = + SpaceFillingCurve, + Sequence<0, 1, 2, 3, 4, 5, 6, 7>, + Sequence>{}; + + // space filling curve for shuffled blockwise C in global mem + constexpr auto sfc_c_global = + SpaceFillingCurve, + Sequence<0, 2, 1, 3>, + Sequence<1, + CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + 1, + CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{}; + + constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess(); + + static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!"); + + static_for<0, num_access, 1>{}([&](auto access_id) { + // make sure it's safe to write to LDS + block_sync_lds(); + + // each thread write its data from VGPR to LDS + c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2, + sfc_c_vgpr.GetIndexTupleOfNumber(access_id), + c_thread_buf, + c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, + c_shuffle_block_buf); + + // make sure it's safe to read from LDS + block_sync_lds(); + + // each block copy its data from LDS to global + c_shuffle_block_copy_lds_to_global.Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + + if constexpr(access_id < num_access - 1) + { + constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id); + + // move on C + c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); + } + }); + } + } + + template + __device__ static void Run_2Lds(const ADataType* p_a_grid, + const BDataType* p_b_grid, + CDataType* p_c_grid, + const BScaleType* p_b_scale_grid, + void* p_shared_0, + void* p_shared_1, + const Problem& problem) + { + const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1( + problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0); + const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1( + problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0); + const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( + problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC); + + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + c_grid_desc_m_n, problem.MBlock, problem.NBlock); + + const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor( + make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN), + math::integer_divide_ceil(problem.K, ScaleBlockK)), + make_tuple(problem.StrideScaleB, 1)); + + Run_2Lds(p_a_grid, + p_b_grid, + p_c_grid, + p_b_scale_grid, + p_shared_0, + p_shared_1, + problem, + a_grid_desc_ak0_m_ak1, + b_grid_desc_bk0_n_bk1, + b_scale_grid_desc_bn_ak, + c_grid_desc_mblock_mperblock_nblock_nperblock); + } +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 758900200..8c65ef32a 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1222,6 +1222,206 @@ struct ThreadwiseTensorSliceTransfer_v4 }); } + // Fuse scale + template + __device__ void Run(const SrcDesc&, + const SrcRefToOriginDisplacement&, + const SrcBuffer& src_buf, + const DstData& scale, + const DstDesc&, + const DstOriginIdx&, + DstBuffer& dst_buf) const + { + static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), + "wrong! SrcDesc and DstDesc need to known at compile-time"); + + static_assert( + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer or DstBuffer data type is wrong"); + + static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer"); + + static_assert(is_known_at_compile_time>::value && + is_known_at_compile_time>::value, + "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known " + "at compile-time"); + + // SrcDesc and DstDesc are known at compile-time + constexpr auto src_desc = remove_cvref_t{}; + constexpr auto dst_desc = remove_cvref_t{}; + + // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time + constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{}); + constexpr auto dst_origin_idx = to_multi_index(DstOriginIdx{}); + + // scalar per access of each dim + constexpr auto src_scalar_per_access = generate_sequence_v2( + [&](auto i) constexpr { + if constexpr(i == SrcVectorDim) + { + return Number{}; + } + else + { + return Number<1>{}; + } + }, + Number{}); + + // scalar step (if steping on SrcVectorDim) of each dim + constexpr auto src_scalar_step_in_vector = generate_sequence_v2( + [&](auto i) constexpr { + if constexpr(i == SrcVectorDim) + { + return Number<1>{}; + } + else + { + return Number<0>{}; + } + }, + Number{}); + + constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access; + + constexpr auto dim_access_order = DimAccessOrder{}; + + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); + + static_ford{}([&](auto ordered_access_idx) { +#if 0 + // TODO: unable to compile + // position in slice window + constexpr auto data_to_origin_disp_idx = + container_reorder_given_old2new(ordered_access_idx, dim_access_order) * + src_scalar_per_access; +#else + // position in slice window + constexpr auto data_to_origin_disp_idx = + ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access; +#endif + // src coordinate + constexpr auto src_ref_to_data_disp_idx = + src_ref_to_origin_disp_idx + data_to_origin_disp_idx; + + constexpr auto src_ref_to_data_disp_coord_step = + make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx); + + auto src_data_coord = src_ref_coord_; + + move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step); + + vector_type_maker_t src_tmp_vector; + + using src_vector_t = typename decltype(src_tmp_vector)::type; + + const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid( + src_desc, src_data_coord); + + // copy data from src_buf into src_tmp_vector + if constexpr(SrcBuffer::IsDynamicBuffer()) + { + src_tmp_vector.template AsType()(Number<0>{}) = + src_buf.template Get(src_data_coord.GetOffset() / PackedSize, + is_src_valid); + } + else if constexpr(SrcBuffer::IsStaticBuffer()) + { + static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { + constexpr index_t src_offset = src_desc.CalculateOffset( + src_ref_to_origin_disp_idx + data_to_origin_disp_idx + + i * src_scalar_step_in_vector); + + src_tmp_vector.template AsType()(i) = src_buf[Number{}]; + }); + } + + if constexpr(is_same, pk_i4_t>::value) + { + // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to + // DstData) + vector_type_maker_t dst_tmp_vector; + vector_type scale_vector; + scale_vector.template AsType()(Number<0>{}) = scale; + scale_vector.template AsType()(Number<1>{}) = scale; + + constexpr index_t pack_size = 8; + + static_assert(SrcScalarPerVector % pack_size == 0, ""); + + using src_v_t = typename vector_type_maker_t::type; + using dst_v_t = typename vector_type_maker_t::type; + using scale_v_t = typename vector_type_maker_t::type; + + static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) { + ck::tensor_operation::element_wise::DequantPack8{}( + dst_tmp_vector.template AsType()(i), + src_tmp_vector.template AsType()[i], + scale_vector.template AsType()[Number<0>{}]); + }); + + // copy data from dst_tmp_vector into dst_buf + static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { + constexpr index_t dst_offset = dst_desc.CalculateOffset( + dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector); + + dst_buf(Number{}) = dst_tmp_vector.template AsType()[i]; + }); + } + else if constexpr(is_same, f8_t>::value && + is_same, half_t>::value && + SrcScalarPerVector % 2 == 0) + { + // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to + // DstData) + vector_type_maker_t dst_tmp_vector; + + constexpr index_t pack_size = 2; + + using dst_v_t = typename vector_type_maker_t::type; + using src_v_t = typename vector_type_maker_t::type; + static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) { + ck::tensor_operation::element_wise::PassThroughPack2{}( + dst_tmp_vector.template AsType()(i), + src_tmp_vector.template AsType()[i]); + }); + + // copy data from dst_tmp_vector into dst_buf + static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { + constexpr index_t dst_offset = dst_desc.CalculateOffset( + dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector); + + dst_buf(Number{}) = dst_tmp_vector.template AsType()[i]; + }); + } + else + { + // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to + // DstData) + vector_type_maker_t dst_tmp_vector; + + // TODO: if SrcData and DstData are vetor type, then static_cast may not compile + static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { + dst_tmp_vector.template AsType()(i) = + type_convert(src_tmp_vector.template AsType()[i]); + }); + + // copy data from dst_tmp_vector into dst_buf + static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { + constexpr index_t dst_offset = dst_desc.CalculateOffset( + dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector); + + dst_buf(Number{}) = dst_tmp_vector.template AsType()[i]; + }); + } + }); + } + template __device__ void MoveSrcSliceWindow(const SrcDesc&, const SrcSliceMoveStepIdx& src_slice_move_step_idx) diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp index 6761c08f2..113f3af4a 100644 --- a/include/ck/utility/amd_inline_asm.hpp +++ b/include/ck/utility/amd_inline_asm.hpp @@ -4,8 +4,8 @@ #ifndef CK_AMD_INLINE_ASM_HPP #define CK_AMD_INLINE_ASM_HPP -#include "data_type.hpp" #include "c_style_pointer_cast.hpp" +#include "data_type.hpp" // TODO: deprecate all amd_assembly_outer_product_xxx @@ -21,14 +21,14 @@ inline __device__ int amd_assembly_and_or_b32(int a, int b, int d) inline __device__ half2_t amd_assembly_pk_fma_f16(half2_t a, half2_t b, half2_t c) { half2_t d; - asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c)); + asm volatile("v_pk_fma_f16 %0, %1, %2, %3" : "=v"(d) : "v"(a), "v"(b), "v"(c)); return d; } inline __device__ half2_t amd_assembly_pk_add_f16(half2_t a, half2_t b) { half2_t c; - asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b)); + asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(c) : "v"(a), "v"(b)); return c; } diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 86bc3c394..94608f5dc 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -19,6 +19,8 @@ struct pk_i4_t type data; __host__ __device__ constexpr pk_i4_t() : data{type{}} {} __host__ __device__ constexpr pk_i4_t(type init) : data{init} {} + + __host__ __device__ constexpr operator float() const { return static_cast(data); } }; inline constexpr auto next_pow2(uint32_t x) diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp new file mode 100644 index 000000000..93eed31bc --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +#if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8)) +void add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& instances); +#endif + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = DeviceGemmV2BScale; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt new file mode 100644 index 000000000..424320fa8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt @@ -0,0 +1,10 @@ +# ONLY XDL_KERNELS +set(GEMM_B_SCALE_INSTANCES) + +list(APPEND GEMM_B_SCALE_INSTANCES + device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp + ) + +set_source_files_properties(device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1") + +add_instance_library(device_gemm_b_scale_instance ${GEMM_B_SCALE_INSTANCES}) \ No newline at end of file diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp new file mode 100644 index 000000000..52735e9df --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using I4 = pk_i4_t; +using F16 = half_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +#if 0 +template +using device_gemm_xdl_b_scale_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple< + +#endif + +template +using device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple< + // clang-format off + //#########################| ALayout| BLayout| CLayout|AData| BData| BScale| CData| AccData| Cshuffle| A| B| C| GEMM| Block| Scale| Scale| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Data| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | Type| | | | Operation| Operation| Operation| | | N| K| | | | | |Wave| Wave| | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + //Compute friendly + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 128, 128, 128, 8, 32, 32, 32, 2, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 128, 128, 64, 8, 32, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 128, 128, 128, 8, 32, 32, 32, 2, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 128, 128, 64, 8, 32, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 128, 128, 64, 8, 32, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + + //Latency friendly + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 1, 128, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 1, 128, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 16, 32, 128, 8, 32, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + + // Memory friendly v3 + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 128, 32, 128, 8, 32, 32, 32, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 128, 16, 128, 8, 16, 16, 16, 4, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 64, 32, 128, 8, 32, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 64, 16, 128, 8, 16, 16, 16, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 1, 128, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 1, 128, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 16, 32, 128, 8, 32, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 16, 64, 128, 8, 32, 16, 16, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 32, 64, 128, 8, 32, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 16, 128, 128, 8, 32, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 32, 128, 128, 8, 32, 32, 32, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 16, 256, 128, 8, 32, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 32, 256, 128, 8, 32, 32, 32, 1, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + + // Memory friendly v4 + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 64, 32, 128, 8, 32, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 64, 16, 128, 8, 16, 16, 16, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 1, 128, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 1, 128, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 16, 32, 128, 8, 32, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 16, 64, 128, 8, 32, 16, 16, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 32, 64, 128, 8, 32, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 16, 128, 128, 8, 32, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 1, 128, 32, 128, 128, 8, 32, 32, 32, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 16, 256, 128, 8, 32, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 32, 256, 128, 8, 32, 32, 32, 1, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, + + //new Compute friendly kernel + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 128, 128, 64, 8, 32, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 128, 128, 64, 8, 32, 32, 32, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, + + //new Memory friendly kernel + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F16, I4, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 1, 128, 16, 64, 256, 8, 32, 16, 16, 1, 1, S<32, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false> + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100644 index 000000000..18788a2a1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +void add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profiler/profile_gemm_b_scale_impl.hpp b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp new file mode 100644 index 000000000..d01d48892 --- /dev/null +++ b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp @@ -0,0 +1,448 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" + +namespace ck { +namespace profiler { + +template +bool profile_gemm_b_scale_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + int M, + int N, + int K, + int StrideA, + int StrideB, + int StrideC, + int KBatch, + int n_warmup, + int n_iter, + uint64_t rotating = 0) +{ + bool pass = true; + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + + if(is_same::value) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + ck::index_t Scale_Stride_BN = ck::is_same_v + ? ((K + ScaleBlockK - 1) / ScaleBlockK) + : N; + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor b1_k_n(f_host_tensor_descriptor( + (K + ScaleBlockK - 1) / ScaleBlockK, // K direction group size is ScaleBlockK + N, // N direction group size is 1 + Scale_Stride_BN, + BLayout{})); + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + int total_gemm_needed = a_m_k.GetElementSpaceSizeInBytes() + + b_k_n.GetElementSpaceSizeInBytes() + + b1_k_n.GetElementSpaceSizeInBytes(); + + int rotating_count = std::max( + 1, + std::min(n_iter, + static_cast(std::ceil(static_cast(rotating) / total_gemm_needed)))); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl; + std::cout << "rotating count: " << rotating_count << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-1, 2}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-1, 2}); + b1_k_n.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); + break; + case 2: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + b1_k_n.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); + break; + default: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_k_n.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); + } + + using AElementOp = ck::tensor_operation::element_wise::PassThrough; + using BElementOp = ck::tensor_operation::element_wise::PassThrough; + using CElementOp = ck::tensor_operation::element_wise::PassThrough; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto c_element_op = CElementOp{}; + + DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize()); + DeviceMem b1_device_buf(sizeof(BScaleDataType) * b1_k_n.mDesc.GetElementSpaceSize()); + DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a_m_k.mData.data()); + b1_device_buf.ToDevice(b1_k_n.mData.data()); + + using DeviceOp = ck::tensor_operation::device::DeviceGemmV2BScale; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + // Run reference GEMM + if(do_verification) + { + Tensor b_k_n_dequant({K, N}); + + float v_b = 0; + for(int n = 0; n < N; n++) + { + for(int k = 0; k < K; k++) + { + ck::pk_i4_t i4x2 = b_k_n(k, n).data; + int8_t i4 = 0; + if(k % 2 == 1) + i4 = (i4x2.data >> 0) & 0xf; + else + i4 = (i4x2.data >> 4) & 0xf; + i4 = i4 - 8; + v_b = ck::type_convert(i4); + + b_k_n_dequant(k, n) = ck::type_convert(v_b) * + ck::type_convert(b1_k_n(k / ScaleBlockK, n)); + } + } + using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; + + auto ref_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = ref_gemm.MakeArgument( + a_m_k, b_k_n_dequant, c_m_n_host_result, a_element_op, b_element_op, c_element_op); + + ref_invoker.Run(ref_argument); + } + + std::string best_op_name; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + float best_kbatch = 0; + + // profile device GEMM instances + for(auto& op_ptr : op_ptrs) + { + const int KPerBlock = op_ptr->GetKPerBlock(); + + if(op_ptr->GetPermuteB()) + { + int K1 = KPerBlock; + int K0 = K / KPerBlock; + + // int K0, N, K1 + for(int j = 0; j < K0; j++) + { + for(int i = 0; i < N; i++) + { + for(int jj = 0; jj < K1; jj++) + { + b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj)); + } + } + } + + if(is_same_v && is_same_v) + { + // vector pk_i4x4 permute + for(int i = 0; i < N; i++) + { + for(int j = 0; j < K; j += 8) + { + int input[8]; + + for(int k = 0; k < 4; k++) + { + int i4x2 = b_k_n_permute(j + k * 2, i).data; + input[k * 2 + 0] = (i4x2 >> 4) & 0xf; + input[k * 2 + 1] = (i4x2 >> 0) & 0xf; + } + + // permute 01234567->20643175 + { + int hi = input[2]; + int lo = input[0]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 0, i) = i4x2; + } + + { + int hi = input[6]; + int lo = input[4]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 2, i) = i4x2; + } + + { + int hi = input[3]; + int lo = input[1]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 4, i) = i4x2; + } + + { + int hi = input[7]; + int lo = input[5]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 6, i) = i4x2; + } + } + } + } + } + else + { + b_k_n_permute = b_k_n; + } + + b_device_buf.ToDevice(b_k_n_permute.mData.data()); + + std::vector kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38}; + + if(KBatch > 0) + { + kbatch_list = {KBatch}; + } + + for(std::size_t i = 0; i < kbatch_list.size(); i++) + { + auto kbatch_curr = kbatch_list[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer( + static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + M, + N, + K, + StrideA, + StrideB, + StrideC, + Scale_Stride_BN, + static_cast(b1_device_buf.GetDeviceBuffer()), + kbatch_curr, + a_element_op, + b_element_op, + c_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + + // re-init C to zero before profiling next kernel + c_device_buf.SetZero(); + + invoker_ptr->Run(argument_ptr.get(), + StreamConfig{nullptr, false, 0, n_warmup, n_iter}); + + if(do_verification) + { + c_device_buf.FromDevice(c_m_n_device_result.mData.data()); + +#if defined CK_ENABLE_FP8 + // set softer tolerances for fp8 + if constexpr(is_same_v || is_same_v || + is_same_v) + { + std::string msg = "Error: Incorrect results!"; + double rtol = 1e-1; + double atol = 1e-1; + pass = pass & ck::utils::check_err( + c_m_n_device_result, c_m_n_host_result, msg, rtol, atol); + } + else + { +#endif + pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); +#if defined CK_ENABLE_FP8 + } +#endif + + if(do_log) + { + LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") << std::endl; + LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; + LogRangeAsType( + std::cout << "c_host : ", c_m_n_host_result.mData, ",") + << std::endl; + LogRangeAsType( + std::cout << "c_device: ", c_m_n_device_result.mData, ",") + << std::endl; + } + } + + std::string op_name = op_ptr->GetTypeString(); + + float ave_time = invoker_ptr->Run(argument_ptr.get(), + StreamConfig{nullptr, + time_kernel, + 0, + n_warmup, + n_iter, + rotating_count > 1, + rotating_count}); + + std::size_t flop = std::size_t(2) * M * N * K; + + static constexpr index_t BPackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + + std::size_t num_btype = sizeof(ADataType) * M * K + + sizeof(BDataType) * K * N / BPackedSize + + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch " + << kbatch_curr << std::endl; + + if(tflops > best_tflops && ave_time > 1e-10) + { + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + best_kbatch = kbatch_curr; + } + } + else + { + std::cout << op_ptr->GetTypeString() << " does not support this problem" + << std::endl; + } + } + } + + if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = f32"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = f16"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = bf16"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = int8"; + } + + if constexpr(is_same::value) + { + std::cout << " ALayout = RowMajor"; + } + else if constexpr(is_same::value) + { + std::cout << " ALayout = ColumnMajor"; + } + + if constexpr(is_same::value) + { + std::cout << " BLayout = RowMajor"; + } + else if constexpr(is_same::value) + { + std::cout << " BLayout = ColumnMajor"; + } + + std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA + << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch + << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec + << " GB/s, " << best_op_name << std::endl; + + return pass; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index a0978eb6b..61017d4b3 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -58,6 +58,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp) list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp) + list(APPEND PROFILER_SOURCES profile_gemm_b_scale.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp) @@ -141,6 +142,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") endif() target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance) + target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_b_scale_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance) diff --git a/profiler/src/profile_gemm_b_scale.cpp b/profiler/src/profile_gemm_b_scale.cpp new file mode 100644 index 000000000..443ebff83 --- /dev/null +++ b/profiler/src/profile_gemm_b_scale.cpp @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "profiler/profile_gemm_b_scale_impl.hpp" +#include "profiler_operation_registry.hpp" + +enum struct GemmMatrixLayout +{ + MK_KN_MN, // 0 + MK_NK_MN, // 1 + KM_KN_MN, // 2 + KM_NK_MN, // 3 +}; + +enum struct GemmDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 + INT8_INT8_INT8, // 3 + F8_F16_F16, // 4 + F16_F8_F16, // 5 + F16_F16_F16_F8, // 6 + F8_F8_BF16, // 7 + F16_I4_F16, // 8 +}; + +enum struct BScaleBlockTile +{ + K_64, // 0 + K_128, // 1 +}; + +#define OP_NAME "gemm_b_scale" +#define OP_DESC "Int4-dequant GEMM" + +int profile_gemm_b_scale(int argc, char* argv[]) +{ + if(argc != 16 && argc != 19) + { + printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); + printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: " + "f16->f8; 7: f8->bf16, " + "comp f8; 8: f16@i4)\n"); + printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"); + printf(" 1: A[m, k] * B[n, k] = C[m, n];\n"); + printf(" 2: A[k, m] * B[k, n] = C[m, n];\n"); + printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); + printf("arg4: B scale block tile (0: 64, 1: 128):\n"); + printf("arg5: verification (0: no; 1: yes)\n"); + printf("arg6: initialization (0: no init; 1: integer value; 2: decimal value)\n"); + printf("arg7: print tensor value (0: no; 1: yes)\n"); + printf("arg8: time kernel (0=no, 1=yes)\n"); + printf("arg9 to 14: M, N, K, StrideA, StrideB, StrideC\n"); + printf("arg15: split k into mulitiple batch\n"); + printf("optional:\n"); + printf("arg16: number of warm-up cycles (default 1)\n"); + printf("arg17: number of iterations (default 10)\n"); + printf("arg18: memory for rotating buffer (default 0, size in MB)\n"); + exit(1); + } + + printf("Start profiling\n"); + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const auto B_scale_block = static_cast(std::stoi(argv[4])); + const bool do_verification = std::stoi(argv[5]); + const int init_method = std::stoi(argv[6]); + const bool do_log = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[8]); + + const int M = std::stoi(argv[9]); + const int N = std::stoi(argv[10]); + const int K = std::stoi(argv[11]); + + const int StrideA = std::stoi(argv[12]); + const int StrideB = std::stoi(argv[13]); + const int StrideC = std::stoi(argv[14]); + const int KBatch = std::stoi(argv[15]); + printf("M:%d, N:%d, K:%d, StrideA:%d, StrideB:%d, StrideC:%d, KBatch:%d\n", + M, + N, + K, + StrideA, + StrideB, + StrideC, + KBatch); + + int n_warmup = 1; + int n_iter = 10; + uint64_t rotating = 0; + if(argc == 19) + { + n_warmup = std::stoi(argv[16]); + n_iter = std::stoi(argv[17]); + rotating = std::stoull(argv[18]) * 1024 * 1024; + + printf("n_warmup:%d, n_iter:%d, rotating:%lu\n", n_warmup, n_iter, rotating); + } + + using F32 = float; + using F16 = ck::half_t; + using I4 = ck::pk_i4_t; + + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + + auto profile = [&](auto a_type, + auto b_type, + auto b_scale_type, + auto comp_type, + auto acc_type, + auto c_type, + auto scale_block_k, + auto a_layout, + auto b_layout, + auto c_layout) { + using ADataType = decltype(a_type); + using BDataType = decltype(b_type); + using BScaleDataType = decltype(b_scale_type); + using ComputeDataType = decltype(comp_type); + using AccDataType = decltype(acc_type); + using CDataType = decltype(c_type); + + using ALayout = decltype(a_layout); + using BLayout = decltype(b_layout); + using CLayout = decltype(c_layout); + + const int DefaultStrideA = ck::is_same_v ? K : M; + const int DefaultStrideB = ck::is_same_v ? N : K; + const int DefaultStrideC = ck::is_same_v ? N : M; + + bool pass = ck::profiler::profile_gemm_b_scale_impl( + do_verification, + init_method, + do_log, + time_kernel, + M, + N, + K, + (StrideA < 0) ? DefaultStrideA : StrideA, + (StrideB < 0) ? DefaultStrideB : StrideB, + (StrideC < 0) ? DefaultStrideC : StrideC, + KBatch, + n_warmup, + n_iter, + rotating); + + return pass ? 0 : 1; + }; + + if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN && + B_scale_block == BScaleBlockTile::K_128) + { + printf("F16_I4_F16 MK_NK_MN K_128\n"); + return profile( + F16{}, I4{}, F16{}, F16{}, F32{}, F16{}, ck::Number<128>{}, Row{}, Col{}, Row{}); + } + else + { + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; + } +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_b_scale); -- GitLab From 6df5fe2ad8fb6ff054a3e75250ccef7c878c3455 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Fri, 3 Jan 2025 18:43:07 +0800 Subject: [PATCH 150/153] [CK_TILE]naive attn support FP8 KVCache quant (#1747) * quant * fix bug * simple smoothquant after softmax * update kv-quant * update stride * fix fp8-pertoken-kvcache * update int8/fp8 quant support --------- Co-authored-by: so Co-authored-by: Po Yen Chen --- example/ck_tile/01_fmha/fmha_fwd.cpp | 19 +- include/ck_tile/ref/naive_attention.hpp | 422 ++++++++++++++++-------- 2 files changed, 301 insertions(+), 140 deletions(-) diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index 08d263da9..b3855e59d 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -1131,15 +1131,16 @@ bool run(const ck_tile::ArgParser& arg_parser) { // NOTE: use gpu to do validation ck_tile::naive_attention_fwd_traits naive_t; - naive_t.q_type = data_type; - naive_t.k_type = data_type; - naive_t.v_type = data_type; - naive_t.o_type = data_type; - naive_t.q_layout = i_perm == 1 ? "bhsd" : "bshd"; - naive_t.k_layout = i_perm == 1 ? "bhsd" : "bshd"; - naive_t.v_layout = i_perm == 1 ? "bhsd" : "bshd"; - naive_t.o_layout = o_perm == 1 ? "bhsd" : "bshd"; - naive_t.variation = 0; // TODO? + naive_t.q_type = data_type; + naive_t.k_type = data_type; + naive_t.v_type = data_type; + naive_t.o_type = data_type; + naive_t.q_layout = i_perm == 1 ? "bhsd" : "bshd"; + naive_t.k_layout = i_perm == 1 ? "bhsd" : "bshd"; + naive_t.v_layout = i_perm == 1 ? "bhsd" : "bshd"; + naive_t.o_layout = o_perm == 1 ? "bhsd" : "bshd"; + naive_t.variation = 0; // TODO? + naive_t.quant_algo = 0; ck_tile::DeviceMem o_naive_buf(o_host.get_element_space_size_in_bytes()); diff --git a/include/ck_tile/ref/naive_attention.hpp b/include/ck_tile/ref/naive_attention.hpp index 09ded761e..98ceab699 100644 --- a/include/ck_tile/ref/naive_attention.hpp +++ b/include/ck_tile/ref/naive_attention.hpp @@ -13,13 +13,18 @@ namespace ck_tile { enum class naive_attention_layout_enum { - BSHD, // [batch, seqlen, nhead, hdim] - BHSD, // [batch, nhead, seqlen, hdim] - BS3HD, // [batch, nhead, 3, seqlen, hdim], used when qkv are packed - PHSD, // [pages, nhead, page_size, hdim] + DEFAULT, // maybe this tensor is not used, set some irrelevant value + BSHD, // [batch, seqlen, nhead, hdim] + BHSD, // [batch, nhead, seqlen, hdim] + BS3HD, // [batch, nhead, 3, seqlen, hdim], used when qkv are packed + PHSD, // [pages, nhead, page_size, hdim] // PHSDX, // [pages, nhead, page_size/x, hdim, x], where <# used pages>*page_size = seqlen PHDSX, // [pages, nhead, hdim/x, page_size, x], where <# used pages>*page_size = seqlen PHDS, // [pages, nhead, hdim, page_size], where <# used pages>*page_size = seqlen + + // scale layout used for dynamic dequant + SCALE_HS, // [nhead, tokens] or [nhead, tokens-per-group], nhe KVCache quant + SCALE_SH, // [tokens, nhead] }; // will used to specialize kernel variation @@ -30,6 +35,15 @@ enum class naive_attention_variation_enum DECODE_PAGED, // decode attn, where kv token from another buffer called kvcache }; +enum class naive_attention_quant_algo +{ + NO = 0, + KV_8BIT_PERHEAD = 1, + // FP8/INT8 quant for KVCache, per-token quant + // [num_tokens, nhead, hdim] -> [nhead, num_tokens] + KV_8BIT_PERTOKEN = 2, +}; + // TODO: for simplicity, this will be used as host/device arg struct naive_attention_fwd_args { @@ -40,7 +54,8 @@ struct naive_attention_fwd_args void* context_len_ptr; // [batch] used when seqlen kv come from a pointer(each element is a // number, not cumsum) void* page_table_ptr; // [batch, max_pages_per_seq] seqlen_kv is in different block(paged attn) - void* kvscale_ptr; // [nhead, 2(kv), hdim] used for kvcache dequant + void* kscale_ptr; // [nhead, max_kv_tokens] used for kvcache dequant + void* vscale_ptr; // [nhead, max_kv_tokens] used for kvcache dequant float scale_s; int hdim; int hdim_v; // could be cross-attn, where V and Q/K hdim are different @@ -54,6 +69,7 @@ struct naive_attention_fwd_args int nhead_ratio_kv; // nhead_q / nhead_kv int page_size; // if paged, the seqlen-kv per each block int max_pages_per_seq; + int max_kv_tokens; // used as stride to access kv scale ptr }; // this is trait for host API @@ -67,14 +83,16 @@ struct naive_attention_fwd_traits std::string k_layout; std::string v_layout; std::string o_layout; - int variation; // sync with naive_attention_variation_enum + int variation; // sync with naive_attention_variation_enum + int quant_algo; // sync with naive_attention_quant_algo }; // this is trait for kernel template -template +template struct naive_attention_fwd_kernel_traits { static constexpr naive_attention_variation_enum variation = variation_; + static constexpr naive_attention_quant_algo quant_algo = quant_algo_; }; // for simplicity, please do not use const-reference type for the template type @@ -83,28 +101,39 @@ template struct naive_attention_fwd_kernel { static constexpr bool is_kvcache_i8 = - std::is_same_v && std::is_same_v && sizeof(QType) != 1; + std::is_same_v && std::is_same_v; + static constexpr bool is_kvcache_fp8 = + std::is_same_v && std::is_same_v; - // kvcache-i8 will have per head scale, we apply this scale to Q/P matrix instead of original - // K/V matrix. This can speed up conversion since Q/P usually is fp16/bf16/fp32 - static constexpr bool is_kvcache_i8_forward_quant = is_kvcache_i8; + static constexpr int v_per_token_quant_group_size = 64; // TODO: hardcode - using KVScaleType = float; - using SoftmaxType = float; - using PType = VType; // src A of gemm2, same type as V + using SoftmaxType = float; // always using float to do softmax compute + using QuantComputeType = float; // used for quant/dequant scale compute + using QCompute = KType; // src A of gemm1, same type as K + using PType = VType; // src A of gemm2, same type as V + using OAccType = float; // always float, in case int8 FA using p_vec_type = ext_vector_t; static constexpr int p_vec_elem = vector_traits::vector_size; + // clang-format off + template struct scale_max { static constexpr float value = 1; /* dummy code */ }; + template <> struct scale_max { static constexpr float value = 127.0; }; + template <> struct scale_max { static constexpr float value = 240.0; }; + // clang-format on + __host__ __device__ naive_attention_fwd_kernel() {} template @@ -198,24 +227,31 @@ struct naive_attention_fwd_kernel __device__ void store(T /*value*/, int /*i_s*/, int /*i_d*/) {} }; - template + template struct kvscale_addresser { - int h, d; // nhead, hdim + int s, h, d; // seqlen(tokens), nhead, hdim T* base_ptr; - __device__ kvscale_addresser(int h_, int d_, void* p_) - : h(h_), d(d_), base_ptr(reinterpret_cast(p_)) + __device__ kvscale_addresser(int s_, int h_, int d_, void* p_) + : s(s_), h(h_), d(d_), base_ptr(reinterpret_cast(p_)) { } - __device__ int get_offset(int i_h, int i_d, int i_kv /*0 or 1*/) + __device__ int get_offset(int i_s, int i_h, int i_d) { + if constexpr(Layout == naive_attention_layout_enum::SCALE_HS) + { + // [nhead, tokens] + (void)i_d; + return i_h * s + i_s; + } + else if constexpr(Layout == naive_attention_layout_enum::DEFAULT) + { + return 0; + } // [h, 2, d] - return i_h * 2 * d + i_kv * d + i_d; - } - __device__ T load(int i_h, int i_d, int i_kv) - { - return base_ptr[get_offset(i_h, i_d, i_kv)]; + // return i_h * 2 * d + i_kv * d + i_d; } + __device__ T load(int i_s, int i_h, int i_d) { return base_ptr[get_offset(i_s, i_h, i_d)]; } }; __device__ __host__ static constexpr int get_block_size() { return 256; } @@ -282,12 +318,13 @@ struct naive_attention_fwd_kernel __device__ void operator()(naive_attention_fwd_args args) { constexpr int wg_size = get_block_size(); - __shared__ char smem[wg_size * 4 * sizeof(float)]; // should enough - int i_dv = blockIdx.x * wg_size + threadIdx.x; // index of hdim_v - int i_sq = blockIdx.y; // index of seqlen_q - int i_batch = blockIdx.z; // index of batch_q * nhead_q - int i_bq = i_batch / args.nhead_q; // index of batch_q - int i_hq = i_batch % args.nhead_q; // index of nhead_q + __shared__ char smem[wg_size * 4 * sizeof(float)]; // should enough + char* smem_quant_q = smem + wg_size * 2 * sizeof(float); // second half, should enough + int i_dv = blockIdx.x * wg_size + threadIdx.x; // index of hdim_v + int i_sq = blockIdx.y; // index of seqlen_q + int i_batch = blockIdx.z; // index of batch_q * nhead_q + int i_bq = i_batch / args.nhead_q; // index of batch_q + int i_hq = i_batch % args.nhead_q; // index of nhead_q int i_bk = i_bq / args.batch_ratio_kv; int i_hk = i_hq / args.nhead_ratio_kv; @@ -360,9 +397,10 @@ struct naive_attention_fwd_kernel auto f_max = [](auto x_, auto y_) { return max(x_, y_); }; auto f_sum = [](auto x_, auto y_) { return x_ + y_; }; auto f_absmax_f32 = [](float v_0_, float v_1_) { - float rtn; - asm volatile("v_max_f32 %0, abs(%1), abs(%2)" : "=v"(rtn) : "v"(v_0_), "v"(v_1_)); - return rtn; + // float rtn; + // asm volatile("v_max_f32 %0, abs(%1), abs(%2)" : "=v"(rtn) : "v"(v_0_), "v"(v_1_)); + // return rtn; + return max(abs(v_0_), abs(v_1_)); }; int seqlen_kv = [&]() { @@ -378,45 +416,82 @@ struct naive_attention_fwd_kernel SoftmaxType row_max = -numeric::infinity(); SoftmaxType l{0}; - AccType o_acc = {0}; + // AccType o_acc = {0}; + OAccType o_acc = {0}; - int sk_loops = (seqlen_kv + wg_size - 1) / wg_size; - float qf_scale = .0f; - kvscale_addresser kvscale_addr{args.nhead_kv, args.hdim, args.kvscale_ptr}; + int sk_loops = (seqlen_kv + wg_size - 1) / wg_size; + QuantComputeType q_dequant_scale = .0f; + kvscale_addresser kscale_addr{ + args.max_kv_tokens, args.nhead_kv, args.hdim, args.kscale_ptr}; + kvscale_addresser vscale_addr{ + args.max_kv_tokens, args.nhead_kv, args.hdim_v, args.vscale_ptr}; - if constexpr(is_kvcache_i8_forward_quant) + if constexpr(Traits::quant_algo == naive_attention_quant_algo::KV_8BIT_PERHEAD) { // AccType is i32 now, seqlen_q = 1, hdim up to 256 - float q = 0; - float k_s = 0; + AccType q = 0; + AccType k_s = 0; if(static_cast(threadIdx.x) < args.hdim) { - q = type_convert(q_addr.load(0, threadIdx.x)); - k_s = type_convert(kvscale_addr.load(i_hk, threadIdx.x, 0)); + q = type_convert(q_addr.load(0, threadIdx.x)); + k_s = type_convert(kscale_addr.load(i_hk, threadIdx.x, 0)); } // 1) we apply the k scale to q - float q_forwarded = q * k_s; + AccType q_forwarded = q * k_s; // 2) apply smooth-quant // find absmax - float qf_max = wave_reduce(q_forwarded, f_absmax_f32); - qf_max = cross_wave_reduce(qf_max, f_absmax_f32, reinterpret_cast(smem)); + AccType qf_max = wave_reduce(q_forwarded, f_absmax_f32); + qf_max = cross_wave_reduce(qf_max, f_absmax_f32, reinterpret_cast(smem)); // per-token scale - qf_scale = qf_max / 127.0; + q_dequant_scale = type_convert(qf_max) / scale_max::value; // devide by scale - q = q / qf_scale; + q = q / q_dequant_scale; // fp32->i8 - int8_t quantized_q = static_cast(q); + QCompute quantized_q = static_cast(q); __syncthreads(); - reinterpret_cast(smem)[threadIdx.x] = quantized_q; + reinterpret_cast(smem)[threadIdx.x] = quantized_q; __syncthreads(); // after above process, we have 2 data // 1) int8 q data stored in smem(no need to reload) - // 2) per-token scale qf_scale, to be mul after 1st gemm + // 2) per-token scale q_dequant_scale, to be mul after 1st gemm + } + else if constexpr(Traits::quant_algo == naive_attention_quant_algo::KV_8BIT_PERTOKEN) + { + if(std::is_same_v || std::is_same_v) + { + // dyanmic quant q here + float q = 0; + if(static_cast(threadIdx.x) < args.hdim) + { + q = type_convert(q_addr.load(i_sq, threadIdx.x)); + } + + // apply smooth-quant + // find absmax + float q_max = wave_reduce(q, f_absmax_f32); + q_max = cross_wave_reduce(q_max, f_absmax_f32, reinterpret_cast(smem)); + + // per-token scale + q_dequant_scale = + type_convert(q_max) / scale_max::value; + + // devide by scale + q = q / q_dequant_scale; + + QCompute quantized_q = type_convert(q); + __syncthreads(); + reinterpret_cast(smem_quant_q)[threadIdx.x] = quantized_q; + __syncthreads(); + + // after above process, we have 2 data + // 1) fp8 q data stored in smem(no need to reload from global) + // 2) per-token scale q_dequant_scale, to be mul after 1st gemm + } } for(int i_loop1 = 0; i_loop1 < sk_loops; i_loop1++) @@ -429,33 +504,41 @@ struct naive_attention_fwd_kernel AccType s_acc{0}; // clear for every loop for(auto i_dq = 0; i_dq < args.hdim; i_dq++) { - if constexpr(is_kvcache_i8_forward_quant) - { - int8_t q = reinterpret_cast(smem)[i_dq]; - auto k = k_addr.load(i_sk, i_dq); - - s_acc += type_convert(q) * type_convert(k); - } - else - { - auto q = q_addr.load(i_sq, i_dq); // q will have duplicate load - auto k = k_addr.load(i_sk, i_dq); + auto q = [&]() { + if constexpr(Traits::quant_algo == + naive_attention_quant_algo::KV_8BIT_PERHEAD || + Traits::quant_algo == + naive_attention_quant_algo::KV_8BIT_PERTOKEN) + { + return reinterpret_cast(smem_quant_q)[i_dq]; + } + else + return q_addr.load(i_sq, i_dq); // q will have duplicate load + }(); + auto k = [&]() { return k_addr.load(i_sk, i_dq); }(); - s_acc += type_convert(q) * type_convert(k); - } + s_acc += type_convert(q) * type_convert(k); } // scale s_softmax = type_convert(s_acc); s_softmax *= type_convert(args.scale_s * ck_tile::log2e_v); - if constexpr(is_kvcache_i8_forward_quant) + if constexpr(Traits::quant_algo == naive_attention_quant_algo::KV_8BIT_PERHEAD) + { + s_softmax *= q_dequant_scale; // post scale the per-token factor + } + else if constexpr(Traits::quant_algo == + naive_attention_quant_algo::KV_8BIT_PERTOKEN) { - s_softmax *= qf_scale; // post scale the per-token factor + SoftmaxType k_per_token_scale = + type_convert(kscale_addr.load(i_sk, i_hk, 0)); + s_softmax *= q_dequant_scale; + s_softmax *= k_per_token_scale; } } // s->p - float pf_scale = 0.; // used for i8 quant + QuantComputeType p_dequant_scale = 1.; { // softmax, find max SoftmaxType old_max = row_max; @@ -473,41 +556,69 @@ struct naive_attention_fwd_kernel // l, pre-scall o_acc SoftmaxType tmp = __builtin_amdgcn_exp2f(old_max - row_max); l = tmp * l + row_sum; - o_acc = type_convert(type_convert(o_acc) * tmp); + o_acc = type_convert(type_convert(o_acc) * tmp); // prepare the p_compute into smem, to let every thread read same p_compute and do // 2nd gemm - if constexpr(is_kvcache_i8_forward_quant) + if constexpr(Traits::quant_algo == naive_attention_quant_algo::KV_8BIT_PERHEAD) { - float v_s = 0; + QuantComputeType v_s = 0; if(static_cast(threadIdx.x) < args.hdim_v) { - v_s = type_convert(kvscale_addr.load(i_hk, threadIdx.x, 1)); + v_s = + type_convert(vscale_addr.load(i_hk, threadIdx.x, 1)); } // 1) we apply the v scale to p - float p_forwarded = p_compute * v_s; + QuantComputeType p_forwarded = p_compute * v_s; // 2) apply smooth-quant // find absmax - float pf_max = wave_reduce(p_forwarded, f_absmax_f32); - pf_max = - cross_wave_reduce(pf_max, f_absmax_f32, reinterpret_cast(smem)); + QuantComputeType pf_max = wave_reduce(p_forwarded, f_absmax_f32); + pf_max = cross_wave_reduce( + pf_max, f_absmax_f32, reinterpret_cast(smem)); // per-token scale - pf_scale = pf_max / 127.0; + p_dequant_scale = pf_max / scale_max::value; // 127.0; // devide by scale - p_compute = p_compute / pf_scale; + p_compute = p_compute / p_dequant_scale; // fp32->i8 - int8_t quantized_p = static_cast(p_compute); + PType quantized_p = static_cast(p_compute); __syncthreads(); - reinterpret_cast(smem)[threadIdx.x] = quantized_p; + reinterpret_cast(smem)[threadIdx.x] = quantized_p; __syncthreads(); // after above process, we have 2 data // 1) int8 p data stored in smem(no need to reload) - // 2) per-token scale pf_scale, to be mul after 2nd gemm + // 2) per-token scale p_dequant_scale, to be mul after 2nd gemm + } + else if constexpr(Traits::quant_algo == + naive_attention_quant_algo::KV_8BIT_PERTOKEN) + { + // forward apply the v scale to p_compute, this is compute friendly + auto v_scale = type_convert(vscale_addr.load(i_sk, i_hk, 0)); + p_compute *= v_scale; + // smooth-quant + // find absmax + QuantComputeType p_max = wave_reduce(p_compute, f_absmax_f32); + p_max = cross_wave_reduce( + p_max, f_absmax_f32, reinterpret_cast(smem)); + + // per-token scale + p_dequant_scale = p_max / scale_max::value; // 240.0; + + // devide by scale + p_compute = p_compute / p_dequant_scale; + + // fp32->i8 + PType quantized_p = type_convert(p_compute); + __syncthreads(); + reinterpret_cast(smem)[threadIdx.x] = quantized_p; + __syncthreads(); + // after above process, we have 2 data + // 1) fp8_t p data stored in smem(no need to reload) + // 2) per-token scale p_dequant_scale, to be mul after 2nd gemm } else { @@ -531,29 +642,45 @@ struct naive_attention_fwd_kernel int sv_offset = i_loop2 * p_vec_elem + i_j; int i_sv = sk_start + sv_offset; - VType v = 0.f; + VType v = 0; if(i_dv < args.hdim_v && i_sv < seqlen_kv) { v = v_addr.load(i_sv, i_dv); } - o_acc_local += type_convert(p_vec[i_j]) * type_convert(v); + AccType v_compute = [&]() { return type_convert(v); }(); + + o_acc_local += type_convert(p_vec[i_j]) * v_compute; } } - if constexpr(is_kvcache_i8_forward_quant) - { - // apply pr scale to local acc - o_acc_local = - type_convert(type_convert(o_acc_local) * pf_scale); - } - o_acc += o_acc_local; + + OAccType post_scale_o_acc_local = [&]() { + if constexpr(Traits::quant_algo == naive_attention_quant_algo::KV_8BIT_PERHEAD) + { + // apply pr scale to local acc + return type_convert(type_convert(o_acc_local) * + p_dequant_scale); + } + else if constexpr(Traits::quant_algo == + naive_attention_quant_algo::KV_8BIT_PERTOKEN) + { + // apply pr scale to local acc + return type_convert(type_convert(o_acc_local) * + p_dequant_scale); + } + else + { + return type_convert(o_acc_local); + } + }(); + o_acc += post_scale_o_acc_local; } } // post scale o_acc { SoftmaxType tmp = l == 0.f ? 0.f : 1.f / l; // in case masking - o_acc = type_convert(type_convert(o_acc) * tmp); + o_acc = type_convert(type_convert(o_acc) * tmp); } // store O @@ -564,18 +691,21 @@ struct naive_attention_fwd_kernel #define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_() \ { \ - using ktraits_ = \ - naive_attention_fwd_kernel_traits( \ - variation_)>; \ + using ktraits_ = naive_attention_fwd_kernel_traits< \ + static_cast(variation_), \ + static_cast(quant_algo_)>; \ using k_ = naive_attention_fwd_kernel; \ dim3 grids = k_::get_grid_size(a); \ r = ck_tile::launch_kernel(s, \ @@ -586,31 +716,37 @@ struct naive_attention_fwd_kernel if(t.variation == 0 && t.q_layout == "bshd" && t.k_layout == "bshd" && t.v_layout == "bshd" && \ t.o_layout == "bshd") \ { \ - constexpr auto q_layout_ = naive_attention_layout_enum::BSHD; \ - constexpr auto k_layout_ = naive_attention_layout_enum::BSHD; \ - constexpr auto v_layout_ = naive_attention_layout_enum::BSHD; \ - constexpr auto o_layout_ = naive_attention_layout_enum::BSHD; \ - constexpr int variation_ = 0; \ + constexpr auto q_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr auto k_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr auto v_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr auto o_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr auto k_scale_layout_ = naive_attention_layout_enum::DEFAULT; \ + constexpr auto v_scale_layout_ = naive_attention_layout_enum::DEFAULT; \ + constexpr int variation_ = 0; \ CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \ } \ else if(t.variation == 0 && t.q_layout == "bhsd" && t.k_layout == "bhsd" && \ t.v_layout == "bhsd" && t.o_layout == "bhsd") \ { \ - constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \ - constexpr auto k_layout_ = naive_attention_layout_enum::BHSD; \ - constexpr auto v_layout_ = naive_attention_layout_enum::BHSD; \ - constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \ - constexpr int variation_ = 0; \ + constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto k_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto v_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto k_scale_layout_ = naive_attention_layout_enum::DEFAULT; \ + constexpr auto v_scale_layout_ = naive_attention_layout_enum::DEFAULT; \ + constexpr int variation_ = 0; \ CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \ } \ else if(t.variation == 2 && t.q_layout == "bhsd" && t.k_layout == "phdsx" && \ t.v_layout == "phds" && t.o_layout == "bhsd") \ { \ - constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \ - constexpr auto k_layout_ = naive_attention_layout_enum::PHDSX; \ - constexpr auto v_layout_ = naive_attention_layout_enum::PHDS; \ - constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \ - constexpr int variation_ = 2; \ + constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto k_layout_ = naive_attention_layout_enum::PHDSX; \ + constexpr auto v_layout_ = naive_attention_layout_enum::PHDS; \ + constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto k_scale_layout_ = naive_attention_layout_enum::SCALE_HS; \ + constexpr auto v_scale_layout_ = naive_attention_layout_enum::SCALE_HS; \ + constexpr int variation_ = 2; \ CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \ } @@ -621,40 +757,64 @@ CK_TILE_HOST float naive_attention_fwd(naive_attention_fwd_traits t, { float r = -1; // TODO: do not explicitly create too much instance! - if(t.q_type == "fp16" && t.k_type == "fp16" && t.v_type == "fp16" && t.o_type == "fp16") + if(t.q_type == "fp16" && t.k_type == "fp16" && t.v_type == "fp16" && t.o_type == "fp16" && + t.quant_algo == 0) + { + using q_type_ = fp16_t; + using k_type_ = fp16_t; + using v_type_ = fp16_t; + using o_type_ = fp16_t; + using acc_type_ = float; + using kvscale_type_ = float; + constexpr int quant_algo_ = 0; + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); + } + else if(t.q_type == "bf16" && t.k_type == "bf16" && t.v_type == "bf16" && t.o_type == "bf16" && + t.quant_algo == 0) { - using q_type_ = fp16_t; - using k_type_ = fp16_t; - using v_type_ = fp16_t; - using o_type_ = fp16_t; - using acc_type_ = float; + using q_type_ = bf16_t; + using k_type_ = bf16_t; + using v_type_ = bf16_t; + using o_type_ = bf16_t; + using acc_type_ = float; + using kvscale_type_ = float; + constexpr int quant_algo_ = 0; CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); } - else if(t.q_type == "bf16" && t.k_type == "bf16" && t.v_type == "bf16" && t.o_type == "bf16") + else if(t.q_type == "bf16" && t.k_type == "fp8" && t.v_type == "fp8" && t.o_type == "bf16" && + t.quant_algo == 2) { - using q_type_ = bf16_t; - using k_type_ = bf16_t; - using v_type_ = bf16_t; - using o_type_ = bf16_t; - using acc_type_ = float; + using q_type_ = bf16_t; + using k_type_ = fp8_t; + using v_type_ = fp8_t; + using o_type_ = bf16_t; + using acc_type_ = float; // NOTE! + using kvscale_type_ = float; + constexpr int quant_algo_ = 2; CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); } - else if(t.q_type == "bf16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "bf16") + else if(t.q_type == "fp16" && t.k_type == "fp8" && t.v_type == "fp8" && t.o_type == "fp16" && + t.quant_algo == 2) { - using q_type_ = bf16_t; - using k_type_ = int8_t; - using v_type_ = int8_t; - using o_type_ = bf16_t; - using acc_type_ = int32_t; // NOTE! + using q_type_ = fp16_t; + using k_type_ = fp8_t; + using v_type_ = fp8_t; + using o_type_ = fp16_t; + using acc_type_ = float; // NOTE! + using kvscale_type_ = float; + constexpr int quant_algo_ = 2; CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); } - else if(t.q_type == "fp16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "fp16") + else if(t.q_type == "bf16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "bf16" && + t.quant_algo == 2) { - using q_type_ = fp16_t; - using k_type_ = int8_t; - using v_type_ = int8_t; - using o_type_ = fp16_t; - using acc_type_ = int32_t; // NOTE! + using q_type_ = bf16_t; + using k_type_ = int8_t; + using v_type_ = int8_t; + using o_type_ = bf16_t; + using acc_type_ = int32_t; // NOTE! + using kvscale_type_ = float; + constexpr int quant_algo_ = 2; CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); } return r; -- GitLab From 8ea375bb58243b943918d3673434fd13a59d5a01 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:38:22 -0800 Subject: [PATCH 151/153] terminology clean-up (#1792) --- .../gpu/thread/threadwise_tensor_slice_transfer.hpp | 4 ++-- include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 8c65ef32a..bb1871ae6 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1544,7 +1544,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic ElementwiseOperation element_op_; }; -// Specilized for WMMA-Navi3 +// Specialized for gfx11 // A single Wave32 is composed by double row // Data exchange allowed between these two rows // This RowLane Dst buf will be filled from two Src buf @@ -1679,7 +1679,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow ElementwiseOperation element_op_{}; }; -// Specilized for WMMA-Navi4 +// Specialized for gfx12 template {}; - // * Fixed in Navi3x, Will be wave mode dependent on Navi4x + // * Fixed for gfx11, Will be wave mode dependent on gfx12 // static constexpr index_t num_src_a_vgprs_per_wave = k_per_wmma / 2 * src_a_data_size / 4; // static constexpr index_t num_src_b_vgprs_per_wave = k_per_wmma / 2 * src_b_data_size / 4; // * num_acc_vgprs_per_wave alone M direction -- GitLab From 37b35146482a69189928320ea06a77f3e3109c9e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 Jan 2025 17:47:48 -0800 Subject: [PATCH 152/153] Bump rocm-docs-core from 1.12.0 to 1.12.1 in /docs/sphinx (#1788) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.12.0 to 1.12.1. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.12.0...v1.12.1) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 46a61a87f..2c7961c37 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.12.0 +rocm-docs-core==1.12.1 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index c2e74baae..3b84d1477 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.12.0 +rocm-docs-core==1.12.1 # via -r requirements.in six==1.16.0 # via pybtex -- GitLab From 888317e698e9803c62bd38568abc9e05d7709f33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Sat, 4 Jan 2025 14:01:33 +0100 Subject: [PATCH 153/153] Fix universal gemm profiler for pk_i4_t (#1790) * Fix universal gemm profiler for pk_i4_t * fix --- include/ck/library/utility/host_tensor.hpp | 13 +++++++++++-- include/ck/utility/type_convert.hpp | 15 ++++++++++++++- .../profiler/profile_gemm_universal_impl.hpp | 6 +++--- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp index ef5738be0..f1730de0e 100644 --- a/include/ck/library/utility/host_tensor.hpp +++ b/include/ck/library/utility/host_tensor.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -44,10 +44,19 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim) else os << delim; - if constexpr(std::is_same_v || std::is_same_v) + using RangeType = ck::remove_cvref_t; + if constexpr(std::is_same_v || std::is_same_v || + std::is_same_v) { os << ck::type_convert(v); } + else if constexpr(std::is_same_v) + { + const auto packed_floats = ck::type_convert(v); + const ck::vector_type vector_of_floats{packed_floats}; + os << vector_of_floats.template AsType()[ck::Number<0>{}] << delim + << vector_of_floats.template AsType()[ck::Number<1>{}]; + } else { os << static_cast(v); diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp index f372756e6..9120ce62c 100644 --- a/include/ck/utility/type_convert.hpp +++ b/include/ck/utility/type_convert.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -465,6 +465,19 @@ inline __host__ __device__ float2_t type_convert(f8x2_ocp_ #endif } +template <> +inline __host__ __device__ float2_t type_convert(pk_i4_t x) +{ + uint8_t x_u8 = ck::bit_cast(x); + uint8_t x_l = (x_u8 & 0x0f) >> 0; + uint8_t x_h = (x_u8 & 0xf0) >> 4; + + auto l_f32 = ck::type_convert(x_l); + auto h_f32 = ck::type_convert(x_h); + + return {l_f32, h_f32}; +} + template <> inline __host__ __device__ half2_t type_convert(float2_t x) { diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp index ed7e86ded..2054ffbbb 100644 --- a/profiler/include/profiler/profile_gemm_universal_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -177,7 +177,7 @@ bool profile_gemm_universal_impl(int do_verification, } } - if(is_same_v && is_same_v) + if constexpr(is_same_v && is_same_v) { // vector pk_i4x4 permute for(int i = 0; i < N; i++) @@ -188,7 +188,7 @@ bool profile_gemm_universal_impl(int do_verification, for(int k = 0; k < 4; k++) { - int i4x2 = b_k_n_permute(j + k * 2, i); + int i4x2 = b_k_n_permute(j + k * 2, i).data; input[k * 2 + 0] = (i4x2 >> 4) & 0xf; input[k * 2 + 1] = (i4x2 >> 0) & 0xf; } -- GitLab