add ck_tile for matrix_core swizzle kernel

22ab193c · carlushuang · b2e95e21 · 22ab193c · 22ab193c · 22ab193c
Commit 22ab193c authored Aug 19, 2024 by carlushuang
20 changed files
--- a/example/ck_tile/05_moe/CMakeLists.txt
+++ b/example/ck_tile/05_moe/CMakeLists.txt
+# generate a list of kernels, but not actually emit files at config stage
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api fwd,fwd_splitkv --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
+)
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt
+)
+# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
+#       as current cmake list, otherwise will not figure out the dependency properly
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt FMHA_FWD_GEN_BLOBS)
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt FMHA_BWD_GEN_BLOBS)
+add_custom_command(
+  OUTPUT ${FMHA_FWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api fwd,fwd_splitkv --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_command(
+  OUTPUT ${FMHA_BWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api bwd --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+)
+set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+message("adding example ${EXAMPLE_FMHA_FWD}")
+add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL fmha_fwd.cpp)
+target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
+set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+message("adding example ${EXAMPLE_FMHA_BWD}")
+add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL fmha_bwd.cpp)
+target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})
+# NOTE: this is dangerous since will change the whole kernel to flush denormals
+#       WIP with compiler team for an exp2 intrinsic..., then remove this
+if(NOT DEFINED FMHA_FWD_FAST_EXP2)
+    set(FMHA_FWD_FAST_EXP2 true)
+endif()
+set(EXAMPLE_FMHA_FWD_COMPILE_OPTIONS)
+set(EXAMPLE_FMHA_BWD_COMPILE_OPTIONS)
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+#       ... because they are auto-generated
+if(FMHA_FWD_FAST_EXP2)
+	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
+  list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
+else()
+	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
+  list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
+endif()
+# Allow comparing floating points directly in order to check sentinel values
+list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-float-equal)
+list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-float-equal)
+target_compile_options(${EXAMPLE_FMHA_FWD} PRIVATE ${EXAMPLE_FMHA_FWD_COMPILE_OPTIONS})
+target_compile_options(${EXAMPLE_FMHA_BWD} PRIVATE ${EXAMPLE_FMHA_BWD_COMPILE_OPTIONS})
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
--- a/example/ck_tile/05_moe/fused_moe/kernel/fused_moe_kernel.hpp
+++ b/example/ck_tile/05_moe/fused_moe/kernel/fused_moe_kernel.hpp
--- a/example/ck_tile/05_moe/fused_moe/kernel/fused_moe_tile_partitioner.hpp
+++ b/example/ck_tile/05_moe/fused_moe/kernel/fused_moe_tile_partitioner.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+template <typename FusedMoeTileShape_>
+struct FusedMoeTilePartitioner_PersistentSplitD
+{
+    using FusedMoeTileShape = ck_tile::remove_cvref_t<FusedMoeTileShape_>;
+    static constexpr index_t kM_a = FusedMoeTileShape::kM_a;
+    static constexpr index_t kN_g = FusedMoeTileShape::kN_g;
+    static constexpr index_t kN_u = FusedMoeTileShape::kN_u;
+    static constexpr index_t kK_a = FusedMoeTileShape::kK_a;
+    static constexpr index_t kN_d = FusedMoeTileShape::kN_d;
+    static constexpr const char* name = "psd"; // expert x hidden
+    CK_TILE_DEVICE auto operator()(ck_tile::index_t tile_id,
+                                   ck_tile::index_t /*num_sorted_tiles*/,
+                                   ck_tile::index_t hidden_size)
+    {
+        const auto f = [](index_t dividend, index_t divisor) {
+            index_t quotient = dividend / divisor;
+            index_t modulus  = dividend - quotient * divisor;
+            return ck_tile::make_tuple(quotient, modulus);
+        };
+        const index_t num_hidden_tiles = ck_tile::integer_divide_ceil(hidden_size, kN_g);
+        const auto [sorted_tile_id, hidden_tile_id] = f(tile_id, num_hidden_tiles);
+        return ck_tile::make_tuple(sorted_tile_id, hidden_tile_id);
+    }
+    // persistent
+    CK_TILE_HOST static constexpr auto GridSize(index_t num_cu, index_t blocks_per_cu)
+    {
+        // TODO: this may need tuning
+        index_t grids = num_cu * blocks_per_cu;
+        return dim3(grids);
+    }
+};
+} // namespace ck_tile
--- a/example/ck_tile/05_moe/fused_moe/pipeline/fused_moe_pipeline.hpp
+++ b/example/ck_tile/05_moe/fused_moe/pipeline/fused_moe_pipeline.hpp
--- a/example/ck_tile/05_moe/fused_moe/pipeline/fused_moe_pipeline_policy.hpp
+++ b/example/ck_tile/05_moe/fused_moe/pipeline/fused_moe_pipeline_policy.hpp
--- a/example/ck_tile/05_moe/fused_moe/pipeline/fused_moe_pipeline_problem.hpp
+++ b/example/ck_tile/05_moe/fused_moe/pipeline/fused_moe_pipeline_problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+template <typename ADataType_,
+          typename GDataType_,
+          typename UDataType_,
+          typename DDataType_,
+          typename ODataType_,
+          typename AccDataType_,
+          typename ScaleDataType_,
+          typename GateActivation_, // = ck_tile::element_wise::Silu,
+          typename FusedMoeTileShape_,
+          typename Traits_>
+struct FusedMoePipelineProblem
+{
+    using ADataType         = remove_cvref_t<ADataType_>;
+    using YDataType         = ADataType;
+    using GDataType         = remove_cvref_t<GDataType_>;
+    using UDataType         = remove_cvref_t<UDataType_>;
+    using DDataType         = remove_cvref_t<DDataType_>;
+    using ODataType         = remove_cvref_t<ODataType_>;
+    using AccDataType       = remove_cvref_t<AccDataType_>;
+    using ScaleDataType     = remove_cvref_t<ScaleDataType_>;
+    using FusedMoeTileShape = remove_cvref_t<FusedMoeTileShape_>;
+    using Traits = remove_cvref_t<Traits_>;
+    static constexpr index_t kBlockSize = FusedMoeTileShape::NumWarps * get_warp_size();
+    // attributes from traits
+    // static constexpr bool kPadSeqLenQ       = Traits::kPadSeqLenQ;
+    // static constexpr bool kPadSeqLenK       = Traits::kPadSeqLenK;
+    // static constexpr bool kPadHeadDimQ      = Traits::kPadHeadDimQ;
+    // static constexpr bool kPadHeadDimV      = Traits::kPadHeadDimV;
+    // static constexpr auto BiasEnum          = Traits::BiasEnum;
+    // static constexpr bool kStoreLSE         = Traits::kStoreLSE;
+    // static constexpr bool kHasDropout       = Traits::kHasDropout;
+    // static constexpr bool kDoFp8StaticQuant = Traits::kDoFp8StaticQuant;
+    static constexpr index_t kBlockPerCu = Traits::kBlockPerCu;
+    using GateActivation                 = remove_cvref_t<typename Traits::GateActivation_>;
+};
+} // namespace ck_tile
--- a/example/ck_tile/05_moe/fused_moe/pipeline/fused_moe_tile_shape.hpp
+++ b/example/ck_tile/05_moe/fused_moe/pipeline/fused_moe_tile_shape.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+/*
+tensors:
+1. act  (A): input feature map
+2. gate (G): B matrix for first gemm, output will do activation(Silu)
+3. up   (U): B matrix for first gemm
+4. down (D): B matrix for second gemm
+                                                                  N_d
+                                                                 /   \
+                                                                 +----------+ |
+                                                                 |   Down   | |
+                                                                 x----------x |
+                       hidden                hidden          K_d |          | |
+                        N_g            N_u                       x----------x |
+             |   +------x-----x------+------x-----x------+       |          | |
+    dim      |   | Gate |     |      | Up   |     |      |       |          | |
+  contiguous |   |      |     |      |      |     |      |       |          | |
+             |   |      |     |      |      |     |      |       |          | |
+             v   +------x-----x------+------x-----x------+       +----------+ V
+      K_a               |     |             |     |                    | contiguous
+     /  \               v     v             v     v                    |
+    +---------+  +------x-----x------+------x-----x------+             |
+M_a |    A    |  |      |     |      |      |     |      |             |
+    +---------+  +------x-----x------+------x-----x------+             |
+    -------->             |                    |                       |
+    contiguous            |                    V                       V
+                          |                 x-----x              +----------+
+                          +------------->   |  Y  |  --------->  |  Out(O)  |
+                            SILU            x-----x              +----------+
+                                             K_y = N_g = N_u          dim
+*/
+template <typename BlockTile_, // sequence<M_a, N_g, N_u, K_a, N_d
+          typename Gemm0BlockWarps_,
+          typename Gemm0WarpTile_,
+          typename Gemm1BlockWarps_,
+          typename Gemm1WarpTile_,
+          bool IsDLayoutRowMajor_>
+struct FusedMoeTileShape
+{
+    using BlockTile       = remove_cvref_t<BlockTile_>;
+    using Gemm0BlockWarps = remove_cvref_t<Gemm0BlockWarps_>;
+    using Gemm0WarpTile   = remove_cvref_t<Gemm0WarpTile_>;
+    using Gemm1BlockWarps = remove_cvref_t<Gemm1BlockWarps_>;
+    using Gemm1WarpTile   = remove_cvref_t<Gemm1WarpTile_>;
+    static constexpr index_t NumWarps =
+        reduce_on_sequence(Gemm0BlockWarps{}, multiplies{}, number<1>{});
+    static_assert(NumWarps == reduce_on_sequence(Gemm1BlockWarps{}, multiplies{}, number<1>{}));
+    static constexpr index_t kM_a = BlockTile::at(number<0>{});
+    static constexpr index_t kN_g = BlockTile::at(number<1>{});
+    static constexpr index_t kN_u = BlockTile::at(number<2>{});
+    static constexpr index_t kK_a = BlockTile::at(number<3>{});
+    static constexpr index_t kN_d = BlockTile::at(number<4>{});
+    static_assert(kN_g == kN_u);
+    static constexpr index_t kK_y = kN_g;
+    static constexpr index_t kM_0 = kM_a;
+    static constexpr index_t kN_0 = kN_g; // note N will x2
+    static constexpr index_t kK_0 = kK_a;
+    static constexpr index_t kM_1 = kM_0;
+    static constexpr index_t kN_1 = kN_d;
+    static constexpr index_t kK_1 = kN_g;
+    // d, rowmajor : hidden*dim, colmajor : dim*hidden (vLLM use this layout)
+    static constexpr bool IsDLayoutRowMajor = IsDLayoutRowMajor_;
+    using DLayout                           = std::conditional_t<IsDLayoutRowMajor,
+                                       ck_tile::tensor_layout::gemm::RowMajor,
+                                       ck_tile::tensor_layout::gemm::ColumnMajor>;
+};
+} // namespace ck_tile
--- a/example/ck_tile/05_moe/fused_moe/pipeline/fused_moe_traits.hpp
+++ b/example/ck_tile/05_moe/fused_moe/pipeline/fused_moe_traits.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp"
+#include "ck_tile/ops/fmha/block/block_dropout.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+namespace ck_tile {
+template <bool GateUpPreShuffled_ = false,
+          bool DownPreShuffled_   = false,
+          index_t NumPrefetchA_   = 2,
+          index_t NumPrefetchG_   = 2,
+          index_t NumPrefetchU_   = 2,
+          index_t NumPrefetchD_   = 2,
+          index_t kBlockPerCu_    = -1 /* overwrite occupancy if not -1 */>
+struct FusedMoeTraits
+{
+    static constexpr bool GateUpPreShuffled = GateUpPreShuffled_;
+    static constexpr bool DownPreShuffled   = DownPreShuffled_;
+    static constexpr index_t NumPrefetchA   = NumPrefetchA_;
+    static constexpr index_t NumPrefetchG   = NumPrefetchG_;
+    static constexpr index_t NumPrefetchU   = NumPrefetchU_;
+    static constexpr index_t NumPrefetchD   = NumPrefetchD_;
+    static constexpr index_t kBlockPerCu    = kBlockPerCu_;
+};
+} // namespace ck_tile
--- a/example/ck_tile/05_moe/moe.cpp
+++ b/example/ck_tile/05_moe/moe.cpp
--- a/example/ck_tile/05_moe/moe.hpp
+++ b/example/ck_tile/05_moe/moe.hpp
--- a/example/ck_tile/06_permute/CMakeLists.txt
+++ b/example/ck_tile/06_permute/CMakeLists.txt
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
 add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp)
+if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
+# set(PERMUTE_USE_ALTERNATIVE_IMPL false)
+set(PERMUTE_USE_ALTERNATIVE_IMPL true)
+endif()
+if(PERMUTE_USE_ALTERNATIVE_IMPL)
+target_compile_options(tile_example_permute PRIVATE -DPERMUTE_USE_ALTERNATIVE_IMPL)
+target_sources(tile_example_permute PRIVATE alternative_impl/matrix_core_swizzle.cpp)
+endif()
 # target_compile_options(tile_example_permute PRIVATE -v --save-temps -Wno-gnu-line-marker)
--- a/example/ck_tile/06_permute/README.md
+++ b/example/ck_tile/06_permute/README.md
@@ -36,3 +36,11 @@ or you can try the smoke_test
 # in the root of ck_tile, after you build this example
 sh example/ck_tile/06_permute/script/smoke_test.sh
 ```
+### alternative implementation
+we have an alternative implementation under `alternative_impl/` folder, that can swizzle the tensor to be more friendly for data loading for matrix core layout. This can be enabled when dealing with a `rank-7` tensor, with a fixed pattern of either `0,1,4,2,5,3,6` or `0,1,2,4,5,3,6`. There are other shape limitation of this implementation, check the source code of `permute.cpp` for detail.
+```
+# example
+./build/bin/tile_example_permute -shape=3,6,4,32,16,2,8 -perm=0,1,4,2,5,3,6 # b_n0_k0_n1_k1_n2_k2
+./build/bin/tile_example_permute -shape=3,8,4,16,16,4,8 -perm=0,1,2,4,5,3,6 # b_n0_n1_k0_k1_n2_k2
+```
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
+#include "matrix_core_swizzle.hpp"
+#include "matrix_core_swizzle_kernel.hpp"
+float matrix_core_swizzle(matrix_core_swizzle_traits t,
+                          matrix_core_swizzle_args a,
+                          const ck_tile::stream_config& s)
+{
+    if(t.data_type.compare("fp16") == 0)
+    {
+        if(t.inst.compare("32x32x8") == 0)
+        {
+            constexpr int BLOCK_SIZE             = 256;
+            constexpr int NPerBlock              = 256;
+            constexpr int KPerBlock              = 128;
+            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_32x32x8_F16;
+            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+                return ave_time;
+            }
+        }
+        else if(t.inst.compare("16x16x16") == 0)
+        {
+            constexpr int BLOCK_SIZE             = 256;
+            constexpr int NPerBlock              = 256;
+            constexpr int KPerBlock              = 128;
+            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_16x16x16_F16;
+            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+                return ave_time;
+            }
+        }
+    }
+    return -1;
+}
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "matrix_core_swizzle_kernel.hpp"
+#include <string>
+struct matrix_core_swizzle_traits
+{
+    std::string data_type; // fp16 only
+    std::string inst;      // 32x32x8, 16x16x16
+    std::string permute;   //
+};
+using matrix_core_swizzle_args = matrix_core_swizzle_host_args;
+// host API
+float matrix_core_swizzle(matrix_core_swizzle_traits,
+                          matrix_core_swizzle_args,
+                          const ck_tile::stream_config&);
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/gemm.hpp"
+enum class matrix_core_inst_enum
+{
+    MFMA_32x32x8_F16  = 0,
+    MFMA_16x16x16_F16 = 1,
+};
+namespace detail {
+template <matrix_core_inst_enum>
+struct to_warp_gemm;
+template <>
+struct to_warp_gemm<matrix_core_inst_enum::MFMA_32x32x8_F16>
+{
+    using type = ck_tile::WarpGemmMfmaF16F16F32M32N32K8;
+};
+template <>
+struct to_warp_gemm<matrix_core_inst_enum::MFMA_16x16x16_F16>
+{
+    using type = ck_tile::WarpGemmMfmaF16F16F32M16N16K16;
+};
+} // namespace detail
+template <matrix_core_inst_enum Inst>
+using to_warp_gemm_t = typename detail::to_warp_gemm<Inst>::type;
+enum class matrix_core_permute_style
+{
+    permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6
+    permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6
+};
+// assume this is B matrix, originally we have batch*n*k
+// now batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
+// assume using 32x32x8-f16, 4 waves and extend the KPerLane to 8xfp16(dwordx4)
+//
+//                                      4(waves)  32(mfma_m lane)
+//                                          |      |
+// batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2 -> 8(thread loading)
+//                                    nr  kr    |
+//        nr  4  32 kr 2  8                     2(klane)
+//
+// permute: 0,1,4,2,5,3,6
+// or
+// batch* n0*n1*n2*k0*k1*k2 -> batch* n0*n1*k0*k1*n2*k2 -> 8(thread loading)
+// permute: 0,1,2,4,5,3,6
+//
+// this kernel only deal with fp16/bf16 data(16bit), and use 2d block size to do the swizzling
+// for simplicity, only consider n/k is multiple of block-size
+// independend host arg with no template
+struct matrix_core_swizzle_host_args
+{
+    const void* p_src;
+    void* p_dst;
+    int32_t batch;
+    int32_t n;
+    int32_t k;
+};
+// NOTE: this kernel could follow the style of generic permute kernel
+// but here we pass in fixed layout as template arg and generate different kernel instance
+// purposely
+template <int BLOCK_SIZE_ = 256,
+          int NPerBlock_  = 256,
+          int KPerBlock_  = 128,
+          matrix_core_permute_style pstyle_ =
+              matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2,
+          matrix_core_inst_enum Inst_ = matrix_core_inst_enum::MFMA_32x32x8_F16>
+struct matrix_core_swizzle_kernel
+{
+    using karg = matrix_core_swizzle_host_args;
+    using harg = matrix_core_swizzle_host_args;
+    static constexpr int BLOCK_SIZE                   = BLOCK_SIZE_;
+    static constexpr int NPerBlock                    = NPerBlock_;
+    static constexpr int KPerBlock                    = KPerBlock_;
+    static constexpr matrix_core_permute_style pstyle = pstyle_;
+    static constexpr matrix_core_inst_enum Inst       = Inst_;
+    static constexpr ck_tile::index_t Alignment = 8;
+    karg a;
+    dim3 grids;
+    using WarpGemm = to_warp_gemm_t<Inst>;
+    __host__ matrix_core_swizzle_kernel(harg h)
+    {
+        a                   = h;
+        ck_tile::index_t ns = (h.n + NPerBlock - 1) / NPerBlock;
+        ck_tile::index_t ks = (h.k + KPerBlock - 1) / KPerBlock;
+        grids               = dim3(ks, ns, h.batch);
+    }
+    __host__ bool is_applicable(harg h) { return h.n % NPerBlock == 0 && h.k % KPerBlock == 0; }
+    __host__ void operator()(const ck_tile::stream_config& s) const
+    {
+        ck_tile::kentry<BLOCK_SIZE, 1, kernel><<<grids, BLOCK_SIZE, 0, s.stream_id_>>>(a);
+    }
+    struct kernel
+    {
+        __device__ static constexpr auto get_src_dist()
+        {
+            using namespace ck_tile;
+            constexpr index_t K2 = Alignment;
+            constexpr index_t N2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t K1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t N1 = BLOCK_SIZE / get_warp_size();
+            static_assert(NPerBlock % (N1 * N2) == 0);
+            static_assert(KPerBlock % (K1 * K2) == 0);
+            constexpr index_t K0 = KPerBlock / (K1 * K2);
+            constexpr index_t N0 = NPerBlock / (N1 * N2);
+            // clang-format off
+            return make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<1>,// 0
+                    //             1              2            3             4             5             6
+                    tuple<sequence<N0>, sequence<N1>, sequence<N2>, sequence<K0>, sequence<K1>, sequence<K2>>,
+                    //            N1           K1  N2
+                    tuple<sequence<2>, sequence<5, 3>>,
+                    tuple<sequence<0>, sequence<0, 0>>,
+                    //       N0 K0 K2
+                    sequence<1, 4, 6>,
+                    sequence<0, 0, 0>>{});
+            // clang-format on
+        }
+        __device__ static constexpr auto get_dst_dist()
+        {
+            using namespace ck_tile;
+            constexpr index_t K2 = Alignment;
+            constexpr index_t N2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t K1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t N1 = BLOCK_SIZE / get_warp_size();
+            static_assert(NPerBlock % (N1 * N2) == 0);
+            static_assert(KPerBlock % (K1 * K2) == 0);
+            constexpr index_t K0 = KPerBlock / (K1 * K2);
+            constexpr index_t N0 = NPerBlock / (N1 * N2);
+            if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+            {
+                // clang-format off
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,// 0
+                        //             1              2            3             4             5             6
+                        tuple<sequence<N0>, sequence<K0>, sequence<N1>, sequence<K1>, sequence<N2>, sequence<K2>>,
+                        //            N1           K1  N2
+                        tuple<sequence<3>, sequence<4, 5>>,
+                        tuple<sequence<0>, sequence<0, 0>>,
+                        //       N0 K0 K2
+                        sequence<1, 2, 6>,
+                        sequence<0, 0, 0>>{});
+                // clang-format on
+            }
+            else
+            {
+                // clang-format off
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,// 0
+                        //             1              2            3             4             5             6
+                        tuple<sequence<N0>, sequence<N1>, sequence<K0>, sequence<K1>, sequence<N2>, sequence<K2>>,
+                        //            N1           K1  N2
+                        tuple<sequence<2>, sequence<4, 5>>,
+                        tuple<sequence<0>, sequence<0, 0>>,
+                        //       N0 K0 K2
+                        sequence<1, 3, 6>,
+                        sequence<0, 0, 0>>{});
+                // clang-format on
+            }
+        }
+        __device__ void operator()(karg a_)
+        {
+            using namespace ck_tile;
+            index_t i_k = blockIdx.x;
+            index_t i_n = blockIdx.y;
+            index_t i_b = blockIdx.z;
+            constexpr index_t k2 = Alignment;
+            constexpr index_t n2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t k1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t n1 = BLOCK_SIZE / get_warp_size();
+            const index_t k0     = a_.k / (k1 * k2);
+            const index_t n0     = a_.n / (n1 * n2);
+            constexpr index_t k2_tile = Alignment;
+            constexpr index_t n2_tile = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t k1_tile = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t n1_tile = BLOCK_SIZE / get_warp_size();
+            constexpr index_t k0_tile = KPerBlock / (k1_tile * k2_tile);
+            constexpr index_t n0_tile = NPerBlock / (n1_tile * n2_tile);
+            const fp16_t* p_src = reinterpret_cast<const fp16_t*>(a_.p_src) + i_b * a_.k * a_.n;
+            fp16_t* p_dst       = reinterpret_cast<fp16_t*>(a_.p_dst) + i_b * a_.k * a_.n;
+            const auto src_view = [&]() {
+                const auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                    p_src,
+                    make_tuple(n0, n1, n2, k0, k1, k2),
+                    number<Alignment>{}); // control vector load
+                return tmp;
+            }();
+            const auto src_window = make_tile_window(src_view,
+                                                     make_tuple(number<n0_tile>{},
+                                                                number<n1_tile>{},
+                                                                number<n2_tile>{},
+                                                                number<k0_tile>{},
+                                                                number<k1_tile>{},
+                                                                number<k2_tile>{}),
+                                                     {i_n * n0_tile, 0, 0, i_k * k0_tile, 0, 0},
+                                                     get_src_dist());
+            auto dst_view = [&]() {
+                if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+                {
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(n0, k0, n1, k1, n2, k2),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+                }
+                else
+                {
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(n0, n1, k0, k1, n2, k2),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+                }
+            }();
+            auto dst_window = [&]() {
+                if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+                {
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<n0_tile>{},
+                                                       number<k0_tile>{},
+                                                       number<n1_tile>{},
+                                                       number<k1_tile>{},
+                                                       number<n2_tile>{},
+                                                       number<k2_tile>{}),
+                                            {i_n * n0_tile, i_k * k0_tile, 0, 0, 0, 0},
+                                            get_dst_dist());
+                }
+                else
+                {
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<n0_tile>{},
+                                                       number<n1_tile>{},
+                                                       number<k0_tile>{},
+                                                       number<k1_tile>{},
+                                                       number<n2_tile>{},
+                                                       number<k2_tile>{}),
+                                            {i_n * n0_tile, 0, i_k * k0_tile, 0, 0, 0},
+                                            get_dst_dist());
+                }
+            }();
+            // actual load store
+            auto src_tile = load_tile(src_window);
+            // now we only swap the distribution from src to dst, no extra movement occurs
+            auto dst_tile                = make_static_distributed_tensor<fp16_t>(get_dst_dist());
+            dst_tile.get_thread_buffer() = src_tile.get_thread_buffer();
+            // final store
+            store_tile(dst_window, dst_tile);
+        }
+    };
+};
--- a/example/ck_tile/06_permute/permute.cpp
+++ b/example/ck_tile/06_permute/permute.cpp
@@ -14,6 +14,10 @@
 #include <utility>
 #include <vector>
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+#include "alternative_impl/matrix_core_swizzle.hpp"
+#endif
 namespace detail {
 template <int bytes>
 struct to_integer_type;
@@ -191,7 +195,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    std::string data_type = arg_parser.get_str("prec");
    int do_validation     = arg_parser.get_int("v");
-    auto x_shape      = decode_vec(arg_parser.get_str("shape"));
+    auto shape        = decode_vec(arg_parser.get_str("shape"));
    auto perm         = decode_vec(arg_parser.get_str("perm"));
    int stream_warmup = arg_parser.get_int("warmup");
    int stream_repeat = arg_parser.get_int("repeat");
@@ -206,7 +210,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        return false;
    }
-    ck_tile::HostTensor<DataType> x(x_shape);
+    ck_tile::HostTensor<DataType> x(shape);
    ck_tile::FillUniformDistributionIntegerValue<DataType>{-15, 15, seed}(x);
    std::vector<ck_tile::index_t> y_shape = [&]() {
@@ -217,7 +221,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            // std::cout << "  i:" << i << ", perm:" << perm[i] << ", rak:" <<
            // static_cast<int>(rank)
            // << std::endl;
-            tmp[i] = x_shape[perm[i]];
+            tmp[i] = shape[perm[i]];
        }
        // std::cout << "@@@" << tmp << std::endl;
        return tmp;
@@ -230,26 +234,78 @@ bool run(const ck_tile::ArgParser& arg_parser)
    x_buf.ToDevice(x.data());
-    permute_args args;
+    std::cout << "[" << data_type << "] shape:" << shape << "->" << y_shape << ", permute:" << perm
-    args.p_src = x_buf.GetDeviceBuffer();
+              << std::flush;
-    args.p_dst = y_buf.GetDeviceBuffer();
-    args.rank  = rank;
-    std::copy(x_shape.begin(), x_shape.end(), args.shape);
-    std::copy(perm.begin(), perm.end(), args.perm);
-    permute_traits trait;
-    trait.data_type = data_type;
-    std::cout << "[" << data_type << "] shape:" << x_shape << "->" << y_shape
-              << ", permute:" << perm << std::flush;
    ck_tile::stream_config stream_config{nullptr,
                                         true,
                                         /* log_level = */ (kname ? 1 : 0),
                                         stream_warmup,
                                         stream_repeat};
+    float ave_time   = 0.f;
+    auto run_permute = [&]() {
+        permute_traits t;
+        t.data_type = data_type;
+        permute_args a;
+        a.p_src = x_buf.GetDeviceBuffer();
+        a.p_dst = y_buf.GetDeviceBuffer();
+        a.rank  = rank;
+        std::copy(shape.begin(), shape.end(), a.shape);
+        std::copy(perm.begin(), perm.end(), a.perm);
+        return permute(t, a, stream_config);
+    };
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+    // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
+    if(rank == 7 && (arg_parser.get_str("perm") == std::string("0,1,4,2,5,3,6") ||
+                     arg_parser.get_str("perm") == std::string("0,1,2,4,5,3,6")))
+    {
+        matrix_core_swizzle_traits t;
+        t.data_type = data_type;
+        t.permute   = arg_parser.get_str("perm");
+        matrix_core_swizzle_args a;
+        a.p_src = x_buf.GetDeviceBuffer();
+        a.p_dst = y_buf.GetDeviceBuffer();
+        a.batch = shape[0];
+        a.n     = shape[1] * shape[2] * shape[3];
+        a.k     = shape[4] * shape[5] * shape[6];
+        if(shape[6] == 8 && shape[3] == 32 && shape[5] == 2 && shape[2] == 4 && shape[4] % 8 == 0 &&
+           shape[1] % 2 == 0)
+        {
+            // 32x32x8 inst
+            // perm=0,1,4,2,5,3,6
+            // y_shape=*,2x,8x,4,2,32,8 (3,6,16,4,2,32,8)
+            // shape = *,2x,4,32,8x,2,8 (3,6,4,32,16,2,8)
+            t.inst = "32x32x8";
+            std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
-    float ave_time = permute(trait, args, stream_config);
+            ave_time = matrix_core_swizzle(t, a, stream_config);
+        }
+        else if(shape[6] == 8 && shape[3] == 16 && shape[5] == 4 && shape[2] == 4 &&
+                shape[4] % 4 == 0 && shape[1] % 4 == 0)
+        {
+            // 16x16x16 inst
+            // perm=0,1,4,2,5,3,6
+            // y_shape=*,4x,4x,4,4,16,8
+            // shape = *,4x,4,16,4x,4,8 (3,8,4,16,16,4,8)
+            t.inst = "16x16x16";
+            std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+            ave_time = matrix_core_swizzle(t, a, stream_config);
+        }
+        else
+        {
+            ave_time = run_permute();
+        }
+    }
+    else
+#endif
+    {
+        ave_time = run_permute();
+    }
    std::cout << ", time:" << ave_time << "ms" << std::flush;
    bool pass = true;

--- a/example/ck_tile/06_permute/script/smoke_test.sh
+++ b/example/ck_tile/06_permute/script/smoke_test.sh
@@ -9,6 +9,14 @@ if [ $# -ge 1 ] ; then
    set -x
 fi
+$EXE -prec=fp16 -shape=3,6,4,32,16,2,8 -perm=0,1,4,2,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=5,10,4,32,8,2,8 -perm=0,1,4,2,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=3,8,4,16,16,4,8 -perm=0,1,4,2,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=3,6,4,32,16,2,8 -perm=0,1,2,4,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=5,10,4,32,8,2,8 -perm=0,1,2,4,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=3,8,4,16,16,4,8 -perm=0,1,2,4,5,3,6  $COMMON_ARGS
+echo "------------------------------------------------------------------"
 for prec in "fp8" "fp16" "fp32" ; do
 $EXE -prec=$prec -shape=3,8 -perm=1,0 $COMMON_ARGS

--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp
@@ -4,7 +4,9 @@
 #pragma once
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
 namespace ck_tile {

--- a/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -4,6 +4,7 @@
 #pragma once
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
 namespace ck_tile {

--- a/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -4,6 +4,7 @@
 #pragma once
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
 namespace ck_tile {