Merge branch 'develop' into feature/use-larger-tile-size-for-chunk-prefill

401e643e · Po Yen Chen · d783a8cf · fdfe2102 · 401e643e · 401e643e
Commit 401e643e authored Dec 17, 2024 by Po Yen Chen
20 changed files
--- a/example/ck_tile/12_smoothquant/smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.cpp
@@ -33,7 +33,8 @@ auto create_args(int argc, char* argv[])
    ck_tile::ArgParser arg_parser;
    arg_parser.insert("m", "3328", "m dimension")
        .insert("n", "4096", "n dimension")
-        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("x_stride", "-1", "input stride per row, if -1 then equal to n")
+        .insert("y_stride", "-1", "output stride per row, if -1 then equal to n")
        .insert("v", "1", "cpu validation or not")
        .insert("kname", "1", "print kernel name or not")
        .insert("prec", "fp16", "precision")
@@ -47,18 +48,21 @@ auto create_args(int argc, char* argv[])
 template <typename DataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
-    ck_tile::index_t m      = arg_parser.get_int("m");
-    ck_tile::index_t n      = arg_parser.get_int("n");
-    ck_tile::index_t stride = arg_parser.get_int("stride");
-    if(stride < 0)
-        stride = n;
+    ck_tile::index_t m        = arg_parser.get_int("m");
+    ck_tile::index_t n        = arg_parser.get_int("n");
+    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
+    if(x_stride < 0)
+        x_stride = n;
+    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
+    if(y_stride < 0)
+        y_stride = n;
    std::string data_type = arg_parser.get_str("prec");
    int kname             = arg_parser.get_int("kname");
    int do_validation     = arg_parser.get_int("v");
    int warmup            = arg_parser.get_int("warmup");
    int repeat            = arg_parser.get_int("repeat");

-    assert(stride >= n);
+    assert(x_stride >= n);

    using TypeConfig = SmoothquantTypeConfig<DataType>;

@@ -69,14 +73,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
    using ComputeDataType = typename TypeConfig::ComputeDataType;

    // host verify
-    ck_tile::HostTensor<XDataType> x_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
    ck_tile::HostTensor<XScaleDataType> xscale_host({n});

    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});

-    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
-    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {y_stride, 1});

    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
    ck_tile::FillUniformDistribution<XScaleDataType>{1e-3, .5f}(xscale_host);
@@ -90,7 +94,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    xscale_buf.ToDevice(xscale_host.data());

    std::cout << "[" << data_type << "]"
-              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride << ", y_stride:" << y_stride
+              << std::flush;

    smoothquant_traits traits{data_type};

@@ -100,7 +105,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                          qy_buf.GetDeviceBuffer(),
                          m,
                          n,
-                          stride};
+                          x_stride,
+                          y_stride};

    float ave_time = smoothquant(
        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
@@ -116,7 +122,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    if(do_validation)
    {
        using YDataType = ComputeDataType;
-        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {stride, 1});
+        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {y_stride, 1});
        // smooth outlier
        {
            auto f = [&](auto n_) {
@@ -166,7 +172,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            qy_buf.FromDevice(qy_host_dev.data());
            auto [rtol, atol] = get_elimit<QYDataType>();

-            if(stride == n)
+            if(y_stride == n)
            {
                pass = ck_tile::check_err(qy_host_dev,
                                          qy_host_ref,
@@ -178,10 +184,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
            {
                for(int i_r = 0; i_r < m; i_r++)
                {
-                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
-                                                            qy_host_dev.begin() + i_r * stride + n);
-                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
-                                                            qy_host_ref.begin() + i_r * stride + n);
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride,
+                                                            qy_host_dev.begin() + i_r * y_stride +
+                                                                n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride,
+                                                            qy_host_ref.begin() + i_r * y_stride +
+                                                                n);
                    pass &= ck_tile::check_err(qy_host_dev_row,
                                               qy_host_ref_row,
                                               std::string("qy[") + std::to_string(i_r) +

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
@@ -89,7 +89,8 @@ struct DeviceBatchedGemmV2MultiD : public BaseOperator
                        index_t BatchStrideE,
                        AElementwiseOperation a_element_op,
                        BElementwiseOperation b_element_op,
-                        CDEElementwiseOperation cde_element_op) = 0;
+                        CDEElementwiseOperation cde_element_op,
+                        index_t KBatch) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -41,12 +41,15 @@ __global__ void
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t g_idx = blockIdx.z % karg.Batch;
+    const index_t k_idx = blockIdx.z / karg.Batch;

    const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
    const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
    const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
    const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);

+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
+
    // populate pointer, desc for Ds
    static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
        // D pointer
@@ -54,8 +57,8 @@ __global__ void
    });

    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_batch_offset,
-        karg.p_b_grid + b_batch_offset,
+        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
        karg.p_ds_grid,
        karg.p_c_grid + c_batch_offset,
        p_shared,
@@ -87,12 +90,15 @@ __global__ void
    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t g_idx = blockIdx.z % karg.Batch;
+    const index_t k_idx = blockIdx.z / karg.Batch;

    const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
    const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
    const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
    const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);

+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
+
    // populate pointer, desc for Ds
    static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
        // D pointer
@@ -100,8 +106,8 @@ __global__ void
    });

    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_batch_offset,
-        karg.p_b_grid + b_batch_offset,
+        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
        karg.p_ds_grid,
        karg.p_c_grid + c_batch_offset,
        p_shared_0,
@@ -303,7 +309,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                 index_t Batch_,
                 AElementwiseOperation a_element_op_,
                 BElementwiseOperation b_element_op_,
-                 CElementwiseOperation c_element_op_)
+                 CElementwiseOperation c_element_op_,
+                 index_t KBatch_)
            : GridwiseGemm::Argument{p_a_grid_,
                                     p_b_grid_,
                                     p_ds_grid_,
@@ -315,7 +322,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                                     StrideB_,
                                     StrideDs_,
                                     StrideE_,
-                                     1,
+                                     KBatch_,
                                     a_element_op_,
                                     b_element_op_,
                                     c_element_op_},
@@ -336,13 +343,14 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                arg.Print();
            }

-            if(!GridwiseGemm::CheckValidity(arg) || arg.KBatch > 1)
+            if(!GridwiseGemm::CheckValidity(arg))
            {
                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
            }

            index_t gdx, gdy, gdz;
-            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch);
+            std::tie(gdx, gdy, gdz) =
+                GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch * arg.KBatch);

            float ave_time = 0;

@@ -387,10 +395,11 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                        rotating_mem.Next();
                        // clear c mem
                        if(arg_.KBatch > 1)
-                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
-                                                             0,
-                                                             arg_.M * arg_.N * sizeof(CDataType),
-                                                             stream_config.stream_id_));
+                            hipGetErrorString(
+                                hipMemsetAsync(arg_.p_c_grid,
+                                               0,
+                                               arg.Batch * arg_.M * arg_.N * sizeof(CDataType),
+                                               stream_config.stream_id_));
                    };

                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
@@ -889,7 +898,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             index_t BatchStrideE,
                             AElementwiseOperation a_element_op,
                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             CElementwiseOperation c_element_op,
+                             index_t KBatch = 1)
    {
        return Argument{static_cast<const ADataType*>(p_a),
                        static_cast<const BDataType*>(p_b),
@@ -909,7 +919,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                        Batch,
                        a_element_op,
                        b_element_op,
-                        c_element_op};
+                        c_element_op,
+                        KBatch};
    }

    static auto MakeInvoker() { return Invoker{}; }
@@ -934,7 +945,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                        index_t BatchStrideE,
                        AElementwiseOperation a_element_op,
                        BElementwiseOperation b_element_op,
-                        CElementwiseOperation c_element_op) override
+                        CElementwiseOperation c_element_op,
+                        index_t KBatch = 1) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
@@ -954,7 +966,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                                          Batch,
                                          a_element_op,
                                          b_element_op,
-                                          c_element_op);
+                                          c_element_op,
+                                          KBatch);
    }

    // polymorphic

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -106,89 +106,35 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
    static constexpr auto I3           = Number<3>{};
    static constexpr index_t KPerBlock = K0PerBlock * K1;

-    static constexpr auto transform_conv_to_gemm =
-        TransformConvBwdDataToGemm_v1<NDimSpatial,
-                                      ConvBackwardDataSpecialization,
-                                      K1,
-                                      K1,
-                                      MPerBlock,
-                                      NPerBlock,
-                                      KPerBlock,
-                                      true /* DoPadGemmM */,
-                                      true /* DoPadGemmN */>{};
-
-    static auto GetDummyABDsEGridDescriptor()
-    {
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_lengths = {1};
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_strides = {1};
-        const std::array<index_t, NDimSpatial> dummy_spatial_lengths    = {1};
-
-        const auto a_grid_desc_ak0_m_ak1 =
-            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
-
-        const auto b_grid_desc_bk0_n_bk1 =
-            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
-
-        const auto ds_grid_desc_m_n = generate_tuple(
-            [&](auto i) {
-                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                return transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths);
-            },
-            Number<NumDTensor>{});
-
-        const auto e_grid_desc_m_n =
-            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths);
+    using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                                     ConvBackwardDataSpecialization,
+                                                                     K1,
+                                                                     K1,
+                                                                     MPerBlock,
+                                                                     NPerBlock,
+                                                                     KPerBlock,
+                                                                     true /* DoPadGemmM */,
+                                                                     true /* DoPadGemmN */,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     ELayout>;

+    static auto
+    GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1();
+        const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1();
+        const auto ds_grid_desc_m_n =
+            generate_tuple([&](auto) { return conv_to_gemm_transform.MakeCDescriptor_M_N(); },
+                           Number<NumDTensor>{});
+        const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N();
        return make_tuple(
            a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
    }

    // desc
-    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor());
+    constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform;
+    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform));

    using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>;
    using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>;
@@ -270,7 +216,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
-                     ds_g_n_c_wis_lengths,
+                 /*ds_g_n_c_wis_lengths*/,
                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
                     ds_g_n_c_wis_strides,
                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths,
@@ -291,15 +237,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
              b_element_op_{b_element_op},
              cde_element_op_{cde_element_op},
              a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
-              a_g_n_k_wos_strides_{a_g_n_k_wos_strides},
              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
-              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
-              ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths},
-              ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides},
-              e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
-              e_g_n_c_wis_strides_{e_g_n_c_wis_strides},
              conv_filter_strides_{conv_filter_strides},
-              conv_filter_dilations_{conv_filter_dilations},
              input_left_pads_{input_left_pads},
              input_right_pads_{input_right_pads}
        {
@@ -382,68 +321,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                            tildes = {i_ztilde, i_ytilde, i_xtilde};
                        }

+                        ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths,
+                                                                           a_g_n_k_wos_strides,
+                                                                           b_g_k_c_xs_lengths,
+                                                                           b_g_k_c_xs_strides,
+                                                                           e_g_n_c_wis_lengths,
+                                                                           e_g_n_c_wis_strides,
+                                                                           conv_filter_strides,
+                                                                           conv_filter_dilations,
+                                                                           input_left_pads,
+                                                                           input_right_pads,
+                                                                           tildes};
+
                        const auto a_grid_desc_ak0_m_ak1 =
-                            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1();

                        const auto b_grid_desc_bk0_n_bk1 =
-                            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1();

                        DsGridDesc_M_N ds_grid_desc_m_n;

                        // populate Ds desc
                        static_for<0, NumDTensor, 1>{}([&](auto i) {
                            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                            ds_grid_desc_m_n(i) =
-                                transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                                    a_g_n_k_wos_lengths,
-                                    a_g_n_k_wos_strides,
-                                    b_g_k_c_xs_lengths,
-                                    b_g_k_c_xs_strides,
-                                    ds_g_n_c_wis_lengths[i],
-                                    ds_g_n_c_wis_strides[i],
-                                    conv_filter_strides,
-                                    conv_filter_dilations,
-                                    input_left_pads,
-                                    input_right_pads,
-                                    tildes);
-                        });
-
-                        const auto e_grid_desc_m_n =
-                            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(
+                            static_assert(is_same_v<DLayout, ELayout>);
+                            ConvToGemmBwdDataTransform conv_to_gemm_transform_d{
                                a_g_n_k_wos_lengths,
                                a_g_n_k_wos_strides,
                                b_g_k_c_xs_lengths,
                                b_g_k_c_xs_strides,
                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
+                                ds_g_n_c_wis_strides[i],
                                conv_filter_strides,
                                conv_filter_dilations,
                                input_left_pads,
                                input_right_pads,
-                                tildes);
+                                tildes};
+
+                            ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N();
+                        });
+
+                        const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N();

                        // for check validity
                        ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
@@ -522,17 +440,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
        BElementwiseOp b_element_op_;
        CDEElementwiseOp cde_element_op_;

-        // for checking IsSupportedArgument()
        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
-        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_;
        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
-        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_lengths_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_strides_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_;
        std::array<index_t, NDimSpatial> conv_filter_strides_;
-        std::array<index_t, NDimSpatial> conv_filter_dilations_;
        std::array<index_t, NDimSpatial> input_left_pads_;
        std::array<index_t, NDimSpatial> input_right_pads_;
    };

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -54,15 +54,16 @@ template <typename GridwiseGemm,
          typename ABDataType,
          typename DsPointer,
          typename EDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp,
          typename AGridDesc_AK0_M_AK1,
          typename BGridDesc_BK0_N_BK1,
          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
          typename Block2ETileMap,
          typename ComputePtrOffsetOfBatch,
+          typename ComputePtrOffsetOfN,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -73,10 +74,9 @@ __global__ void
            const ABDataType* __restrict__ p_b_grid,
            DsPointer p_ds_grid,
            EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const index_t batch_count,
+            const AElementwiseOp a_element_op,
+            const BElementwiseOp b_element_op,
+            const CDEElementwiseOp cde_element_op,
            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
@@ -84,24 +84,29 @@ __global__ void
            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                e_grid_desc_mblock_mperblock_nblock_nperblock_,
            const Block2ETileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const ComputePtrOffsetOfN compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx94__))
    // offset base pointer for each work-group
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);

-    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));

    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);

+    const long_index_t a_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const long_index_t e_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    DsPointer p_ds_grid_grp;
@@ -112,10 +117,10 @@ __global__ void
    static_for<0, NumDTensor, 1>{}(
        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });

-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset + a_n_offset,
                                                  p_b_grid + b_batch_offset,
                                                  p_ds_grid_grp,
-                                                  p_e_grid + e_batch_offset,
+                                                  p_e_grid + e_batch_offset + e_n_offset,
                                                  p_shared,
                                                  a_element_op,
                                                  b_element_op,
@@ -130,7 +135,6 @@ __global__ void
    ignore = p_b_grid;
    ignore = p_ds_grid;
    ignore = p_e_grid;
-    ignore = batch_count;
    ignore = a_grid_desc_ak0_m_ak1;
    ignore = b_grid_desc_bk0_n_bk1;
    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
@@ -139,6 +143,7 @@ __global__ void
    ignore = b_element_op;
    ignore = cde_element_op;
    ignore = compute_ptr_offset_of_batch;
+    ignore = compute_ptr_offset_of_n;
    ignore = block_2_ctile_map;
 #endif
 }
@@ -233,82 +238,54 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
    static constexpr auto I2 = Number<2>{};
    static constexpr auto I3 = Number<3>{};

-    static constexpr auto transform_conv_to_gemm =
-        TransformConvBwdDataToGemm_v1<NDimSpatial,
-                                      ConvBackwardDataSpecialization,
-                                      AK1,
-                                      BK1,
-                                      MPerBlock,
-                                      NPerBlock,
-                                      KPerBlock,
-                                      DoPadGemmM,
-                                      DoPadGemmN>{};
-
-    static auto GetDummyABDsEGridDescriptor()
+    using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                                     ConvBackwardDataSpecialization,
+                                                                     AK1,
+                                                                     BK1,
+                                                                     MPerBlock,
+                                                                     NPerBlock,
+                                                                     KPerBlock,
+                                                                     DoPadGemmM,
+                                                                     DoPadGemmN,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     ELayout,
+                                                                     true, /*SplitConvN*/
+                                                                     ABDataType,
+                                                                     EDataType>;
+
+    static auto
+    GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform)
    {
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_lengths = {1};
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_strides = {1};
-        const std::array<index_t, NDimSpatial> dummy_spatial_lengths    = {1};
-
-        const auto a_grid_desc_ak0_m_ak1 =
-            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
-
-        const auto b_grid_desc_bk0_n_bk1 =
-            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
+        const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1();
+
+        const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1();

        const auto ds_grid_desc_m_n = generate_tuple(
            [&](auto i) {
-                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                return transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths);
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                using ConvToGemmBwdDataTransformD =
+                    TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                  ConvBackwardDataSpecialization,
+                                                  AK1,
+                                                  BK1,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  DoPadGemmM,
+                                                  DoPadGemmN,
+                                                  ALayout,
+                                                  BLayout,
+                                                  DLayout,
+                                                  true, /*SplitConvN*/
+                                                  ABDataType,
+                                                  DDataType>;
+                return ConvToGemmBwdDataTransformD{}.MakeCDescriptor_M_N();
            },
            Number<NumDTensor>{});

-        const auto e_grid_desc_m_n =
-            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths);
+        const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N();

        return make_tuple(
            a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
@@ -377,7 +354,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
    }

    // desc
-    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor());
+    constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform;
+    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform));

    using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>;
    using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>;
@@ -431,15 +409,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
              b_element_op_{b_element_op},
              cde_element_op_{cde_element_op},
              a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
-              a_g_n_k_wos_strides_{a_g_n_k_wos_strides},
              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
-              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
-              ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths},
-              ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides},
-              e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
-              e_g_n_c_wis_strides_{e_g_n_c_wis_strides},
              conv_filter_strides_{conv_filter_strides},
-              conv_filter_dilations_{conv_filter_dilations},
              input_left_pads_{input_left_pads},
              input_right_pads_{input_right_pads}
        {
@@ -450,11 +421,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
            });

-            // A/B/Ds/E Batch Stride
-            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0];
-
            static_for<0, NumDTensor, 1>{}([&](auto i) {
                compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_c_wis_strides[i][0];
            });
@@ -526,68 +492,65 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                            throw std::runtime_error("wrong! only implemented for 2D and 3D now");
                        }

+                        ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths,
+                                                                           a_g_n_k_wos_strides,
+                                                                           b_g_k_c_xs_lengths,
+                                                                           b_g_k_c_xs_strides,
+                                                                           e_g_n_c_wis_lengths,
+                                                                           e_g_n_c_wis_strides,
+                                                                           conv_filter_strides,
+                                                                           conv_filter_dilations,
+                                                                           input_left_pads,
+                                                                           input_right_pads,
+                                                                           tildes};
+
+                        conv_N_per_block_ = conv_to_gemm_transform_.N_;
+
                        const auto a_grid_desc_ak0_m_ak1 =
-                            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1();

                        const auto b_grid_desc_bk0_n_bk1 =
-                            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1();

                        DsGridDesc_M_N ds_grid_desc_m_n;

                        // populate Ds desc
                        static_for<0, NumDTensor, 1>{}([&](auto i) {
-                            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                            ds_grid_desc_m_n(i) =
-                                transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                                    a_g_n_k_wos_lengths,
-                                    a_g_n_k_wos_strides,
-                                    b_g_k_c_xs_lengths,
-                                    b_g_k_c_xs_strides,
-                                    ds_g_n_c_wis_lengths[i],
-                                    ds_g_n_c_wis_strides[i],
-                                    conv_filter_strides,
-                                    conv_filter_dilations,
-                                    input_left_pads,
-                                    input_right_pads,
-                                    tildes);
-                        });
-
-                        const auto e_grid_desc_m_n =
-                            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(
+                            using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                            using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                            using ConvToGemmBwdDataTransformD =
+                                TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                              ConvBackwardDataSpecialization,
+                                                              AK1,
+                                                              BK1,
+                                                              MPerBlock,
+                                                              NPerBlock,
+                                                              KPerBlock,
+                                                              DoPadGemmM,
+                                                              DoPadGemmN,
+                                                              ALayout,
+                                                              BLayout,
+                                                              DLayout,
+                                                              true, /*SplitConvN*/
+                                                              ABDataType,
+                                                              DDataType>;
+                            ConvToGemmBwdDataTransformD conv_to_gemm_transform_d{
                                a_g_n_k_wos_lengths,
                                a_g_n_k_wos_strides,
                                b_g_k_c_xs_lengths,
                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
+                                ds_g_n_c_wis_lengths[i],
+                                ds_g_n_c_wis_strides[i],
                                conv_filter_strides,
                                conv_filter_dilations,
                                input_left_pads,
                                input_right_pads,
-                                tildes);
+                                tildes};
+
+                            ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N();
+                        });
+
+                        const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N();

                        // desc for problem definition
                        const auto a_grid_desc_m_k =
@@ -628,6 +591,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                    }
                }
            }
+            // A/B/Ds/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0];
+
+            compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_k_wos_strides[1] * conv_N_per_block_;
+            compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_c_wis_strides[1] * conv_N_per_block_;
        }

        void Print() const
@@ -660,6 +630,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1

        // tensor descriptor for problem definition
        index_t num_group_;
+        index_t conv_N_per_block_;
        std::vector<AGridDesc_M_K> a_grid_desc_m_k_container_;
        std::vector<BGridDesc_N_K> b_grid_desc_n_k_container_;
        std::vector<DsGridDesc_M_N> ds_grid_desc_m_n_container_;
@@ -678,23 +649,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1

        // for computing batch offset
        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor> compute_ptr_offset_of_batch_;
+        ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_n_;

        // element-wise op
        AElementwiseOp a_element_op_;
        BElementwiseOp b_element_op_;
        CDEElementwiseOp cde_element_op_;

-        // for checking IsSupportedArgument()
        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
-        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_;
        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
-        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_lengths_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_strides_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_;
        std::array<index_t, NDimSpatial> conv_filter_strides_;
-        std::array<index_t, NDimSpatial> conv_filter_dilations_;
        std::array<index_t, NDimSpatial> input_left_pads_;
        std::array<index_t, NDimSpatial> input_right_pads_;
    };
@@ -711,8 +675,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                arg.Print();
            }

-            float ave_time = 0;
+            const index_t gdy = arg.num_group_;
+            const index_t num_workgroups_per_Conv_N =
+                arg.a_g_n_k_wos_lengths_[I1] / arg.conv_N_per_block_;
+            const index_t gdz = num_workgroups_per_Conv_N;

+            float ave_time = 0;
            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
            {
                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
@@ -724,9 +692,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                    throw std::runtime_error("wrong! device_op has invalid setting");
                }

-                const index_t grid_size = arg.block_2_etile_map_container_[i].CalculateGridSize(
-                                              arg.e_grid_desc_m_n_container_[i]) *
-                                          arg.num_group_;
+                const index_t gdx = arg.block_2_etile_map_container_[i].CalculateGridSize(
+                    arg.e_grid_desc_m_n_container_[i]);

                const auto GemmK = arg.a_grid_desc_m_k_container_[i].GetLength(I1);

@@ -747,12 +714,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
                        Block2ETileMap,
                        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
+                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
                        has_main_loop>;

                    return launch_and_time_kernel(
                        stream_config,
                        kernel,
-                        dim3(grid_size),
+                        dim3(gdx, gdy, gdz),
                        dim3(BlockSize),
                        0,
                        arg.p_a_grid_,
@@ -762,13 +730,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                        arg.a_element_op_,
                        arg.b_element_op_,
                        arg.cde_element_op_,
-                        arg.a_g_n_k_wos_lengths_[0], // Group count
                        arg.a_grid_desc_ak0_m_ak1_container_[i],
                        arg.b_grid_desc_bk0_n_bk1_container_[i],
                        arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
                        arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
                        arg.block_2_etile_map_container_[i],
-                        arg.compute_ptr_offset_of_batch_);
+                        arg.compute_ptr_offset_of_batch_,
+                        arg.compute_ptr_offset_of_n_);
                };

                if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK))

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -41,7 +41,7 @@ __global__ void
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);

    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
@@ -76,7 +76,7 @@ __global__ void
    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];

-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);

    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
@@ -639,27 +639,27 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3

    struct SplitKBatchOffset
    {
-        __device__ SplitKBatchOffset(Argument& karg)
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
        {
            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
            {
-                a_k_split_offset = blockIdx.z * karg.KRead;
+                a_k_split_offset = k_id * karg.KRead;
            }
            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
            {
-                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
            }

            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
            {
-                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
            }
            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
            {
-                b_k_split_offset = blockIdx.z * karg.KRead;
+                b_k_split_offset = k_id * karg.KRead;
            }

-            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            if(k_id < karg.KBatch - 1)
            {
                karg.K = karg.KRead;
            }

--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -13,245 +13,614 @@
 namespace ck {
 namespace tensor_operation {

-namespace {
 template <
    index_t NDimSpatial,
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization,
+    index_t AK1,
+    index_t BK1,
+    index_t GemmMPerBlock,
+    index_t GemmNPerBlock,
+    index_t GemmKPerBlock,
+    bool DoPadGemmM,
+    bool DoPadGemmN,
    typename ALayout,
-    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization>
-constexpr auto make_out_grid_desc(const index_t N,
-                                  const index_t Do,
-                                  const index_t Ho,
-                                  const index_t Wo,
-                                  const index_t K,
-                                  const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_strides)
+    typename BLayout,
+    typename CLayout,
+    bool SplitN              = false,
+    typename ADataType       = float,
+    typename CDataType       = float,
+    index_t NumGroupsToMerge = 1,
+    typename IndexType       = index_t>
+struct TransformConvBwdDataToGemm_v1
 {
-    const auto KStride = Number<1>{};
+    private:
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};

-    if constexpr(is_same_v<ALayout, tensor_layout::convolution::NHWGK>)
-    {
-        const index_t NStride  = out_g_n_k_wos_strides[1];
-        const index_t HiStride = out_g_n_k_wos_strides[3];
-        const index_t WiStride = out_g_n_k_wos_strides[4];
-        if constexpr(ConvBwdDataSpecialization ==
-                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
-                         Filter1x1Stride1Pad0)
-        {
+    static constexpr auto NonSpatialDimsNum = Number<3>{};

-            return make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, K),
-                                                make_tuple(WiStride, KStride));
-        }
-        else
+    static constexpr auto DIdx = NonSpatialDimsNum;
+    static constexpr auto HIdx =
+        NDimSpatial == 2 ? NonSpatialDimsNum : Number<NonSpatialDimsNum + 1>{};
+    static constexpr auto WIdx =
+        NDimSpatial == 2 ? Number<NonSpatialDimsNum + 1>{} : Number<NonSpatialDimsNum + 2>{};
+
+    static constexpr auto ZIdx = NonSpatialDimsNum;
+    static constexpr auto YIdx =
+        NDimSpatial == 2 ? NonSpatialDimsNum : Number<NonSpatialDimsNum + 1>{};
+    static constexpr auto XIdx =
+        NDimSpatial == 2 ? Number<NonSpatialDimsNum + 1>{} : Number<NonSpatialDimsNum + 2>{};
+
+    template <typename ConvDimsType>
+    static long_index_t calculate_element_space_size_impl(const ConvDimsType& lengths,
+                                                          const ConvDimsType& strides,
+                                                          index_t i)
+    {
+        long_index_t acc = 1;
+        for(; i < (NDimSpatial + 3); i++)
        {
-            return make_naive_tensor_descriptor(make_tuple(N, Ho, Wo, K),
-                                                make_tuple(NStride, HiStride, WiStride, KStride));
+            acc +=
+                static_cast<long_index_t>(lengths[i] - I1) * static_cast<long_index_t>(strides[i]);
        }
+
+        return acc;
    }
-    else if constexpr(is_same_v<ALayout, tensor_layout::convolution::NDHWGK>)
+
+    template <typename ConvDimsType>
+    static IndexType GetSplitedNSize(const ConvDimsType& a_g_n_k_wos_lengths,
+                                     const ConvDimsType& a_g_n_k_wos_strides,
+                                     const ConvDimsType& c_g_n_c_wis_lengths,
+                                     const ConvDimsType& c_g_n_c_wis_strides)
    {
-        const index_t NStride  = out_g_n_k_wos_strides[1];
-        const index_t DoStride = out_g_n_k_wos_strides[3];
-        const index_t HoStride = out_g_n_k_wos_strides[4];
-        const index_t WoStride = out_g_n_k_wos_strides[5];
-        if constexpr(ConvBwdDataSpecialization ==
-                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
-                         Filter1x1Stride1Pad0)
+        const long_index_t a_element_space_size =
+            calculate_element_space_size_impl(a_g_n_k_wos_lengths, a_g_n_k_wos_strides, I1);
+        const long_index_t c_element_space_size =
+            calculate_element_space_size_impl(c_g_n_c_wis_lengths, c_g_n_c_wis_strides, I1);
+        const long_index_t element_space_size = math::max(a_element_space_size * sizeof(ADataType),
+                                                          c_element_space_size * sizeof(CDataType));
+        constexpr long_index_t TwoGB          = (long_index_t{1} << 31);
+
+        const IndexType N = a_g_n_k_wos_lengths[I1];
+
+        if(element_space_size > TwoGB)
        {
+            // Minimum divisor of N to not exceed 2GB
+            const auto divisor = math::integer_divide_ceil(element_space_size, TwoGB);

-            return make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, K),
-                                                make_tuple(WoStride, KStride));
+            if(divisor <= static_cast<double>(N))
+            {
+                // Find least divisor of N larger than element_space_size / TwoGB
+                // Iterate up to sqrt(N). There are no divisors above this value.
+                for(IndexType least_divisor = divisor; least_divisor * least_divisor <= N;
+                    least_divisor++)
+                {
+                    if(N % least_divisor == 0)
+                    {
+                        return N / least_divisor;
+                    }
+                }
+                // Not found, process one Convolution N per block
+                return 1;
+            }
+            else
+            {
+                // Not possible to support even after split N.
+                // Too large tensor.
+                return N;
+            }
        }
        else
        {
-            return make_naive_tensor_descriptor(
-                make_tuple(N, Do, Ho, Wo, K),
-                make_tuple(NStride, DoStride, HoStride, WoStride, KStride));
+            // Split N is not needed.
+            return N;
        }
    }
-    else if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK>)
+
+    public:
+    __host__ __device__ constexpr TransformConvBwdDataToGemm_v1() {}
+
+    template <typename TransformConvBwdDataToGemm_v1Base>
+    __host__ __device__ TransformConvBwdDataToGemm_v1(
+        const TransformConvBwdDataToGemm_v1Base& transform_conv_bwd_data_to_gemm_base)
+        : N_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.N_)},
+          Di_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Di_)},
+          Hi_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Hi_)},
+          Wi_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Wi_)},
+          Do_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Do_)},
+          Ho_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Ho_)},
+          Wo_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Wo_)},
+          Z_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Z_)},
+          Y_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Y_)},
+          X_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.X_)},
+          K_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.K_)},
+          C_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.C_)},
+          DiStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.DiStride_)},
+          HiStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.HiStride_)},
+          WiStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.WiStride_)},
+          DoStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.DoStride_)},
+          HoStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.HoStride_)},
+          WoStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.WoStride_)},
+          CStrideTensorB_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.CStrideTensorB_)},
+          CStrideTensorC_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.CStrideTensorC_)},
+          KStrideTensorA_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.KStrideTensorA_)},
+          KStrideTensorB_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.KStrideTensorB_)},
+          NStrideTensorA_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.NStrideTensorA_)},
+          NStrideTensorC_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.NStrideTensorC_)},
+          ConvStrideD_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvStrideD_)},
+          ConvStrideH_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvStrideH_)},
+          ConvStrideW_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvStrideW_)},
+          ConvDilationD_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvDilationD_)},
+          ConvDilationH_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvDilationH_)},
+          ConvDilationW_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvDilationW_)},
+          InLeftPadD_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InLeftPadD_)},
+          InLeftPadH_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InLeftPadH_)},
+          InLeftPadW_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InLeftPadW_)},
+          InRightPadD_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InRightPadD_)},
+          InRightPadH_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InRightPadH_)},
+          InRightPadW_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InRightPadW_)},
+          IdxZTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.IdxZTilde_)},
+          IdxYTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.IdxYTilde_)},
+          IdxXTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.IdxXTilde_)},
+          GcdStrideDilationD_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationD_)},
+          GcdStrideDilationH_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationH_)},
+          GcdStrideDilationW_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationW_)},
+          ZTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ZTilde_)},
+          YTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.YTilde_)},
+          XTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.XTilde_)},
+          DTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.DTilde_)},
+          HTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.HTilde_)},
+          WTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.WTilde_)},
+          ZDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ZDot_)},
+          YDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.YDot_)},
+          XDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.XDot_)}
    {
-        // assume packed
-        if constexpr(ConvBwdDataSpecialization ==
-                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
-                         Filter1x1Stride1Pad0)
+    }
+
+    template <typename ConvDimsType, typename ConvSpatialDimsType>
+    __host__ __device__
+    TransformConvBwdDataToGemm_v1(const ConvDimsType& a_g_n_k_wos_lengths,
+                                  const ConvDimsType& a_g_n_k_wos_strides,
+                                  const ConvDimsType& b_g_k_c_xs_lengths,
+                                  const ConvDimsType& b_g_k_c_xs_strides,
+                                  const ConvDimsType& c_g_n_c_wis_lengths,
+                                  const ConvDimsType& c_g_n_c_wis_strides,
+                                  const ConvSpatialDimsType& conv_filter_strides,
+                                  const ConvSpatialDimsType& conv_filter_dilations,
+                                  const ConvSpatialDimsType& input_left_pads,
+                                  const ConvSpatialDimsType& input_right_pads,
+                                  const ConvSpatialDimsType& tildes)
+        : Hi_{c_g_n_c_wis_lengths[HIdx]},
+          Wi_{c_g_n_c_wis_lengths[WIdx]},
+          Ho_{a_g_n_k_wos_lengths[HIdx]},
+          Wo_{a_g_n_k_wos_lengths[WIdx]},
+          Y_{b_g_k_c_xs_lengths[YIdx]},
+          X_{b_g_k_c_xs_lengths[XIdx]},
+          K_{a_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          HiStride_{c_g_n_c_wis_strides[HIdx]},
+          WiStride_{c_g_n_c_wis_strides[WIdx]},
+          HoStride_{a_g_n_k_wos_strides[HIdx]},
+          WoStride_{a_g_n_k_wos_strides[WIdx]},
+          CStrideTensorB_{b_g_k_c_xs_strides[I2]},
+          CStrideTensorC_{c_g_n_c_wis_strides[I2]},
+          KStrideTensorA_{a_g_n_k_wos_strides[I2]},
+          KStrideTensorB_{b_g_k_c_xs_strides[I1]},
+          NStrideTensorA_{a_g_n_k_wos_strides[I1]},
+          NStrideTensorC_{c_g_n_c_wis_strides[I1]},
+          ConvStrideH_{conv_filter_strides[HIdx - NonSpatialDimsNum]},
+          ConvStrideW_{conv_filter_strides[WIdx - NonSpatialDimsNum]},
+          ConvDilationH_{conv_filter_dilations[HIdx - NonSpatialDimsNum]},
+          ConvDilationW_{conv_filter_dilations[WIdx - NonSpatialDimsNum]},
+          InLeftPadH_{input_left_pads[HIdx - NonSpatialDimsNum]},
+          InLeftPadW_{input_left_pads[WIdx - NonSpatialDimsNum]},
+          InRightPadH_{input_right_pads[HIdx - NonSpatialDimsNum]},
+          InRightPadW_{input_right_pads[WIdx - NonSpatialDimsNum]},
+          IdxYTilde_{tildes[YIdx - NonSpatialDimsNum]},
+          IdxXTilde_{tildes[XIdx - NonSpatialDimsNum]}
+    {
+        static_assert(is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
+                      is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>);
+        static_assert(is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
+                      is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>);
+
+        if constexpr(SplitN)
        {
-            return make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+            N_ = GetSplitedNSize(
+                a_g_n_k_wos_lengths, a_g_n_k_wos_strides, c_g_n_c_wis_lengths, c_g_n_c_wis_strides);
        }
        else
        {
-            return make_naive_tensor_descriptor_packed(make_tuple(N, Ho, Wo, K));
+            N_ = c_g_n_c_wis_lengths[I1];
        }
-    }
-    else if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNDHWK>)
-    {
-        // assume packed
-        if constexpr(ConvBwdDataSpecialization ==
-                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
-                         Filter1x1Stride1Pad0)
+        if constexpr(NDimSpatial == 3)
        {
-            return make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo, K));
+            Di_                 = c_g_n_c_wis_lengths[DIdx];
+            Do_                 = a_g_n_k_wos_lengths[DIdx];
+            Z_                  = b_g_k_c_xs_lengths[ZIdx];
+            DiStride_           = c_g_n_c_wis_strides[DIdx];
+            DoStride_           = a_g_n_k_wos_strides[DIdx];
+            ConvStrideD_        = conv_filter_strides[DIdx - NonSpatialDimsNum];
+            ConvDilationD_      = conv_filter_dilations[DIdx - NonSpatialDimsNum];
+            InLeftPadD_         = input_left_pads[DIdx - NonSpatialDimsNum];
+            InRightPadD_        = input_right_pads[DIdx - NonSpatialDimsNum];
+            IdxZTilde_          = tildes[ZIdx - NonSpatialDimsNum];
+            GcdStrideDilationD_ = math::gcd(ConvStrideD_, ConvDilationD_);
+            ZTilde_             = ConvStrideD_ / GcdStrideDilationD_;
+            DTilde_ = Do_ + math::integer_divide_ceil(ConvDilationD_ * (Z_ - I1), ConvStrideD_);
+            ZDot_   = math::integer_divide_ceil(Z_, ZTilde_);
        }
        else
        {
-            return make_naive_tensor_descriptor_packed(make_tuple(N, Do, Ho, Wo, K));
+            Di_ = Do_ = Z_ = ZTilde_ = ConvStrideD_ = DTilde_ = ZDot_ = 1;
+            InLeftPadD_ = InRightPadD_ = DiStride_ = DoStride_ = IdxZTilde_ = 0;
        }
-    }
-    else
-    {
-        throw std::runtime_error("wrong! unsupported layout: " + ALayout::name());
-    }
-}

-template <typename BLayout>
-constexpr auto make_wei_grid_desc(
-    const index_t K, const index_t Z, const index_t Y, const index_t X, const index_t C)
-{
+        GcdStrideDilationH_ = math::gcd(ConvStrideH_, ConvDilationH_);
+        GcdStrideDilationW_ = math::gcd(ConvStrideW_, ConvDilationW_);

-    if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKYXC>)
-    {
-        return make_naive_tensor_descriptor_packed(make_tuple(K, Y, X, C));
-    }
-    else if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKZYXC>)
-    {
-        return make_naive_tensor_descriptor_packed(make_tuple(K, Z, Y, X, C));
-    }
-    else
-    {
-        throw std::runtime_error("wrong! unsupported layout: " + BLayout::name());
-    }
-}
-
-template <index_t NDimSpatial, typename CLayout>
-constexpr auto make_in_grid_desc(const index_t N,
-                                 const index_t Di,
-                                 const index_t Hi,
-                                 const index_t Wi,
-                                 const index_t C,
-                                 const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_strides)
-{
+        YTilde_ = ConvStrideH_ / GcdStrideDilationH_;
+        XTilde_ = ConvStrideW_ / GcdStrideDilationW_;

-    if constexpr(is_same_v<CLayout, tensor_layout::convolution::GNHWC> ||
-                 is_same_v<CLayout, tensor_layout::convolution::NHWGC> ||
-                 is_same_v<CLayout, tensor_layout::convolution::G_NHW_C>)
-    {
-        return make_naive_tensor_descriptor(make_tuple(N, Hi, Wi, C),
-                                            make_tuple(in_g_n_c_wis_strides[1],
-                                                       in_g_n_c_wis_strides[3],
-                                                       in_g_n_c_wis_strides[4],
-                                                       in_g_n_c_wis_strides[2]));
+        HTilde_ = Ho_ + math::integer_divide_ceil(ConvDilationH_ * (Y_ - I1), ConvStrideH_);
+        WTilde_ = Wo_ + math::integer_divide_ceil(ConvDilationW_ * (X_ - I1), ConvStrideW_);
+
+        YDot_ = math::integer_divide_ceil(Y_, YTilde_);
+        XDot_ = math::integer_divide_ceil(X_, XTilde_);
    }
-    else if constexpr(is_same_v<CLayout, tensor_layout::convolution::GNDHWC> ||
-                      is_same_v<CLayout, tensor_layout::convolution::NDHWGC>)
+
+#if 0 // At now not supported to split tensor
+    __host__ bool AreDescriptorsSmallerThan2GB() const
    {
-        return make_naive_tensor_descriptor(make_tuple(N, Di, Hi, Wi, C),
-                                            make_tuple(in_g_n_c_wis_strides[1],
-                                                       in_g_n_c_wis_strides[3],
-                                                       in_g_n_c_wis_strides[4],
-                                                       in_g_n_c_wis_strides[5],
-                                                       in_g_n_c_wis_strides[2]));
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        const long_index_t in_desc_space_size =
+            I1 + (N_ - I1) * NStrideTensorC_ + (Di_ - I1) * DiStride_ + (Hi_ - I1) * HiStride_ +
+            (Wi_ - I1) * WiStride_ + (C_ - I1) * CStrideTensorC_;
+        const long_index_t out_desc_space_size =
+            I1 + (N_ - I1) * NStrideTensorA_ + (Do_ - I1) * DoStride_ + (Ho_ - I1) * HoStride_ +
+            (Wo_ - I1) * WoStride_ + (K_ - I1) * KStrideTensorA_;
+
+        bool is_a_descriptor_smaller_than_2GB = (out_desc_space_size * sizeof(ADataType)) <= TwoGB;
+        bool is_c_descriptor_smaller_than_2GB = (in_desc_space_size * sizeof(CDataType)) <= TwoGB;
+
+        return is_a_descriptor_smaller_than_2GB && is_c_descriptor_smaller_than_2GB;
    }
-    else
+
+    __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base,
+                                   CDataType* c_grid_ptr_base) const
    {
-        throw std::runtime_error("wrong! unsupported layout: " + CLayout::name());
-    }
-}
+        // Create copies
+        auto conv_to_gemm_transformer_left  = *this;
+        auto conv_to_gemm_transformer_right = *this;
+        IndexType a_right_offset            = 0;
+        IndexType c_right_offset            = 0;
+        // Calculate real filter size
+        const IndexType z_eff = (Z_ - 1) * ConvDilationD_ + 1;
+        const IndexType y_eff = (Y_ - 1) * ConvDilationH_ + 1;
+        const IndexType x_eff = (X_ - 1) * ConvDilationW_ + 1;
+        // Calculate start position in input for right tensor
+        const IndexType di_right_transformer_start_idx = (Do_ / 2) * ConvStrideD_;
+        const IndexType hi_right_transformer_start_idx = (Ho_ / 2) * ConvStrideH_;
+        const IndexType wi_right_transformer_start_idx = (Wo_ / 2) * ConvStrideW_;
+        // Calculate last position in input for left tensor
+        const IndexType di_left_transformer_end_idx = (Do_ / 2 - 1) * ConvStrideD_ + z_eff;
+        const IndexType hi_left_transformer_end_idx = (Ho_ / 2 - 1) * ConvStrideH_ + y_eff;
+        const IndexType wi_left_transformer_end_idx = (Wo_ / 2 - 1) * ConvStrideW_ + x_eff;
+        // Allow to split if whole left padding will be in left tensor and right padding in right
+        // tensor
+        const bool is_possible_to_split_d = Do_ != 1 &&
+                                            di_right_transformer_start_idx > InLeftPadD_ &&
+                                            di_left_transformer_end_idx <= (InLeftPadD_ + Di_);
+        const bool is_possible_to_split_h = Ho_ != 1 &&
+                                            hi_right_transformer_start_idx > InLeftPadH_ &&
+                                            hi_left_transformer_end_idx <= (InLeftPadH_ + Hi_);
+        const bool is_possible_to_split_w = Wo_ != 1 &&
+                                            wi_right_transformer_start_idx > InLeftPadW_ &&
+                                            wi_left_transformer_end_idx <= (InLeftPadW_ + Wi_);
+
+        if(is_possible_to_split_d)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Do_  = Do_ / 2;
+            conv_to_gemm_transformer_right.Do_ = Do_ - Do_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadD_  = InLeftPadD_;
+            conv_to_gemm_transformer_right.InLeftPadD_ = 0;
+            // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadD_  = 0;
+            conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Di_ = di_left_transformer_end_idx - InLeftPadD_;
+            conv_to_gemm_transformer_right.Di_ =
+                math::min(Di_ - (di_right_transformer_start_idx - InLeftPadD_),
+                          (conv_to_gemm_transformer_right.Do_ - 1) * ConvStrideD_ + z_eff);
+            ;
+            // Calcualte offsets
+            a_right_offset = (Do_ / 2) * DoStride_;
+            c_right_offset = ((Do_ / 2) * ConvStrideD_ - InLeftPadD_) * DiStride_;
+        }
+        else if(is_possible_to_split_h)
+        {
+            conv_to_gemm_transformer_left.Ho_  = Ho_ / 2;
+            conv_to_gemm_transformer_right.Ho_ = Ho_ - Ho_ / 2;

-} // namespace
+            conv_to_gemm_transformer_left.InLeftPadH_  = InLeftPadH_;
+            conv_to_gemm_transformer_right.InLeftPadH_ = 0;

-template <
-    index_t NDimSpatial,
-    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization,
-    index_t AK1,
-    index_t BK1,
-    index_t GemmMPerBlock,
-    index_t GemmNPerBlock,
-    index_t GemmKPerBlock,
-    bool DoPadGemmM,
-    bool DoPadGemmN>
-struct TransformConvBwdDataToGemm_v1
-{
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
+            conv_to_gemm_transformer_left.InRightPadH_  = 0;
+            conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_;

-    static constexpr auto NonSpatialDimsNum = Number<3>{};
+            conv_to_gemm_transformer_left.Hi_ = hi_left_transformer_end_idx - InLeftPadH_;
+            conv_to_gemm_transformer_right.Hi_ =
+                math::min(Hi_ - (hi_right_transformer_start_idx - InLeftPadH_),
+                          (conv_to_gemm_transformer_right.Ho_ - 1) * ConvStrideH_ + y_eff);
+            a_right_offset = (Ho_ / 2) * HoStride_;
+            c_right_offset = ((Ho_ / 2) * ConvStrideH_ - InLeftPadH_) * HiStride_;
+        }
+        else if(is_possible_to_split_w)
+        {
+            conv_to_gemm_transformer_left.Wo_  = Wo_ / 2;
+            conv_to_gemm_transformer_right.Wo_ = Wo_ - Wo_ / 2;

-    static constexpr auto DIdx = Number<NonSpatialDimsNum>{};
-    static constexpr auto HIdx =
-        NDimSpatial == 2 ? Number<NonSpatialDimsNum>{} : Number<NonSpatialDimsNum + 1>{};
-    static constexpr auto WIdx =
-        NDimSpatial == 2 ? Number<NonSpatialDimsNum + 1>{} : Number<NonSpatialDimsNum + 2>{};
+            conv_to_gemm_transformer_left.InLeftPadW_  = InLeftPadW_;
+            conv_to_gemm_transformer_right.InLeftPadW_ = 0;

-    static constexpr auto ZIdx = Number<NonSpatialDimsNum>{};
-    static constexpr auto YIdx =
-        NDimSpatial == 2 ? Number<NonSpatialDimsNum>{} : Number<NonSpatialDimsNum + 1>{};
-    static constexpr auto XIdx =
-        NDimSpatial == 2 ? Number<NonSpatialDimsNum + 1>{} : Number<NonSpatialDimsNum + 2>{};
+            conv_to_gemm_transformer_left.InRightPadW_  = 0;
+            conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_;

-    template <typename ALayout,
-              typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
-                                          (is_same_v<ALayout, tensor_layout::convolution::GNHWK> ||
-                                           is_same_v<ALayout, tensor_layout::convolution::GNDHWK> ||
-                                           is_same_v<ALayout, tensor_layout::convolution::NHWGK> ||
-                                           is_same_v<ALayout, tensor_layout::convolution::NDHWGK>),
-                                      bool>::type = false>
-    static auto MakeADescriptor_AK0_M_AK1(
-        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
-        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_strides,
-        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
-        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* in_g_n_c_wis_strides */,
-        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-        const std::array<index_t, NDimSpatial>& input_left_pads,
-        const std::array<index_t, NDimSpatial>& /* input_right_pads */,
-        const std::array<index_t, NDimSpatial>& tildes)
+            conv_to_gemm_transformer_left.Wi_ = wi_left_transformer_end_idx - InLeftPadW_;
+            conv_to_gemm_transformer_right.Wi_ =
+                math::min(Wi_ - (wi_right_transformer_start_idx - InLeftPadW_),
+                          (conv_to_gemm_transformer_right.Wo_ - 1) * ConvStrideW_ + x_eff);
+
+            a_right_offset = (Wo_ / 2) * WoStride_;
+            c_right_offset = ((Wo_ / 2) * ConvStrideW_ - InLeftPadW_) * WiStride_;
+        }
+        // Return left transform, right transformer, right offset to Input and right offset to
+        // Output
+        return ck::make_tuple(conv_to_gemm_transformer_left,
+                              conv_to_gemm_transformer_right,
+                              a_grid_ptr_base + a_right_offset,
+                              c_grid_ptr_base + c_right_offset);
+    }
+
+    __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base,
+                                   CDataType* c_grid_ptr_base) const
    {
-        index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum];
-        index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum];
-        index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum];
+        // Create copies
+        auto conv_to_gemm_transformer_left  = *this;
+        auto conv_to_gemm_transformer_right = *this;
+        IndexType a_right_offset            = 0;
+        IndexType c_right_offset            = 0;
+
+        // Calculate start position in input for right tensor
+        const IndexType do_right_transformer_start_idx = math::integer_divide_ceil((Di_ / 2) + InLeftPadD_ - ((Z_ - 1) * ConvDilationD_), ConvStrideD_);
+        const IndexType ho_right_transformer_start_idx = math::integer_divide_ceil((Hi_ / 2) + InLeftPadH_ - ((Y_ - 1) * ConvDilationH_), ConvStrideH_);
+        const IndexType wo_right_transformer_start_idx = math::integer_divide_ceil((Wi_ / 2) + InLeftPadW_ - ((X_ - 1) * ConvDilationW_), ConvStrideW_);
+        // Calculate last position in input for left tensor
+        const IndexType do_left_transformer_end_idx = math::integer_divide_ceil((Di_ / 2 - 1) + InLeftPadD_, ConvStrideD_);
+        const IndexType ho_left_transformer_end_idx = math::integer_divide_ceil((Hi_ / 2 - 1) + InLeftPadH_, ConvStrideH_);
+        const IndexType wo_left_transformer_end_idx = math::integer_divide_ceil((Wi_ / 2 - 1) + InLeftPadW_, ConvStrideW_);
+
+
+        if(Di_!=1)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Di_  = Di_ / 2;
+            conv_to_gemm_transformer_right.Di_ = Di_ - Di_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadD_  = InLeftPadD_;
+            conv_to_gemm_transformer_right.InLeftPadD_ = 0;
+            // // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadD_  = 0;
+            conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Do_ = do_left_transformer_end_idx;
+            conv_to_gemm_transformer_right.Do_ = Do_ - do_right_transformer_start_idx;
+            ;
+            // Calcualte offsets
+            a_right_offset = do_right_transformer_start_idx * DoStride_;
+            c_right_offset = (Di_ / 2) * DiStride_;
+        }
+        else if(Hi_!=1)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Hi_  = Hi_ / 2;
+            conv_to_gemm_transformer_right.Hi_ = Hi_ - Hi_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadH_  = InLeftPadH_;
+            conv_to_gemm_transformer_right.InLeftPadH_ = 0;
+            // // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadH_  = 0;
+            conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Ho_ = ho_left_transformer_end_idx ;
+            conv_to_gemm_transformer_right.Ho_ = Ho_ - ho_right_transformer_start_idx ;
+            ;
+            // Calcualte offsets
+            a_right_offset = ho_right_transformer_start_idx * HoStride_;
+            c_right_offset = (Hi_ / 2) * HiStride_;
+        }
+        else if(Wi_!=1)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Wi_  = Wi_ / 2;
+            conv_to_gemm_transformer_right.Wi_ = Wi_ - Wi_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadW_  = InLeftPadW_;
+            conv_to_gemm_transformer_right.InLeftPadW_ = 0;
+            // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadW_  = 0;
+            conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Wo_ = wo_left_transformer_end_idx;
+            conv_to_gemm_transformer_right.Wo_ = Wo_ - wo_right_transformer_start_idx;
+            ;
+            // Calcualte offsets
+            a_right_offset = wo_right_transformer_start_idx * WoStride_;
+            c_right_offset = (Wi_ / 2) * WiStride_;
+        }
+        // Return left transform, right transformer, right offset to Input and right offset to
+        // Output
+        return ck::make_tuple(conv_to_gemm_transformer_left,
+                              conv_to_gemm_transformer_right,
+                              a_grid_ptr_base + a_right_offset,
+                              c_grid_ptr_base + c_right_offset);
+    }
+#endif

-        const index_t N = in_g_n_c_wis_lengths[1];
-        const index_t K = wei_g_k_c_xs_lengths[1];
+    __host__ __device__ auto MakeOutGridDesc() const
+    {
+        if constexpr(is_same_v<ALayout, tensor_layout::convolution::NHWGK>)
+        {
+            if constexpr(ConvBwdDataSpecialization ==
+                         ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                             Filter1x1Stride1Pad0)
+            {

-        const index_t Di = NDimSpatial == 3 ? in_g_n_c_wis_lengths[DIdx] : 1;
-        const index_t Hi = in_g_n_c_wis_lengths[HIdx];
-        const index_t Wi = in_g_n_c_wis_lengths[WIdx];
+                return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_),
+                                                    make_tuple(WoStride_, KStrideTensorA_));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(N_, Ho_, Wo_, K_),
+                    make_tuple(NStrideTensorA_, HoStride_, WoStride_, KStrideTensorA_));
+            }
+        }
+        else if constexpr(is_same_v<ALayout, tensor_layout::convolution::NDHWGK>)
+        {
+            if constexpr(ConvBwdDataSpecialization ==
+                         ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                             Filter1x1Stride1Pad0)
+            {

-        const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1;
-        const index_t Ho = out_g_n_k_wos_lengths[HIdx];
-        const index_t Wo = out_g_n_k_wos_lengths[WIdx];
+                return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_),
+                                                    make_tuple(WoStride_, KStrideTensorA_));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(N_, Do_, Ho_, Wo_, K_),
+                    make_tuple(NStrideTensorA_, DoStride_, HoStride_, WoStride_, KStrideTensorA_));
+            }
+        }
+        else if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK>)
+        {
+            // assume packed
+            if constexpr(ConvBwdDataSpecialization ==
+                         ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                             Filter1x1Stride1Pad0)
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(N_ * Ho_ * Wo_, K_));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(N_, Ho_, Wo_, K_));
+            }
+        }
+        else if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNDHWK>)
+        {
+            // assume packed
+            if constexpr(ConvBwdDataSpecialization ==
+                         ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                             Filter1x1Stride1Pad0)
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(N_ * Do_ * Ho_ * Wo_, K_));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(N_, Do_, Ho_, Wo_, K_));
+            }
+        }
+        else
+        {
+            throw std::runtime_error("wrong! unsupported layout: " + ALayout::name());
+        }
+    }

-        const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1;
-        const index_t Y = wei_g_k_c_xs_lengths[YIdx];
-        const index_t X = wei_g_k_c_xs_lengths[XIdx];
+    __host__ __device__ auto MakeWeiGridDesc() const
+    {

-        const index_t InLeftPadD = input_left_pads[DIdx - NonSpatialDimsNum];
-        const index_t InLeftPadH = input_left_pads[HIdx - NonSpatialDimsNum];
-        const index_t InLeftPadW = input_left_pads[WIdx - NonSpatialDimsNum];
+        if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKYXC>)
+        {
+            return make_naive_tensor_descriptor_packed(make_tuple(K_, Y_, X_, C_));
+        }
+        else if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKZYXC>)
+        {
+            return make_naive_tensor_descriptor_packed(make_tuple(K_, Z_, Y_, X_, C_));
+        }
+        else
+        {
+            throw std::runtime_error("wrong! unsupported layout: " + BLayout::name());
+        }
+    }

-        const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum];
-        const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum];
-        const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum];
+    __host__ __device__ auto MakeInGridDesc() const
+    {

-        const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum];
-        const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum];
-        const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum];
+        if constexpr(is_same_v<CLayout, tensor_layout::convolution::GNHWC> ||
+                     is_same_v<CLayout, tensor_layout::convolution::NHWGC> ||
+                     is_same_v<CLayout, tensor_layout::convolution::G_NHW_C>)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(N_, Hi_, Wi_, C_),
+                make_tuple(NStrideTensorC_, HiStride_, WiStride_, CStrideTensorC_));
+        }
+        else if constexpr(is_same_v<CLayout, tensor_layout::convolution::GNDHWC> ||
+                          is_same_v<CLayout, tensor_layout::convolution::NDHWGC>)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(N_, Di_, Hi_, Wi_, C_),
+                make_tuple(NStrideTensorC_, DiStride_, HiStride_, WiStride_, CStrideTensorC_));
+        }
+        else
+        {
+            throw std::runtime_error("wrong! unsupported layout: " + CLayout::name());
+        }
+    }

+    template <
+        typename ALayout_                   = ALayout,
+        typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
+                                    (is_same_v<ALayout_, tensor_layout::convolution::GNHWK> ||
+                                     is_same_v<ALayout_, tensor_layout::convolution::GNDHWK> ||
+                                     is_same_v<ALayout_, tensor_layout::convolution::NHWGK> ||
+                                     is_same_v<ALayout_, tensor_layout::convolution::NDHWGK>),
+                                bool>::type = false>
+    __host__ __device__ auto MakeADescriptor_AK0_M_AK1() const
+    {
        // n_do_ho_wo_k for 3d or n_ho_wo_k for 2d
-        const auto out_grid_desc =
-            make_out_grid_desc<NDimSpatial, ALayout, ConvBwdDataSpecialization>(
-                N, Do, Ho, Wo, K, out_g_n_k_wos_strides);
+        const auto out_grid_desc = MakeOutGridDesc();

        if constexpr(ConvBwdDataSpecialization ==
                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                         Filter1x1Stride1Pad0)
        {
-            const index_t AK0 = math::integer_divide_ceil(K, AK1);
+            const index_t AK0 = math::integer_divide_ceil(K_, AK1);

            // A: output tensor
            const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor(
                out_grid_desc,
-                make_tuple(make_pass_through_transform(N * Do * Ho * Wo),
+                make_tuple(make_pass_through_transform(N_ * Do_ * Ho_ * Wo_),
                           make_unmerge_transform(make_tuple(AK0, AK1))),
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
@@ -266,82 +635,63 @@ struct TransformConvBwdDataToGemm_v1
        }
        else
        {
-            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
-            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
-            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
-
-            const auto ZTilde = ConvStrideD / GcdStrideDilationD;
-            const auto YTilde = ConvStrideH / GcdStrideDilationH;
-            const auto XTilde = ConvStrideW / GcdStrideDilationW;
-
-            const auto ZDot = math::integer_divide_ceil(Z, ZTilde);
-            const auto YDot = math::integer_divide_ceil(Y, YTilde);
-            const auto XDot = math::integer_divide_ceil(X, XTilde);
-
-            const auto DTilde =
-                Do + math::integer_divide_ceil(ConvDilationD * (Z - I1), ConvStrideD);
-            const auto HTilde =
-                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
-            const auto WTilde =
-                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
-
            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
            const auto IDTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadD - ConvDilationD * (ZTilde - I1)), ConvStrideD);
+                math::max(I0, InLeftPadD_ - ConvDilationD_ * (ZTilde_ - I1)), ConvStrideD_);
            const auto IHTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+                math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_);
            const auto IWTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+                math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_);

            const auto IDTildeSliceEnd = math::min(
-                DTilde, math::integer_divide_ceil(InLeftPadD + Di - I1, ConvStrideD) + I1);
+                DTilde_, math::integer_divide_ceil(InLeftPadD_ + Di_ - I1, ConvStrideD_) + I1);
            const auto IHTildeSliceEnd = math::min(
-                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+                HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1);
            const auto IWTildeSliceEnd = math::min(
-                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+                WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1);

            const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin;
            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;

            // GemmK is different for each GEMM
-            const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde);
-            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
-            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+            const auto ZDotSlice = math::integer_divide_ceil(Z_ - IdxZTilde_, ZTilde_);
+            const auto YDotSlice = math::integer_divide_ceil(Y_ - IdxYTilde_, YTilde_);
+            const auto XDotSlice = math::integer_divide_ceil(X_ - IdxXTilde_, XTilde_);

            if constexpr(NDimSpatial == 2)
            {
                // A: output tensor
                const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
                    out_grid_desc,
-                    make_tuple(make_pass_through_transform(N),
-                               make_pad_transform(Ho, I0, I0),
-                               make_pad_transform(Wo, I0, I0),
-                               make_pass_through_transform(K)),
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Ho_, I0, I0),
+                               make_pad_transform(Wo_, I0, I0),
+                               make_pass_through_transform(K_)),
                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));

                const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
                    out_n_hop_wop_k_grid_desc,
                    make_tuple(
-                        make_pass_through_transform(N),
-                        make_embed_transform(make_tuple(YDot, HTilde),
-                                             make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
-                        make_embed_transform(make_tuple(XDot, WTilde),
-                                             make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
-                        make_pass_through_transform(K)),
+                        make_pass_through_transform(N_),
+                        make_embed_transform(make_tuple(YDot_, HTilde_),
+                                             make_tuple(-ConvDilationH_ / GcdStrideDilationH_, I1)),
+                        make_embed_transform(make_tuple(XDot_, WTilde_),
+                                             make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)),
+                        make_pass_through_transform(K_)),
                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                    make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));

                const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc =
                    transform_tensor_descriptor(
                        out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
-                        make_tuple(make_pass_through_transform(N),
-                                   make_slice_transform(YDot, I0, YDotSlice),
-                                   make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                                   make_slice_transform(XDot, I0, XDotSlice),
-                                   make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                                   make_pass_through_transform(K)),
+                        make_tuple(make_pass_through_transform(N_),
+                                   make_slice_transform(YDot_, I0, YDotSlice),
+                                   make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                                   make_slice_transform(XDot_, I0, XDotSlice),
+                                   make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                                   make_pass_through_transform(K_)),
                        make_tuple(Sequence<0>{},
                                   Sequence<1>{},
                                   Sequence<2>{},
@@ -357,8 +707,8 @@ struct TransformConvBwdDataToGemm_v1

                const auto out_gemmk_gemmmraw_grid_desc = transform_tensor_descriptor(
                    out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc,
-                    make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K)),
-                               make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice))),
+                    make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K_)),
+                               make_merge_transform(make_tuple(N_, HTildeSlice, WTildeSlice))),
                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}));

@@ -385,11 +735,11 @@ struct TransformConvBwdDataToGemm_v1
                // A: output tensor
                const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
                    out_grid_desc,
-                    make_tuple(make_pass_through_transform(N),
-                               make_pad_transform(Do, I0, I0),
-                               make_pad_transform(Ho, I0, I0),
-                               make_pad_transform(Wo, I0, I0),
-                               make_pass_through_transform(K)),
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Do_, I0, I0),
+                               make_pad_transform(Ho_, I0, I0),
+                               make_pad_transform(Wo_, I0, I0),
+                               make_pass_through_transform(K_)),
                    make_tuple(
                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
                    make_tuple(
@@ -398,17 +748,17 @@ struct TransformConvBwdDataToGemm_v1
                const auto out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc =
                    transform_tensor_descriptor(
                        out_n_hop_wop_k_grid_desc,
-                        make_tuple(make_pass_through_transform(N),
+                        make_tuple(make_pass_through_transform(N_),
                                   make_embed_transform(
-                                       make_tuple(ZDot, DTilde),
-                                       make_tuple(-ConvDilationD / GcdStrideDilationD, I1)),
+                                       make_tuple(ZDot_, DTilde_),
+                                       make_tuple(-ConvDilationD_ / GcdStrideDilationD_, I1)),
                                   make_embed_transform(
-                                       make_tuple(YDot, HTilde),
-                                       make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                                       make_tuple(YDot_, HTilde_),
+                                       make_tuple(-ConvDilationH_ / GcdStrideDilationH_, I1)),
                                   make_embed_transform(
-                                       make_tuple(XDot, WTilde),
-                                       make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
-                                   make_pass_through_transform(K)),
+                                       make_tuple(XDot_, WTilde_),
+                                       make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)),
+                                   make_pass_through_transform(K_)),
                        make_tuple(Sequence<0>{},
                                   Sequence<1>{},
                                   Sequence<2>{},
@@ -424,14 +774,15 @@ struct TransformConvBwdDataToGemm_v1
                    out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc =
                        transform_tensor_descriptor(
                            out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc,
-                            make_tuple(make_pass_through_transform(N),
-                                       make_slice_transform(ZDot, I0, ZDotSlice),
-                                       make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice),
-                                       make_slice_transform(YDot, I0, YDotSlice),
-                                       make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                                       make_slice_transform(XDot, I0, XDotSlice),
-                                       make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                                       make_pass_through_transform(K)),
+                            make_tuple(
+                                make_pass_through_transform(N_),
+                                make_slice_transform(ZDot_, I0, ZDotSlice),
+                                make_slice_transform(DTilde_, IDTildeSliceBegin, DTildeSlice),
+                                make_slice_transform(YDot_, I0, YDotSlice),
+                                make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                                make_slice_transform(XDot_, I0, XDotSlice),
+                                make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                                make_pass_through_transform(K_)),
                            make_tuple(Sequence<0>{},
                                       Sequence<1>{},
                                       Sequence<2>{},
@@ -452,8 +803,9 @@ struct TransformConvBwdDataToGemm_v1
                const auto out_gemmk_gemmmraw_grid_desc = transform_tensor_descriptor(
                    out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc,
                    make_tuple(
-                        make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K)),
-                        make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice))),
+                        make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K_)),
+                        make_merge_transform(
+                            make_tuple(N_, DTildeSlice, HTildeSlice, WTildeSlice))),
                    make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}));

@@ -482,66 +834,31 @@ struct TransformConvBwdDataToGemm_v1
        }
    }

-    template <typename BLayout,
+    template <typename BLayout_                   = BLayout,
              typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
-                                          (is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
-                                           is_same_v<BLayout, tensor_layout::convolution::GKZYXC>),
+                                          (is_same_v<BLayout_, tensor_layout::convolution::GKYXC> ||
+                                           is_same_v<BLayout_, tensor_layout::convolution::GKZYXC>),
                                      bool>::type = false>
-    static auto MakeBDescriptor_BK0_N_BK1(
-        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
-        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
-        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* in_g_n_c_wis_strides */,
-        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-        const std::array<index_t, NDimSpatial>& /* input_left_pads */,
-        const std::array<index_t, NDimSpatial>& /* input_right_pads */,
-        const std::array<index_t, NDimSpatial>& tildes)
+    __host__ __device__ auto MakeBDescriptor_BK0_N_BK1() const
    {
-        index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum];
-        index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum];
-        index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum];
-
-        const index_t N = in_g_n_c_wis_lengths[1];
-        const index_t K = wei_g_k_c_xs_lengths[1];
-        const index_t C = wei_g_k_c_xs_lengths[2];
-
-        const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1;
-        const index_t Ho = out_g_n_k_wos_lengths[HIdx];
-        const index_t Wo = out_g_n_k_wos_lengths[WIdx];
-
-        const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1;
-        const index_t Y = wei_g_k_c_xs_lengths[YIdx];
-        const index_t X = wei_g_k_c_xs_lengths[XIdx];
-
-        const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum];
-        const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum];
-        const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum];
-
-        const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum];
-        const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum];
-        const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum];
-
        // assume packed
        // k_y_x_c for 2d or k_z_y_x_c for 3d
-        const auto wei_grid_desc = make_wei_grid_desc<BLayout>(K, Z, Y, X, C);
+        const auto wei_grid_desc = MakeWeiGridDesc();

        if constexpr(ConvBwdDataSpecialization ==
                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                         Filter1x1Stride1Pad0)
        {
-            const index_t BK0 = math::integer_divide_ceil(K, BK1);
+            const index_t BK0 = math::integer_divide_ceil(K_, BK1);

            // B: weight tensor
            const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc =
-                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K_, C_)),
                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(C)),
+                                                       make_pass_through_transform(C_)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-            make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, C), make_tuple(I0, I1));
+            make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, C_), make_tuple(I0, I1));

            const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc =
                ck::tensor_operation::device::PadTensorDescriptor(
@@ -553,22 +870,10 @@ struct TransformConvBwdDataToGemm_v1
        }
        else
        {
-            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
-            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
-            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
-
-            const auto ZTilde = ConvStrideD / GcdStrideDilationD;
-            const auto YTilde = ConvStrideH / GcdStrideDilationH;
-            const auto XTilde = ConvStrideW / GcdStrideDilationW;
-
-            const auto ZDot = math::integer_divide_ceil(Z, ZTilde);
-            const auto YDot = math::integer_divide_ceil(Y, YTilde);
-            const auto XDot = math::integer_divide_ceil(X, XTilde);
-
            // GemmK is different for each GEMM
-            const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde);
-            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
-            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+            const auto ZDotSlice = math::integer_divide_ceil(Z_ - IdxZTilde_, ZTilde_);
+            const auto YDotSlice = math::integer_divide_ceil(Y_ - IdxYTilde_, YTilde_);
+            const auto XDotSlice = math::integer_divide_ceil(X_ - IdxXTilde_, XTilde_);

            // B weight tensor
            if constexpr(NDimSpatial == 2)
@@ -576,23 +881,23 @@ struct TransformConvBwdDataToGemm_v1
                const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
                    wei_grid_desc,
                    make_tuple(
-                        make_pass_through_transform(K),
-                        make_embed_transform(make_tuple(YDot, YTilde),
-                                             make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
-                        make_embed_transform(make_tuple(XDot, XTilde),
-                                             make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
-                        make_pass_through_transform(C)),
+                        make_pass_through_transform(K_),
+                        make_embed_transform(make_tuple(YDot_, YTilde_),
+                                             make_tuple(ConvStrideH_ / GcdStrideDilationH_, I1)),
+                        make_embed_transform(make_tuple(XDot_, XTilde_),
+                                             make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)),
+                        make_pass_through_transform(C_)),
                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                    make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));

                const auto wei_k_ydotslice_xdotslice_c_grid_desc = transform_tensor_descriptor(
                    wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
-                    make_tuple(make_pass_through_transform(K),
-                               make_slice_transform(YDot, I0, YDotSlice),
-                               make_slice_transform(XDot, I0, XDotSlice),
-                               make_freeze_transform(i_ytilde),
-                               make_freeze_transform(i_xtilde),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_pass_through_transform(K_),
+                               make_slice_transform(YDot_, I0, YDotSlice),
+                               make_slice_transform(XDot_, I0, XDotSlice),
+                               make_freeze_transform(IdxYTilde_),
+                               make_freeze_transform(IdxXTilde_),
+                               make_pass_through_transform(C_)),
                    make_tuple(Sequence<0>{},
                               Sequence<1>{},
                               Sequence<3>{},
@@ -608,8 +913,8 @@ struct TransformConvBwdDataToGemm_v1

                const auto wei_gemmk_gemmnraw_grid_desc = transform_tensor_descriptor(
                    wei_k_ydotslice_xdotslice_c_grid_desc,
-                    make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K)),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K_)),
+                               make_pass_through_transform(C_)),
                    make_tuple(Sequence<1, 2, 0>{}, Sequence<3>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}));

@@ -636,15 +941,17 @@ struct TransformConvBwdDataToGemm_v1
                const auto wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc =
                    transform_tensor_descriptor(
                        wei_grid_desc,
-                        make_tuple(
-                            make_pass_through_transform(K),
-                            make_embed_transform(make_tuple(ZDot, ZTilde),
-                                                 make_tuple(ConvStrideD / GcdStrideDilationD, I1)),
-                            make_embed_transform(make_tuple(YDot, YTilde),
-                                                 make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
-                            make_embed_transform(make_tuple(XDot, XTilde),
-                                                 make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
-                            make_pass_through_transform(C)),
+                        make_tuple(make_pass_through_transform(K_),
+                                   make_embed_transform(
+                                       make_tuple(ZDot_, ZTilde_),
+                                       make_tuple(ConvStrideD_ / GcdStrideDilationD_, I1)),
+                                   make_embed_transform(
+                                       make_tuple(YDot_, YTilde_),
+                                       make_tuple(ConvStrideH_ / GcdStrideDilationH_, I1)),
+                                   make_embed_transform(
+                                       make_tuple(XDot_, XTilde_),
+                                       make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)),
+                                   make_pass_through_transform(C_)),
                        make_tuple(Sequence<0>{},
                                   Sequence<1>{},
                                   Sequence<2>{},
@@ -659,14 +966,14 @@ struct TransformConvBwdDataToGemm_v1
                const auto wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc =
                    transform_tensor_descriptor(
                        wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc,
-                        make_tuple(make_pass_through_transform(K),
-                                   make_slice_transform(ZDot, I0, ZDotSlice),
-                                   make_slice_transform(YDot, I0, YDotSlice),
-                                   make_slice_transform(XDot, I0, XDotSlice),
-                                   make_freeze_transform(i_ztilde),
-                                   make_freeze_transform(i_ytilde),
-                                   make_freeze_transform(i_xtilde),
-                                   make_pass_through_transform(C)),
+                        make_tuple(make_pass_through_transform(K_),
+                                   make_slice_transform(ZDot_, I0, ZDotSlice),
+                                   make_slice_transform(YDot_, I0, YDotSlice),
+                                   make_slice_transform(XDot_, I0, XDotSlice),
+                                   make_freeze_transform(IdxZTilde_),
+                                   make_freeze_transform(IdxYTilde_),
+                                   make_freeze_transform(IdxXTilde_),
+                                   make_pass_through_transform(C_)),
                        make_tuple(Sequence<0>{},
                                   Sequence<1>{},
                                   Sequence<3>{},
@@ -686,8 +993,9 @@ struct TransformConvBwdDataToGemm_v1

                const auto wei_gemmk_gemmnraw_grid_desc = transform_tensor_descriptor(
                    wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc,
-                    make_tuple(make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K)),
-                               make_pass_through_transform(C)),
+                    make_tuple(
+                        make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K_)),
+                        make_pass_through_transform(C_)),
                    make_tuple(Sequence<1, 2, 3, 0>{}, Sequence<4>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}));

@@ -716,66 +1024,20 @@ struct TransformConvBwdDataToGemm_v1
        }
    }

-    template <typename CLayout,
-              typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
-                                          (is_same_v<CLayout, tensor_layout::convolution::GNHWC> ||
-                                           is_same_v<CLayout, tensor_layout::convolution::GNDHWC> ||
-                                           is_same_v<CLayout, tensor_layout::convolution::NHWGC> ||
-                                           is_same_v<CLayout, tensor_layout::convolution::NDHWGC> ||
-                                           is_same_v<CLayout, tensor_layout::convolution::G_NHW_C>),
-                                      bool>::type = false>
-    static auto
-    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-                        const std::array<index_t, NDimSpatial>& input_left_pads,
-                        const std::array<index_t, NDimSpatial>& input_right_pads,
-                        const std::array<index_t, NDimSpatial>& tildes)
+    template <
+        typename CLayout_                   = CLayout,
+        typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
+                                    (is_same_v<CLayout_, tensor_layout::convolution::GNHWC> ||
+                                     is_same_v<CLayout_, tensor_layout::convolution::GNDHWC> ||
+                                     is_same_v<CLayout_, tensor_layout::convolution::NHWGC> ||
+                                     is_same_v<CLayout_, tensor_layout::convolution::NDHWGC> ||
+                                     is_same_v<CLayout_, tensor_layout::convolution::G_NHW_C>),
+                                bool>::type = false>
+    __host__ __device__ auto MakeCDescriptor_M_N() const
    {
-        index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum];
-        index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum];
-        index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum];
-
-        const index_t N = in_g_n_c_wis_lengths[1];
-        const index_t C = wei_g_k_c_xs_lengths[2];
-
-        const index_t Di = NDimSpatial == 3 ? in_g_n_c_wis_lengths[DIdx] : 1;
-        const index_t Hi = in_g_n_c_wis_lengths[HIdx];
-        const index_t Wi = in_g_n_c_wis_lengths[WIdx];
-
-        const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1;
-        const index_t Ho = out_g_n_k_wos_lengths[HIdx];
-        const index_t Wo = out_g_n_k_wos_lengths[WIdx];
-
-        const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1;
-        const index_t Y = wei_g_k_c_xs_lengths[YIdx];
-        const index_t X = wei_g_k_c_xs_lengths[XIdx];
-
-        const index_t InLeftPadD = input_left_pads[DIdx - NonSpatialDimsNum];
-        const index_t InLeftPadH = input_left_pads[HIdx - NonSpatialDimsNum];
-        const index_t InLeftPadW = input_left_pads[WIdx - NonSpatialDimsNum];
-
-        const index_t InRightPadD = input_right_pads[DIdx - NonSpatialDimsNum];
-        const index_t InRightPadH = input_right_pads[HIdx - NonSpatialDimsNum];
-        const index_t InRightPadW = input_right_pads[WIdx - NonSpatialDimsNum];
-
-        const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum];
-        const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum];
-        const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum];
-
-        const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum];
-        const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum];
-        const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum];
-
        // assume strided
        // n_hi_wi_c for 2d n_di_hi_wi_c for 3d
-        const auto in_grid_desc =
-            make_in_grid_desc<NDimSpatial, CLayout>(N, Di, Hi, Wi, C, in_g_n_c_wis_strides);
+        const auto in_grid_desc = MakeInGridDesc();

        if constexpr(ConvBwdDataSpecialization ==
                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
@@ -787,10 +1049,10 @@ struct TransformConvBwdDataToGemm_v1
                const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
                    in_grid_desc,
                    make_tuple(
-                        make_pass_through_transform(N),
-                        make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
-                        make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
-                        make_pass_through_transform(C)),
+                        make_pass_through_transform(N_),
+                        make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)),
+                        make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)),
+                        make_pass_through_transform(C_)),
                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                    make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));

@@ -798,8 +1060,8 @@ struct TransformConvBwdDataToGemm_v1
                    in_n_y_ho_x_wo_c_grid_desc,
                    make_tuple(make_freeze_transform(I0),
                               make_freeze_transform(I0),
-                               make_merge_transform(make_tuple(N, Ho, Wo)),
-                               make_pass_through_transform(C)),
+                               make_merge_transform(make_tuple(N_, Ho_, Wo_)),
+                               make_pass_through_transform(C_)),
                    make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}),
                    make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}));

@@ -818,11 +1080,11 @@ struct TransformConvBwdDataToGemm_v1
                const auto in_n_x_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
                    in_grid_desc,
                    make_tuple(
-                        make_pass_through_transform(N),
-                        make_embed_transform(make_tuple(I1, Do), make_tuple(I1, ConvStrideD)),
-                        make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
-                        make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
-                        make_pass_through_transform(C)),
+                        make_pass_through_transform(N_),
+                        make_embed_transform(make_tuple(I1, Do_), make_tuple(I1, ConvStrideD_)),
+                        make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)),
+                        make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)),
+                        make_pass_through_transform(C_)),
                    make_tuple(
                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
                    make_tuple(Sequence<0>{},
@@ -836,8 +1098,8 @@ struct TransformConvBwdDataToGemm_v1
                    make_tuple(make_freeze_transform(I0),
                               make_freeze_transform(I0),
                               make_freeze_transform(I0),
-                               make_merge_transform(make_tuple(N, Do, Ho, Wo)),
-                               make_pass_through_transform(C)),
+                               make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_)),
+                               make_pass_through_transform(C_)),
                    make_tuple(Sequence<1>{},
                               Sequence<3>{},
                               Sequence<5>{},
@@ -861,36 +1123,21 @@ struct TransformConvBwdDataToGemm_v1
        }
        else
        {
-            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
-            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
-            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
-
-            const auto ZTilde = ConvStrideD / GcdStrideDilationD;
-            const auto YTilde = ConvStrideH / GcdStrideDilationH;
-            const auto XTilde = ConvStrideW / GcdStrideDilationW;
-
-            const auto DTilde =
-                Do + math::integer_divide_ceil(ConvDilationD * (Z - I1), ConvStrideD);
-            const auto HTilde =
-                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
-            const auto WTilde =
-                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
-
            // only work on DTilde, HTilde and WTilde that contribute to
            // non-padding area of input tensor
            const auto IDTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadD - ConvDilationD * (ZTilde - I1)), ConvStrideD);
+                math::max(I0, InLeftPadD_ - ConvDilationD_ * (ZTilde_ - I1)), ConvStrideD_);
            const auto IHTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+                math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_);
            const auto IWTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+                math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_);

            const auto IDTildeSliceEnd = math::min(
-                DTilde, math::integer_divide_ceil(InLeftPadD + Di - I1, ConvStrideD) + I1);
+                DTilde_, math::integer_divide_ceil(InLeftPadD_ + Di_ - I1, ConvStrideD_) + I1);
            const auto IHTildeSliceEnd = math::min(
-                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+                HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1);
            const auto IWTildeSliceEnd = math::min(
-                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+                WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1);

            const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin;
            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
@@ -901,34 +1148,34 @@ struct TransformConvBwdDataToGemm_v1
            {
                const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
                    in_grid_desc,
-                    make_tuple(make_pass_through_transform(N),
-                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(C_)),
                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));

                const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc =
                    transform_tensor_descriptor(
                        in_n_hip_wip_c_grid_desc,
-                        make_tuple(make_pass_through_transform(N),
-                                   make_embed_transform(make_tuple(YTilde, HTilde),
-                                                        make_tuple(ConvDilationH, ConvStrideH)),
-                                   make_embed_transform(make_tuple(XTilde, WTilde),
-                                                        make_tuple(ConvDilationW, ConvStrideW)),
-                                   make_pass_through_transform(C)),
+                        make_tuple(make_pass_through_transform(N_),
+                                   make_embed_transform(make_tuple(YTilde_, HTilde_),
+                                                        make_tuple(ConvDilationH_, ConvStrideH_)),
+                                   make_embed_transform(make_tuple(XTilde_, WTilde_),
+                                                        make_tuple(ConvDilationW_, ConvStrideW_)),
+                                   make_pass_through_transform(C_)),
                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                        make_tuple(
                            Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));

                const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
                    in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
-                    make_tuple(make_pass_through_transform(N),
-                               make_freeze_transform(i_ytilde),
-                               make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                               make_freeze_transform(i_xtilde),
-                               make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_pass_through_transform(N_),
+                               make_freeze_transform(IdxYTilde_),
+                               make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                               make_freeze_transform(IdxXTilde_),
+                               make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                               make_pass_through_transform(C_)),
                    make_tuple(Sequence<0>{},
                               Sequence<1>{},
                               Sequence<2>{},
@@ -944,8 +1191,8 @@ struct TransformConvBwdDataToGemm_v1

                const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
                    in_n_htildeslice_wtildeslice_c_grid_desc,
-                    make_tuple(make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_merge_transform(make_tuple(N_, HTildeSlice, WTildeSlice)),
+                               make_pass_through_transform(C_)),
                    make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}));

@@ -961,11 +1208,11 @@ struct TransformConvBwdDataToGemm_v1
            {
                const auto in_n_dip_hip_wip_c_grid_desc = transform_tensor_descriptor(
                    in_grid_desc,
-                    make_tuple(make_pass_through_transform(N),
-                               make_pad_transform(Di, InLeftPadD, InRightPadD),
-                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Di_, InLeftPadD_, InRightPadD_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(C_)),
                    make_tuple(
                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
                    make_tuple(
@@ -974,14 +1221,14 @@ struct TransformConvBwdDataToGemm_v1
                const auto in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc =
                    transform_tensor_descriptor(
                        in_n_dip_hip_wip_c_grid_desc,
-                        make_tuple(make_pass_through_transform(N),
-                                   make_embed_transform(make_tuple(ZTilde, DTilde),
-                                                        make_tuple(ConvDilationD, ConvStrideD)),
-                                   make_embed_transform(make_tuple(YTilde, HTilde),
-                                                        make_tuple(ConvDilationH, ConvStrideH)),
-                                   make_embed_transform(make_tuple(XTilde, WTilde),
-                                                        make_tuple(ConvDilationW, ConvStrideW)),
-                                   make_pass_through_transform(C)),
+                        make_tuple(make_pass_through_transform(N_),
+                                   make_embed_transform(make_tuple(ZTilde_, DTilde_),
+                                                        make_tuple(ConvDilationD_, ConvStrideD_)),
+                                   make_embed_transform(make_tuple(YTilde_, HTilde_),
+                                                        make_tuple(ConvDilationH_, ConvStrideH_)),
+                                   make_embed_transform(make_tuple(XTilde_, WTilde_),
+                                                        make_tuple(ConvDilationW_, ConvStrideW_)),
+                                   make_pass_through_transform(C_)),
                        make_tuple(Sequence<0>{},
                                   Sequence<1>{},
                                   Sequence<2>{},
@@ -996,14 +1243,14 @@ struct TransformConvBwdDataToGemm_v1
                const auto in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc =
                    transform_tensor_descriptor(
                        in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc,
-                        make_tuple(make_pass_through_transform(N),
-                                   make_freeze_transform(i_ztilde),
-                                   make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice),
-                                   make_freeze_transform(i_ytilde),
-                                   make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                                   make_freeze_transform(i_xtilde),
-                                   make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                                   make_pass_through_transform(C)),
+                        make_tuple(make_pass_through_transform(N_),
+                                   make_freeze_transform(IdxZTilde_),
+                                   make_slice_transform(DTilde_, IDTildeSliceBegin, DTildeSlice),
+                                   make_freeze_transform(IdxYTilde_),
+                                   make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                                   make_freeze_transform(IdxXTilde_),
+                                   make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                                   make_pass_through_transform(C_)),
                        make_tuple(Sequence<0>{},
                                   Sequence<1>{},
                                   Sequence<2>{},
@@ -1024,8 +1271,8 @@ struct TransformConvBwdDataToGemm_v1
                const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
                    in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc,
                    make_tuple(
-                        make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice)),
-                        make_pass_through_transform(C)),
+                        make_merge_transform(make_tuple(N_, DTildeSlice, HTildeSlice, WTildeSlice)),
+                        make_pass_through_transform(C_)),
                    make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}));

@@ -1044,84 +1291,41 @@ struct TransformConvBwdDataToGemm_v1
    }

    // for input bias
-    template <typename CLayout,
+    template <typename CLayout_                   = CLayout,
              typename std::enable_if<NDimSpatial == 2 &&
-                                          (is_same_v<CLayout, tensor_layout::convolution::GC> ||
-                                           is_same_v<CLayout, tensor_layout::convolution::G_C>),
+                                          (is_same_v<CLayout_, tensor_layout::convolution::GC> ||
+                                           is_same_v<CLayout_, tensor_layout::convolution::G_C>),
                                      bool>::type = false>
-    static auto
-    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* in_g_n_c_wis_strides */,
-                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-                        const std::array<index_t, NDimSpatial>& input_left_pads,
-                        const std::array<index_t, NDimSpatial>& /* input_right_pads */,
-                        const std::array<index_t, NDimSpatial>& /* tildes */)
+    __host__ __device__ auto MakeCDescriptor_M_N() const
    {
-        const index_t N = in_g_n_c_wis_lengths[1];
-        const index_t C = wei_g_k_c_xs_lengths[2];
-
-        const index_t Hi = in_g_n_c_wis_lengths[3];
-        const index_t Wi = in_g_n_c_wis_lengths[4];
-
-        const index_t Ho = out_g_n_k_wos_lengths[3];
-        const index_t Wo = out_g_n_k_wos_lengths[4];
-
-        const index_t Y = wei_g_k_c_xs_lengths[3];
-        const index_t X = wei_g_k_c_xs_lengths[4];
-
-        const index_t InLeftPadH = input_left_pads[0];
-        const index_t InLeftPadW = input_left_pads[1];
-
-        const index_t ConvStrideH = conv_filter_strides[0];
-        const index_t ConvStrideW = conv_filter_strides[1];
-
-        const index_t ConvDilationH = conv_filter_dilations[0];
-        const index_t ConvDilationW = conv_filter_dilations[1];
-
        if constexpr(ConvBwdDataSpecialization ==
                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                         Filter1x1Stride1Pad0)
        {
            const auto in_gemmm_gemmn_grid_desc =
-                make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, C), make_tuple(I0, I1));
+                make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, C_), make_tuple(I0, I1));

            return in_gemmm_gemmn_grid_desc;
        }
        else
        {
-            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
-            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
-
-            const auto YTilde = ConvStrideH / GcdStrideDilationH;
-            const auto XTilde = ConvStrideW / GcdStrideDilationW;
-
-            const auto HTilde =
-                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
-            const auto WTilde =
-                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
-
            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
            const auto IHTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+                math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_);
            const auto IWTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+                math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_);

            const auto IHTildeSliceEnd = math::min(
-                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+                HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1);
            const auto IWTildeSliceEnd = math::min(
-                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+                WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1);

            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;

            // bias tensor
            const auto in_gemmmraw_gemmnraw_grid_desc = make_naive_tensor_descriptor(
-                make_tuple(N * HTildeSlice * WTildeSlice, C), make_tuple(I0, I1));
+                make_tuple(N_ * HTildeSlice * WTildeSlice, C_), make_tuple(I0, I1));

            const auto in_gemmm_gemmn_grid_desc = ck::tensor_operation::device::PadTensorDescriptor(
                in_gemmmraw_gemmnraw_grid_desc,
@@ -1131,6 +1335,25 @@ struct TransformConvBwdDataToGemm_v1
            return in_gemmm_gemmn_grid_desc;
        }
    }
+
+    IndexType N_;
+    IndexType Di_, Hi_, Wi_;
+    IndexType Do_, Ho_, Wo_;
+    IndexType Z_, Y_, X_;
+    IndexType K_, C_;
+    IndexType DiStride_, HiStride_, WiStride_;
+    IndexType DoStride_, HoStride_, WoStride_;
+    IndexType CStrideTensorB_, CStrideTensorC_, KStrideTensorA_, KStrideTensorB_;
+    IndexType NStrideTensorA_, NStrideTensorC_;
+    IndexType ConvStrideD_, ConvStrideH_, ConvStrideW_;
+    IndexType ConvDilationD_, ConvDilationH_, ConvDilationW_;
+    IndexType InLeftPadD_, InLeftPadH_, InLeftPadW_;
+    IndexType InRightPadD_, InRightPadH_, InRightPadW_;
+    IndexType IdxZTilde_, IdxYTilde_, IdxXTilde_;
+    IndexType GcdStrideDilationD_, GcdStrideDilationH_, GcdStrideDilationW_;
+    IndexType ZTilde_, YTilde_, XTilde_;
+    IndexType DTilde_, HTilde_, WTilde_;
+    IndexType ZDot_, YDot_, XDot_;
 };

 } // namespace tensor_operation

--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -611,7 +611,7 @@ inline __device__ int8_t neg<int8_t>(int8_t x)
 template <>
 inline __device__ half_t neg<half_t>(half_t x)
 {
-    return __hneg(x);
+    return __hneg(static_cast<__half>(x));
 };

 template <typename T>

--- a/include/ck_tile/README.md
+++ b/include/ck_tile/README.md
@@ -45,5 +45,8 @@ our implementation of different device operators.
 **[ops/epilogue]**  
 epilogue part of our kernel. We may extend this epilogue part to let users to build their own cutomized epilogues.

+**[ref]**  
+reference implementation of cpu or gpu. This folder is supposed to include a specific header on demand.
+
 ## examples
 currently we put all ck_tile related example under [/example/ck_tile](/example/ck_tile/) folder. Please check each example's subfolder.
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -54,6 +54,7 @@
 #include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/tensor/tile_window_utils.hpp"
 #include "ck_tile/core/tensor/update_tile.hpp"
+#include "ck_tile/core/utility/amd_address_space.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/functional_with_tuple.hpp"

--- a/include/ck_tile/ops/flatmm.hpp
+++ b/include/ck_tile/ops/flatmm.hpp
@@ -5,6 +5,7 @@

 #include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
+#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp
+++ b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
+#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
+
+namespace ck_tile {
+
+// "S"tream update output along "N"
+// A in smem, B load from global
+// require 4 wave, occupancy=1c
+
+struct FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base
+{
+    using BDataType = bf16_t;
+    using ODataType = bf16_t;
+
+    // TODO: need paired with tile_window_linear!
+    // TODO: need call init_raw() before call this function!
+    // template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
+    template <typename BRes,
+              typename BCoords,
+              typename ORes,
+              typename OCoords,
+              typename OFlags,
+              typename ScaleTensor>
+    CK_TILE_DEVICE auto
+    operator()(const BRes& res_b,
+               const BCoords& cached_coords_b,
+               const ORes& res_o,
+               const OCoords& cached_coords_o,
+               const OFlags& o_flags, // this should be in sgpr
+               CK_TILE_LDS_ADDR void* smem,
+               index_t n, // loop along n dim
+               const ScaleTensor& scale_,
+               index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
+               index_t tile_offset_o)
+    {
+        static_assert(BCoords::size() == 8); // 8
+        static_assert(OCoords::size() == 8);
+
+        const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
+        const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
+
+        static_assert(ScaleTensor::size() == 2);
+        float s0 = scale_[number<0>{}];
+        float s1 = scale_[number<1>{}];
+
+        // index_t loop_cnt = n / Block_N;
+
+        register float v_c0 asm("v64");
+        register float v_c1 asm("v65");
+        register float v_c2 asm("v66");
+        register float v_c3 asm("v67");
+        register float v_c4 asm("v68");
+        register float v_c5 asm("v69");
+        register float v_c6 asm("v70");
+        register float v_c7 asm("v71");
+        register float v_c8 asm("v72");
+        register float v_c9 asm("v73");
+        register float v_c10 asm("v74");
+        register float v_c11 asm("v75");
+        register float v_c12 asm("v76");
+        register float v_c13 asm("v77");
+        register float v_c14 asm("v78");
+        register float v_c15 asm("v79");
+        register float v_c16 asm("v80");
+        register float v_c17 asm("v81");
+        register float v_c18 asm("v82");
+        register float v_c19 asm("v83");
+        register float v_c20 asm("v84");
+        register float v_c21 asm("v85");
+        register float v_c22 asm("v86");
+        register float v_c23 asm("v87");
+        register float v_c24 asm("v88");
+        register float v_c25 asm("v89");
+        register float v_c26 asm("v90");
+        register float v_c27 asm("v91");
+        register float v_c28 asm("v92");
+        register float v_c29 asm("v93");
+        register float v_c30 asm("v94");
+        register float v_c31 asm("v95");
+        int32_t nan_hi = 0x7fff0000;
+        int32_t nan_lo = 0x00007fff;
+
+        // in smem, the layout is  M0(2)*K0(128)*M1(16)*K1(4)
+        // every threads need 8xK in contiguous register
+        // ... and every wave need the same data
+        int lane_id  = threadIdx.x % 64;
+        int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
+        sld_y_os *= 2;
+
+        //                    y     y     p     p      p      y
+        // reg before shfl  M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
+        // but order is N0*M0*Nv
+        // in LDS we need store as
+        //          M0(2)* N0(2) *  Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
+        //             y    y       wave-id  lid/16  lid%16   v
+        // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
+        int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
+        sfl_sst *= 2;
+
+        // from LDS we need load as
+        //          M0(2)*    N0(2) *  Nl(4) * Nw(4) * (Mw(16)         *  Nv(4) + 4)
+        //        ( 2 issue)    (rem 32-lane)        (4 wave*4issue)   2lane*1ussue(pk2)
+        // sld(v4) = v0/2 *34*4  + v0 % 2 *4 + wid*2 *4
+        int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
+        sfl_sld *= 2;
+
+        // B nr->kr
+        // clang-format off
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winline-asm"
+        asm volatile(
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
+#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc"
+#undef CK_TILE_FLATMM_UK_MFMA
+            :[smem_]"+r"(smem),
+            // [s_loop_cnt]"+s"(loop_cnt),
+            [s_loop_cnt]"+s"(n),
+                [c0]"+v" (v_c0),
+                [c1]"+v" (v_c1),
+                [c2]"+v" (v_c2),
+                [c3]"+v" (v_c3),
+                [c4]"+v" (v_c4),
+                [c5]"+v" (v_c5),
+                [c6]"+v" (v_c6),
+                [c7]"+v" (v_c7),
+                [c8]"+v" (v_c8),
+                [c9]"+v" (v_c9),
+                [c10]"+v"(v_c10),
+                [c11]"+v"(v_c11),
+                [c12]"+v"(v_c12),
+                [c13]"+v"(v_c13),
+                [c14]"+v"(v_c14),
+                [c15]"+v"(v_c15),
+                [c16]"+v"(v_c16),
+                [c17]"+v"(v_c17),
+                [c18]"+v"(v_c18),
+                [c19]"+v"(v_c19),
+                [c20]"+v"(v_c20),
+                [c21]"+v"(v_c21),
+                [c22]"+v"(v_c22),
+                [c23]"+v"(v_c23),
+                [c24]"+v"(v_c24),
+                [c25]"+v"(v_c25),
+                [c26]"+v"(v_c26),
+                [c27]"+v"(v_c27),
+                [c28]"+v"(v_c28),
+                [c29]"+v"(v_c29),
+                [c30]"+v"(v_c30),
+                [c31]"+v"(v_c31)
+            :
+            [sld_a_base]"n"(0),
+            [shfl_base]"n"(0),
+            [v_sld_y_os]"v"(sld_y_os),
+            [v_sfl_sld]"v"(sfl_sld),
+            [v_sfl_sst]"v"(sfl_sst),
+            [s_res_o0]"s"(res_o[0]),
+                [s_res_o1]"s"(res_o[1]),
+                //[s_res_o2]"s"(res_o[2]),
+                //[s_res_o3]"s"(res_o[3]),
+                [s_res_b0]"s"(res_b[0]),
+                [s_res_b1]"s"(res_b[1]),
+                [s_res_b2]"s"(res_b[2]),
+                [s_res_b3]"s"(res_b[3]),
+                [v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
+                [v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
+                [v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
+                [v_os_o3]"v"(static_cast<index_t>(cached_coords_o[number<3>{}] * sizeof(ODataType))),
+                [v_os_o4]"v"(static_cast<index_t>(cached_coords_o[number<4>{}] * sizeof(ODataType))),
+                [v_os_o5]"v"(static_cast<index_t>(cached_coords_o[number<5>{}] * sizeof(ODataType))),
+                [v_os_o6]"v"(static_cast<index_t>(cached_coords_o[number<6>{}] * sizeof(ODataType))),
+                [v_os_o7]"v"(static_cast<index_t>(cached_coords_o[number<7>{}] * sizeof(ODataType))),
+                [v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
+                [v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
+                [v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
+                [v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
+                [v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
+                [v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
+                [v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
+                [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
+
+                [s_tile_os_o]"s"(tile_stride_o_bytes),
+                [s_tile_os_b]"s"(tile_stride_b_bytes),
+                [scale_0]"v"(s0),
+                [scale_1]"v"(s1),
+                [v_nan_lo]"v"(nan_lo),
+                [v_nan_hi]"v"(nan_hi),
+                [s_execflag_0]"s"(o_flags[number<0>{}]),
+                [s_execflag_1]"s"(o_flags[number<1>{}]),
+                [s_execflag_2]"s"(o_flags[number<2>{}]),
+                [s_execflag_3]"s"(o_flags[number<3>{}]),
+                [s_execflag_4]"s"(o_flags[number<4>{}]),
+                [s_execflag_5]"s"(o_flags[number<5>{}]),
+                [s_execflag_6]"s"(o_flags[number<6>{}]),
+                [s_execflag_7]"s"(o_flags[number<7>{}])
+            :
+          "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
+          "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
+          "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
+          "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
+          "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
+          "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
+          "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
+          "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
+          "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
+          "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
+          "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
+          "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
+          "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
+          "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
+          "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
+          "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
+          "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
+          "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
+          "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
+          "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
+          "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
+          "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
+          "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
+          "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
+          "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
+          "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
+          "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
+          "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
+          "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
+          "a252", "a253", "a254", "a255", 
+          "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86",
+          "s36", "s37","s59","s80",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
+          "v50", "v54", "v55",
+          "v64","v65","v66","v67","v68","v69","v70","v71",
+          "v72","v73","v74","v75","v76","v77","v78","v79",
+          "v80","v81","v82","v83","v84","v85","v86","v87",
+          "v88","v89","v90","v91","v92","v93","v94","v95",
+          "v128", "v129", "v130", "v131",
+          "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139",
+          "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147",
+          "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155",
+          "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163",
+          "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171",
+          "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
+          "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187",
+          "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195",
+          "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203",
+          "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211",
+          "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219",
+          "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227",
+          "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235",
+          "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243",
+          "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
+          "v252", "v253", "v254", "v255"
+        );
+#pragma clang diagnostic pop
+        // clang-format on
+    }
+};
+
+struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base
+{
+    using BDataType = bf16_t;
+    using ODataType = bf16_t;
+
+    // TODO: need paired with tile_window_linear!
+    // TODO: need call init_raw() before call this function!
+    // template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
+    template <typename BRes,
+              typename BCoords,
+              typename ORes,
+              typename OCoords,
+              typename OFlags,
+              typename ScaleTensor>
+    CK_TILE_DEVICE auto
+    operator()(const BRes& res_b,
+               const BCoords& cached_coords_b,
+               const ORes& res_o,
+               const OCoords& cached_coords_o,
+               const OFlags& o_flags, // this should be in sgpr
+               CK_TILE_LDS_ADDR void* smem,
+               index_t n, // loop along n dim
+               const ScaleTensor& scale_,
+               index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
+               index_t tile_offset_o)
+    {
+        static_assert(BCoords::size() == 8); // 8
+        static_assert(OCoords::size() == 8);
+
+        const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
+        const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
+
+        static_assert(ScaleTensor::size() == 2);
+        float s0 = scale_[number<0>{}];
+        float s1 = scale_[number<1>{}];
+
+        // index_t loop_cnt = n / Block_N;
+
+        register float v_c0 asm("v64");
+        register float v_c1 asm("v65");
+        register float v_c2 asm("v66");
+        register float v_c3 asm("v67");
+        register float v_c4 asm("v68");
+        register float v_c5 asm("v69");
+        register float v_c6 asm("v70");
+        register float v_c7 asm("v71");
+        register float v_c8 asm("v72");
+        register float v_c9 asm("v73");
+        register float v_c10 asm("v74");
+        register float v_c11 asm("v75");
+        register float v_c12 asm("v76");
+        register float v_c13 asm("v77");
+        register float v_c14 asm("v78");
+        register float v_c15 asm("v79");
+        register float v_c16 asm("v80");
+        register float v_c17 asm("v81");
+        register float v_c18 asm("v82");
+        register float v_c19 asm("v83");
+        register float v_c20 asm("v84");
+        register float v_c21 asm("v85");
+        register float v_c22 asm("v86");
+        register float v_c23 asm("v87");
+        register float v_c24 asm("v88");
+        register float v_c25 asm("v89");
+        register float v_c26 asm("v90");
+        register float v_c27 asm("v91");
+        register float v_c28 asm("v92");
+        register float v_c29 asm("v93");
+        register float v_c30 asm("v94");
+        register float v_c31 asm("v95");
+        int32_t nan_hi = 0x7fff0000;
+        int32_t nan_lo = 0x00007fff;
+
+        // in smem, the layout is  M0(2)*K0(128)*M1(16)*K1(4)
+        // every threads need 8xK in contiguous register
+        // ... and every wave need the same data
+        int lane_id  = threadIdx.x % 64;
+        int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
+        sld_y_os *= 2;
+
+        //                    y     y     p     p      p      y
+        // reg before shfl  M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
+        // but order is N0*M0*Nv
+        // in LDS we need store as
+        //          M0(2)* N0(2) *  Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
+        //             y    y       wave-id  lid/16  lid%16   v
+        // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
+        int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
+        sfl_sst *= 2;
+
+        // from LDS we need load as
+        //          M0(2)*    N0(2) *  Nl(4) * Nw(4) * (Mw(16)         *  Nv(4) + 4)
+        //        ( 2 issue)    (rem 32-lane)        (4 wave*4issue)   2lane*1ussue(pk2)
+        // sld(v4) = v0/2 *34*4  + v0 % 2 *4 + wid*2 *4
+        int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
+        sfl_sld *= 2;
+
+        // B nr->kr
+        // clang-format off
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winline-asm"
+        asm volatile(
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
+#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc"
+#undef CK_TILE_FLATMM_UK_MFMA
+            :[smem_]"+r"(smem),
+            [s_loop_cnt]"+s"(n),
+                [c0]"+v" (v_c0),
+                [c1]"+v" (v_c1),
+                [c2]"+v" (v_c2),
+                [c3]"+v" (v_c3),
+                [c4]"+v" (v_c4),
+                [c5]"+v" (v_c5),
+                [c6]"+v" (v_c6),
+                [c7]"+v" (v_c7),
+                [c8]"+v" (v_c8),
+                [c9]"+v" (v_c9),
+                [c10]"+v"(v_c10),
+                [c11]"+v"(v_c11),
+                [c12]"+v"(v_c12),
+                [c13]"+v"(v_c13),
+                [c14]"+v"(v_c14),
+                [c15]"+v"(v_c15),
+                [c16]"+v"(v_c16),
+                [c17]"+v"(v_c17),
+                [c18]"+v"(v_c18),
+                [c19]"+v"(v_c19),
+                [c20]"+v"(v_c20),
+                [c21]"+v"(v_c21),
+                [c22]"+v"(v_c22),
+                [c23]"+v"(v_c23),
+                [c24]"+v"(v_c24),
+                [c25]"+v"(v_c25),
+                [c26]"+v"(v_c26),
+                [c27]"+v"(v_c27),
+                [c28]"+v"(v_c28),
+                [c29]"+v"(v_c29),
+                [c30]"+v"(v_c30),
+                [c31]"+v"(v_c31)
+            :
+            [sld_a_base]"n"(0),
+            [shfl_base]"n"(0),
+            [v_sld_y_os]"v"(sld_y_os),
+            [v_sfl_sld]"v"(sfl_sld),
+            [v_sfl_sst]"v"(sfl_sst),
+            [s_res_o0]"s"(res_o[0]),
+                [s_res_o1]"s"(res_o[1]),
+                //[s_res_o2]"s"(res_o[2]),
+                //[s_res_o3]"s"(res_o[3]),
+                [s_res_b0]"s"(res_b[0]),
+                [s_res_b1]"s"(res_b[1]),
+                [s_res_b2]"s"(res_b[2]),
+                [s_res_b3]"s"(res_b[3]),
+                [v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
+                [v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
+                [v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
+                [v_os_o3]"v"(static_cast<index_t>(cached_coords_o[number<3>{}] * sizeof(ODataType))),
+                [v_os_o4]"v"(static_cast<index_t>(cached_coords_o[number<4>{}] * sizeof(ODataType))),
+                [v_os_o5]"v"(static_cast<index_t>(cached_coords_o[number<5>{}] * sizeof(ODataType))),
+                [v_os_o6]"v"(static_cast<index_t>(cached_coords_o[number<6>{}] * sizeof(ODataType))),
+                [v_os_o7]"v"(static_cast<index_t>(cached_coords_o[number<7>{}] * sizeof(ODataType))),
+                [v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
+                [v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
+                [v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
+                [v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
+                [v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
+                [v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
+                [v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
+                [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
+
+                [s_tile_os_o]"s"(tile_stride_o_bytes),
+                [s_tile_os_b]"s"(tile_stride_b_bytes),
+                [scale_0]"v"(s0),
+                [scale_1]"v"(s1),
+                [v_nan_lo]"v"(nan_lo),
+                [v_nan_hi]"v"(nan_hi),
+                [s_execflag_0]"s"(o_flags[number<0>{}]),
+                [s_execflag_1]"s"(o_flags[number<1>{}]),
+                [s_execflag_2]"s"(o_flags[number<2>{}]),
+                [s_execflag_3]"s"(o_flags[number<3>{}]),
+                [s_execflag_4]"s"(o_flags[number<4>{}]),
+                [s_execflag_5]"s"(o_flags[number<5>{}]),
+                [s_execflag_6]"s"(o_flags[number<6>{}]),
+                [s_execflag_7]"s"(o_flags[number<7>{}])
+            :
+          "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
+          "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
+          "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
+          "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
+          "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
+          "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
+          "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
+          "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
+          "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
+          "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
+          "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
+          "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
+          "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
+          "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
+          "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
+          "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
+          "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
+          "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
+          "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
+          "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
+          "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
+          "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
+          "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
+          "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
+          "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
+          "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
+          "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
+          "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
+          "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
+          "a252", "a253", "a254", "a255", 
+          "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86",
+          "s36", "s37","s59","s80",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
+          "v50", "v54", "v55",
+          "v64","v65","v66","v67","v68","v69","v70","v71",
+          "v72","v73","v74","v75","v76","v77","v78","v79",
+          "v80","v81","v82","v83","v84","v85","v86","v87",
+          "v88","v89","v90","v91","v92","v93","v94","v95",
+          "v128", "v129", "v130", "v131",
+          "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139",
+          "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147",
+          "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155",
+          "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163",
+          "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171",
+          "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
+          "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187",
+          "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195",
+          "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203",
+          "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211",
+          "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219",
+          "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227",
+          "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235",
+          "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243",
+          "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
+          "v252", "v253", "v254", "v255"
+        );
+#pragma clang diagnostic pop
+        // clang-format on
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
+++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
+#ifndef CK_TILE_FLATMM_UK_MFMA
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
+#endif
+
+#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
+# define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
+
+# define _UK_PK_CVT_(x0_, x1_, y_) \
+ " v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \
+ " v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \
+ " v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \
+ " v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \
+ " v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \
+ " v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \
+ " v_perm_b32 " y_ ", v55, v54, s52 \n"
+
+# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
+
+#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
+#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
+
+# define _UK_PK_CVT_(x0_, x1_, y_) \
+ " v_cvt_f16_f32 v54, " x0_ " \n" \
+ " v_cvt_f16_f32 v55, " x1_ " \n" \
+ " v_pack_b32_f16 " y_ ", v54, v55 \n"
+
+# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
+
+#endif
+
+
+";-------------------------------------------------------------\n"
+" s_mov_b32 s52, 0x07060302 ; v_perm\n"
+" s_mov_b64 s[38:39], exec ; save current exec\n"
+" s_mov_b32 s8, %[s_res_o0] \n"
+" s_mov_b32 s9, %[s_res_o1] \n"
+" s_mov_b32 s12, %[s_res_b0] \n"
+" s_mov_b32 s13, %[s_res_b1] \n"
+" s_mov_b32 s14, %[s_res_b2] \n"
+" s_mov_b32 s15, %[s_res_b3] \n" 
+" s_mov_b32 s59, 0 \n"
+" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n"
+" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n"
+" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n"
+" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n"
+" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n"
+" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n"
+" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n"
+" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n"
+" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n"
+" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n"
+" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n"
+" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n"
+" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n"
+" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n"
+" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n"
+" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n"
+" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n"
+" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n"
+" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n"
+" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n"
+" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n"
+" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n"
+" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n"
+" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n"
+" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n"
+" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n"
+" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n"
+" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n"
+" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n"
+" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n"
+" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n"
+" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n"
+" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n"
+" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n"
+" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n"
+" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n"
+" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n"
+" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n"
+" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n"
+" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n"
+" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n"
+" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n"
+" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n"
+" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n"
+" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n"
+" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n"
+" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n"
+" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n"
+" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n"
+" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n"
+" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n"
+" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n"
+" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n"
+" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n"
+" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n"
+" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n"
+" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n"
+" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n"
+" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n"
+" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n"
+" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n"
+" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n"
+" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n"
+" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n" 
+" s_waitcnt 0 \n"
+" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
+" s_add_u32 s12, %[s_tile_os_b], s12 \n" 
+" s_addc_u32 s13, 0, s13 \n" 
+" v_mov_b32 v64, 0 \n"
+" v_mov_b32 v80, 0 \n"
+" v_mov_b32 v65, 0 \n"
+" v_mov_b32 v81, 0 \n"
+" v_mov_b32 v66, 0 \n"
+" v_mov_b32 v82, 0 \n"
+" v_mov_b32 v67, 0 \n"
+" v_mov_b32 v83, 0 \n"
+" v_mov_b32 v68, 0 \n"
+" v_mov_b32 v84, 0 \n"
+" v_mov_b32 v69, 0 \n"
+" v_mov_b32 v85, 0 \n"
+" v_mov_b32 v70, 0 \n"
+" v_mov_b32 v86, 0 \n"
+" v_mov_b32 v71, 0 \n"
+" v_mov_b32 v87, 0 \n"
+" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168  \n"
+" s_mov_b32 s80, 0 \n" 
+" s_waitcnt vmcnt(24) \n" 
+"label_0AA6: \n"
+" s_waitcnt vmcnt(30) & lgkmcnt(0)  \n"
+" s_barrier \n" 
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0  \n"
+" ds_read_b32 v10, %[v_sfl_sld] offset:16640  \n"
+" ds_read_b32 v11, %[v_sfl_sld] offset:16672  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[128:131],  %[v_os_b0], s[12:15], 0 offen  \n" 
+" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67]  \n"
+" ds_read_b32 v12, %[v_sfl_sld] offset:16704  \n"
+" ds_read_b32 v13, %[v_sfl_sld] offset:16736  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67]  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67]  \n"
+" ds_read_b32 v14, %[v_sfl_sld] offset:20992  \n"
+" ds_read_b32 v15, %[v_sfl_sld] offset:21024  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[132:135],  %[v_os_b0], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67]  \n"
+" ds_read_b32 v16, %[v_sfl_sld] offset:21056  \n"
+" ds_read_b32 v17, %[v_sfl_sld] offset:21088  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[136:139],  %[v_os_b0], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[140:143],  %[v_os_b0], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71]  \n"
+ " s_waitcnt lgkmcnt(0) \n"
+ " s_mov_b64 exec, %[s_execflag_0] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[144:147],  %[v_os_b1], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[148:151],  %[v_os_b1], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[152:155],  %[v_os_b1], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[156:159],  %[v_os_b1], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79]  \n"
+ " s_mov_b64 exec, %[s_execflag_1] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]  \n"
+" s_waitcnt vmcnt(30)   \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[160:163],  %[v_os_b2], s[12:15], 0 offen  \n" 
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[164:167],  %[v_os_b2], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[168:171],  %[v_os_b2], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[172:175],  %[v_os_b2], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71]  \n"
+ " s_mov_b64 exec, %[s_execflag_2] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[176:179],  %[v_os_b3], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[180:183],  %[v_os_b3], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[184:187],  %[v_os_b3], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[188:191],  %[v_os_b3], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79]  \n"
+ " s_mov_b64 exec, %[s_execflag_3] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]    \n"
+" s_waitcnt vmcnt(30)   \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[192:195],  %[v_os_b4], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[196:199],  %[v_os_b4], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[200:203],  %[v_os_b4], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[204:207],  %[v_os_b4], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71]  \n"
+ " s_mov_b64 exec, %[s_execflag_4] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[208:211],  %[v_os_b5], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[212:215],  %[v_os_b5], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[216:219],  %[v_os_b5], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[220:223],  %[v_os_b5], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79]  \n"
+ " s_mov_b64 exec, %[s_execflag_5] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_waitcnt vmcnt(30)   \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[224:227],  %[v_os_b6], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[228:231],  %[v_os_b6], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[232:235],  %[v_os_b6], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[236:239],  %[v_os_b6], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71]  \n"
+ " s_mov_b64 exec, %[s_execflag_6] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[240:243],  %[v_os_b7], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[244:247],  %[v_os_b7], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[248:251],  %[v_os_b7], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[252:255],  %[v_os_b7], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79]  \n"
+ " s_mov_b64 exec, %[s_execflag_7] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_add_u32 s60, 0x00000100, s80  \n"
+" s_cmp_lt_u32 s60, %[s_loop_cnt]  \n"
+" s_cselect_b32 s56, %[s_tile_os_b], 0  \n"
+" s_add_u32 s12, s56, s12  \n"
+" s_addc_u32 s13, 0, s13  \n"
+" s_cmp_ge_u32 s80, 0x00000100  \n"
+" s_cselect_b32 s59, %[s_tile_os_o], s59  \n"
+" s_add_u32 s8, s59, s8  \n"
+" s_addc_u32 s9, 0, s9  \n"
+" v_mul_f32 %[c0], %[scale_0], %[c0] \n"
+" v_mul_f32 %[c1], %[scale_0], %[c1] \n"
+" v_mul_f32 %[c2], %[scale_0], %[c2] \n"
+" v_mul_f32 %[c3], %[scale_0], %[c3] \n"
+" v_mul_f32 %[c4], %[scale_1], %[c4] \n"
+" v_mul_f32 %[c5], %[scale_1], %[c5] \n"
+" v_mul_f32 %[c6], %[scale_1], %[c6] \n"
+" v_mul_f32 %[c7], %[scale_1], %[c7] \n"
+" v_mul_f32 %[c8], %[scale_0], %[c8] \n"
+" v_mul_f32 %[c9], %[scale_0], %[c9] \n"
+" v_mul_f32 %[c10], %[scale_0], %[c10] \n"
+" v_mul_f32 %[c11], %[scale_0], %[c11] \n"
+" v_mul_f32 %[c12], %[scale_1], %[c12] \n"
+" v_mul_f32 %[c13], %[scale_1], %[c13] \n"
+" v_mul_f32 %[c14], %[scale_1], %[c14] \n"
+" v_mul_f32 %[c15], %[scale_1], %[c15] \n"
+_UK_PK_CVT_("%[c0]", "%[c1]", "%[c0]")
+_UK_PK_CVT_("%[c2]", "%[c3]", "%[c1]")
+_UK_PK_CVT_("%[c4]", "%[c5]", "%[c2]")
+_UK_PK_CVT_("%[c6]", "%[c7]", "%[c3]")
+_UK_PK_CVT_("%[c8]", "%[c9]", "%[c4]")
+_UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]")
+_UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]")
+_UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]")
+" s_addk_i32 s80, 0x0080  \n"
+" s_cmp_lt_i32 s80, %[s_loop_cnt]  \n"
+" s_cbranch_scc0 label_0EC1  \n"
+" s_waitcnt vmcnt(30) & lgkmcnt(0)  \n"
+" s_barrier  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0  \n"
+" ds_read_b32 v10, %[v_sfl_sld] offset:25344  \n"
+" ds_read_b32 v11, %[v_sfl_sld] offset:25376  \n"
+" ds_write_b64 v3, v[64:65] offset:16640  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[0:3],  %[v_os_b0], s[12:15], 0 offen  \n"
+" ds_write_b64 v3, v[66:67] offset:20992  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83]  \n"
+" ds_read_b32 v12, %[v_sfl_sld] offset:25408  \n"
+" ds_read_b32 v13, %[v_sfl_sld] offset:25440  \n"
+" ds_write_b64 v3, v[68:69] offset:18816  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83]  \n"
+" ds_write_b64 v3, v[70:71] offset:23168  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83]  \n"
+" ds_read_b32 v14, %[v_sfl_sld] offset:29696  \n"
+" ds_read_b32 v15, %[v_sfl_sld] offset:29728  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[4:7],  %[v_os_b0], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83]  \n"
+" ds_read_b32 v16, %[v_sfl_sld] offset:29760  \n"
+" ds_read_b32 v17, %[v_sfl_sld] offset:29792  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[8:11],  %[v_os_b0], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[12:15],  %[v_os_b0], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87]  \n"
+ " s_waitcnt lgkmcnt(0) \n" 
+ " s_mov_b64 exec, %[s_execflag_0] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[16:19],  %[v_os_b1], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[20:23],  %[v_os_b1], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[24:27],  %[v_os_b1], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[28:31],  %[v_os_b1], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95]  \n"
+ " s_mov_b64 exec, %[s_execflag_1] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_waitcnt vmcnt(30) \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[32:35],  %[v_os_b2], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[36:39],  %[v_os_b2], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[40:43],  %[v_os_b2], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[44:47],  %[v_os_b2], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87]  \n"
+ " s_mov_b64 exec, %[s_execflag_2] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[48:51],  %[v_os_b3], s[12:15], 0 offen  \n" 
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[52:55],  %[v_os_b3], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[56:59],  %[v_os_b3], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[60:63],  %[v_os_b3], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95]  \n"
+ " s_mov_b64 exec, %[s_execflag_3] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_waitcnt vmcnt(30) \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[64:67],  %[v_os_b4], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[68:71],  %[v_os_b4], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[72:75],  %[v_os_b4], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[76:79],  %[v_os_b4], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87]  \n"
+ " s_mov_b64 exec, %[s_execflag_4] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[80:83],  %[v_os_b5], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[84:87],  %[v_os_b5], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[88:91],  %[v_os_b5], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[92:95],  %[v_os_b5], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95]  \n"
+ " s_mov_b64 exec, %[s_execflag_5] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_waitcnt vmcnt(30)  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[96:99],  %[v_os_b6], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[100:103],  %[v_os_b6], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[236:237], v[188:189], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[104:107],  %[v_os_b6], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[228:229], v[244:245], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[108:111],  %[v_os_b6], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[236:237], v[252:253], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87]  \n"
+ " s_mov_b64 exec, %[s_execflag_6] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[112:115],  %[v_os_b7], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[116:119],  %[v_os_b7], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[252:253], v[188:189], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[120:123],  %[v_os_b7], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[244:245], v[244:245], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[124:127],  %[v_os_b7], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[252:253], v[252:253], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95]  \n"
+ " s_mov_b64 exec, %[s_execflag_7] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_add_u32 s60, 0x00000100, s80  \n"
+" s_cmp_lt_u32 s60, %[s_loop_cnt]  \n"
+" s_cselect_b32 s56, s56, 0  \n"
+" s_add_u32 s12, s56, s12  \n"
+" s_addc_u32 s13, 0, s13  \n"
+" s_cmp_ge_u32 s80, 0x00000100  \n"
+" s_cselect_b32 s59, 0x00000100, s59  \n"
+" s_add_u32 s8, s59, s8  \n"
+" s_addc_u32 s9, 0, s9  \n"
+" v_mul_f32 %[c16], %[scale_0], %[c16] \n"
+" v_mul_f32 %[c17], %[scale_0], %[c17] \n"
+" v_mul_f32 %[c18], %[scale_0], %[c18] \n"
+" v_mul_f32 %[c19], %[scale_0], %[c19] \n"
+" v_mul_f32 %[c20], %[scale_1], %[c20] \n"
+" v_mul_f32 %[c21], %[scale_1], %[c21] \n"
+" v_mul_f32 %[c22], %[scale_1], %[c22] \n"
+" v_mul_f32 %[c23], %[scale_1], %[c23] \n"
+" v_mul_f32 %[c24], %[scale_0], %[c24] \n"
+" v_mul_f32 %[c25], %[scale_0], %[c25] \n"
+" v_mul_f32 %[c26], %[scale_0], %[c26] \n"
+" v_mul_f32 %[c27], %[scale_0], %[c27] \n"
+" v_mul_f32 %[c28], %[scale_1], %[c28] \n"
+" v_mul_f32 %[c29], %[scale_1], %[c29] \n"
+" v_mul_f32 %[c30], %[scale_1], %[c30] \n"
+" v_mul_f32 %[c31], %[scale_1], %[c31] \n"
+_UK_PK_CVT_("%[c16]", "%[c17]", "%[c16]")
+_UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]")
+_UK_PK_CVT_("%[c20]", "%[c21]", "%[c18]")
+_UK_PK_CVT_("%[c22]", "%[c23]", "%[c19]")
+_UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]")
+_UK_PK_CVT_("%[c26]", "%[c27]", "%[c21]")
+_UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]")
+_UK_PK_CVT_("%[c30]", "%[c31]", "%[c23]")
+" s_addk_i32 s80, 0x0080  \n"
+" s_cmp_lt_i32 s80, %[s_loop_cnt]  \n"
+" s_cbranch_scc0 label_0EC1  \n"
+" s_branch label_0AA6  \n"
+" label_0EC1: \n"
+" s_waitcnt lgkmcnt(0)  \n"
+" s_barrier  \n"
+" ds_read_b32 v10, %[v_sfl_sld] offset:16640  \n"
+" ds_read_b32 v11, %[v_sfl_sld] offset:16672  \n"
+" ds_read_b32 v12, %[v_sfl_sld] offset:16704  \n"
+" ds_read_b32 v13, %[v_sfl_sld] offset:16736  \n"
+" ds_read_b32 v14, %[v_sfl_sld] offset:20992  \n"
+" ds_read_b32 v15, %[v_sfl_sld] offset:21024  \n"
+" ds_read_b32 v16, %[v_sfl_sld] offset:21056  \n"
+" ds_read_b32 v17, %[v_sfl_sld] offset:21088  \n"
+" s_waitcnt lgkmcnt(0)  \n"
+ " s_mov_b64 exec, %[s_execflag_0] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_1] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_2] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_3] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_4] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_5] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_6] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_7] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_add_u32 s8, s59, s8  \n"
+" s_addc_u32 s9, 0, s9  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872  \n"
+" s_waitcnt lgkmcnt(0)  \n"
+" s_barrier  \n"
+" ds_read_b32 v10, %[v_sfl_sld] offset:25344  \n"
+" ds_read_b32 v11, %[v_sfl_sld] offset:25376  \n"
+" ds_read_b32 v12, %[v_sfl_sld] offset:25408  \n"
+" ds_read_b32 v13, %[v_sfl_sld] offset:25440  \n"
+" ds_read_b32 v14, %[v_sfl_sld] offset:29696  \n"
+" ds_read_b32 v15, %[v_sfl_sld] offset:29728  \n"
+" ds_read_b32 v16, %[v_sfl_sld] offset:29760  \n"
+" ds_read_b32 v17, %[v_sfl_sld] offset:29792  \n"
+" s_waitcnt lgkmcnt(0)  \n"
+" s_mov_b64 exec, %[s_execflag_0] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_1] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_2] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_3] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_4] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_5] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_6] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_7] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]  \n"
+
+#undef _UK_MFMA_ 
+#undef _UK_PK_CVT_
+#undef _UK_ATOMIC_ADD_
+
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -998,14 +998,14 @@ struct FmhaFwdKernel
                return pad_tensor_view(
                    q_dram_naive,
                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
-                    sequence<false, kPadHeadDimQ>{});
+                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
            }
            else
            {
                return pad_tensor_view(
                    q_dram_naive,
                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<false, kPadHeadDimQ>{});
+                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
            }
        }();
        const auto k_dram = [&]() {
@@ -1019,7 +1019,7 @@ struct FmhaFwdKernel
            return pad_tensor_view(
                k_dram_naive,
                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                sequence<false, kPadHeadDimQ>{});
+                sequence<kPadSeqLenK, kPadHeadDimQ>{});
        }();
        const auto v_dram = [&]() {
            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
@@ -1041,7 +1041,7 @@ struct FmhaFwdKernel
                return pad_tensor_view(
                    v_dram_transposed,
                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, false>{});
+                    sequence<kPadHeadDimV, kPadSeqLenK>{});
            }
            else
            {
@@ -1055,7 +1055,7 @@ struct FmhaFwdKernel
                return pad_tensor_view(
                    v_dram_naive,
                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<false, kPadSeqLenK>{});
+                    sequence<kPadHeadDimV, kPadSeqLenK>{});
            }
        }();

@@ -1097,8 +1097,9 @@ struct FmhaFwdKernel
                        number<FmhaPipeline::kAlignmentBias>{},
                        number<1>{});

-                    return pad_tensor_view(
-                        bias_dram_naive, bias_dram_window_lengths, sequence<false, kPadSeqLenK>{});
+                    return pad_tensor_view(bias_dram_naive,
+                                           bias_dram_window_lengths,
+                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
                }();

                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});

--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
@@ -810,21 +810,46 @@ struct FusedMoeGemmPipelineFlatmmPolicy
    CK_TILE_HOST_DEVICE static constexpr auto GetUK_1()
    {
        using S_ = typename Problem::BlockShape;
+        using T_ = typename Problem::Traits;
        if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
                     std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
                     std::is_same_v<typename Problem::TopkWeightDataType, float> &&
                     S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
-                     S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32)
+                     S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                     T_::PipeInterleave == false)
        {
            return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16{};
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl{};
        }
        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::fp16_t> &&
                          std::is_same_v<typename Problem::DDataType, ck_tile::fp16_t> &&
                          std::is_same_v<typename Problem::TopkWeightDataType, float> &&
                          S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
-                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32)
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                          T_::PipeInterleave == false)
        {
            return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl{};
+        }
+        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
+                          std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
+                          std::is_same_v<typename Problem::TopkWeightDataType, float> &&
+                          S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                          T_::PipeInterleave == true)
+        {
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
+            return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl{};
+        }
+        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::fp16_t> &&
+                          std::is_same_v<typename Problem::DDataType, ck_tile::fp16_t> &&
+                          std::is_same_v<typename Problem::TopkWeightDataType, float> &&
+                          S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                          T_::PipeInterleave == true)
+        {
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
+            return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl{};
        }
    }
 };

--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
@@ -22,7 +22,8 @@ template <bool IsGateOnly_,
          FusedMoeGemmWeightPermuteEnum PermuteEnum_ =
              FusedMoeGemmWeightPermuteEnum::b_nr_kr_waveflatten,
          bool PadHiddenSize_       = false,
-          bool PadIntermediateSize_ = false>
+          bool PadIntermediateSize_ = false,
+          bool PipeInterleave_      = true>
 struct FusedMoeGemmTraits
 {
    // Gate+Up or Gate only
@@ -32,6 +33,7 @@ struct FusedMoeGemmTraits
    static constexpr FusedMoeGemmWeightPermuteEnum PermuteEnum = PermuteEnum_;
    static constexpr bool PadHiddenSize                        = PadHiddenSize_;
    static constexpr bool PadIntermediateSize                  = PadIntermediateSize_;
+    static constexpr bool PipeInterleave                       = PipeInterleave_;
 };

 // Note: this need to be a bit mask

--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -23,10 +23,10 @@
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
 #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
+#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
-#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"

--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -66,6 +66,79 @@ struct GemmKernel
        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
    }

+    CK_TILE_HOST static bool IsSupportedArgument(const GemmCommonKargs& kargs)
+    {
+        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false)
+            {
+                return false;
+            }
+            if(kargs.K % GemmPipeline::VectorSizeA != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::kM != 0 && GemmPipeline::kPadM == false)
+            {
+                return false;
+            }
+            if(kargs.M % GemmPipeline::VectorSizeA != 0)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::kN != 0 && GemmPipeline::kPadN == false)
+            {
+                return false;
+            }
+            if(kargs.N % GemmPipeline::VectorSizeB != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false)
+            {
+                return false;
+            }
+            if(kargs.K % GemmPipeline::VectorSizeB != 0)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::kN != 0 && GemmPipeline::kPadN == false)
+            {
+                return false;
+            }
+            if(kargs.N % GemmPipeline::VectorSizeC != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::kM != 0 && GemmPipeline::kPadM == false)
+            {
+                return false;
+            }
+            if(kargs.M % GemmPipeline::VectorSizeC != 0)
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
    CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const
    {
        const auto [i_m, i_n] = TilePartitioner{}();

--- a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
@@ -19,7 +19,8 @@ struct SmoothquantHostArgs

    index_t m;
    index_t n;
-    index_t stride; // row_stride
+    index_t x_stride; // input row_stride
+    index_t y_stride; // output row_stride
 };

 // TODO: Extract some type to wrapper class
@@ -58,14 +59,21 @@ struct Smoothquant

        index_t m;
        index_t n;
-        index_t stride; // row_stride
+        index_t x_stride; // input row_stride
+        index_t y_stride; // out row_stride
    };
    using Hargs = SmoothquantHostArgs;

    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
    {
-        return Kargs{
-            hargs.p_x, hargs.p_xscale, hargs.p_yscale, hargs.p_qy, hargs.m, hargs.n, hargs.stride};
+        return Kargs{hargs.p_x,
+                     hargs.p_xscale,
+                     hargs.p_yscale,
+                     hargs.p_qy,
+                     hargs.m,
+                     hargs.n,
+                     hargs.x_stride,
+                     hargs.y_stride};
    }

    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
@@ -116,7 +124,7 @@ struct Smoothquant
            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                static_cast<const XDataType*>(kargs.p_x),
                make_tuple(kargs.m, kargs.n),
-                make_tuple(kargs.stride, 1),
+                make_tuple(kargs.x_stride, 1),
                number<Vector_N>{},
                number<1>{});

@@ -157,7 +165,7 @@ struct Smoothquant
            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                static_cast<QYDataType*>(kargs.p_qy),
                make_tuple(kargs.m, kargs.n),
-                make_tuple(kargs.stride, 1),
+                make_tuple(kargs.y_stride, 1),
                number<Vector_N>{},
                number<1>{});


--- a/include/ck_tile/ref/README.md
+++ b/include/ck_tile/ref/README.md
+# reference
+
+this folder contains reference implementation of a specific op. Note by including a specific header, you are including the implementation(expecially the gpu implementation) into your source code, and compile that kernel into the fatbin, hence may increase your kernel obj code length. Usually the header starts with `reference_` is a cpu reference implementation. The header starts with `naive_` contains a gpu implementation with a small launcher.
+
+TODO: move `host/reference` under this folder