Merge branch 'gfx950' into lwpck-2619

efab74a3 · Rostyslav Geyyer · 86950b3a · bcef33c1 · efab74a3 · efab74a3
Commit efab74a3 authored Jan 24, 2025 by Rostyslav Geyyer
20 changed files
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -5,88 +5,6 @@
 #include "ck/tensor_operation/gpu/device/device_gemm_streamk.hpp"
-template <typename DataType>
-inline __host__ __device__ constexpr double get_rtol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 2e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 2e-1;
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-template <typename DataType>
-inline __host__ __device__ constexpr double get_atol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 2e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 2e-1;
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 {

--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -3,88 +3,6 @@
 #pragma once
-template <typename DataType>
-inline __host__ __device__ constexpr double get_rtol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 1e-1; // 240 and 224 are acceptable
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 1.5e-1; // 57344 and 49152 are acceptable
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-template <typename DataType>
-inline __host__ __device__ constexpr double get_atol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 16.1; // 240 and 224 are acceptable
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 8192.1; // 57344 and 49152 are acceptable
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 {

--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -3,88 +3,6 @@
 #pragma once
-template <typename DataType>
-inline __host__ __device__ constexpr double get_rtol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 1e-1; // 240 and 224 are acceptable
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 1.5e-1; // 57344 and 49152 are acceptable
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-template <typename DataType>
-inline __host__ __device__ constexpr double get_atol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 16.1; // 240 and 224 are acceptable
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 8192.1; // 57344 and 49152 are acceptable
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 {

--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
+add_custom_target(example_gemm_mx)
+add_example_executable(example_gemm_mx_fp8 gemm_mx_fp8.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_fp8)
--- a/example/67_gemm_microscaling/README.md
+++ b/example/67_gemm_microscaling/README.md
+# GEMM Examples for Microscaling Formats
+## example_gemm_mx_fp8
+```bash
+# arg1: verification (0=no, 1=CPU)
+# arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+# arg3: time kernel (0=no, 1=yes)
+# arg4: verbosity (0=no info, 1=verbose info)
+# arg5 to 10: M (16x), N(16x), K(16x), StrideA, StrideB, StrideC
+./bin/example_gemm_mx_fp8 1 1 0 1
+```
+```bash
+# Implies: ./bin/example_gemm_mx_fp8 1 2 0 0
+./bin/example_gemm_mx_fp8
+```
\ No newline at end of file
--- a/example/67_gemm_microscaling/gemm_mx_common.hpp
+++ b/example/67_gemm_microscaling/gemm_mx_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+using ScaleDataType = ck::e8m0_bexp_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+struct ExecutionConfig final
+{
+    int do_verification = 1;     // (0=no, 1=CPU)
+    int init_method     = 2;     // (0=no init, 1=integer value, 2=decimal value)
+    bool time_kernel    = false; // (0=no, 1=yes)
+    int verbosity       = 0;     // (0=no info, 1=verbose info)
+};
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA = -1;
+    ck::index_t StrideB = -1;
+    ck::index_t StrideC = -1;
+};
+bool parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
+{
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.verbosity       = std::stoi(argv[4]);
+    }
+    else if(argc == 11)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.verbosity       = std::stoi(argv[4]);
+        problem_size.M = std::stoi(argv[5]);
+        problem_size.N = std::stoi(argv[6]);
+        problem_size.K = std::stoi(argv[7]);
+        problem_size.StrideA = std::stoi(argv[8]);
+        problem_size.StrideB = std::stoi(argv[9]);
+        problem_size.StrideC = std::stoi(argv[10]);
+    }
+    else
+    {
+        std::cerr << "arg1: verification (0=no, 1=CPU)" << std::endl
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+                  << std::endl
+                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg4: verbosity (0=no info, 1=verbose info)" << std::endl
+                  << "arg5 to 10: M (16x), N(16x), K(16x), StrideA, StrideB, StrideC" << std::endl;
+        return false;
+    }
+    return true;
+}
+template <typename ADataType,
+          typename BDataType,
+          typename XDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename CElementWiseOp,
+          typename AccDataType,
+          typename CShuffleDataType,
+          ck::index_t MXVectorSize>
+bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using ELayout      = CLayout;
+    using DsLayout     = ck::Tuple<>;
+    using DsDataType   = ck::Tuple<>;
+    using AElementOp   = PassThrough;
+    using BElementOp   = PassThrough;
+    using CDEElementOp = CElementWiseOp;
+    static constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+    static constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+    static constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v3;
+#if 1
+    // XXX: These parameters should not exist in MX-native GEMM kernel
+    static constexpr ck::index_t Scale_Block_M = 128;
+    static constexpr ck::index_t Scale_Block_N = 128;
+#endif
+    static constexpr ck::index_t Scale_Block_K = MXVectorSize;
+    // XXX: DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3 is not designed to utilize MX-specific MFMA
+    //      instructions.
+    //
+    // XXX: DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3 is not designed to utilize device-optimized
+    //      scaled type convert functions.
+    //
+    // XXX: In DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3, KPerBlock is expected to be equal to
+    //      ScaleBlockK (aka MXVectorSize).
+    //      Additionally, the following is also expected:
+    //         static_assert(ScaleBlockM % MPerBlock == 0);
+    //         static_assert(ScaleBlockN % NPerBlock == 0);
+    //         In MX-native GEMM kernel these requirements should be relaxed.
+    //
+    // XXX: It appears, by default we are using mfma_f32_16x16x4xf32
+    //      MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>::selected_mfma.k_per_blk =
+    //          MfmaSelector<float, 16, 16, float>::selected_mfma.k_per_blk = mfma_f32_16x16x4xf32
+    // XXX: GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3 assumes scale type is float
+    // clang-format off
+    using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
+    // ######| ALayout| BLayout| DsLayout| CLayout| ADataType|    AScale| BDataType|    BScale| DsDataType| CDataType|     GemmAcc| CShuffleDataType|AElementwise|BElementwise| CElementwise| GemmSpec|Block|   ScaleBlockM|   ScaleBlockN|   ScaleBlockK|    M|    N|             K| AK1| BK1|   M|   N|MXdl|NXdl|ABlockTransfer|ABlockTransfer|ABlockTransfer|ABlockTransfer|ABlockTransfer|ABlockTransfer|   ABlock|BBlockTransfer|BBlockTransfer|BBlockTransfer|BBlockTransfer|BBlockTransfer|BBlockTransfer|   BBlock|  CShuffle|  CShuffle|CShuffleBlockTransfer|CDEShuffleBlockTransfer|       BlkGemm|     BlkGemm|ComputeTypeA|ComputeTypeB|LDSTypeA|LDSTypeB|
+    // ######|        |        |         |        |          |  DataType|          |  DataType|           |          |    DataType|                 |   Operation|   Operation|    Operation|         | Size|              |              |              |  Per|  Per|           Per|    |    | Per| Per| Per| Per| ThreadCluster| ThreadCluster|SrcAccessOrder|  SrcVectorDim|     SrcScalar|     DstScalar|LdsExtraM| ThreadCluster| ThreadCluster|SrcAccessOrder|     SrcVector|     SrcScalar|     DstScalar|LdsExtraN|      MXdl|      NXdl|       ClusterLengths|                 Scalar|     PipeSched| PipelineVer|            |            |        |        |
+    // ######|        |        |         |        |          |          |          |          |           |          |            |                 |            |            |             |         |     |              |              |              |Block|Block|         Block|    |    | XDL| XDL|Wave|Wave|       Lengths|  ArrangeOrder|              |              |     PerVector| PerVector_AK1|         |       Lengths|  ArrangeOrder|              |           Dim|     PerVector| PerVector_BK1|         |   PerWave|   PerWave|     MBlock_MPerBlock|             PerVectors|              |            |            |            |        |        |
+    // ######|        |        |         |        |          |          |          |          |           |          |            |                 |            |            |             |         |     |              |              |              |     |     |              |    |    |    |    |    |    |     AK0_M_AK1|              |              |              |              |              |         |     BK0_N_BK1|              |              |                             |              |         |PerShuffle|PerShuffle|     NBlock_NPerBlock|                       |              |            |            |            |        |        |
+             < ALayout, BLayout, DsLayout, ELayout, ADataType, XDataType, BDataType, XDataType, DsDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp, GemmSpec,  256, Scale_Block_M, Scale_Block_N, Scale_Block_K,  128,  128,           128,  16,  16,  16,  16,   4,   4,   S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,             2,            16,            16,        0,   S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,             2,            16,            16,        0,         1,         2,       S<1, 32, 1, 8>,             S<8, 8, 1>, BlkGemmPSched, BlkGemmPVer,       float,       float,    float,  float>;
+    // clang-format on
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto f_host_tensor_descriptor =
+        [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1, stride});
+            }
+        };
+    auto f_get_default_stride =
+        [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<ck::index_t>(col);
+                }
+                else
+                {
+                    return static_cast<ck::index_t>(row);
+                }
+            }
+            else
+                return static_cast<ck::index_t>(stride);
+        };
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+    if(K % Scale_Block_K != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of Scale_Block_K (16 or 32)");
+    };
+    auto Scale_Stride_AM = f_get_default_stride(M, K / Scale_Block_K, StrideA, ALayout{});
+    auto Scale_Stride_BN = f_get_default_stride(K / Scale_Block_K, N, StrideB, BLayout{});
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<XDataType> a_m_k_scale(
+        f_host_tensor_descriptor(M, K / Scale_Block_K, Scale_Stride_AM, ALayout{})); // scales for A
+    Tensor<XDataType> b_k_n_scale(
+        f_host_tensor_descriptor(K / Scale_Block_K, N, Scale_Stride_BN, BLayout{})); // scales for B
+    Tensor<CDataType> c_m_n_host_result(
+        f_host_tensor_descriptor(M, N, StrideC, CLayout{})); // host verification
+    Tensor<CDataType> c_m_n_device_result(
+        f_host_tensor_descriptor(M, N, StrideC, CLayout{})); // device result downloaded to host
+    if(config.verbosity >= 0)
+    {
+        std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+        std::cout << "a_m_k_scale: " << a_m_k_scale.mDesc << std::endl;
+        std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+        std::cout << "b_k_n_scale: " << b_k_n_scale.mDesc << std::endl;
+        std::cout << "c_m_n_device_result: " << c_m_n_device_result.mDesc << std::endl;
+    }
+    switch(config.init_method)
+    {
+    case 0:
+        if(config.verbosity > 0)
+        {
+            std::cout << "NOTE: No input data initialization." << std::endl;
+        }
+        break;
+    case 1:
+    case 2:
+        ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.0f)}(a_m_k);
+        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(0.5f)}(a_m_k_scale);
+        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(1.0f)}(b_k_n);
+        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(2.0f)}(b_k_n_scale);
+        if(config.verbosity > 0)
+        {
+            std::cout << "Init A = {1}" << std::endl;
+            std::cout << "Init A scale = {0.5}" << std::endl;
+            std::cout << "Init B = {1}" << std::endl;
+            std::cout << "Init B scale = {2.0}" << std::endl;
+            std::cout << "Expect C = {K}" << std::endl;
+        }
+        break;
+    default:
+        if(config.verbosity > 0)
+        {
+            std::cout << "NOTE: No input data initialization." << std::endl;
+        }
+    }
+    if(config.verbosity > 0)
+        std::cout << "Device memory allocation..." << std::endl;
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem a_scale_device_buf(sizeof(XDataType) * a_m_k_scale.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b_scale_device_buf(sizeof(XDataType) * b_k_n_scale.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    if(config.verbosity > 0)
+        std::cout << "Upload data to device..." << std::endl;
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    a_scale_device_buf.ToDevice(a_m_k_scale.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    b_scale_device_buf.ToDevice(b_k_n_scale.mData.data());
+    if(config.verbosity > 0)
+        std::cout << "Done." << std::endl;
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument  = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                           b_device_buf.GetDeviceBuffer(),
+                                           std::array<const void*, NumDTensor>{},
+                                           c_device_buf.GetDeviceBuffer(),
+                                           M,
+                                           N,
+                                           K,
+                                           StrideA,
+                                           StrideB,
+                                           std::array<ck::index_t, NumDTensor>{},
+                                           StrideC,
+                                           a_scale_device_buf.GetDeviceBuffer(),
+                                           b_scale_device_buf.GetDeviceBuffer(),
+                                           a_element_op,
+                                           b_element_op,
+                                           cde_element_op);
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong!\n"
+                                 "Provided combination of compilation and runtime parameters is "
+                                 "not consistent with the supported device_gemm arguments.");
+    }
+    if(config.verbosity > 0)
+        std::cout << "Computing GEMM on device..." << std::endl;
+    float ave_time =
+        invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, config.verbosity, 20, 50});
+    bool res_verified = true;
+    if(config.do_verification > 0)
+    {
+        c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+        if(config.verbosity > 0)
+        {
+            std::cout << "Done." << std::endl;
+            std::cout << "Computing GEMM on host..." << std::endl;
+        }
+        Tensor<CDataType> c({M, N});
+        Tensor<float> a({M, K});
+        Tensor<float> b({K, N});
+        for(int m = 0; m < M; m++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                a(m, k) = ck::type_convert<float>(a_m_k(m, k)) *
+                          ck::type_convert<float>(a_m_k_scale(m, k / Scale_Block_K));
+            }
+        }
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                b(k, n) = ck::type_convert<float>(b_k_n(k, n)) *
+                          ck::type_convert<float>(b_k_n_scale(k / Scale_Block_K, n));
+            }
+        }
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<float,
+                                                                                float,
+                                                                                CShuffleDataType,
+                                                                                CDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+        auto ref_argument =
+            ref_gemm.MakeArgument(a, b, c, PassThrough{}, PassThrough{}, PassThrough{});
+        ref_invoker.Run(ref_argument);
+        if(config.verbosity > 0)
+        {
+            std::cout << "Done." << std::endl;
+            std::cout << "Comparing results..." << std::endl;
+        }
+        if(config.init_method == 1)
+        {
+            res_verified =
+                res_verified && std::abs(static_cast<float>(K) - c_m_n_device_result(0, 0)) <= 0.0f;
+            std::cout << "Expected vs Computed: " << 1.0f * K << " vs " << c_m_n_device_result(0, 0)
+                      << ((res_verified) ? " (PASSED!)" : " (FAILED!)") << std::endl;
+        }
+        res_verified = res_verified &&
+                       ck::utils::check_err(c_m_n_device_result, c, "Error: Incorrect results!");
+        if(config.verbosity > 0 && res_verified)
+            std::cout << "Done." << std::endl;
+    }
+    else
+    {
+        if(config.verbosity > 0)
+            std::cout << "Done." << std::endl;
+    }
+    if(config.time_kernel)
+    {
+        std::size_t flop = std::size_t(2) * M * N * K + M * K + K * N; // GEMM + A scale + B scale
+        std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                sizeof(CDataType) * M * N +
+                                sizeof(XDataType) * (M * K + K * N) / Scale_Block_K;
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s" << std::endl;
+    }
+    return res_verified;
+}
+template <typename ADataType,
+          typename BDataType,
+          typename XDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename CElementWiseOp,
+          typename AccDataType,
+          typename CShuffleDataType,
+          ck::index_t MXVectorSize>
+bool run_mx_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+    return parse_cmd_args(argc, argv, problem_size, config) &&
+           run_mx_gemm<ADataType,
+                       BDataType,
+                       XDataType,
+                       CDataType,
+                       ALayout,
+                       BLayout,
+                       CLayout,
+                       CElementWiseOp,
+                       AccDataType,
+                       CShuffleDataType,
+                       MXVectorSize>(problem_size, config);
+}
--- a/example/67_gemm_microscaling/gemm_mx_fp8.cpp
+++ b/example/67_gemm_microscaling/gemm_mx_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#include "gemm_mx_common.hpp"
+using ADataType = ck::f8_t;
+using BDataType = ck::f8_t;
+#if 1
+// XXX: MX-native GEMM kernel will work with e8m0_bexp_t scale type
+using XDataType = float;
+#else
+using XDataType = ck::e8m0_bexp_t;
+#endif
+using AccDataType      = float;
+using CShuffleDataType = float;
+using CDataType        = float;
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+using CElementOp = PassThrough; // elementwise transformation for C matrix
+constexpr ck::index_t mx_vector_size = 128; // scaling block size
+int main(int argc, char* argv[])
+{
+    return run_mx_gemm_example<ADataType,
+                               BDataType,
+                               XDataType,
+                               CDataType,
+                               ALayout,
+                               BLayout,
+                               CLayout,
+                               CElementOp,
+                               AccDataType,
+                               CShuffleDataType,
+                               mx_vector_size>(argc, argv)
+               ? 0
+               : -1;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -5,6 +5,14 @@ include_directories(BEFORE
 add_custom_target(examples)
+# list of examples that are labelled as REGRESSION_EXAMPLE for make regression (runtime more than 30 seconds)
+# all other tests are labelled as SMOKE_EXAMPLE
+set(REGRESSION_EXAMPLES
+    example_sparse_embedding3_forward_layernorm
+)
 function(add_example_dependencies EXAMPLE_NAME FILE_NAME)
    if(FILE_NAME)
        add_dependencies(EXAMPLE_NAME FILE_NAME)
@@ -15,34 +23,34 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    message("adding example ${EXAMPLE_NAME}")
    set(result 1)
    if(DEFINED DTYPES)
-    foreach(source IN LISTS FILE_NAME)
+        foreach(source IN LISTS FILE_NAME)
-        set(test 0)
+            set(test 0)
-        if((source MATCHES "_fp16" OR source MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
+            if((source MATCHES "_fp16" OR source MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
-            set(test 1)
+                set(test 1)
-        endif()
+            endif()
-        if((source MATCHES "_fp32" OR source MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
+            if((source MATCHES "_fp32" OR source MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
-            set(test 1)
+                set(test 1)
-        endif()
+            endif()
-        if((source MATCHES "_fp64" OR source MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
+            if((source MATCHES "_fp64" OR source MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
-            set(test 1)
+                set(test 1)
-        endif()
+            endif()
-        if((source MATCHES "_fp8" OR source MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
+            if((source MATCHES "_fp8" OR source MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
-            set(test 1)
+                set(test 1)
-        endif()
+            endif()
-        if((source MATCHES "_bf8" OR source MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
+            if((source MATCHES "_bf8" OR source MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
-            set(test 1)
+                set(test 1)
-        endif()
+            endif()
-        if((source MATCHES "_bf16" OR source MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
+            if((source MATCHES "_bf16" OR source MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
-            set(test 1)
+                set(test 1)
-        endif()
+            endif()
-        if((source MATCHES "_int8" OR source MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
+            if((source MATCHES "_int8" OR source MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
-            set(test 1)
+                set(test 1)
-        endif()
+            endif()
-        if(test EQUAL 1)
+            if(test EQUAL 1)
-            message("removing example source file ${source} ")
+                message("removing example source file ${source} ")
-            list(REMOVE_ITEM FILE_NAME "${source}")
+                list(REMOVE_ITEM FILE_NAME "${source}")
-        endif()
+            endif()
-    endforeach()
+        endforeach()
    endif()
    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})
@@ -54,9 +62,9 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
    endforeach()
-    #Do not build any DPP examples if DL_KERNELS not set
+    #Do not build any DPP examples if DPP_KERNELS not set
    foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dpp")
+        if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp")
            message("removing dpp example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
@@ -75,6 +83,13 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
    endforeach()
+    #Do not build any microscaling examples if gfx950 target is not on the list
+    foreach(source IN LISTS FILE_NAME)
+	if(NOT EX_TARGETS MATCHES "gfx950" AND source MATCHES "_mx")
+            message("removing microscaling example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
    #Do not build any FP8 examples if CK_ENABLE_FP8 not set
    foreach(source IN LISTS FILE_NAME)
        if(NOT DEFINED CK_ENABLE_FP8 AND source MATCHES "_fp8")
@@ -95,6 +110,8 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
        elseif(FILE_NAME MATCHES "_wmma")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
+        elseif(FILE_NAME MATCHES "_mx") #only build mx example for gfx950
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
        endif()
        set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
        add_executable(${EXAMPLE_NAME} ${FILE_NAME})
@@ -107,6 +124,15 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
        set(result 0)
    endif()
    #message("add_example returns ${result}")
+    if(result EQUAL 0 AND NOT "${EXAMPLE_NAME}" IN_LIST REGRESSION_EXAMPLES)
+        #message("adding to SMOKE EXAMPLE FILTER ${EXAMPLE_NAME}")
+        set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "SMOKE_TEST")
+        add_dependencies(smoke ${EXAMPLE_NAME})
+    elseif(result EQUAL 0 AND "${EXAMPLE_NAME}" IN_LIST REGRESSION_EXAMPLES)
+        #message("Adding to REGRESSION EXAMPLE FILTER ${EXAMPLE_NAME}")
+        set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "REGRESSION_TEST")
+        add_dependencies(regression ${EXAMPLE_NAME})
+    endif()
    set(result ${result} PARENT_SCOPE)
 endfunction(add_example_executable EXAMPLE_NAME)
@@ -188,8 +214,10 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
        rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
        set(result 0)
    endif()
    #message("add_example returns ${result}")
    set(result ${result} PARENT_SCOPE)
 endfunction(add_example_executable_no_testing EXAMPLE_NAME)
 # add all example subdir

--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -15,8 +15,7 @@ This will result in an executable `build/bin/tile_example_fmha_fwd`
 ## kernel
 The kernel template is `fmha_fwd_kernel.hpp`, this is the grid-wise op in old ck_tile's terminology. We put it here purposely, to demonstrate one can construct a kernel by using various internal component from ck_tile. We may still have an implementation under ck_tile's include path (in the future) for the kernel template.
-There are 3 template parameters for this kernel template.
+There are 2 template parameters for this kernel template.
-* `TilePartitioner` is used to map the workgroup to corresponding tile, `fmha_fwd_tile_partitioner.hpp` in this folder served as this purpose.
 * `FmhaPipeline` is one of the block_tile_pipeline(under `include/ck_tile/tile_program/block_tile_pipeline`) which is a performance critical component. Indeed, we did a lot of optimization and trials to optimize the pipeline and may still workout more performance pipeline and update into that folder. People only need to replace this pipeline type and would be able to enjoy the benefit of different performant implementations (stay tuned for updated pipeline(s)).
 * `EpiloguePipeline` will modify and store out the result in the last phase. People usually will do lot of post-fusion at this stage, so we also abstract this concept. Currently we didn't do much thing at the epilogue stage but leave the room for future possible support.

--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -119,6 +119,7 @@ PIPELINE_MAP = {
 PIPELINE_ENUM_MAP = {
    "qr" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
    "qr_async" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC",
+    "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
 }
 BOOL_MAP = {

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -29,11 +29,6 @@ K0_MAX_SUBMAX_MAP = {
    256: 256
 }
-TILE_PARTITIONER_MAP = {
-    "shb" : "ck_tile::FmhaFwdTilePartitioner_SHB",
-    "hbs" : "ck_tile::FmhaFwdTilePartitioner_HBS",
-}
 FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
 // auto generated by generate.py
@@ -44,13 +39,12 @@ FMHA_FWD_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
 using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
-using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
 using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
                                      ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
-                                      fmha_warp_tile_{F_idx},
+                                      ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>,
                                      ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
-                                      fmha_warp_tile_{F_idx},
+                                      ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>,
                                      {F_vlayout}>;
 using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
@@ -91,9 +85,7 @@ using fmha_epilogue_{F_idx} =
                                           {F_spad}, {F_dvpad}>>;
 using fmha_kernel_{F_idx} =
-    ck_tile::FmhaFwdKernel<{F_tile_partitioner}<fmha_shape_{F_idx}>,
+    ck_tile::FmhaFwdKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
-                  fmha_pipeline_{F_idx},
-                  fmha_epilogue_{F_idx}>;
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
                        {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
@@ -306,15 +298,19 @@ class FmhaFwdTileSize:
    F_rm1       : int  # number of warps for gemm1 along q seqlen
    F_rn1       : int  # number of warps for gemm1 along head dim v
    F_rk1       : int  # number of warps for gemm1 along k seqlen (not used)
-    F_wm        : int  # warp size along m (warp size)
+    F_wm0       : int  # gemm0 warp size along m
-    F_wn        : int  # warp size along n
+    F_wn0       : int  # gemm0 warp size along n
-    F_wk        : int  # warp size along k
+    F_wk0       : int  # gemm0 warp size along k
+    F_wm1       : int  # gemm1 warp size along m
+    F_wn1       : int  # gemm1 warp size along n
+    F_wk1       : int  # gemm1 warp size along k
    F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
    @property
    def name(self) -> str:
        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
        f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\
-        f"_w{self.F_wm}x{self.F_wn}x{self.F_wk}" + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
+        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" +\
+        ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
 @dataclass
 class FmhaFwdKernel:
@@ -326,12 +322,6 @@ class FmhaFwdKernel:
    F_pipeline      : FmhaFwdPipeline
    mask_impl       : str
-    def get_tp(self) -> str:
-        if self.F_mode == 'group':
-            return 'hbs'
-        else:
-            return 'shb'
    @property
    def template(self) -> str:
        kernel_body = str()
@@ -352,9 +342,12 @@ class FmhaFwdKernel:
                F_rm1           = self.F_tile.F_rm1,
                F_rn1           = self.F_tile.F_rn1,
                F_rk1           = self.F_tile.F_rk1,
-                F_wm            = self.F_tile.F_wm,
+                F_wm0           = self.F_tile.F_wm0,
-                F_wn            = self.F_tile.F_wn,
+                F_wn0           = self.F_tile.F_wn0,
-                F_wk            = self.F_tile.F_wk,
+                F_wk0           = self.F_tile.F_wk0,
+                F_wm1           = self.F_tile.F_wm1,
+                F_wn1           = self.F_tile.F_wn1,
+                F_wk1           = self.F_tile.F_wk1,
                F_vlayout       = LAYOUT_MAP[self.F_pipeline.F_vlayout],
                F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
                F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
@@ -368,13 +361,12 @@ class FmhaFwdKernel:
                F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
                F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
                F_mode          = MODE_MAP[self.F_mode],
-                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag],
+                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag])
-                F_tile_partitioner = TILE_PARTITIONER_MAP[self.get_tp()])
    @property
    def name(self) -> str:
        # TODO: we don't encode idx here
-        return f"fmha_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_{self.get_tp()}_" + \
+        return f"fmha_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + \
                self.F_tile.name + '_' + self.F_pipeline.name
    @property
@@ -409,17 +401,17 @@ class FmhaFwdKernel:
 def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
    if dtype == 'fp16' or dtype == 'bf16':
        return {
-            '32'  : FmhaFwdTileSize(128, 64, 16, 32, 32, 32,     2, 1, 1,  2, 1, 1,  32, 32, 16, -1),
+            '32'  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
+            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            ## '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32, 96,   4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
+        ### '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
        }
    elif dtype == 'fp8' or dtype == 'bf8':
        return {
-            '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     2, 1, 1,  2, 1, 1,  32, 32, 32, -1),
+            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1)
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
        }
    else:
        return None

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -46,9 +46,7 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipelineProbl
 using fmha_pipeline_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipeline<
    fmha_pipeline_problem_{F_idx}>;
-using fmha_kernel_{F_idx} =
+using fmha_kernel_{F_idx} = ck_tile::FmhaFwdAppendKVKernel<fmha_pipeline_{F_idx}>;
-    ck_tile::FmhaFwdAppendKVKernel<ck_tile::FmhaFwdAppendKVTilePartitioner<{F_bs}, {F_bsk}, {F_bd}, {F_bdv}>,
-                  fmha_pipeline_{F_idx}>;
 using trait_{F_idx} = fmha_fwd_appendkv_traits_<{F_hdim}, {F_dtype}, {F_bs}, {F_bsk}, {F_bd}, {F_bdv}, {F_vlayout},
                        {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_rope}, {F_pagedkv}>;
@@ -355,4 +353,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im
        _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
        for kernel in kernels:
            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_APPENDKV_API_FILENAME) + "\n")
\ No newline at end of file
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -39,6 +39,7 @@ K0_MAX_SUBMAX_MAP = {
 FMHA_FWD_SPLITKV_PIPELINE_MAP = {
    "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS",
+    "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS",
    "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync",
 }
@@ -47,16 +48,15 @@ using fmha_dtype_{F_idx} = {F_dtype};
 using fmha_mask_{F_idx} = {F_mask};
 namespace {{
-template <bool kHasUnevenSplits>
+template <bool kHasUnevenSplits, bool kMergeNumHeadGroupsSeqLenQ = false>
-struct kernel_runner {{
+struct instance {{
 using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
-using fmha_warp_tile = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
 using fmha_shape = ck_tile::TileFmhaShape<fmha_block_tile,
                                          ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
-                                          fmha_warp_tile,
+                                          ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>,
                                          ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
-                                          fmha_warp_tile,
+                                          ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>,
                                          {F_vlayout}>;
 using fmha_trait = ck_tile::TileFmhaFwdSplitKVTraits<{F_spad},
@@ -64,11 +64,12 @@ using fmha_trait = ck_tile::TileFmhaFwdSplitKVTraits<{F_spad},
                                                     {F_dpad},
                                                     {F_dvpad},
                                                     {F_bias},
-                                                     false,
+                                                     /*kHasBiasGrad=*/false,
                                                     {F_lse},
                                                     {F_squant},
                                                     {F_pagedkv},
                                                     kHasUnevenSplits,
+                                                     kMergeNumHeadGroupsSeqLenQ,
                                                     {F_occupancy}>;
 using fmha_pipeline_problem = ck_tile::BlockFmhaFwdSplitKVPipelineProblem<
@@ -96,9 +97,7 @@ using fmha_epilogue =
                                           {F_spad}, {F_dvpad}>>;
 using fmha_kernel =
-    ck_tile::FmhaFwdSplitKVKernel<ck_tile::FmhaFwdSplitKVTilePartitioner<fmha_shape>,
+    ck_tile::FmhaFwdSplitKVKernel<fmha_pipeline, fmha_epilogue>;
-                  fmha_pipeline,
-                  fmha_epilogue>;
 static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
@@ -117,28 +116,50 @@ using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F
 #include <iostream>
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wtautological-compare"
+namespace {{
+template <bool kHasUnevenSplits>
+void run_instance(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) {{
+    if constexpr ({F_hdim} == 128 && {F_bias} == ck_tile::BlockAttentionBiasEnum::NO_BIAS
+                  && (std::is_same_v<{F_mask}, ck_tile::SimplifiedGenericAttentionMask<false>>
+                      || std::is_same_v<{F_mask}, FmhaMasks::NoMask>)) {{
+        if (a.max_seqlen_q == 1 && a.nhead_k < a.nhead_q) {{
+            instance<kHasUnevenSplits, /*kMergeNumHeadGroupsSeqLenQ=*/true>::run(s, a);
+        }} else {{
+            instance<kHasUnevenSplits>::run(s, a);
+        }}
+    }} else {{
+        instance<kHasUnevenSplits>::run(s, a);
+    }}
+}}
+}} // anonymous namespace
+#pragma clang diagnostic pop
 template<>
 void fmha_fwd_splitkv_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
    if constexpr({F_mode} == false) {{ // batch mode
        // we don't check every seqlen_k values for kvcache
        if (a.seqlen_k_ptr != nullptr) {{
-            kernel_runner<true>::run(s, a);
+            run_instance</*kHasUnevenSplits=*/true>(s, a);
        // make sure F_bn0 is divisible by F_bk1
        }} else if (a.seqlen_k % (a.num_splits * {F_bn0}) == 0) {{
-            kernel_runner<false>::run(s, a);
+            run_instance</*kHasUnevenSplits=*/false>(s, a);
        }} else {{
-            kernel_runner<true>::run(s, a);
+            run_instance</*kHasUnevenSplits=*/true>(s, a);
        }}
    }} else {{
-        kernel_runner<true>::run(s, a);
+        run_instance</*kHasUnevenSplits=*/true>(s, a);
    }}
 }}
 template<>
 std::string fmha_fwd_splitkv_get_name_<trait_{F_idx}>()
 {{
-    using k_ = kernel_runner<true>::fmha_kernel; /// FIXME: choose real kernel type
+    using k_ = instance<true>::fmha_kernel; /// FIXME: choose real kernel type
    return k_::GetName();
 }}
 """
@@ -148,7 +169,7 @@ using fmha_dtype_{F_idx} = {F_dtype};
 namespace {{
 template <ck_tile::index_t kLogMaxSplits>
-struct kernel_runner {{
+struct instance {{
 using fmha_trait = ck_tile::TileFmhaFwdSplitKVCombineTraits<{F_spad},
                                                    {F_dvpad},
                                                    {F_lse},
@@ -161,9 +182,8 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaSplitKVCombinePipelineProblem<
    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
    {F_hdim},
-    {F_bm0},
-    {F_bn1},
    {F_mode},
+    {F_bn1},
    fmha_trait>;
 using fmha_pipeline = ck_tile::BlockFmhaFwdSplitKVCombinePipeline<
@@ -177,9 +197,7 @@ using fmha_epilogue =
                                           false, false>>;
 using fmha_kernel =
-    ck_tile::FmhaFwdSplitKVCombineKernel<ck_tile::FmhaFwdSplitKVCombineTilePartitioner<{F_bm0}, {F_bn1}>,
+    ck_tile::FmhaFwdSplitKVCombineKernel<fmha_pipeline, fmha_epilogue>;
-                  fmha_pipeline,
-                  fmha_epilogue>;
 static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
@@ -192,7 +210,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 }};
 }}
-using trait_{F_idx} = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn1},
+using trait_{F_idx} = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bn1},
                        {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
 #include <iostream>
@@ -201,22 +219,22 @@ template<>
 void fmha_fwd_splitkv_combine_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
    if (a.num_splits <= 8) {{
-        kernel_runner<3>::run(s, a);
+        instance<3>::run(s, a);
    }} else if (a.num_splits <= 16) {{
-        kernel_runner<4>::run(s, a);
+        instance<4>::run(s, a);
    }} else if (a.num_splits <= 32) {{
-        kernel_runner<5>::run(s, a);
+        instance<5>::run(s, a);
    }} else if (a.num_splits <= 64) {{
-        kernel_runner<6>::run(s, a);
+        instance<6>::run(s, a);
    }} else if (a.num_splits <= 128) {{
-        kernel_runner<7>::run(s, a);
+        instance<7>::run(s, a);
    }}
 }}
 template<>
 std::string fmha_fwd_splitkv_combine_get_name_<trait_{F_idx}>()
 {{
-    using k_ = kernel_runner<6>::fmha_kernel; /// FIXME: choose real kernel type
+    using k_ = instance<6>::fmha_kernel; /// FIXME: choose real kernel type
    return k_::GetName();
 }}
 """
@@ -250,16 +268,25 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) &&
                        ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                // get combine kernel tile sizes
+                using OaccDataType = typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType;
+                constexpr ck_tile::index_t kM0 = ck_tile::BlockFmhaSplitKVCombinePipelineTileSizes<OaccDataType, /*F_bn1=*/32>::kM0;
+                // make sure we can reuse the padding flags in combine kernels
+                static_assert({F_bm0} % kM0 == 0);
+                static_assert({F_bn1} % 32 == 0);
                if (t.has_lse) {{
-                    if constexpr (std::is_same_v<{F_dtype}, ck_tile::fp8_t>) {{
+                    if constexpr (std::is_same_v<{F_dtype}, FmhaFwdFp8>) {{
                        return -1;
                    }} else {{
-                        using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, true, {F_squant}, {F_spad}, {F_dvpad}>;
+                        using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, /*F_bn1=*/32, true, {F_squant}, {F_spad}, {F_dvpad}>;
                        return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
                    }}
                }} else {{
-                    using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, false, {F_squant}, {F_spad}, {F_dvpad}>;
+                    using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, /*F_bn1=*/32, false, {F_squant}, {F_spad}, {F_dvpad}>;
                    return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
                }}
@@ -302,7 +329,7 @@ class FmhaFwdSplitKVApiTrait:
        if self.pipeline_tag == 'qr_async':
            if self.spad == 't' : return 'true' # always support
            else :                return 'true'
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']:
            if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
            else :                return f'a.seqlen_q % {self.bm0} == 0'
        else: assert False
@@ -313,7 +340,7 @@ class FmhaFwdSplitKVApiTrait:
        if self.pipeline_tag == 'qr_async':
            if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
            else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
-        elif self.pipeline_tag in ['qr', 'qr_fp8']:
+        elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']:
            if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
            else :                return f'a.seqlen_k % {self.bn0} == 0'
        else: assert False
@@ -324,7 +351,7 @@ class FmhaFwdSplitKVApiTrait:
            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
            if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
            else :               assert False
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']:
            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
            if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
            else :               return f'a.hdim_q % {bk0submax} == 0'
@@ -336,7 +363,7 @@ class FmhaFwdSplitKVApiTrait:
            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
            if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
            else :                assert False
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']:
            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
            if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
            else :                return f'a.hdim_v % {bk0submax} == 0'
@@ -447,12 +474,11 @@ class FmhaFwdSplitKVApiPool:
 @dataclass
 class FmhaFwdSplitKVCombineTileSize:
-    F_bm0       : int  # tile size along q seqlen
    F_bn1       : int  # tile size along v head_dim
    F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
    @property
    def name(self) -> str:
-        return f"b{self.F_bm0}x{self.F_bn1}" +\
+        return f"b{self.F_bn1}" +\
            ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
 @dataclass
@@ -485,9 +511,12 @@ class FmhaFwdSplitKVKernel:
                F_rm1           = self.F_tile.F_rm1,
                F_rn1           = self.F_tile.F_rn1,
                F_rk1           = self.F_tile.F_rk1,
-                F_wm            = self.F_tile.F_wm,
+                F_wm0           = self.F_tile.F_wm0,
-                F_wn            = self.F_tile.F_wn,
+                F_wn0           = self.F_tile.F_wn0,
-                F_wk            = self.F_tile.F_wk,
+                F_wk0           = self.F_tile.F_wk0,
+                F_wm1           = self.F_tile.F_wm1,
+                F_wn1           = self.F_tile.F_wn1,
+                F_wk1           = self.F_tile.F_wk1,
                F_vlayout       = LAYOUT_MAP[self.F_pipeline.F_vlayout],
                F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
                F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
@@ -553,7 +582,6 @@ class FmhaFwdSplitKVCombineKernel:
                F_idx           = self.F_idx,
                F_hdim          = self.F_hdim,
                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
-                F_bm0           = self.F_tile.F_bm0,
                F_bn1           = self.F_tile.F_bn1,
                F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
                F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
@@ -577,17 +605,17 @@ class FmhaFwdSplitKVCombineKernel:
 def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
    if dtype == 'fp16' or dtype == 'bf16':
        return {
-            '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  16, 16, 16, -1),
+            '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
+            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-            ## '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
+        ### '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
+            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
+            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
        }
    elif dtype == 'fp8' or dtype == 'bf8':
        return {
-            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32, -1),
+            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1)
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
        }
    else:
        return None
@@ -595,17 +623,17 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
 def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
    if dtype == 'fp16' or dtype == 'bf16':
        return {
-            '32'  : FmhaFwdSplitKVCombineTileSize(16, 16,  -1),
+            '32'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
-            '64'  : FmhaFwdSplitKVCombineTileSize(32, 32,  -1),
+            '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
-            ## '96' : FmhaFwdSplitKVCombineTileSize(32, 64,  -1),
+        ### '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
-            '128' : FmhaFwdSplitKVCombineTileSize(32, 64,  -1),
+            '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
-            '256' : FmhaFwdSplitKVCombineTileSize(32, 128, -1),
+            '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
    }
    elif dtype == 'fp8' or dtype == 'bf8':
        return {
-            '64'  : FmhaFwdSplitKVCombineTileSize(64, 32,  -1),
+            '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
-            '128' : FmhaFwdSplitKVCombineTileSize(64, 64,  -1),
+            '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
-            '256' : FmhaFwdSplitKVCombineTileSize(64, 128, -1),
+            '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
        }
    else:
        return None

--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1131,15 +1131,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
    {
        // NOTE: use gpu to do validation
        ck_tile::naive_attention_fwd_traits naive_t;
-        naive_t.q_type    = data_type;
+        naive_t.q_type     = data_type;
-        naive_t.k_type    = data_type;
+        naive_t.k_type     = data_type;
-        naive_t.v_type    = data_type;
+        naive_t.v_type     = data_type;
-        naive_t.o_type    = data_type;
+        naive_t.o_type     = data_type;
-        naive_t.q_layout  = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.q_layout   = i_perm == 1 ? "bhsd" : "bshd";
-        naive_t.k_layout  = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.k_layout   = i_perm == 1 ? "bhsd" : "bshd";
-        naive_t.v_layout  = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.v_layout   = i_perm == 1 ? "bhsd" : "bshd";
-        naive_t.o_layout  = o_perm == 1 ? "bhsd" : "bshd";
+        naive_t.o_layout   = o_perm == 1 ? "bhsd" : "bshd";
-        naive_t.variation = 0; // TODO?
+        naive_t.variation  = 0; // TODO?
+        naive_t.quant_algo = 0;
        ck_tile::DeviceMem o_naive_buf(o_host.get_element_space_size_in_bytes());

--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -400,8 +400,18 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
        }
    }();
-    dim3 grids = FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v);
+    if constexpr(FmhaKernel::kIsGroupMode)
-    return ck_tile::make_tuple(kargs, grids);
+    {
+        dim3 grids = FmhaKernel::GridSize(
+            args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, args.seqlen_k_ptr != nullptr);
+        return ck_tile::make_tuple(kargs, grids);
+    }
+    else
+    {
+        dim3 grids =
+            FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, false);
+        return ck_tile::make_tuple(kargs, grids);
+    }
 }
 template <typename Kernel>
@@ -500,8 +510,8 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
        }
    }();
-    dim3 grids =
+    dim3 grids = Kernel::GridSize(
-        Kernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, args.num_splits);
+        args.batch, args.nhead_q, args.nhead_k, args.max_seqlen_q, args.hdim_v, args.num_splits);
    return ck_tile::make_tuple(kargs, grids);
 }
@@ -709,7 +719,6 @@ std::string fmha_fwd_splitkv_get_name_();
 template <ck_tile::index_t HDim_,
          typename DataType_,
          bool kIsGroupMode_,
-          ck_tile::index_t kM0_,
          ck_tile::index_t kN1_,
          bool kStoreLse_,
          bool kDoFp8StaticQuant_,
@@ -720,7 +729,6 @@ struct fmha_fwd_splitkv_combine_traits_
    static constexpr ck_tile::index_t HDim  = HDim_;
    using DataType                          = ck_tile::remove_cvref_t<DataType_>;
    static constexpr bool kIsGroupMode      = kIsGroupMode_;
-    static constexpr ck_tile::index_t kM0   = kM0_;
    static constexpr ck_tile::index_t kN1   = kN1_;
    static constexpr bool kStoreLse         = kStoreLse_;
    static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_;

--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
@@ -59,7 +59,7 @@ args:
      -kname    print kernel name or not (default:1)
     -prec_i    input precision (default:fp16)
     -prec_o    output precision, set auto will be the same as input (default:auto)
-    -prec_sx    output quant scale type, set auto will be the same as input. used when fquant=1 (default:auto)
+    -prec_sm    output quant scale type, set auto will be the same as input. used when fquant=1 (default:auto)
    -prec_sy    output quant scale type, set auto will be the same as input. used when fquant=1 or 2 (default:auto)
       -fadd    fused-add, 0:no fused add, 1:preadd+store, 2:preadd only (default:0)
     -fquant    fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant (default:0)
@@ -69,7 +69,7 @@ args:
 ```
 ## limitations
-Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by default generated. Though our kernel template suppor this. (TBD: add some flag in generate.py) to generate those instance on demand. Beside, `N>8192` case will by default using two-pass pipeline, and `-fquant=1/2` are not supported yet. If need suport `N>8192` and `fused+residual+store`, you can use this example together with `12_smoothquant`, to construct layernorm+residual, and smoothquant, 2 kernels for this purpose.
+Note that `fquant=2`, `fadd=2`, `prec_sm/prec_sy` other than `fp32` are not by default generated. Though our kernel template suppor this. (TBD: add some flag in generate.py) to generate those instance on demand. Beside, `N>8192` case will by default using two-pass pipeline, and `-fquant=1/2` are not supported yet. If need suport `N>8192` and `fused+residual+store`, you can use this example together with `12_smoothquant`, to construct layernorm+residual, and smoothquant, 2 kernels for this purpose.
 ```
 # some case

--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 # generate kernel instances to speed up compilation
 import argparse
@@ -23,6 +23,10 @@ def get_if_str(idx, total, lase_else = True):
        else:
            return 'else if'
+XBIAS_ENUM_STR_MAP = [
+    'no',
+    'xbias']      # pre-norm add bias
 FUSED_ADD_ENUM_STR_MAP = [
    'no',
    'pras',      # pre-norm
@@ -48,7 +52,7 @@ class layernorm_fwd_codegen:
 // this is used to pattern-match internl kernel implementation, not to instantiate kernel
 template <typename XDataType_,
          typename YDataType_,
-          typename XScaleDataType_,
+          typename SmoothScaleDataType_,
          typename YScaleDataType_,
          ck_tile::index_t Repeat_M_,         // each thread repeat along M
          ck_tile::index_t Repeat_N_,         // each thread repeat along N
@@ -58,14 +62,16 @@ template <typename XDataType_,
          bool kPadN_,
          bool kSaveMeanInvStd_,
          bool kFastFDiv_,
+          bool kWelford_,
          bool kTwoPass_,
+          ck_tile::index_t kXbias_ = 0,
          ck_tile::index_t kFusedAdd_ = 0,
          ck_tile::index_t kFusedQuant_ = 0>
 struct layernorm2d_fwd_traits_
 {
    using XDataType = ck_tile::remove_cvref_t<XDataType_>;
    using YDataType = ck_tile::remove_cvref_t<YDataType_>;
-    using XScaleDataType = ck_tile::remove_cvref_t<XScaleDataType_>;
+    using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
    using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;
    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
@@ -120,14 +126,16 @@ struct layernorm2d_fwd_traits_
    static constexpr bool kPadN           = kPadN_;
    static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
    static constexpr bool kFastFDiv       = kFastFDiv_;
+    static constexpr bool kWelford        = kWelford_;
    static constexpr bool kTwoPass        = kTwoPass_;
+    static constexpr ck_tile::index_t kXbias = kXbias_;
    static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
    static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
 };
 template <typename XDataType_,
          typename YDataType_,
-          typename XScaleDataType_,
+          typename SmoothScaleDataType_,
          typename YScaleDataType_,
          ck_tile::index_t Repeat_M_,         // each thread repeat along M
          ck_tile::index_t Repeat_N_,         // each thread repeat along N
@@ -137,12 +145,14 @@ template <typename XDataType_,
          bool kPadN_,
          bool kSaveMeanInvStd_,
          bool kFastFDiv_,
+          bool kWelford_,
          bool kTwoPass_,
+          int  kXbias_,
          int  kFusedAdd_,
          int  kFusedQuant_>
 using traits_ = layernorm2d_fwd_traits_<XDataType_,
                                       YDataType_,
-                                       XScaleDataType_,
+                                       SmoothScaleDataType_,
                                       YScaleDataType_,
                                       Repeat_M_,
                                       Repeat_N_,
@@ -152,13 +162,15 @@ using traits_ = layernorm2d_fwd_traits_<XDataType_,
                                       kPadN_,
                                       kSaveMeanInvStd_,
                                       kFastFDiv_,
+                                       kWelford_,
                                       kTwoPass_,
+                                       kXbias_,
                                       kFusedAdd_,
                                       kFusedQuant_>;
 """
    API_COMMON_HEADER = """
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 #include <ck_tile/core.hpp>
 #include "layernorm2d_fwd.hpp"
@@ -177,26 +189,29 @@ float layernorm2d_fwd_(const S& s, A a)
 {{
    using XDataType = typename Traits_::XDataType;
    using YDataType = typename Traits_::YDataType;
-    using XScaleDataType = typename Traits_::XScaleDataType;
+    using SmoothScaleDataType = typename Traits_::SmoothScaleDataType;
    using YScaleDataType = typename Traits_::YScaleDataType;
-    using ComputeDataType = typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::ComputeDataType;
+    using ComputeDataType = typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType;
    using PipelineTraits = ck_tile::Layernorm2dFwdTraits<Traits_::kPadN,
        Traits_::kSaveMeanInvStd,
        Traits_::kFastFDiv,
+        Traits_::kWelford,
        Traits_::kTwoPass,
+        static_cast<ck_tile::Layernorm2dXBiasEnum>(Traits_::kXbias),
        static_cast<ck_tile::Layernorm2dFusedAddEnum>(Traits_::kFusedAdd),
        static_cast<ck_tile::Layernorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
    using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem<
-        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::XDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XDataType,
-        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::GammaDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XBiasDataType,
-        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::BetaDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::GammaDataType,
-        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::ComputeDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::BetaDataType,
-        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::YDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType,
-        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::MeanDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YDataType,
-        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::InvStdDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::MeanDataType,
-        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::XScaleDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::InvStdDataType,
-        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::YScaleDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::SmoothScaleDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YScaleDataType,
        typename Traits_::Shape,
        PipelineTraits>;
@@ -204,12 +219,13 @@ float layernorm2d_fwd_(const S& s, A a)
    using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass<PipelineProblem>;
    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
-    using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, false>;
+    using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, true>;
    using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
    static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1;
-    using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, XScaleDataType, YScaleDataType, YDataType, typename Traits_::Shape,
+    static constexpr bool UseRawStore = sizeof(YDataType) == 4;
-            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, false,  true/*max3*/>>;
+    using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, SmoothScaleDataType, YScaleDataType, YDataType, typename Traits_::Shape,
+            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, UseRawStore,  true/*max3*/>>;
    using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue<DynamicQuantEpilogueProblem>;
@@ -233,7 +249,7 @@ float layernorm2d_fwd_(const S& s, A a)
    API_BASE = """
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 #include <ck_tile/core.hpp>
 #include "layernorm2d_fwd.hpp"
@@ -269,12 +285,12 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
    INSTANCE_BASE = """
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 #include "layernorm2d_fwd_api_common.hpp"
 // clang-format off
-//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv    rpcf    2p      add  sweep
+//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv    rpcf   welford   2p   xbias   add  sweep
 {F_instance_def}
 // clang-format on
@@ -284,6 +300,10 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
        self.working_path = working_path
        self.kernel_filter = kernel_filter
+    class k_xbias_enum(IntEnum):
+        F_NO_XBIAS = 0
+        F_ADD_XBIAS = 1
    class k_fuesd_add_enum(IntEnum):
        F_NO_ADD = 0
        F_PRE_ADD = 1
@@ -299,6 +319,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
        F_kPadN : bool
        F_kSaveMeanInvStd : bool
        F_kTwoPass : bool
+        F_kXbias : Any #: layernorm_fwd_codegen.k_bias_enum
        F_kFusedAdd : Any #: layernorm_fwd_codegen.k_fuesd_add_enum
        F_kFusedQuant : Any  #: layernorm_fwd_codegen.k_fused_sweep_enum
@@ -315,6 +336,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
    @dataclass
    class k_problem:
        F_XDataType       : str
+        F_XBiasDataType   : str
        F_GammaDataType   : str
        F_BetaDataType    : str
        F_ComputeDataType : str
@@ -352,7 +374,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
    class h_traits:
        F_XDataType : str
        F_YDataType : str
-        F_XScaleDataType : str
+        F_SmoothScaleDataType : str
        F_YScaleDataType : str
        F_Repeat_M : int
        F_Repeat_N : int
@@ -362,15 +384,17 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
        F_kPadN : bool
        F_kSaveMeanInvStd_ : bool
        F_kFastFDiv_ : bool
+        F_kWelford_ : bool
        F_kTwoPass_ : bool
+        F_kXbias_ : int
        F_kFusedAdd : int
        F_kFusedQuant : int
        @property
        def trait_name(self) ->str:
-            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_XScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
+            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_SmoothScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
-            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}'
+            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}, {BOOL_MAP(self.F_kWelford_):5}'
-            t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
+            t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kXbias:4}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
            return t_
        # string when calling this kernel
@@ -388,6 +412,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
    class h_instance:
        F_DataTypePair : str
        F_N : str
+        F_xbias : int
        F_add : int
        F_sweep : int
        instance_list : List[Any] # List[h_traits]
@@ -397,6 +422,8 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
            prec_i, prec_o = self.F_DataTypePair.split(',')
            dtype_str = f'{prec_i}' if prec_i == prec_o else f'{prec_i}_{prec_o}'
            nnn = f'layernorm2d_fwd_{dtype_str}_n{self.F_N}'
+            if self.F_xbias != 0:
+                nnn = nnn + '_' + XBIAS_ENUM_STR_MAP[self.F_xbias] 
            if self.F_add != 0:
                nnn = nnn + '_' + FUSED_ADD_ENUM_STR_MAP[self.F_add]
            if self.F_sweep != 0:
@@ -422,11 +449,10 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
    def name_common_header(self) -> str:
        return 'layernorm2d_fwd_api_common'
-    @property
+    def content_api(self, args) -> str:
-    def content_api(self) -> str:
        # 1 sort based on dtype
        t_dtype_dict = dict()
-        blobs = self.get_blobs()
+        blobs = self.get_blobs(args)
        for blob in blobs:
            if blob.F_DataTypePair not in t_dtype_dict:
                t_dtype_dict[blob.F_DataTypePair] = {}
@@ -451,19 +477,19 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
                        if ins.F_kFusedQuant == 0:
                            _sweep_cond = 't.fused_quant == {f_fused_sweep}'.format(f_fused_sweep = ins.F_kFusedQuant)
                        elif ins.F_kFusedQuant == 1:
-                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sx == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\")'.format(
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sm == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\")'.format(
-                                f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_XScaleDataType, f_sy_type=ins.F_YScaleDataType)
+                                f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_SmoothScaleDataType, f_sy_type=ins.F_YScaleDataType)
                        elif ins.F_kFusedQuant == 2:
                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\")'.format(
                                f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType)
-                        _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
+                        _cond = '((a.n % {f_vec_n} == 0) && (t.xbias == {f_xbias}) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
-                                        f_vec_n = ins.F_Vector_N, f_fused_add = ins.F_kFusedAdd,
+                                        f_vec_n = ins.F_Vector_N, f_xbias = ins.F_kXbias, f_fused_add = ins.F_kFusedAdd,
                                        f_sweep_cond = _sweep_cond)
                        inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False),
                                            F_VEC_COND = _cond, F_instance_func=ins.call_name)
                    #inner_str = inner_str + vec_str
-                n_cnd = f'(a.n <= {n_})' if (i_n < len(blob_per_t) - 1) else ''
+                n_cnd = f'(a.n <= {n_})' if isinstance(n_, int) else ''
-                n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t)), F_N_COND=n_cnd, F_inner_dispatch=inner_str)
+                n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t), not isinstance(n_, int)), F_N_COND=n_cnd, F_inner_dispatch=inner_str)
            prec_i, prec_o = dtype_.split(',')
            d_str += self.API_PER_DTYPE.format(F_if = get_if_str(i_d, len(t_dtype_dict), False), F_i_type=prec_i, F_o_type=prec_o, F_per_n_case=n_str)
@@ -474,7 +500,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
    def content_common_header(self) -> str:
        return self.API_COMMON_HEADER.format(F_traits_define=self.API_TRAITS_DEFINE)
-    def get_blobs(self):
+    def get_blobs(self, args):
        h_traits = layernorm_fwd_codegen.h_traits
        h_instance = layernorm_fwd_codegen.h_instance
@@ -484,67 +510,69 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
        scale_list = [('fp32,fp32')]
        dtype_list = [('fp16,fp16'), ('bf16,bf16'),
                        ('fp16,int8'), ('bf16,int8')] # NOTE: only fused-dynamic-quant use int8 out
+        types_8bit = ('int8', 'fp8')
+        types_16bit = ('int16', 'fp16', 'bf16')
        #fused_add_list = [0, 1, 2]
        #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused dynamic quant
+        xbias_list = [0, 1]
        fused_add_list = [0, 1]
        fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant
+        #                                                       rm  rn  tm   tn  vn  pd     mv     fdiv  welford   2p     xbias    add   sweep
-        #                                                       rm  rn  tm   tn  vn  pd     mv     fdiv  2p     add    sweep
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, True, True,   False,   0,    0,    0),
-        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, True, True,   False,   0,    0,    0),
-                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, True, True,   False,   0,    0,    0),
-                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, True, True,   False,   0,    0,    0),
-                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, True, True,   False,   0,    0,    0),
-                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, True, True,   False,   0,    0,    0),
-                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, True, False,   0,    0)],
+                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, True, True,   False,   0,    0,    0),
-                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, True, False,   0,    0)],
+                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
-                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, True, False,   0,    0)],
+                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, True, True,   False,   0,    0,    0),
-                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
-                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
-                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
-                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, True, True,   False,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, True,    True,   0,    0,    0),
-                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, True,    True,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, True,    True,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, True,    True,   0,    0,    0)]}
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True,  True,   0,    0)]}
        total_blob = list()
        for hs_key in h_trait_dict:
            hs = h_trait_dict[hs_key]
            current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N
-            for dtype, scale_type, fused_add, fused_quant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list):
+            for dtype, scale_type, xbias, fused_add, fused_quant in itertools.product(dtype_list, scale_list, xbias_list, fused_add_list, fused_sweep_list):
                prec_i, prec_o = dtype.split(',')
-                scale_x, scale_y = scale_type.split(',')
+                scale_sm, scale_y = scale_type.split(',')
                if prec_o in dynamic_quant_out_dtype and fused_quant != 1:
                    continue # skip non dynamic quant case
                if fused_quant == 1 and hs_key == 'big':
@@ -554,20 +582,32 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
                    h_ = copy.copy(chs_) # copy the base instance out
                    h_.F_XDataType = prec_i
                    h_.F_YDataType = prec_o
-                    h_.F_XScaleDataType = scale_y
+                    h_.F_SmoothScaleDataType = scale_sm
-                    h_.F_YScaleDataType = scale_x
+                    h_.F_YScaleDataType = scale_y
+                    h_.F_kXbias = xbias
                    h_.F_kFusedAdd = fused_add
                    h_.F_kFusedQuant = fused_quant
+                    # disable welford update for 8bit and 16 bit smallN
+                    if not h_.F_kTwoPass_:
+                        #disable 16 bit when set args disable_16b_welford
+                        if args.disable_16b_welford and prec_i in types_16bit:
+                            h_.F_kWelford_ = False
+                        #disable 8bit by default
+                        elif prec_i in types_8bit or prec_o in types_8bit:
+                            h_.F_kWelford_ = False
+                        #disable 16bit small N
+                        elif prec_i in types_16bit and hs_key == '64':
+                            h_.F_kWelford_ = False
                    current_hs.append(h_) # + "\n"
                #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
                current_n_str = 'big' if hs_key == 'big' else current_n
-                total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, current_hs))
+                total_blob.append(h_instance(dtype, current_n_str, xbias, fused_add, fused_quant, current_hs))
        return total_blob
-    def list_blobs(self) -> None:
+    def list_blobs(self, args) -> None:
        w_p = Path(self.working_path)
        list_p = w_p / 'layernorm2d_fwd_blobs.txt'
-        blobs = self.get_blobs()
+        blobs = self.get_blobs(args)
        with list_p.open('w') as list_f:
            # api related file
            list_f.write(str(w_p / (self.name_api + ".cpp"))  + "\n")
@@ -576,11 +616,12 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
            for b in blobs:
                list_f.write(str(w_p / (b.name + ".cpp")) + "\n")
-    def gen_blobs(self) -> None:
+    def gen_blobs(self, args) -> None:
        w_p = Path(self.working_path)
-        (w_p / (self.name_api + ".cpp")).write_text(self.content_api)
+        w_str = self.content_api(args)
+        (w_p / (self.name_api + ".cpp")).write_text(w_str)
        (w_p / (self.name_common_header + ".hpp")).write_text(self.content_common_header)
-        blobs = self.get_blobs()
+        blobs = self.get_blobs(args)
        for b in blobs:
            (w_p / (b.name + ".cpp")).write_text(b.content)
@@ -588,14 +629,14 @@ def list_blobs(args):
    api_list = args.api.split(',')
    for api in api_list:
        if api == 'fwd':
-            layernorm_fwd_codegen(args.working_path, args.filter).list_blobs()
+            layernorm_fwd_codegen(args.working_path, args.filter).list_blobs(args)
 def gen_blobs(args):
    api_list = args.api.split(',')
    for api in api_list:
        if api == 'fwd':
-            layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs()
+            layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs(args)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
@@ -663,6 +704,13 @@ if __name__ == "__main__":
        help="codegen receipt."
    )
+    parser.add_argument(
+        "--disable_16b_welford",
+        default=False,
+        required=False,
+        help="enable/disable welford for 16bit datatype n > 64"
+    )
    args = parser.parse_args()
    # print(f'{args.list_blobs}-{args.gen_blobs}')

--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -35,12 +35,13 @@ auto create_args(int argc, char* argv[])
        .insert("kname", "1", "print kernel name or not")
        .insert("prec_i", "fp16", "input precision")
        .insert("prec_o", "auto", "output precision, set auto will be the same as input")
-        .insert("prec_sx",
+        .insert("prec_sm",
                "auto",
                "output quant scale type, set auto will use fp32. used when fquant=1")
        .insert("prec_sy",
                "auto",
                "output quant scale type, set auto will use fp32. used when fquant=1 or 2")
+        .insert("xbias", "0", "add bias, 0:no add, 1:add bias before fadd")
        .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
        .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
        .insert("warmup", "5", "cold iter")
@@ -52,7 +53,7 @@ auto create_args(int argc, char* argv[])
 template <typename InDataType,
          typename OutDataType,
-          typename XScaleDataType,
+          typename SmoothScaleDataType,
          typename YScaleDataType,
          bool SaveMeanVar>
 bool run(const ck_tile::ArgParser& arg_parser)
@@ -74,15 +75,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
    float epsilon       = arg_parser.get_float("e");
    std::string prec_i  = arg_parser.get_str("prec_i");
    std::string prec_o  = arg_parser.get_str("prec_o");
-    std::string prec_sx = arg_parser.get_str("prec_sx");
+    std::string prec_sm = arg_parser.get_str("prec_sm");
    std::string prec_sy = arg_parser.get_str("prec_sy");
    if(prec_o == "auto")
    {
        prec_o = prec_i;
    }
-    if(prec_sx == "auto")
+    if(prec_sm == "auto")
    {
-        prec_sx = "fp32";
+        prec_sm = "fp32";
    }
    if(prec_sy == "auto")
    {
@@ -93,6 +94,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    int do_validation = arg_parser.get_int("v");
    int warmup        = arg_parser.get_int("warmup");
    int repeat        = arg_parser.get_int("repeat");
+    int xbias         = arg_parser.get_int("xbias");
    int fused_add     = arg_parser.get_int("fadd");
    int fused_quant   = arg_parser.get_int("fquant");
    if(fused_quant == 1 && prec_o != "int8")
@@ -103,10 +105,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
    assert(x_stride >= n);
-    using TypeConfig = LayerNormTypeConfig<InDataType, OutDataType, XScaleDataType, YScaleDataType>;
+    using TypeConfig =
+        LayerNormTypeConfig<InDataType, OutDataType, SmoothScaleDataType, YScaleDataType>;
    using XDataType         = typename TypeConfig::XDataType;
    using YDataType         = typename TypeConfig::YDataType;
+    using XBiasDataType     = typename TypeConfig::XBiasDataType;
    using GammaDataType     = typename TypeConfig::GammaDataType;
    using BetaDataType      = typename TypeConfig::BetaDataType;
    using XResidualDataType = XDataType;
@@ -121,6 +125,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    // host verify
    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
+    ck_tile::HostTensor<XBiasDataType> x_bias_host({n});
    ck_tile::HostTensor<GammaDataType> gamma_host({n});
    ck_tile::HostTensor<BetaDataType> beta_host({n});
@@ -135,30 +140,33 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::HostTensor<YScaleDataType> y_scale_host_ref({m});
    ck_tile::HostTensor<YScaleDataType> y_scale_host_dev({m});
-    ck_tile::HostTensor<XScaleDataType> x_scale_host({n});
+    ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host({n});
-    ck_tile::HostTensor<XScaleDataType> x_scale_host_dev({n});
+    ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host_dev({n});
    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
    ck_tile::FillUniformDistribution<XResidualDataType>{-.5f, .5f}(x_residual_host);
-    ck_tile::FillUniformDistribution<XScaleDataType>{-1.f, 1.f}(x_scale_host);
+    ck_tile::FillUniformDistribution<SmoothScaleDataType>{-1.f, 1.f}(sm_scale_host);
+    ck_tile::FillUniformDistribution<XBiasDataType>{-.5f, .5f}(x_bias_host);
    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
    ck_tile::FillUniformDistribution<BetaDataType>{-.5f, .5f}(beta_host);
    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_bias_buf(x_bias_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
    ck_tile::DeviceMem y_scale_buf(y_scale_host_dev.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem x_scale_buf(x_scale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sm_scale_buf(sm_scale_host_dev.get_element_space_size_in_bytes());
    ck_tile::DeviceMem x_residual_buf(x_residual_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes());
    x_buf.ToDevice(x_host.data());
+    x_bias_buf.ToDevice(x_bias_host.data());
    gamma_buf.ToDevice(gamma_host.data());
    beta_buf.ToDevice(beta_host.data());
    x_residual_buf.ToDevice(x_residual_host.data());
-    x_scale_buf.ToDevice(x_scale_host.data());
+    sm_scale_buf.ToDevice(sm_scale_host.data());
    auto prec_str = [&]() {
        auto base_str = prec_i;
@@ -179,11 +187,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
              << ", yr_stride:" << yr_stride << std::flush;
    layernorm2d_fwd_traits traits{
-        prec_i, prec_o, prec_sx, prec_sy, SaveMeanVar, fused_add, fused_quant};
+        prec_i, prec_o, prec_sm, prec_sy, SaveMeanVar, xbias, fused_add, fused_quant};
    layernorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
                              fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr,
-                              fused_quant == 1 ? x_scale_buf.GetDeviceBuffer() : nullptr,
+                              fused_quant == 1 ? sm_scale_buf.GetDeviceBuffer() : nullptr,
+                              x_bias_buf.GetDeviceBuffer(),
                              gamma_buf.GetDeviceBuffer(),
                              beta_buf.GetDeviceBuffer(),
@@ -210,8 +219,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
        return false;
    }
-    std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(GammaDataType) * n +
+    std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(XBiasDataType) * n +
-                           sizeof(BetaDataType) * n + sizeof(YDataType) * m * n;
+                           sizeof(GammaDataType) * n + sizeof(BetaDataType) * n +
+                           sizeof(YDataType) * m * n;
    float gb_per_sec = num_byte / 1.E6 / ave_time;
    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
@@ -221,6 +231,22 @@ bool run(const ck_tile::ArgParser& arg_parser)
    if(do_validation)
    {
        // reference
+        if(xbias != 0)
+        {
+            // add bias before fadd
+            int M = x_host.mDesc.get_lengths()[0];
+            int N = x_host.mDesc.get_lengths()[1];
+            for(int idx_m = 0; idx_m < M; ++idx_m)
+            {
+                for(int idx_n = 0; idx_n < N; ++idx_n)
+                {
+                    x_host(idx_m, idx_n) = ck_tile::type_convert<XDataType>(
+                        ck_tile::type_convert<ComputeDataType>(x_host(idx_m, idx_n)) +
+                        ck_tile::type_convert<ComputeDataType>(x_bias_host(idx_n)));
+                }
+            }
+        }
        if(fused_add != 0)
        {
            // fused pre_add/pre_add_store
@@ -254,8 +280,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                    for(int n_ = 0; n_ < N_; n_++)
                    {
                        // input smooth outlier
-                        acc_(m_, n_) =
+                        acc_(m_, n_) = acc_(m_, n_) *
-                            acc_(m_, n_) * ck_tile::type_convert<ComputeDataType>(x_scale_host(n_));
+                                       ck_tile::type_convert<ComputeDataType>(sm_scale_host(n_));
                    }
                }
                ComputeDataType absmax = static_cast<ComputeDataType>(0);
@@ -377,16 +403,16 @@ int main(int argc, char* argv[])
    std::string prec_i  = arg_parser.get_str("prec_i");
    std::string prec_o  = arg_parser.get_str("prec_o");
-    std::string prec_sx = arg_parser.get_str("prec_sx");
+    std::string prec_sm = arg_parser.get_str("prec_sm");
    std::string prec_sy = arg_parser.get_str("prec_sy");
    if(prec_o == "auto")
    {
        prec_o = prec_i;
    }
-    if(prec_sx == "auto")
+    if(prec_sm == "auto")
    {
-        prec_sx = "fp32";
+        prec_sm = "fp32";
    }
    if(prec_sy == "auto")
    {
@@ -395,33 +421,33 @@ int main(int argc, char* argv[])
    int save_mv = arg_parser.get_int("save_mv");
    // no dynamic quant case
-    if(prec_i == "fp16" && prec_o == "fp16" && prec_sx == "fp32" && prec_sy == "fp32" && save_mv)
+    if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" && save_mv)
    {
        return run<ck_tile::half_t, ck_tile::half_t, float, float, true>(arg_parser) ? 0 : -2;
    }
-    else if(prec_i == "fp16" && prec_o == "fp16" && prec_sx == "fp32" && prec_sy == "fp32" &&
+    else if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" &&
            !save_mv)
    {
        return run<ck_tile::half_t, ck_tile::half_t, float, float, false>(arg_parser) ? 0 : -2;
    }
-    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sx == "fp32" && prec_sy == "fp32" &&
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
            save_mv)
    {
        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
    }
-    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sx == "fp32" && prec_sy == "fp32" &&
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
            !save_mv)
    {
        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
    }
    // dynamic quant case, only in inference
-    else if(prec_i == "fp16" && prec_o == "int8" && prec_sx == "fp32" && prec_sy == "fp32" &&
+    else if(prec_i == "fp16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
            !save_mv)
    {
        return run<ck_tile::half_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
    }
-    else if(prec_i == "bf16" && prec_o == "int8" && prec_sx == "fp32" && prec_sy == "fp32" &&
+    else if(prec_i == "bf16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
            !save_mv)
    {
        return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;

--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -8,35 +8,40 @@
 #include "ck_tile/ops/layernorm2d.hpp"
 #include <string>
-template <typename InType, typename OutType, typename XScaleDataType_, typename YScaleDataType_>
+template <typename InType,
+          typename OutType,
+          typename SmoothSScaleDataType_,
+          typename YScaleDataType_>
 struct LayerNormTypeConfig;
-template <typename OutType, typename XScaleDataType_, typename YScaleDataType_>
+template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
-struct LayerNormTypeConfig<ck_tile::half_t, OutType, XScaleDataType_, YScaleDataType_>
+struct LayerNormTypeConfig<ck_tile::half_t, OutType, SmoothScaleDataType_, YScaleDataType_>
 {
-    using XDataType       = ck_tile::half_t;
+    using XDataType           = ck_tile::half_t;
-    using YDataType       = OutType;
+    using YDataType           = OutType;
-    using GammaDataType   = ck_tile::half_t;
+    using XBiasDataType       = ck_tile::half_t;
-    using BetaDataType    = ck_tile::half_t;
+    using GammaDataType       = ck_tile::half_t;
-    using MeanDataType    = ck_tile::half_t;
+    using BetaDataType        = ck_tile::half_t;
-    using InvStdDataType  = ck_tile::half_t;
+    using MeanDataType        = ck_tile::half_t;
-    using ComputeDataType = float;
+    using InvStdDataType      = ck_tile::half_t;
-    using XScaleDataType  = XScaleDataType_;
+    using ComputeDataType     = float;
-    using YScaleDataType  = YScaleDataType_;
+    using SmoothScaleDataType = SmoothScaleDataType_;
+    using YScaleDataType      = YScaleDataType_;
 };
-template <typename OutType, typename XScaleDataType_, typename YScaleDataType_>
+template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
-struct LayerNormTypeConfig<ck_tile::bf16_t, OutType, XScaleDataType_, YScaleDataType_>
+struct LayerNormTypeConfig<ck_tile::bf16_t, OutType, SmoothScaleDataType_, YScaleDataType_>
 {
-    using XDataType       = ck_tile::bf16_t;
+    using XDataType           = ck_tile::bf16_t;
-    using YDataType       = OutType;
+    using YDataType           = OutType;
-    using GammaDataType   = ck_tile::bf16_t;
+    using XBiasDataType       = ck_tile::bf16_t;
-    using BetaDataType    = ck_tile::bf16_t;
+    using GammaDataType       = ck_tile::bf16_t;
-    using MeanDataType    = ck_tile::bf16_t;
+    using BetaDataType        = ck_tile::bf16_t;
-    using InvStdDataType  = ck_tile::bf16_t;
+    using MeanDataType        = ck_tile::bf16_t;
-    using ComputeDataType = float;
+    using InvStdDataType      = ck_tile::bf16_t;
-    using XScaleDataType  = XScaleDataType_;
+    using ComputeDataType     = float;
-    using YScaleDataType  = YScaleDataType_;
+    using SmoothScaleDataType = SmoothScaleDataType_;
+    using YScaleDataType      = YScaleDataType_;
 };
 // runtime args
@@ -50,13 +55,14 @@ struct layernorm2d_fwd_traits
    std::string prec_i; // input precision
    std::string prec_o; // output precision
-    // if fused_quant == 1, need set prec_sx/prec_sy to proper string, otherwise can set
+    // if fused_quant == 1, need set prec_sm/prec_sy to proper string, otherwise can set
    // arbitrary(will skip check) if fused_quant == 2, need set prec_sy to proper string, otherwise
    // can set arbitrary(will skip check)
-    std::string prec_sx; // x-scale, used for [1*N] input smooth quant
+    std::string prec_sm; // x-scale, used for [1*N] input smooth quant
    std::string prec_sy; // y-scale, used for [M*1] output for next layer
    bool save_mean_var; //
+    int xbias;          // 0:no-bias, 1:add bias
    int fused_add;      // 0:no-add, 1:pre-add-store, 2:pre-add
    int fused_quant;    // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
 };

--- a/example/ck_tile/02_layernorm2d/script/smoke_test.sh
+++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
@@ -27,7 +27,8 @@ $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=2734
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=3182
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=9   -n=4096
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=8192
-#$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=10547
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=9120
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=10547
 #$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=17134
 done
 done