Merge remote-tracking branch 'origin/develop' into ck_tile/moe

0eb75e21 · carlushuang · 1b4b640b · c8b6b642 · 0eb75e21 · 0eb75e21
Commit 0eb75e21 authored Aug 17, 2024 by carlushuang
20 changed files
--- a/codegen/test/rtc/src/tmp_dir.cpp
+++ b/codegen/test/rtc/src/tmp_dir.cpp
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.6.1
+rocm-docs-core==1.7.0
 sphinxcontrib-bibtex==2.6.2
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.6.1
+rocm-docs-core==1.7.0
    # via -r requirements.in
 six==1.16.0
    # via pybtex

--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <algorithm>
 #include <cassert>
@@ -139,7 +139,7 @@ inline bool parse_cmd_args(int argc,
 inline HostTensorDescriptor
 make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
 {
-    std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
+    std::vector<ck::long_index_t> dimensions{problem_size.G_, problem_size.N_};
    ck::ranges::copy(problem_size.output_spatial_lengths_, std::back_inserter(dimensions));

--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <initializer_list>
@@ -255,34 +255,61 @@ int main(int argc, char* argv[])
    else
    {
        // for testing half_t
+        pass =
+            pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass =
            pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing float
+        pass =
+            pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing double
+        pass =
+            pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing bhalf_t
+        pass = pass &&
+               reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                   true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass &&
               reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
                   true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing int8_t
+        pass =
+            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass =
            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
        // for testing int4_t using AVG operation
+        pass =
+            pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing int4_t using MAX operation
+        pass =
+            pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 #endif

--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -316,7 +316,17 @@ int reduce_blockwise_impl(bool do_verification,
    auto invoker_ptr = reduce.MakeInvokerPointer();
-    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    int log_level = 0, cold_niters = 5, nrepeat = 50;
+    if(beta != 0.0f)
+    {
+        std::cerr << "Warning: With beta != 0.0f there must be only one repeat for correct results "
+                     "since out memory is being overwritten."
+                  << std::endl;
+        cold_niters = 0;
+        nrepeat     = 1;
+    }
+    float avg_time = invoker_ptr->Run(
+        argument_ptr.get(), StreamConfig{nullptr, time_kernel, log_level, cold_niters, nrepeat});
    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
                            invariant_total_length * sizeof(InOutDataType);

--- a/example/12_reduce/reduce_example_common.hpp
+++ b/example/12_reduce/reduce_example_common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -38,7 +38,8 @@ struct ReduceShape
    static constexpr ck::index_t NumReduceDim_ = NumReduceDim;
 };
-using reduce_shape_instances = std::tuple<ReduceShape<3, 1>,
+using reduce_shape_instances = std::tuple<ReduceShape<12, 3>,
+                                          ReduceShape<3, 1>,
                                          ReduceShape<3, 2>,
                                          ReduceShape<4, 1>,
                                          ReduceShape<4, 2>,

--- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
@@ -80,6 +80,29 @@ int run_conv_bwd_data(bool do_verification,
    // reset input to zero
    in_device_buf.SetZero();
+    std::vector<ck::index_t> input_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> filter_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> output_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> conv_filter_strides_i32(NDimSpatial);
+    std::vector<ck::index_t> conv_filter_dilations_i32(NDimSpatial);
+    std::vector<ck::index_t> input_left_pads_i32(NDimSpatial);
+    std::vector<ck::index_t> input_right_pads_i32(NDimSpatial);
+    for(ck::index_t d = 0; d < NDimSpatial; d++)
+    {
+        input_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.input_spatial_lengths_[d]);
+        filter_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.filter_spatial_lengths_[d]);
+        output_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.GetOutputSpatialLengths()[d]);
+        conv_filter_strides_i32[d] = static_cast<ck::index_t>(conv_param.conv_filter_strides_[d]);
+        conv_filter_dilations_i32[d] =
+            static_cast<ck::index_t>(conv_param.conv_filter_dilations_[d]);
+        input_left_pads_i32[d]  = static_cast<ck::index_t>(conv_param.input_left_pads_[d]);
+        input_right_pads_i32[d] = static_cast<ck::index_t>(conv_param.input_right_pads_[d]);
+    }
    // do GEMM
    auto conv    = DeviceConvNdBwdDataInstance{};
    auto invoker = conv.MakeInvoker();
@@ -87,16 +110,16 @@ int run_conv_bwd_data(bool do_verification,
        conv.MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                 static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                 static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                 conv_param.N_,
+                                 static_cast<ck::index_t>(conv_param.N_),
-                                 conv_param.K_,
+                                 static_cast<ck::index_t>(conv_param.K_),
-                                 conv_param.C_,
+                                 static_cast<ck::index_t>(conv_param.C_),
-                                 conv_param.input_spatial_lengths_,
+                                 input_spatial_lengths_i32,
-                                 conv_param.filter_spatial_lengths_,
+                                 filter_spatial_lengths_i32,
-                                 conv_param.GetOutputSpatialLengths(),
+                                 output_spatial_lengths_i32,
-                                 conv_param.conv_filter_strides_,
+                                 conv_filter_strides_i32,
-                                 conv_param.conv_filter_dilations_,
+                                 conv_filter_dilations_i32,
-                                 conv_param.input_left_pads_,
+                                 input_left_pads_i32,
-                                 conv_param.input_right_pads_,
+                                 input_right_pads_i32,
                                 in_element_op,
                                 wei_element_op,
                                 out_element_op);

--- a/example/20_grouped_conv_bwd_weight/common.hpp
+++ b/example/20_grouped_conv_bwd_weight/common.hpp
@@ -23,12 +23,8 @@
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
-#ifdef CK_ENABLE_FP8
 using F8   = ck::f8_t;
-#endif
-#ifdef CK_ENABLE_BF8
 using BF8  = ck::bf8_t;
-#endif
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp
@@ -208,6 +208,7 @@ int main(int argc, char* argv[])
                               StrideB,
                               std::array<ck::index_t, NumDTensor>{StrideD, StrideD},
                               StrideE,
+                               1,
                               a_element_op,
                               b_element_op,
                               cde_element_op);

--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
@@ -69,7 +69,7 @@ using AElementOp   = PassThrough;
 using BElementOp   = PassThrough;
 using CDEElementOp = MultiplyMultiply;
-static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNPadding;
 using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3
    // clang-format off
@@ -99,6 +99,8 @@ int main(int argc, char* argv[])
    ck::index_t StrideD = 0;
    ck::index_t StrideE = N;
+    ck::index_t KBatch = 1;
    if(argc == 1)
    {
        // use default case
@@ -109,7 +111,7 @@ int main(int argc, char* argv[])
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
-    else if(argc == 11)
+    else if(argc == 12)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
@@ -123,13 +125,16 @@ int main(int argc, char* argv[])
        StrideB = std::stoi(argv[8]);
        StrideD = std::stoi(argv[9]);
        StrideE = std::stoi(argv[10]);
+        KBatch = std::stoi(argv[11]);
    }
    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE\n");
+        printf(
+            "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, KBatch\n");
        exit(0);
    }
@@ -212,6 +217,7 @@ int main(int argc, char* argv[])
                               StrideB,
                               std::array<ck::index_t, NumDTensor>{I0, I0},
                               StrideE,
+                               KBatch,
                               a_element_op,
                               b_element_op,
                               cde_element_op);
@@ -236,10 +242,12 @@ int main(int argc, char* argv[])
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << std::endl;
-    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
    if(do_verification)
    {
+        invoker.Run(argument, StreamConfig{nullptr, false});
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
        Tensor<CShuffleDataType> c_m_n({M, N});
        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -72,10 +72,24 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
    endforeach()
+    #Do not build any FP8 examples if CK_ENABLE_FP8 not set
+    foreach(source IN LISTS FILE_NAME)
+        if(NOT DEFINED CK_ENABLE_FP8 AND source MATCHES "_fp8")
+            message("removing fp8 example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
+    #Do not build any BF8 examples if CK_ENABLE_BF8 not set
+    foreach(source IN LISTS FILE_NAME)
+        if(NOT DEFINED CK_ENABLE_BF8 AND source MATCHES "_bf8")
+            message("removing bf8 example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
    #only continue if there are some source files left on the list
    if(FILE_NAME)
        if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
+            list(REMOVE_ITEM EX_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(FILE_NAME MATCHES "_wmma")
            list(REMOVE_ITEM EX_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        endif()
@@ -162,7 +176,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    #only continue if there are some source files left on the list
    if(FILE_NAME)
        if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(FILE_NAME MATCHES "_wmma")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        endif()

--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -6,7 +6,7 @@ execute_process(
 execute_process(
  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt
+  --api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt --receipt 3
 )
 # NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
@@ -23,7 +23,7 @@ add_custom_command(
 add_custom_command(
  OUTPUT ${FMHA_BWD_GEN_BLOBS}
  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --api bwd --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  --api bwd --output_dir ${CMAKE_CURRENT_BINARY_DIR} --receipt 3
 )
 set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
@@ -55,11 +55,10 @@ set(EXAMPLE_FMHA_BWD_COMPILE_OPTIONS)
 #       ... because they are auto-generated
 if(FMHA_FWD_FAST_EXP2)
 	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
-  list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
 else()
 	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
-  list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
 endif()
+list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -fgpu-flush-denormals-to-zero)
 # Allow comparing floating points directly in order to check sentinel values
 list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-float-equal)

--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -66,6 +66,22 @@ BIAS_CHECK_MAP = {
    "alibi" : "bias_enum::alibi"
 }
+DROPOUT_MAP = {
+    "no"                        : "ck_tile::BlockDropoutBwd<false, true,  false>",
+    "dropout_wg32"              : "ck_tile::BlockDropoutBwd<true,  true,  false>",
+    "dropout_wg32_storerandval" : "ck_tile::BlockDropoutBwd<true,  true,  true >",
+    "dropout_wg16"              : "ck_tile::BlockDropoutBwd<true,  false, false>",
+    "dropout_wg16_storerandval" : "ck_tile::BlockDropoutBwd<true,  false, true >"
+}
+DROPOUT_CHECK_MAP = {
+    "no"                        : "t.has_dropout == false",
+    "dropout_wg32"              : "t.has_dropout == true && t.is_store_randval == false",
+    "dropout_wg32_storerandval" : "t.has_dropout == true && t.is_store_randval == true",
+    "dropout_wg16"              : "t.has_dropout == true && t.is_store_randval == false",
+    "dropout_wg16_storerandval" : "t.has_dropout == true && t.is_store_randval == true",
+}
 MODE_MAP = {
    "batch" : "false",
    "group" : "true"

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -87,7 +87,11 @@ auto create_args(int argc, char* argv[])
        .insert("drop_offset", "0", "offset for random number generator")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("deterministic",
+                "0",
+                "if set to 1 will use multi-buffer reduction strategy for dq, atomic opeartion "
+                "will not be used");
    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -128,11 +132,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::index_t hdim_v = arg_parser.get_int("d_v");
    if(hdim_v < 0)
        hdim_v = hdim_q;
-    if(hdim_q % 2 != 0 || hdim_v % 2 != 0)
-    {
-        std::cerr << "FMHA Bwd kernel currently only supports even headdim" << std::endl;
-        return false;
-    }
    bool i_perm = arg_parser.get_bool("iperm"); // if true, will be batch * nhead * seqlen * hdim
    bool o_perm = arg_parser.get_bool("operm"); // if false, will be batch * seqlen * nhead * hdim
@@ -180,6 +179,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    int stream_warmup  = arg_parser.get_int("warmup");
    int stream_repeat  = arg_parser.get_int("repeat");
    bool kname         = arg_parser.get_bool("kname");
+    bool deterministic = arg_parser.get_bool("deterministic");
    ck_tile::stream_config stream_config{nullptr,
                                         true,
@@ -265,6 +265,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
        (mode == mode_enum::batch ? seqlen_q : seqstart_q_host.back());
    const ck_tile::index_t shape_seqlen_k =
        (mode == mode_enum::batch ? seqlen_k : seqstart_k_host.back());
+    const ck_tile::index_t kN0 = (hdim_q <= 128) ? 128 : 64;
+    const ck_tile::index_t nsplits =
+        deterministic ? ck_tile::integer_divide_ceil(max_seqlen_k, kN0) : 1;
    ck_tile::HostTensor<QDataType> q_host(
        get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, hdim_q));
@@ -284,9 +287,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::HostTensor<ODataType> o_host(
        get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
    ck_tile::HostTensor<LSEDataType> lse_host(
-        std::array<ck_tile::index_t, 3>{batch, nhead, max_seqlen_q});
+        std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
    ck_tile::HostTensor<DDataType> d_host(
-        std::array<ck_tile::index_t, 3>{batch, nhead, max_seqlen_q});
+        std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
    ck_tile::HostTensor<RandValOutputDataType> randval_host(
        p_drop > 0 ? get_lengths(true, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
                   : std::array<ck_tile::index_t, 4>{1, 1, 1, 1});
@@ -302,6 +305,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
        use_dbias
            ? get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
+    ck_tile::HostTensor<AccDataType> dq_acc_host(
+        i_perm
+            ? std::array<ck_tile::index_t, 5>{nsplits, shape_batch, nhead, shape_seqlen_q, hdim_q}
+            : std::array<ck_tile::index_t, 5>{nsplits, shape_batch, shape_seqlen_q, nhead, hdim_q});
    if(init_method == 0)
    {
@@ -362,6 +369,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
    ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dq_acc_buf(dq_acc_host.get_element_space_size_in_bytes());
    q_buf.ToDevice(q_host.data());
    k_buf.ToDevice(k_host.data());
@@ -387,8 +395,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
    std::cout << "[" << prec << "|" << mode << "|" << io_layout(i_perm, o_perm) << "] b:" << batch
              << ", h:" << nhead << "/" << nhead_k << ", s:" << seqlen_q << "/" << seqlen_k
              << ", d:" << hdim_q << "/" << hdim_v << ", scale:" << scale << ", bias:" << bias
-              << ", dbias:" << use_dbias << ", p_drop:" << p_drop << ", mask:" << mask
+              << ", dbias:" << use_dbias << ", p_drop:" << p_drop << ", s_randval:" << s_randval
-              << std::flush;
+              << ", deterministic:" << deterministic << ", mask:" << mask << std::flush;
+    std::size_t workspace_size =
+        dq_acc_host.get_element_space_size_in_bytes() * sizeof(AccDataType) / (1024 * 1024);
+    if(deterministic == 1)
+    {
+        std::cout << "\nDeterministic mode ON: " << workspace_size
+                  << " MByte memory workspace allocated" << std::endl;
+    }
    auto fmha_traits = fmha_bwd_traits{hdim_q,
                                       hdim_v,
@@ -397,7 +414,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                       mask.type,
                                       bias.type,
                                       use_dbias,
-                                       p_drop > 0.0f};
+                                       p_drop > 0.0f,
+                                       s_randval,
+                                       deterministic};
    auto fmha_args   = [&]() {
        assert(nhead % nhead_k == 0);
        /// NOTE: we broadcast bias from [1, 1, seqlen_q, seqlen_k] to [batch, nhead, seqlen_q,
@@ -422,7 +441,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
        const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t nhead_stride_do      = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
-        const ck_tile::index_t nhead_stride_lsed    = max_seqlen_q;
+        const ck_tile::index_t nhead_stride_lsed    = shape_seqlen_q;
        const ck_tile::index_t nhead_stride_dbias =
            (i_perm ? shape_seqlen_q * max_seqlen_k : max_seqlen_k);
        // setup batch_stride_* arguments
@@ -433,10 +452,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t batch_stride_o       = (nhead * shape_seqlen_q * hdim_v);
        const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t batch_stride_do      = (nhead * shape_seqlen_q * hdim_v);
-        const ck_tile::index_t batch_stride_lsed    = (nhead * max_seqlen_q);
+        const ck_tile::index_t batch_stride_lsed    = (nhead * shape_seqlen_q);
        const ck_tile::index_t batch_stride_dk      = (nhead * shape_seqlen_k * hdim_q);
        const ck_tile::index_t batch_stride_dv      = (nhead * shape_seqlen_k * hdim_v);
        const ck_tile::index_t batch_stride_dbias   = (nhead * shape_seqlen_q * max_seqlen_k);
+        const ck_tile::index_t split_stride_dq_acc =
+            (shape_batch * nhead * shape_seqlen_q * hdim_q);
        return fmha_bwd_args{q_buf.GetDeviceBuffer(),
                             k_buf.GetDeviceBuffer(),
@@ -452,6 +473,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             dk_buf.GetDeviceBuffer(),
                             dv_buf.GetDeviceBuffer(),
                             dbias_buf.GetDeviceBuffer(),
+                             dq_acc_buf.GetDeviceBuffer(),
                             seqstart_q.GetDeviceBuffer(),
                             seqstart_k.GetDeviceBuffer(),
                             nullptr,
@@ -473,6 +495,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             stride_o,
                             stride_randval,
                             stride_do,
+                             stride_q, // stride_dq_acc
+                             stride_q, // stride_dq
                             stride_dk,
                             stride_dv,
                             stride_dbias,
@@ -484,6 +508,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             nhead_stride_randval,
                             nhead_stride_do,
                             nhead_stride_lsed,
+                             nhead_stride_q, // nhead_stride_dq_acc
+                             nhead_stride_q, // nhead_stride_dq
+                             nhead_stride_k, // nhead_stride_dk
+                             nhead_stride_v, // nhead_stride_dv
                             nhead_stride_dbias,
                             batch_stride_q,
                             batch_stride_k,
@@ -493,15 +521,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             batch_stride_randval,
                             batch_stride_do,
                             batch_stride_lsed,
+                             batch_stride_q, // batch_stride_dq_acc
+                             batch_stride_q, // batch_stride_dq
                             batch_stride_dk,
                             batch_stride_dv,
                             batch_stride_dbias,
+                             split_stride_dq_acc,
                             mask.left,
                             mask.right,
                             static_cast<ck_tile::index_t>(mask.type),
                             p_drop,
                             p_undrop,
-                             s_randval,
                             {drop_seed, drop_offset}};
    }();
@@ -719,7 +749,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        if(o_perm) o_host_ref.ForEach([&](auto& self, auto idx) { o_host(b, idx[0], idx[1] + query_offset, idx[2]) = self(idx); });
        else       o_host_ref.ForEach([&](auto& self, auto idx) { o_host(b, idx[1] + query_offset, idx[0], idx[2]) = self(idx); });
-        lse_host_ref.ForEach([&](auto& self, auto idx) { lse_host(wb, idx[0], idx[1]) = self(idx); });
+        lse_host_ref.ForEach([&](auto& self, auto idx) { lse_host(b, idx[0], idx[1] + query_offset) = self(idx); });
        // clang-format on
        q_host_refs.push_back(q_host_ref);
@@ -738,6 +768,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    lse_buf.ToDevice(lse_host.data());
    dq_buf.SetZero();
    dbias_buf.SetZero();
+    dq_acc_buf.SetZero();
    ck_tile::stream_config stream_config_v{
        nullptr, true, 0, 0, 1, arg_parser.get_str("timer") == std::string("gpu")};

--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -77,6 +77,7 @@ struct fmha_bwd_args
    void* dk_ptr;
    void* dv_ptr;
    void* dbias_ptr;
+    void* dq_acc_ptr;
    const void* seqstart_q_ptr;
    const void* seqstart_k_ptr;
    const void* seqlen_k_ptr;
@@ -97,6 +98,8 @@ struct fmha_bwd_args
    ck_tile::index_t stride_o;
    ck_tile::index_t stride_randval;
    ck_tile::index_t stride_do;
+    ck_tile::index_t stride_dq_acc;
+    ck_tile::index_t stride_dq;
    ck_tile::index_t stride_dk;
    ck_tile::index_t stride_dv;
    ck_tile::index_t stride_dbias;
@@ -108,6 +111,10 @@ struct fmha_bwd_args
    ck_tile::index_t nhead_stride_randval;
    ck_tile::index_t nhead_stride_do;
    ck_tile::index_t nhead_stride_lsed;
+    ck_tile::index_t nhead_stride_dq_acc;
+    ck_tile::index_t nhead_stride_dq;
+    ck_tile::index_t nhead_stride_dk;
+    ck_tile::index_t nhead_stride_dv;
    ck_tile::index_t nhead_stride_dbias;
    ck_tile::index_t batch_stride_q;
    ck_tile::index_t batch_stride_k;
@@ -117,15 +124,17 @@ struct fmha_bwd_args
    ck_tile::index_t batch_stride_randval;
    ck_tile::index_t batch_stride_do;
    ck_tile::index_t batch_stride_lsed;
+    ck_tile::index_t batch_stride_dq_acc;
+    ck_tile::index_t batch_stride_dq;
    ck_tile::index_t batch_stride_dk;
    ck_tile::index_t batch_stride_dv;
    ck_tile::index_t batch_stride_dbias;
+    ck_tile::index_t split_stride_dq_acc;
    ck_tile::index_t window_size_left;
    ck_tile::index_t window_size_right;
    ck_tile::index_t mask_type;
    float p_drop;
    float p_undrop;
-    bool s_randval;
    std::tuple<uint64_t, uint64_t> drop_seed_offset;
 };
@@ -145,10 +154,10 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.do_ptr,
                                                  args.d_ptr,
                                                  args.rand_val_ptr,
-                                                  args.dq_ptr,
                                                  args.dk_ptr,
                                                  args.dv_ptr,
                                                  args.dbias_ptr,
+                                                  args.dq_acc_ptr,
                                                  args.seqstart_q_ptr,
                                                  args.seqstart_k_ptr,
                                                  args.seqlen_k_ptr,
@@ -163,6 +172,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.stride_bias,
                                                  args.stride_randval,
                                                  args.stride_do,
+                                                  args.stride_dq_acc,
                                                  args.stride_dk,
                                                  args.stride_dv,
                                                  args.stride_dbias,
@@ -173,13 +183,15 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.nhead_stride_randval,
                                                  args.nhead_stride_do,
                                                  args.nhead_stride_lsed,
+                                                  args.nhead_stride_dq_acc,
+                                                  args.nhead_stride_dk,
+                                                  args.nhead_stride_dv,
                                                  args.nhead_stride_dbias,
-                                                  args.batch_stride_lsed,
+                                                  args.split_stride_dq_acc,
                                                  args.window_size_left,
                                                  args.window_size_right,
                                                  args.mask_type,
                                                  args.p_drop,
-                                                  args.s_randval,
                                                  args.drop_seed_offset);
        }
        else
@@ -192,10 +204,10 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.do_ptr,
                                                  args.d_ptr,
                                                  args.rand_val_ptr,
-                                                  args.dq_ptr,
                                                  args.dk_ptr,
                                                  args.dv_ptr,
                                                  args.dbias_ptr,
+                                                  args.dq_acc_ptr,
                                                  args.seqlen_q,
                                                  args.seqlen_k,
                                                  args.hdim_q,
@@ -209,6 +221,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.stride_bias,
                                                  args.stride_randval,
                                                  args.stride_do,
+                                                  args.stride_dq_acc,
                                                  args.stride_dk,
                                                  args.stride_dv,
                                                  args.stride_dbias,
@@ -219,6 +232,9 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.nhead_stride_randval,
                                                  args.nhead_stride_do,
                                                  args.nhead_stride_lsed,
+                                                  args.nhead_stride_dq_acc,
+                                                  args.nhead_stride_dk,
+                                                  args.nhead_stride_dv,
                                                  args.nhead_stride_dbias,
                                                  args.batch_stride_q,
                                                  args.batch_stride_k,
@@ -227,14 +243,15 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                  args.batch_stride_randval,
                                                  args.batch_stride_do,
                                                  args.batch_stride_lsed,
+                                                  args.batch_stride_dq_acc,
                                                  args.batch_stride_dk,
                                                  args.batch_stride_dv,
                                                  args.batch_stride_dbias,
+                                                  args.split_stride_dq_acc,
                                                  args.window_size_left,
                                                  args.window_size_right,
                                                  args.mask_type,
                                                  args.p_drop,
-                                                  args.s_randval,
                                                  args.drop_seed_offset);
        }
    }();
@@ -260,8 +277,7 @@ auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
                                                     args.stride_o,
                                                     args.nhead_stride_do,
                                                     args.nhead_stride_o,
-                                                     args.nhead_stride_lsed,
+                                                     args.nhead_stride_lsed);
-                                                     args.batch_stride_lsed);
        }
        else
        { // create batch mode kernel arguments
@@ -286,19 +302,59 @@ auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
    return ck_tile::make_tuple(kargs, grids);
 }
+template <typename FmhaBwdConvertQGradKernel>
+auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args)
+{
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaBwdConvertQGradKernel::kIsGroupMode)
+        {
+            return FmhaBwdConvertQGradKernel::MakeKargs(args.dq_acc_ptr,
+                                                        args.dq_ptr,
+                                                        args.seqstart_q_ptr,
+                                                        args.seqstart_k_ptr,
+                                                        args.hdim_q,
+                                                        args.stride_dq,
+                                                        args.stride_dq_acc,
+                                                        args.nhead_stride_dq,
+                                                        args.nhead_stride_dq_acc,
+                                                        args.split_stride_dq_acc);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaBwdConvertQGradKernel::MakeKargs(args.dq_acc_ptr,
+                                                        args.dq_ptr,
+                                                        args.seqlen_q,
+                                                        args.seqlen_k,
+                                                        args.hdim_q,
+                                                        args.stride_dq,
+                                                        args.stride_dq_acc,
+                                                        args.nhead_stride_dq,
+                                                        args.nhead_stride_dq_acc,
+                                                        args.batch_stride_dq,
+                                                        args.batch_stride_dq_acc,
+                                                        args.split_stride_dq_acc);
+        }
+    }();
+    dim3 grids = FmhaBwdConvertQGradKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q);
+    return ck_tile::make_tuple(kargs, grids);
+}
 // this is used to pattern-match internl kernel implementation, not to instantiate kernel
 template <ck_tile::index_t HDim_,
          typename DataType_,
          bool kIsGroupMode_,
          ck_tile::BlockFmhaBwdPipelineEnum FmhaBwdPipelineEnum_,
          typename FmhaMask_,
+          typename FmhaDropout_,
          ck_tile::BlockAttentionBiasEnum BiasEnum_,
          bool kHasBiasGrad_,
-          bool kHasDropout_,
          bool kPadS_,
          bool kPadSK_,
          bool kPadD_,
-          bool kPadDv_>
+          bool kPadDv_,
+          bool kIsDeterministic_>
 struct fmha_bwd_dq_dk_dv_traits_
 {
    static constexpr ck_tile::index_t HDim    = HDim_;
@@ -306,13 +362,14 @@ struct fmha_bwd_dq_dk_dv_traits_
    static constexpr bool kIsGroupMode        = kIsGroupMode_;
    static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_;
    using FmhaMask                            = ck_tile::remove_cvref_t<FmhaMask_>;
+    using FmhaDropout                         = ck_tile::remove_cvref_t<FmhaDropout_>;
    static constexpr auto BiasEnum            = BiasEnum_;
    static constexpr bool kHasBiasGrad        = kHasBiasGrad_;
-    static constexpr bool kHasDropout         = kHasDropout_;
    static constexpr bool kPadS               = kPadS_;
    static constexpr bool kPadSK              = kPadSK_;
    static constexpr bool kPadD               = kPadD_;
    static constexpr bool kPadDv              = kPadDv_;
+    static constexpr bool kIsDeterministic    = kIsDeterministic_;
 };
 template <typename Traits_>
@@ -343,6 +400,31 @@ void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
 template <typename Traits_>
 std::string fmha_bwd_dot_do_o_get_name_();
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          bool kPadS_,
+          bool kPadD_,
+          bool kIsDeterministic_>
+struct fmha_bwd_convert_dq_traits_
+{
+    static constexpr ck_tile::index_t HDim = HDim_;
+    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode     = kIsGroupMode_;
+    static constexpr bool kPadS            = kPadS_;
+    static constexpr bool kPadD            = kPadD_;
+    static constexpr bool kIsDeterministic = kIsDeterministic_;
+};
+template <typename Traits_>
+float fmha_bwd_convert_dq_(const ck_tile::stream_config&, fmha_bwd_args);
+template <typename Traits_>
+void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
+template <typename Traits_>
+std::string fmha_bwd_convert_dq_get_name_();
 // This is the public API, will be generated by script
 struct fmha_bwd_traits
 {
@@ -354,6 +436,8 @@ struct fmha_bwd_traits
    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
    bool has_dbias;
    bool has_dropout;
+    bool is_store_randval;
+    bool is_deterministic;
    // TODO: padding check is inside this api
 };
 float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -479,16 +479,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
            : std::array<ck_tile::index_t, 2>{1, 1});
    ck_tile::HostTensor<LSEDataType> lse_acc_host(
-        1 < num_splits ? std::array<ck_tile::index_t, 4>{num_splits, batch, nhead, max_seqlen_q}
+        1 < num_splits
+            ? std::array<ck_tile::index_t, 4>{num_splits, shape_batch, nhead, shape_seqlen_q}
            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1});
    ck_tile::HostTensor<OaccDataType> o_acc_host(
        1 < num_splits
            ? std::array<ck_tile::index_t, 5>{num_splits, batch, nhead, max_seqlen_q, hdim_v}
            : std::array<ck_tile::index_t, 5>{1, 1, 1, 1, 1});
-    // self define lse data layout as [batch, nhead, max_seqlen_q]
+    // batch mode of lse data layout is [batch, nhead, seqlen_q]
+    // group mode of lse data layout is [nhead, total_seqlen_q]
    ck_tile::HostTensor<LSEDataType> lse_host(
-        lse ? std::array<ck_tile::index_t, 3>{batch, nhead, max_seqlen_q}
+        lse ? std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q}
            : std::array<ck_tile::index_t, 3>{1, 1, 1} /* dummy shape for simplifying code */);
    ck_tile::HostTensor<ODataType> o_host(
@@ -669,8 +671,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t nhead_stride_bias =
            (i_perm ? 0 * shape_seqlen_q * shape_seqlen_k : 0 * shape_seqlen_k);
        const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t nhead_stride_lse     = max_seqlen_q;
+        const ck_tile::index_t nhead_stride_lse     = shape_seqlen_q;
-        const ck_tile::index_t nhead_stride_lse_acc = max_seqlen_q;
+        const ck_tile::index_t nhead_stride_lse_acc = shape_seqlen_q;
        const ck_tile::index_t nhead_stride_o_acc   = (max_seqlen_q * hdim_v);
        const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
        // setup batch_stride_* arguments
@@ -679,12 +681,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t batch_stride_v       = (nhead_k * hdim_v * shape_seqlen_k);
        const ck_tile::index_t batch_stride_bias    = (0 * nhead * shape_seqlen_q * shape_seqlen_k);
        const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t batch_stride_lse     = (nhead * max_seqlen_q);
+        const ck_tile::index_t batch_stride_lse     = (nhead * shape_seqlen_q);
-        const ck_tile::index_t batch_stride_lse_acc = (nhead * max_seqlen_q);
+        const ck_tile::index_t batch_stride_lse_acc = (nhead * shape_seqlen_q);
        const ck_tile::index_t batch_stride_o_acc   = (nhead * max_seqlen_q * hdim_v);
        const ck_tile::index_t batch_stride_o       = (nhead * shape_seqlen_q * hdim_v);
        // setup split_stride_* arguments (only used in split-kv kernel)
-        const ck_tile::index_t split_stride_lse_acc = (batch * nhead * max_seqlen_q);
+        const ck_tile::index_t split_stride_lse_acc = (shape_batch * nhead * shape_seqlen_q);
        const ck_tile::index_t split_stride_o_acc   = (batch * nhead * max_seqlen_q * hdim_v);
        return fmha_fwd_args{q_buf.GetDeviceBuffer(),
@@ -996,8 +998,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
        if(lse)
        {
            ck_tile::HostTensor<SMPLComputeDataType> lse_host_result({nhead, real_seqlen_q});
-            lse_host_result.ForEach(
+            lse_host_result.ForEach([&](auto& self, auto idx) {
-                [&](auto& self, auto idx) { self(idx) = lse_host(wb, idx[0], idx[1]); });
+                self(idx) = lse_host(b, idx[0], idx[1] + query_offset);
+            });
            cur_pass = ck_tile::check_err(lse_host_result,
                                          lse_host_ref,

--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -185,7 +185,6 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                         args.nhead_stride_randval,
                                         args.nhead_stride_lse,
                                         args.nhead_stride_o,
-                                         args.batch_stride_lse,
                                         args.window_size_left,
                                         args.window_size_right,
                                         args.mask_type,
@@ -284,7 +283,6 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_args args)
                                     args.nhead_stride_randval,
                                     args.nhead_stride_lse_acc,
                                     args.nhead_stride_o_acc,
-                                     args.batch_stride_lse_acc,
                                     args.batch_stride_o_acc,
                                     args.split_stride_lse_acc,
                                     args.split_stride_o_acc,
@@ -376,9 +374,7 @@ auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_args args)
                                     args.nhead_stride_o_acc,
                                     args.nhead_stride_lse,
                                     args.nhead_stride_o,
-                                     args.batch_stride_lse_acc,
                                     args.batch_stride_o_acc,
-                                     args.batch_stride_lse,
                                     args.split_stride_lse_acc,
                                     args.split_stride_o_acc);
        }

--- a/example/ck_tile/01_fmha/script/run_full_test.sh
+++ b/example/ck_tile/01_fmha/script/run_full_test.sh
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the tile_example_fmha_fwd and tile_eaxmple_fmha_bwd executables in ../build/bin/
+#
+# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
+# input arguments: 
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# host name        : $hostname
+# gpu architecture: e.g., gfx90a, or gfx942, etc.
+#get the command line arguments:
+export env_type=$1
+echo 'Environment type: ' $env_type
+export branch=$2
+echo 'Branch name: ' $branch
+export host_name=$3
+echo 'Host name: ' $host_name
+export GPU_arch=$4
+echo 'GPU_arch: ' $GPU_arch
+function print_log_header(){
+	rm -f $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
+	#get GPU_arch and number of compute units from rocminfo
+	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+	rocminfo | grep "Compute Unit:" >> $1;
+	hipcc --version | grep -e 'HIP version'  >> $1;
+	echo 'Environment type: ' $2 >> $1;
+	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+#run verification tests
+example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+#run performance benchmarks
+export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
+print_log_header $fmha_fwd_log $env_type $branch $host_name
+example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
+export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log"
+print_log_header $fmha_bwd_log $env_type $branch $host_name
+example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log