Merge branch 'develop' into amd-develop

d39c3f5d · Jun Liu · 88b978c5 · ac58cc5d · d39c3f5d · d39c3f5d
Commit d39c3f5d authored Jun 06, 2024 by Jun Liu
20 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -652,8 +652,8 @@ def process_results(Map conf=[:]){
 }
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.1;COMPILER_VERSION=
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.1;
-                                              0 21 * * * % ROCMVERSION=6.1;COMPILER_VERSION=;COMPILER_COMMIT=
+                                              0 21 * * * % ROCMVERSION=6.1;hipTensor_test=true
                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;COMPILER_COMMIT=;USE_SCCACHE=false
                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;COMPILER_COMMIT=;USE_SCCACHE=false
                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_CODEGEN_TESTS=false;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false''' : ""
@@ -701,8 +701,8 @@ pipeline {
            description: "Select whether to build DL kernels (default: OFF)")
        booleanParam(
            name: "hipTensor_test",
-            defaultValue: true,
+            defaultValue: false,
-            description: "Use the CK build to verify hipTensor build and tests (default: ON)")
+            description: "Use the CK build to verify hipTensor build and tests (default: OFF)")
        string(
            name: 'hipTensor_branch',
            defaultValue: 'mainline',

--- a/client_example/24_grouped_conv_activation/CMakeLists.txt
+++ b/client_example/24_grouped_conv_activation/CMakeLists.txt
@@ -35,6 +35,10 @@ target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_int8 PRIVATE composa
 add_executable(client_grouped_convnd_fwd_bilinear_residual_fp16
               grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp)
 target_link_libraries(client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations)
+# Fwd convscale
+add_executable(client_conv3d_fwd_convscale_fp8 
+               grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp)
+target_link_libraries(client_conv3d_fwd_convscale_fp8 PRIVATE composable_kernel::device_conv_operations)
 # Bwd data bilinear
 add_executable(client_grouped_convnd_bwd_data_bilinear_residual_fp16
               grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp)

--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale/common.hpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <string>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ConvScale   = ck::tensor_operation::element_wise::ConvScale;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+template <ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
+std::size_t
+GetFlops(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths,
+         const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths,
+         const std::size_t& ds_size)
+{
+    // G * N * C * <output spatial lengths product> * (2 * K * <filter spatial lengths product> +
+    // <number of scale factors>)
+    ck::index_t G = weights_lengths[0];
+    ck::index_t N = output_lengths[1];
+    ck::index_t K = weights_lengths[1];
+    ck::index_t C = weights_lengths[2];
+    return G * N * C *
+           std::accumulate(std::next(std::begin(output_lengths), NumNonSpatialDim),
+                           std::end(output_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>()) *
+           (static_cast<std::size_t>(2) * K *
+                std::accumulate(std::next(std::begin(weights_lengths), NumNonSpatialDim),
+                                std::end(weights_lengths),
+                                static_cast<std::size_t>(1),
+                                std::multiplies<>()) +
+            ds_size);
+}
+template <typename InDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
+std::size_t
+GetInputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& input_lengths)
+{
+    // sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
+    return sizeof(InDataType) * std::accumulate(std::begin(input_lengths),
+                                                std::end(input_lengths),
+                                                static_cast<std::size_t>(1),
+                                                std::multiplies<>());
+}
+template <typename WeiDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
+std::size_t
+GetWeightByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths)
+{
+    // sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
+    return sizeof(WeiDataType) * std::accumulate(std::begin(weights_lengths),
+                                                 std::end(weights_lengths),
+                                                 static_cast<std::size_t>(1),
+                                                 std::multiplies<>());
+}
+template <typename OutDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
+std::size_t
+GetOutputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths)
+{
+    // sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
+    return sizeof(OutDataType) * std::accumulate(std::begin(output_lengths),
+                                                 std::end(output_lengths),
+                                                 static_cast<std::size_t>(1),
+                                                 std::multiplies<std::size_t>());
+}
+template <ck::index_t NumDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          ck::index_t NumNonSpatialDim = 3,
+          typename AComputeType        = InDataType,
+          typename BComputeType        = AComputeType>
+bool run_grouped_conv_fwd_convscale(
+    std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_lengths,
+    std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_lengths,
+    std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_lengths)
+{
+    std::size_t in_mem_size  = GetInputByte<InDataType, NumDimSpatial>(in_lengths);
+    std::size_t wei_mem_size = GetWeightByte<WeiDataType, NumDimSpatial>(wei_lengths);
+    std::size_t out_mem_size = GetOutputByte<OutDataType, NumDimSpatial>(out_lengths);
+    SimpleDeviceMem in(in_mem_size);
+    SimpleDeviceMem wei(wei_mem_size);
+    SimpleDeviceMem out(out_mem_size);
+    float scale_in;
+    float scale_wei;
+    float scale_out;
+    std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_strides;
+    std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_strides;
+    std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_strides;
+    in_strides.fill(0);
+    wei_strides.fill(0);
+    out_strides.fill(0);
+    in_strides.back()  = 1;
+    wei_strides.back() = 1;
+    out_strides.back() = 1;
+    std::partial_sum(rbegin(in_lengths),
+                     std::prev(rend(in_lengths)),
+                     std::next(rbegin(in_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(wei_lengths),
+                     std::prev(rend(wei_lengths)),
+                     std::next(rbegin(wei_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(out_lengths),
+                     std::prev(rend(out_lengths)),
+                     std::next(rbegin(out_strides)),
+                     std::multiplies<>{});
+    // transpose NDHWGC/KZYXGC/NDHWGK to GNDHWC/GKZYXC/GNDHWK to GNCDHW/GKCZYX/GNKDHW
+    std::rotate(std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 2), rend(in_lengths));
+    std::rotate(rbegin(in_lengths),
+                std::next(rbegin(in_lengths)),
+                std::next(rbegin(in_lengths), NumDimSpatial + 1));
+    std::rotate(std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 2), rend(in_strides));
+    std::rotate(rbegin(in_strides),
+                std::next(rbegin(in_strides)),
+                std::next(rbegin(in_strides), NumDimSpatial + 1));
+    std::rotate(rbegin(wei_lengths),
+                std::next(rbegin(wei_lengths)),
+                std::next(rbegin(wei_lengths), NumDimSpatial + 1));
+    std::rotate(rbegin(wei_strides),
+                std::next(rbegin(wei_strides)),
+                std::next(rbegin(wei_strides), NumDimSpatial + 1));
+    std::rotate(
+        std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 2), rend(out_lengths));
+    std::rotate(rbegin(out_lengths),
+                std::next(rbegin(out_lengths)),
+                std::next(rbegin(out_lengths), NumDimSpatial + 1));
+    std::rotate(
+        std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 2), rend(out_strides));
+    std::rotate(rbegin(out_strides),
+                std::next(rbegin(out_strides)),
+                std::next(rbegin(out_strides), NumDimSpatial + 1));
+    std::array<ck::index_t, NumDimSpatial> conv_filter_strides;
+    std::array<ck::index_t, NumDimSpatial> conv_filter_dilations;
+    std::array<ck::index_t, NumDimSpatial> input_left_pads;
+    std::array<ck::index_t, NumDimSpatial> input_right_pads;
+    conv_filter_strides.fill(1);
+    conv_filter_dilations.fill(1);
+    input_left_pads.fill(1);
+    input_right_pads.fill(1);
+    std::size_t ds_size = 3; // 3 element-wise scale multipliers
+    std::size_t flop    = GetFlops<NumDimSpatial>(out_lengths, wei_lengths, ds_size);
+    std::size_t num_bytes =
+        in_mem_size + wei_mem_size + sizeof(float) + sizeof(float) + sizeof(float) + out_mem_size;
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                                                                   InLayout,
+                                                                                   WeiLayout,
+                                                                                   ck::Tuple<>,
+                                                                                   OutLayout,
+                                                                                   InDataType,
+                                                                                   WeiDataType,
+                                                                                   ck::Tuple<>,
+                                                                                   OutDataType,
+                                                                                   PassThrough,
+                                                                                   PassThrough,
+                                                                                   ConvScale,
+                                                                                   AComputeType,
+                                                                                   BComputeType>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            in.GetDeviceBuffer(),
+            wei.GetDeviceBuffer(),
+            std::array<const void*, 0>{},
+            out.GetDeviceBuffer(),
+            in_lengths,
+            in_strides,
+            wei_lengths,
+            wei_strides,
+            std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{},
+            std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{},
+            out_lengths,
+            out_strides,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            PassThrough{},
+            PassThrough{},
+            ConvScale{scale_in, scale_wei, scale_out});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return false;
+    }
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            in.GetDeviceBuffer(),
+            wei.GetDeviceBuffer(),
+            std::array<const void*, 0>{},
+            out.GetDeviceBuffer(),
+            in_lengths,
+            in_strides,
+            wei_lengths,
+            wei_strides,
+            std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{},
+            std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{},
+            out_lengths,
+            out_strides,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            PassThrough{},
+            PassThrough{},
+            ConvScale{scale_in, scale_wei, scale_out});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return true;
+}
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using CShuffleDataType = float;
+using OutDataType      = ck::f8_t;
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+int main()
+{
+    return run_grouped_conv_fwd_convscale<NumDimSpatial,
+                                          InDataType,
+                                          WeiDataType,
+                                          OutDataType,
+                                          InLayout,
+                                          WeiLayout,
+                                          OutLayout,
+                                          3,
+                                          AComputeDataType,
+                                          BComputeDataType>(
+               {N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.2.0
+rocm-docs-core==1.3.0
 sphinxcontrib-bibtex==2.6.2
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.31.0
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.2.0
+rocm-docs-core==1.3.0
    # via -r requirements.in
 six==1.16.0
    # via

--- a/example/62_convnd_activ/CMakeLists.txt
+++ b/example/62_convnd_activ/CMakeLists.txt
 add_subdirectory(binary)
+add_subdirectory(convscale)
 add_subdirectory(multi_AB)
 add_subdirectory(unary)

--- a/example/62_convnd_activ/convscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+      add_custom_target(example_convnd_activ_xdl_convscale)
+      add_example_executable(example_convnd_fwd_xdl_convscale_fp8 convnd_fwd_xdl_convscale_fp8.cpp)
+      add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_fp8)
+      set(target 1)
+ endif()
+endforeach()
--- a/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ConvScale   = ck::tensor_operation::element_wise::ConvScale;
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+template <typename DataType>
+inline __host__ __device__ constexpr double get_rtol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 1e-1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 1.5e-1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
+template <typename DataType>
+inline __host__ __device__ constexpr double get_atol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 16.1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 8192.1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
+template <ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
+std::size_t
+GetFlops(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths,
+         const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths,
+         const std::size_t& ds_size)
+{
+    // G * N * C * <output spatial lengths product> * (2 * K * <filter spatial lengths product> +
+    // <number of scale factors>)
+    ck::index_t G = weights_lengths[0];
+    ck::index_t N = output_lengths[1];
+    ck::index_t K = weights_lengths[1];
+    ck::index_t C = weights_lengths[2];
+    return G * N * C *
+           std::accumulate(std::next(std::begin(output_lengths), NumNonSpatialDim),
+                           std::end(output_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>()) *
+           (static_cast<std::size_t>(2) * K *
+                std::accumulate(std::next(std::begin(weights_lengths), NumNonSpatialDim),
+                                std::end(weights_lengths),
+                                static_cast<std::size_t>(1),
+                                std::multiplies<>()) +
+            ds_size);
+}
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<CShuffleDataType> c(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+    // random scale values
+    float scale_in  = float(std::rand()) / float(RAND_MAX);
+    float scale_wei = float(std::rand()) / float(RAND_MAX);
+    float scale_out = float(std::rand()) / float(RAND_MAX);
+    // initialize out_element_op for each iteration
+    const auto out_element_op = OutElementOp{scale_in, scale_wei, scale_out};
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t ds_size   = 3; // 3 element-wise scale multipliers
+    std::size_t flop      = GetFlops<NDimSpatial>(e_g_n_k_wos_lengths, b_g_k_c_xs_lengths, ds_size);
+    std::size_t num_btype = conv_param.GetInputByte<InDataType>() +
+                            conv_param.GetWeightByte<WeiDataType>() + sizeof(float) +
+                            sizeof(float) + sizeof(float) + conv_param.GetOutputByte<OutDataType>();
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     CShuffleDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     PassThrough>();
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  c,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  PassThrough{});
+        ref_invoker.Run(ref_argument);
+        out_host.ForEach([&](auto&, auto idx) { out_element_op(out_host(idx), c(idx)); });
+        out_device_buf.FromDevice(out_device.mData.data());
+        return ck::utils::check_err(out_device,
+                                    out_host,
+                                    "Error: incorrect results!",
+                                    get_rtol<OutDataType>(),
+                                    get_atol<OutDataType>());
+    }
+    return true;
+}
--- a/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_fp8.cpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_xdl_convscale_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_convscale_common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using DsDataType       = ck::Tuple<>;
+using OutDataType      = ck::f8_t;
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvScale;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        DsLayout,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8,
+        AComputeDataType,
+        BComputeDataType>;
+#include "run_convnd_fwd_convscale_example.inc"
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/62_convnd_activ/convscale/run_convnd_fwd_convscale_example.inc
+++ b/example/62_convnd_activ/convscale/run_convnd_fwd_convscale_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+bool run_convnd_fwd_example(int argc, char* argv[])
+{
+    print_helper_msg();
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    ck::utils::conv::ConvParam conv_param{
+        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+    // instantiate in and wei element ops, will
+    // instantiate out_element_op below for every iteration
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto run =
+        [&](auto ndim_spatial, auto in_layout, auto wei_layout, auto ds_layout, auto out_layout) {
+            constexpr ck::index_t ndim_spatial_value = ndim_spatial.value;
+            using InLayout  = decltype(in_layout);
+            using WeiLayout = decltype(wei_layout);
+            using DsLayout  = decltype(ds_layout);
+            using OutLayout = decltype(out_layout);
+            const auto in_g_n_c_wis_desc =
+                ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                    conv_param);
+            const auto wei_g_k_c_xs_desc =
+                ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                    conv_param);
+            const auto out_g_n_k_wos_desc =
+                ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                    conv_param);
+            return run_grouped_conv_fwd<ndim_spatial_value,
+                                        InDataType,
+                                        WeiDataType,
+                                        CShuffleDataType,
+                                        DsDataType,
+                                        OutDataType,
+                                        InElementOp,
+                                        WeiElementOp,
+                                        OutElementOp,
+                                        DeviceGroupedConvNDFwdInstance<ndim_spatial_value,
+                                                                       InLayout,
+                                                                       WeiLayout,
+                                                                       DsLayout,
+                                                                       OutLayout>>(
+                do_verification,
+                init_method,
+                time_kernel,
+                conv_param,
+                in_g_n_c_wis_desc,
+                wei_g_k_c_xs_desc,
+                out_g_n_k_wos_desc,
+                in_element_op,
+                wei_element_op);
+        };
+    namespace ctc = ck::tensor_layout::convolution;
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        return run(ck::Number<1>{}, ctc::GNWC{}, ctc::GKXC{}, ck::Tuple<>{}, ctc::GNWK{});
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        return run(ck::Number<2>{}, ctc::GNHWC{}, ctc::GKYXC{}, ck::Tuple<>{}, ctc::GNHWK{});
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        return run(ck::Number<3>{}, ctc::GNDHWC{}, ctc::GKZYXC{}, ck::Tuple<>{}, ctc::GNDHWK{});
+    }
+    return true;
+}
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
 # generate a list of kernels, but not actually emit files at config stage
 execute_process(
  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/blob_list.txt
+  --direction fwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
 )
-# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS files must be in the same directory
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --direction bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt
+)
+# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
 #       as current cmake list, otherwise will not figure out the dependency properly
-file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/blob_list.txt FMHA_FWD_GEN_BLOBS)
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt FMHA_FWD_GEN_BLOBS)
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt FMHA_BWD_GEN_BLOBS)
 add_custom_command(
  OUTPUT ${FMHA_FWD_GEN_BLOBS}
  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  --direction fwd --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_command(
+  OUTPUT ${FMHA_BWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --direction bwd --output_dir ${CMAKE_CURRENT_BINARY_DIR}
 )
 set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
@@ -22,6 +34,14 @@ add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL fmha_fwd.cpp)
 target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
+set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+message("adding example ${EXAMPLE_FMHA_BWD}")
+add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL fmha_bwd.cpp)
+target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})
 # NOTE: this is dangerous since will change the whole kernel to flush denormals
 #       WIP with compiler team for an exp2 intrinsic..., then remove this
 if(NOT DEFINED FMHA_FWD_FAST_EXP2)
@@ -29,16 +49,27 @@ if(NOT DEFINED FMHA_FWD_FAST_EXP2)
 endif()
 set(EXAMPLE_FMHA_FWD_COMPILE_OPTIONS)
+set(EXAMPLE_FMHA_BWD_COMPILE_OPTIONS)
 # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
 #       ... because they are auto-generated
 if(FMHA_FWD_FAST_EXP2)
 	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
+  list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
 else()
 	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
+  list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
 endif()
 # Allow comparing floating points directly in order to check sentinel values
 list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-float-equal)
+list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-float-equal)
 target_compile_options(${EXAMPLE_FMHA_FWD} PRIVATE ${EXAMPLE_FMHA_FWD_COMPILE_OPTIONS})
+target_compile_options(${EXAMPLE_FMHA_BWD} PRIVATE ${EXAMPLE_FMHA_BWD_COMPILE_OPTIONS})
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/fmha.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "mask.hpp"
+#include "bias.hpp"
+#include <type_traits>
+template <typename DataType>
+struct FmhaBwdTypeConfig;
+template <>
+struct FmhaBwdTypeConfig<ck_tile::half_t>
+{
+    using QDataType             = ck_tile::half_t;
+    using KDataType             = ck_tile::half_t;
+    using VDataType             = ck_tile::half_t;
+    using GemmDataType          = ck_tile::half_t;
+    using BiasDataType          = ck_tile::half_t;
+    using LSEDataType           = float;
+    using AccDataType           = float; // data type for gemm accumulation
+    using DDataType             = float;
+    using RandValOutputDataType = uint8_t;
+    using ODataType             = ck_tile::half_t;
+    using OGradDataType         = ck_tile::half_t;
+    using QGradDataType         = ck_tile::half_t;
+    using KGradDataType         = ck_tile::half_t;
+    using VGradDataType         = ck_tile::half_t;
+    using BiasGradDataType      = ck_tile::half_t;
+};
+template <>
+struct FmhaBwdTypeConfig<ck_tile::bf16_t>
+{
+    using QDataType             = ck_tile::bf16_t;
+    using KDataType             = ck_tile::bf16_t;
+    using VDataType             = ck_tile::bf16_t;
+    using GemmDataType          = ck_tile::bf16_t;
+    using BiasDataType          = ck_tile::bf16_t;
+    using LSEDataType           = float;
+    using AccDataType           = float; // data type for gemm accumulation
+    using DDataType             = float;
+    using RandValOutputDataType = uint8_t;
+    using ODataType             = ck_tile::bf16_t;
+    using OGradDataType         = ck_tile::bf16_t;
+    using QGradDataType         = ck_tile::bf16_t;
+    using KGradDataType         = ck_tile::bf16_t;
+    using VGradDataType         = ck_tile::bf16_t;
+    using BiasGradDataType      = ck_tile::bf16_t;
+};
+struct FmhaMasks
+{
+    using NoMask      = ck_tile::GenericAttentionMask<false>;
+    using GenericMask = ck_tile::GenericAttentionMask<true, true>;
+    using CausalMask  = ck_tile::GenericAttentionMask<true, false>;
+};
+// runtime args, some will passed to karg, some will used to compute grids/blocks
+struct fmha_bwd_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* bias_ptr; // bias or alibi_slope pointer
+    const void* o_ptr;
+    const void* lse_ptr;
+    const void* do_ptr;
+    void* d_ptr;
+    void* rand_val_ptr;
+    void* dq_ptr;
+    void* dk_ptr;
+    void* dv_ptr;
+    void* dbias_ptr;
+    const void* seqstart_q_ptr;
+    const void* seqstart_k_ptr;
+    const void* seqlen_k_ptr;
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t batch;
+    ck_tile::index_t max_seqlen_q;
+    ck_tile::index_t max_seqlen_k;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+    float scale;
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_v;
+    ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
+    ck_tile::index_t stride_o;
+    ck_tile::index_t stride_randval;
+    ck_tile::index_t stride_do;
+    ck_tile::index_t stride_dk;
+    ck_tile::index_t stride_dv;
+    ck_tile::index_t stride_dbias;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_v;
+    ck_tile::index_t nhead_stride_bias;
+    ck_tile::index_t nhead_stride_o;
+    ck_tile::index_t nhead_stride_randval;
+    ck_tile::index_t nhead_stride_do;
+    ck_tile::index_t nhead_stride_lsed;
+    ck_tile::index_t nhead_stride_dbias;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+    ck_tile::index_t batch_stride_v;
+    ck_tile::index_t batch_stride_bias;
+    ck_tile::index_t batch_stride_o;
+    ck_tile::index_t batch_stride_randval;
+    ck_tile::index_t batch_stride_do;
+    ck_tile::index_t batch_stride_lsed;
+    ck_tile::index_t batch_stride_dk;
+    ck_tile::index_t batch_stride_dv;
+    ck_tile::index_t batch_stride_dbias;
+    ck_tile::index_t window_size_left;
+    ck_tile::index_t window_size_right;
+    ck_tile::index_t mask_type;
+    float p_drop;
+    float p_undrop;
+    bool s_randval;
+    std::tuple<uint64_t, uint64_t> drop_seed_offset;
+};
+template <typename FmhaBwdDQDKDVKernel>
+auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaBwdDQDKDVKernel::kIsGroupMode)
+        {
+            return FmhaBwdDQDKDVKernel::MakeKargs(args.q_ptr,
+                                                  args.k_ptr,
+                                                  args.v_ptr,
+                                                  args.bias_ptr,
+                                                  args.lse_ptr,
+                                                  args.do_ptr,
+                                                  args.d_ptr,
+                                                  args.rand_val_ptr,
+                                                  args.dq_ptr,
+                                                  args.dk_ptr,
+                                                  args.dv_ptr,
+                                                  args.dbias_ptr,
+                                                  args.seqstart_q_ptr,
+                                                  args.seqstart_k_ptr,
+                                                  args.seqlen_k_ptr,
+                                                  args.hdim_q,
+                                                  args.hdim_v,
+                                                  args.nhead_q,
+                                                  args.nhead_q / args.nhead_k,
+                                                  args.scale,
+                                                  args.stride_q,
+                                                  args.stride_k,
+                                                  args.stride_v,
+                                                  args.stride_bias,
+                                                  args.stride_randval,
+                                                  args.stride_do,
+                                                  args.stride_dk,
+                                                  args.stride_dv,
+                                                  args.stride_dbias,
+                                                  args.nhead_stride_q,
+                                                  args.nhead_stride_k,
+                                                  args.nhead_stride_v,
+                                                  args.nhead_stride_bias,
+                                                  args.nhead_stride_randval,
+                                                  args.nhead_stride_do,
+                                                  args.nhead_stride_lsed,
+                                                  args.nhead_stride_dbias,
+                                                  args.batch_stride_lsed,
+                                                  args.window_size_left,
+                                                  args.window_size_right,
+                                                  args.mask_type,
+                                                  args.p_drop,
+                                                  args.s_randval,
+                                                  args.drop_seed_offset);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaBwdDQDKDVKernel::MakeKargs(args.q_ptr,
+                                                  args.k_ptr,
+                                                  args.v_ptr,
+                                                  args.bias_ptr,
+                                                  args.lse_ptr,
+                                                  args.do_ptr,
+                                                  args.d_ptr,
+                                                  args.rand_val_ptr,
+                                                  args.dq_ptr,
+                                                  args.dk_ptr,
+                                                  args.dv_ptr,
+                                                  args.dbias_ptr,
+                                                  args.seqlen_q,
+                                                  args.seqlen_k,
+                                                  args.hdim_q,
+                                                  args.hdim_v,
+                                                  args.nhead_q,
+                                                  args.nhead_q / args.nhead_k,
+                                                  args.scale,
+                                                  args.stride_q,
+                                                  args.stride_k,
+                                                  args.stride_v,
+                                                  args.stride_bias,
+                                                  args.stride_randval,
+                                                  args.stride_do,
+                                                  args.stride_dk,
+                                                  args.stride_dv,
+                                                  args.stride_dbias,
+                                                  args.nhead_stride_q,
+                                                  args.nhead_stride_k,
+                                                  args.nhead_stride_v,
+                                                  args.nhead_stride_bias,
+                                                  args.nhead_stride_randval,
+                                                  args.nhead_stride_do,
+                                                  args.nhead_stride_lsed,
+                                                  args.nhead_stride_dbias,
+                                                  args.batch_stride_q,
+                                                  args.batch_stride_k,
+                                                  args.batch_stride_v,
+                                                  args.batch_stride_bias,
+                                                  args.batch_stride_randval,
+                                                  args.batch_stride_do,
+                                                  args.batch_stride_lsed,
+                                                  args.batch_stride_dk,
+                                                  args.batch_stride_dv,
+                                                  args.batch_stride_dbias,
+                                                  args.window_size_left,
+                                                  args.window_size_right,
+                                                  args.mask_type,
+                                                  args.p_drop,
+                                                  args.s_randval,
+                                                  args.drop_seed_offset);
+        }
+    }();
+    dim3 grids = FmhaBwdDQDKDVKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_k);
+    return ck_tile::make_tuple(kargs, grids);
+}
+template <typename FmhaBwdOGradDotOKernel>
+auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
+{
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaBwdOGradDotOKernel::kIsGroupMode)
+        {
+            return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
+                                                     args.do_ptr,
+                                                     args.d_ptr,
+                                                     args.p_undrop,
+                                                     args.seqstart_q_ptr,
+                                                     args.hdim_v,
+                                                     args.stride_do,
+                                                     args.stride_o,
+                                                     args.nhead_stride_do,
+                                                     args.nhead_stride_o,
+                                                     args.nhead_stride_lsed,
+                                                     args.batch_stride_lsed);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
+                                                     args.do_ptr,
+                                                     args.d_ptr,
+                                                     args.p_undrop,
+                                                     args.seqlen_q,
+                                                     args.hdim_v,
+                                                     args.stride_do,
+                                                     args.stride_o,
+                                                     args.nhead_stride_do,
+                                                     args.nhead_stride_o,
+                                                     args.nhead_stride_lsed,
+                                                     args.batch_stride_do,
+                                                     args.batch_stride_o,
+                                                     args.batch_stride_lsed);
+        }
+    }();
+    dim3 grids = FmhaBwdOGradDotOKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q);
+    return ck_tile::make_tuple(kargs, grids);
+}
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          ck_tile::BlockFmhaBwdPipelineEnum FmhaBwdPipelineEnum_,
+          typename FmhaMask_,
+          ck_tile::BlockAttentionBiasEnum BiasEnum_,
+          bool kHasBiasGrad_,
+          bool kHasDropout_,
+          bool kPadS_,
+          bool kPadSK_,
+          bool kPadD_,
+          bool kPadDv_>
+struct fmha_bwd_dq_dk_dv_traits_
+{
+    static constexpr ck_tile::index_t HDim    = HDim_;
+    using DataType                            = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode        = kIsGroupMode_;
+    static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_;
+    using FmhaMask                            = ck_tile::remove_cvref_t<FmhaMask_>;
+    static constexpr auto BiasEnum            = BiasEnum_;
+    static constexpr bool kHasBiasGrad        = kHasBiasGrad_;
+    static constexpr bool kHasDropout         = kHasDropout_;
+    static constexpr bool kPadS               = kPadS_;
+    static constexpr bool kPadSK              = kPadSK_;
+    static constexpr bool kPadD               = kPadD_;
+    static constexpr bool kPadDv              = kPadDv_;
+};
+template <typename Traits_>
+float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config&, fmha_bwd_args);
+template <typename Traits_>
+void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
+template <typename Traits_>
+std::string fmha_bwd_dq_dk_dv_get_name_();
+template <ck_tile::index_t HDim_, typename DataType_, bool kIsGroupMode_, bool kPadS_, bool kPadDv_>
+struct fmha_bwd_dot_do_o_traits_
+{
+    static constexpr ck_tile::index_t HDim = HDim_;
+    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode     = kIsGroupMode_;
+    static constexpr bool kPadS            = kPadS_;
+    static constexpr bool kPadDv           = kPadDv_;
+};
+template <typename Traits_>
+float fmha_bwd_dot_do_o_(const ck_tile::stream_config&, fmha_bwd_args);
+template <typename Traits_>
+void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
+template <typename Traits_>
+std::string fmha_bwd_dot_do_o_get_name_();
+// This is the public API, will be generated by script
+struct fmha_bwd_traits
+{
+    int hdim_q;
+    int hdim_v;
+    std::string data_type;
+    bool is_group_mode;
+    mask_enum mask_type;
+    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
+    bool has_dbias;
+    bool has_dropout;
+    // TODO: padding check is inside this api
+};
+float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "fmha_fwd.hpp"
 #include "ck_tile/host.hpp"
@@ -110,6 +110,9 @@ auto create_args(int argc, char* argv[])
                "11939",
                "random seed used for initializing input tensors. 0 for "
                "non-deterministic seed")
+        .insert("p_drop", "0", "0~1 probability of dropout")
+        .insert("drop_seed", "1", "seed for random number generator")
+        .insert("drop_offset", "0", "offset for random number generator")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert("warmup", "5", "number of iterations before benchmark the kernel")
        .insert("repeat", "20", "number of iterations to benchmark the kernel");
@@ -128,26 +131,11 @@ auto get_elimit(std::string /*init_method*/)
 }
 template <>
-auto get_elimit<ck_tile::bf16_t>(std::string init_method)
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
 {
-    if(init_method == "ui" || init_method == "ni")
+    double rtol = 1e-2;
-    {
+    double atol = 1e-2;
-        double rtol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
-        double atol = 1e-2;
-        return ck_tile::make_tuple(rtol, atol);
-    }
-    else if(init_method == "nf")
-    {
-        double rtol = 1e-2;
-        double atol = 1e-2;
-        return ck_tile::make_tuple(rtol, atol);
-    }
-    else
-    {
-        double rtol = 3e-3;
-        double atol = 3e-3;
-        return ck_tile::make_tuple(rtol, atol);
-    }
 }
 template <>
@@ -250,6 +238,21 @@ bool run(const ck_tile::ArgParser& arg_parser)
    mask_info mask = mask_info::decode(
        arg_parser.get_str("mask"), seqlen_qs[0], seqlen_ks[0]); // TODO: we don't need x/y anymore
+    float p_drop         = arg_parser.get_float("p_drop");
+    uint64_t drop_seed   = arg_parser.get_uint64("drop_seed");
+    uint64_t drop_offset = arg_parser.get_uint64("drop_offset");
+    if(p_drop < 0.0f || p_drop > 1.0f)
+    {
+        std::cerr << "The value of p_drop should be 0~1" << std::endl;
+        return false;
+    }
+    bool s_randval = false;
+    if(p_drop > 0.0f && do_validation)
+    {
+        s_randval = true;
+    }
    std::string init_method      = arg_parser.get_str("init");
    std::optional<uint32_t> seed = arg_parser.get_uint32("seed");
    if(*seed == 0)
@@ -274,21 +277,23 @@ bool run(const ck_tile::ArgParser& arg_parser)
    using TypeConfig = FmhaFwdTypeConfig<DataType>;
-    using QDataType           = typename TypeConfig::QDataType;
+    using QDataType             = typename TypeConfig::QDataType;
-    using KDataType           = typename TypeConfig::KDataType;
+    using KDataType             = typename TypeConfig::KDataType;
-    using VDataType           = typename TypeConfig::VDataType;
+    using VDataType             = typename TypeConfig::VDataType;
-    using BiasDataType        = typename TypeConfig::BiasDataType;
+    using BiasDataType          = typename TypeConfig::BiasDataType;
-    using LSEDataType         = typename TypeConfig::LSEDataType;
+    using RandValOutputDataType = typename TypeConfig::RandValOutputDataType;
-    using SaccDataType        = typename TypeConfig::SaccDataType;
+    using LSEDataType           = typename TypeConfig::LSEDataType;
-    using SMPLComputeDataType = typename TypeConfig::SMPLComputeDataType;
+    using SaccDataType          = typename TypeConfig::SaccDataType;
-    using PDataType           = typename TypeConfig::PDataType;
+    using SMPLComputeDataType   = typename TypeConfig::SMPLComputeDataType;
-    using OaccDataType        = typename TypeConfig::OaccDataType;
+    using PDataType             = typename TypeConfig::PDataType;
-    using ODataType           = typename TypeConfig::ODataType;
+    using OaccDataType          = typename TypeConfig::OaccDataType;
+    using ODataType             = typename TypeConfig::ODataType;
    // accumulation numbers for performance evaluation
    std::size_t flop = 0, num_byte = 0;
    auto max_seqlen_q =
        std::numeric_limits<int32_t>::min(); // we will use max seqlen to decide grid size
+    auto max_seqlen_k = std::numeric_limits<int32_t>::min();
    {
        for(ck_tile::index_t wb = 0; wb < batch; ++wb)
        {
@@ -300,6 +305,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
                max_seqlen_q = real_seqlen_q;
            }
+            if(max_seqlen_k < real_seqlen_k)
+            {
+                max_seqlen_k = real_seqlen_k;
+            }
            flop += nhead * (static_cast<std::size_t>(2) * real_seqlen_q * real_seqlen_k * hdim_q +
                             static_cast<std::size_t>(2) * real_seqlen_q * hdim_v * real_seqlen_k);
@@ -353,12 +363,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
    // self define lse data layout as [shape_batch, nhead, shape_seqlen_q]
    ck_tile::HostTensor<LSEDataType> lse_host(
-        lse ? std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q}
+        lse ? std::array<ck_tile::index_t, 3>{batch, nhead, max_seqlen_q}
            : std::array<ck_tile::index_t, 3>{1, 1, 1} /* dummy shape for simplifying code */);
    ck_tile::HostTensor<ODataType> o_host(
        get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
+    ck_tile::HostTensor<RandValOutputDataType> randval_host(
+        p_drop > 0 ? get_lengths(true, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
+                   : std::array<ck_tile::index_t, 4>{1, 1, 1, 1});
    if(init_method == "ui" || init_method == "0")
    {
        ck_tile::FillUniformDistributionIntegerValue<QDataType>{-3.f, 3.f, seed}(q_host);
@@ -434,6 +448,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
    ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
    ck_tile::DeviceMem seqlen_k_buf(seqlen_kpads[0] < 0 ? 0 : seqlen_ks.size() * sizeof(int32_t));
+    ck_tile::DeviceMem randval_buf(randval_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
    q_buf.ToDevice(q_host.data());
@@ -463,8 +478,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
              << (seqlen_kpads[0] < 0 ? ""
                                      : (std::string("(") + std::to_string(seqlen_kpads[0]) + ")"))
              << ", d:" << hdim_q << "/" << hdim_v << ", scale_s:" << scale_s << ", bias:" << bias
-              << ", lse:" << lse << ", squant:" << squant << ", mask:" << mask << ", v:" << vlayout
+              << ", p_drop:" << p_drop << ", lse:" << lse << ", squant:" << squant
-              << std::flush;
+              << ", mask:" << mask << ", v:" << vlayout << std::flush;
    auto fmha_traits = fmha_fwd_traits{hdim_q,
                                       hdim_v,
@@ -474,6 +489,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                       mask.type,
                                       bias.type,
                                       lse,
+                                       p_drop > 0.0f,
                                       squant};
    auto p_compute_element_func = [&]() {
@@ -505,8 +521,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
            else
                return i_perm ? shape_seqlen_k : nhead_k * shape_seqlen_k;
        }();
-        const ck_tile::index_t stride_bias = (i_perm ? shape_seqlen_k : 1 * shape_seqlen_k);
+        const ck_tile::index_t stride_bias    = (i_perm ? shape_seqlen_k : 1 * shape_seqlen_k);
-        const ck_tile::index_t stride_o    = (o_perm ? hdim_v : nhead * hdim_v);
+        const ck_tile::index_t stride_randval = (max_seqlen_k);
+        const ck_tile::index_t stride_o       = (o_perm ? hdim_v : nhead * hdim_v);
        // setup nhead_stride_* arguments
        const ck_tile::index_t nhead_stride_q = (i_perm ? shape_seqlen_q * hdim_q : hdim_q);
        const ck_tile::index_t nhead_stride_k = (i_perm ? shape_seqlen_k * hdim_q : hdim_q);
@@ -518,21 +535,24 @@ bool run(const ck_tile::ArgParser& arg_parser)
        }();
        const ck_tile::index_t nhead_stride_bias =
            (i_perm ? 0 * shape_seqlen_q * shape_seqlen_k : 0 * shape_seqlen_k);
-        const ck_tile::index_t nhead_stride_lse = (shape_seqlen_q * 1);
+        const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t nhead_stride_o   = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
+        const ck_tile::index_t nhead_stride_lse     = max_seqlen_q;
+        const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
        // setup batch_stride_* arguments
-        const ck_tile::index_t batch_stride_q    = (nhead * shape_seqlen_q * hdim_q);
+        const ck_tile::index_t batch_stride_q       = (nhead * shape_seqlen_q * hdim_q);
-        const ck_tile::index_t batch_stride_k    = (nhead_k * shape_seqlen_k * hdim_q);
+        const ck_tile::index_t batch_stride_k       = (nhead_k * shape_seqlen_k * hdim_q);
-        const ck_tile::index_t batch_stride_v    = (nhead_k * hdim_v * shape_seqlen_k);
+        const ck_tile::index_t batch_stride_v       = (nhead_k * hdim_v * shape_seqlen_k);
-        const ck_tile::index_t batch_stride_bias = (0 * nhead * shape_seqlen_q * shape_seqlen_k);
+        const ck_tile::index_t batch_stride_bias    = (0 * nhead * shape_seqlen_q * shape_seqlen_k);
-        const ck_tile::index_t batch_stride_lse  = (nhead * shape_seqlen_q * 1);
+        const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
-        const ck_tile::index_t batch_stride_o    = (nhead * shape_seqlen_q * hdim_v);
+        const ck_tile::index_t batch_stride_lse     = (nhead * max_seqlen_q);
+        const ck_tile::index_t batch_stride_o       = (nhead * shape_seqlen_q * hdim_v);
        return fmha_fwd_args{q_buf.GetDeviceBuffer(),
                             k_buf.GetDeviceBuffer(),
                             v_buf.GetDeviceBuffer(),
                             bias.type == bias_enum::alibi ? alibi_slope_buf.GetDeviceBuffer()
                                                           : bias_buf.GetDeviceBuffer(),
+                             randval_buf.GetDeviceBuffer(),
                             lse_buf.GetDeviceBuffer(),
                             o_buf.GetDeviceBuffer(),
                             seqstart_q.GetDeviceBuffer(),
@@ -554,22 +574,28 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             stride_v,
                             bias.type == bias_enum::alibi ? (bias.rank_info == 0 ? 0 : nhead)
                                                           : stride_bias,
+                             stride_randval,
                             stride_o,
                             nhead_stride_q,
                             nhead_stride_k,
                             nhead_stride_v,
                             nhead_stride_bias,
+                             nhead_stride_randval,
                             nhead_stride_lse,
                             nhead_stride_o,
                             batch_stride_q,
                             batch_stride_k,
                             batch_stride_v,
                             batch_stride_bias,
+                             batch_stride_randval,
                             batch_stride_lse,
                             batch_stride_o,
                             mask.left,
                             mask.right,
-                             static_cast<ck_tile::index_t>(mask.type)};
+                             static_cast<ck_tile::index_t>(mask.type),
+                             p_drop,
+                             s_randval,
+                             {drop_seed, drop_offset}};
    }();
    float ave_time = fmha_fwd(fmha_traits, fmha_args, stream_config);
@@ -596,6 +622,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
    o_buf.FromDevice(o_host.data());
    lse_buf.FromDevice(lse_host.data());
+    randval_buf.FromDevice(randval_host.data());
+    float p_undrop = 1.0 - p_drop;
+    uint8_t p_undrop_in_uint8_t =
+        uint8_t(std::floor(p_undrop * std::numeric_limits<uint8_t>::max()));
+    float rp_undrop = 1.0 / p_undrop;
    bool pass = true;
@@ -771,6 +802,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
                s_host_ref, p_host_ref, p_compute_element_func);
        }
+        if(p_drop > 0)
+        {
+            ck_tile::HostTensor<RandValOutputDataType> randval_host_ref(
+                {nhead, real_seqlen_q, real_seqlen_k});
+            randval_host_ref.ForEach([&](auto& self, auto idx) {
+                self(idx) = randval_host(b, idx[0], idx[1] + query_offset, idx[2]);
+            });
+            ck_tile::reference_batched_dropout(
+                p_host_ref, randval_host_ref, p_undrop_in_uint8_t, rp_undrop);
+        }
        ck_tile::reference_batched_gemm<PDataType, VDataType, OaccDataType, ODataType>(
            p_host_ref,
            v_host_ref,
@@ -804,9 +846,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
        if(lse)
        {
            ck_tile::HostTensor<SMPLComputeDataType> lse_host_result({nhead, real_seqlen_q});
-            lse_host_result.ForEach([&](auto& self, auto idx) {
+            lse_host_result.ForEach(
-                self(idx) = lse_host(b, idx[0], idx[1] + query_offset);
+                [&](auto& self, auto idx) { self(idx) = lse_host(wb, idx[0], idx[1]); });
-            });
            bool lse_pass = ck_tile::check_err(lse_host_result,
                                               lse_host_ref,

--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -17,61 +17,65 @@ struct FmhaFwdTypeConfig;
 template <>
 struct FmhaFwdTypeConfig<ck_tile::half_t>
 {
-    using QDataType           = ck_tile::half_t;
+    using QDataType             = ck_tile::half_t;
-    using KDataType           = ck_tile::half_t;
+    using KDataType             = ck_tile::half_t;
-    using VDataType           = ck_tile::half_t;
+    using VDataType             = ck_tile::half_t;
-    using BiasDataType        = ck_tile::half_t;
+    using BiasDataType          = ck_tile::half_t;
-    using LSEDataType         = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using RandValOutputDataType = uint8_t;
-    using SaccDataType        = float; // data type for first gemm accumulation
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
-    using SMPLComputeDataType = float; // data type for reduction, softmax
+    using SaccDataType          = float; // data type for first gemm accumulation
-    using PDataType           = ck_tile::half_t; // data type for A matrix of second gemm
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
-    using OaccDataType        = float;           // data type for second gemm accumulation
+    using PDataType             = ck_tile::half_t; // data type for A matrix of second gemm
-    using ODataType           = ck_tile::half_t;
+    using OaccDataType          = float;           // data type for second gemm accumulation
+    using ODataType             = ck_tile::half_t;
 };
 template <>
 struct FmhaFwdTypeConfig<ck_tile::bf16_t>
 {
-    using QDataType           = ck_tile::bf16_t;
+    using QDataType             = ck_tile::bf16_t;
-    using KDataType           = ck_tile::bf16_t;
+    using KDataType             = ck_tile::bf16_t;
-    using VDataType           = ck_tile::bf16_t;
+    using VDataType             = ck_tile::bf16_t;
-    using BiasDataType        = ck_tile::bf16_t;
+    using BiasDataType          = ck_tile::bf16_t;
-    using LSEDataType         = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using RandValOutputDataType = uint8_t;
-    using SaccDataType        = float; // data type for first gemm accumulation
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
-    using SMPLComputeDataType = float; // data type for reduction, softmax
+    using SaccDataType          = float; // data type for first gemm accumulation
-    using PDataType           = ck_tile::bf16_t; // data type for A matrix of second gemm
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
-    using OaccDataType        = float;           // data type for second gemm accumulation
+    using PDataType             = ck_tile::bf16_t; // data type for A matrix of second gemm
-    using ODataType           = ck_tile::bf16_t;
+    using OaccDataType          = float;           // data type for second gemm accumulation
+    using ODataType             = ck_tile::bf16_t;
 };
 template <>
 struct FmhaFwdTypeConfig<ck_tile::fp8_t>
 {
-    using QDataType           = ck_tile::fp8_t;
+    using QDataType             = ck_tile::fp8_t;
-    using KDataType           = ck_tile::fp8_t;
+    using KDataType             = ck_tile::fp8_t;
-    using VDataType           = ck_tile::fp8_t;
+    using VDataType             = ck_tile::fp8_t;
-    using BiasDataType        = float;
+    using BiasDataType          = float;
-    using LSEDataType         = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using RandValOutputDataType = uint8_t;
-    using SaccDataType        = float; // data type for first gemm accumulation
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
-    using SMPLComputeDataType = float; // data type for reduction, softmax
+    using SaccDataType          = float; // data type for first gemm accumulation
-    using PDataType           = ck_tile::fp8_t; // data type for A matrix of second gemm
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
-    using OaccDataType        = float;          // data type for second gemm accumulation
+    using PDataType             = ck_tile::fp8_t; // data type for A matrix of second gemm
-    using ODataType           = ck_tile::fp8_t;
+    using OaccDataType          = float;          // data type for second gemm accumulation
+    using ODataType             = ck_tile::fp8_t;
 };
 template <>
 struct FmhaFwdTypeConfig<ck_tile::bf8_t>
 {
-    using QDataType           = ck_tile::bf8_t;
+    using QDataType             = ck_tile::bf8_t;
-    using KDataType           = ck_tile::bf8_t;
+    using KDataType             = ck_tile::bf8_t;
-    using VDataType           = ck_tile::bf8_t;
+    using VDataType             = ck_tile::bf8_t;
-    using BiasDataType        = ck_tile::bf8_t;
+    using BiasDataType          = ck_tile::bf8_t;
-    using LSEDataType         = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using RandValOutputDataType = uint8_t;
-    using SaccDataType        = float; // data type for first gemm accumulation
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
-    using SMPLComputeDataType = float; // data type for reduction, softmax
+    using SaccDataType          = float; // data type for first gemm accumulation
-    using PDataType           = ck_tile::bf8_t; // data type for A matrix of second gemm
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
-    using OaccDataType        = float;          // data type for second gemm accumulation
+    using PDataType             = ck_tile::bf8_t; // data type for A matrix of second gemm
-    using ODataType           = ck_tile::bf8_t;
+    using OaccDataType          = float;          // data type for second gemm accumulation
+    using ODataType             = ck_tile::bf8_t;
 };
 struct FmhaMasks
@@ -88,6 +92,7 @@ struct fmha_fwd_args
    const void* k_ptr;
    const void* v_ptr;
    const void* bias_ptr; // bias or alibi_slope pointer
+    void* rand_val_ptr;
    void* lse_ptr;
    void* o_ptr;
    const void* seqstart_q_ptr;
@@ -108,22 +113,28 @@ struct fmha_fwd_args
    ck_tile::index_t stride_k;
    ck_tile::index_t stride_v;
    ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
+    ck_tile::index_t stride_randval;
    ck_tile::index_t stride_o;
    ck_tile::index_t nhead_stride_q;
    ck_tile::index_t nhead_stride_k;
    ck_tile::index_t nhead_stride_v;
    ck_tile::index_t nhead_stride_bias;
+    ck_tile::index_t nhead_stride_randval;
    ck_tile::index_t nhead_stride_lse;
    ck_tile::index_t nhead_stride_o;
    ck_tile::index_t batch_stride_q;
    ck_tile::index_t batch_stride_k;
    ck_tile::index_t batch_stride_v;
    ck_tile::index_t batch_stride_bias;
+    ck_tile::index_t batch_stride_randval;
    ck_tile::index_t batch_stride_lse;
    ck_tile::index_t batch_stride_o;
    ck_tile::index_t window_size_left;
    ck_tile::index_t window_size_right;
    ck_tile::index_t mask_type;
+    float p_drop;
+    bool s_randval;
+    std::tuple<uint64_t, uint64_t> drop_seed_offset;
 };
 template <typename FmhaKernel>
@@ -138,6 +149,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                         args.k_ptr,
                                         args.v_ptr,
                                         args.bias_ptr,
+                                         args.rand_val_ptr,
                                         args.lse_ptr,
                                         args.o_ptr,
                                         args.seqstart_q_ptr,
@@ -145,6 +157,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                         args.seqlen_k_ptr,
                                         args.hdim_q,
                                         args.hdim_v,
+                                         args.nhead_q,
                                         args.nhead_q / args.nhead_k,
                                         args.scale_s,
                                         args.scale_p,
@@ -153,16 +166,22 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                         args.stride_k,
                                         args.stride_v,
                                         args.stride_bias,
+                                         args.stride_randval,
                                         args.stride_o,
                                         args.nhead_stride_q,
                                         args.nhead_stride_k,
                                         args.nhead_stride_v,
                                         args.nhead_stride_bias,
+                                         args.nhead_stride_randval,
                                         args.nhead_stride_lse,
                                         args.nhead_stride_o,
+                                         args.batch_stride_lse,
                                         args.window_size_left,
                                         args.window_size_right,
-                                         args.mask_type);
+                                         args.mask_type,
+                                         args.p_drop,
+                                         args.s_randval,
+                                         args.drop_seed_offset);
        }
        else
        { // create batch mode kernel arguments
@@ -170,12 +189,14 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                         args.k_ptr,
                                         args.v_ptr,
                                         args.bias_ptr,
+                                         args.rand_val_ptr,
                                         args.lse_ptr,
                                         args.o_ptr,
                                         args.seqlen_q,
                                         args.seqlen_k,
                                         args.hdim_q,
                                         args.hdim_v,
+                                         args.nhead_q,
                                         args.nhead_q / args.nhead_k,
                                         args.scale_s,
                                         args.scale_p,
@@ -184,22 +205,28 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                         args.stride_k,
                                         args.stride_v,
                                         args.stride_bias,
+                                         args.stride_randval,
                                         args.stride_o,
                                         args.nhead_stride_q,
                                         args.nhead_stride_k,
                                         args.nhead_stride_v,
                                         args.nhead_stride_bias,
+                                         args.nhead_stride_randval,
                                         args.nhead_stride_lse,
                                         args.nhead_stride_o,
                                         args.batch_stride_q,
                                         args.batch_stride_k,
                                         args.batch_stride_v,
                                         args.batch_stride_bias,
+                                         args.batch_stride_randval,
                                         args.batch_stride_lse,
                                         args.batch_stride_o,
                                         args.window_size_left,
                                         args.window_size_right,
-                                         args.mask_type);
+                                         args.mask_type,
+                                         args.p_drop,
+                                         args.s_randval,
+                                         args.drop_seed_offset);
        }
    }();
@@ -222,6 +249,7 @@ template <ck_tile::index_t HDim_,
          typename FmhaMask_,
          ck_tile::BlockAttentionBiasEnum BiasEnum_,
          bool kStoreLse_,
+          bool kHasDropout_,
          bool kDoFp8StaticQuant_,
          bool kPadS_,
          bool kPadSK_,
@@ -243,6 +271,7 @@ struct fmha_fwd_traits_
    using FmhaMask                                   = ck_tile::remove_cvref_t<FmhaMask_>;
    static constexpr auto BiasEnum                   = BiasEnum_;
    static constexpr bool kStoreLse                  = kStoreLse_;
+    static constexpr bool kHasDropout                = kHasDropout_;
    static constexpr bool kDoFp8StaticQuant          = kDoFp8StaticQuant_;
    static constexpr bool kPadS                      = kPadS_;
    static constexpr bool kPadSK                     = kPadSK_;
@@ -264,6 +293,7 @@ struct fmha_fwd_traits
    mask_enum mask_type;
    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
    bool has_lse;
+    bool has_dropout;
    bool do_fp8_static_quant;
    // TODO: padding check is inside this api
 };

--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
--- a/example/ck_tile/01_fmha/script/benchmark_bwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_bwd.sh
+#!/bin/sh
+# TODO: run this script from CK root
+BUILD=build
+EXE=$BUILD/bin/tile_example_fmha_bwd
+VALID=0
+for prec in "fp16" "bf16" ; do
+for perm in 0 1 ; do
+for hdim in 32 64 128 ; do
+nhead=$((2048 / $hdim))     # follow fav2 setup
+$EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512   -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=8  -h=$nhead -d=$hdim -s=2048  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=4  -h=$nhead -d=$hdim -s=4096  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=2  -h=$nhead -d=$hdim -s=8192  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+done
+done
+done
--- a/example/ck_tile/01_fmha/script/benchmark.sh
+++ b/example/ck_tile/01_fmha/script/benchmark.sh
--- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+#!/bin/sh
+# TODO: run this script from CK root
+BUILD=build
+EXE=$BUILD/bin/tile_example_fmha_bwd
+KNAME=1
+export CK_WARMUP=0
+export CK_REPEAT=1
+COMMON_ARGS='-v=1'
+for prec in "fp16" "bf16" ; do
+for perm in 0 1 ; do
+for hdim in 32 64 128 ; do
+for mode in 0 1 ; do
+for bias in "n" "e" "a"; do
+for dbias in 0 1 ; do
+for p_drop in 0.0 0.2; do
+$EXE -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=2 -h=2 -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1 -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=1 -h=2 -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2 -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=2 -h=1 -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35 -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+done
+done
+done
+done
+done
+done
+done