Merge remote-tracking branch 'origin/develop' into ck_tile/fav2_fwd_sept

6a25d081 · carlushuang · 02f8c487 · ceaed8e0 · 6a25d081 · 6a25d081
Commit 6a25d081 authored Oct 09, 2024 by carlushuang
20 changed files
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -33,6 +33,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
 #include "run_gemm_example.inc"

 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>

@@ -53,6 +53,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
 #include "run_gemm_example.inc"

 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>

@@ -52,6 +52,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
 #include "run_gemm_example.inc"

 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_streamk.cpp
+++ b/example/01_gemm/gemm_xdl_streamk.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -44,6 +44,17 @@ using DeviceGemmInstance = DeviceGemmStreamK;
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
 #include "run_gemm_example.inc"

 int main(int argc, char* argv[]) { return !run_gemm_streamk_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -37,6 +37,17 @@ using DeviceGemmInstance = DeviceGemmInstance;
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
 #include "run_gemm_example.inc"

 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -173,6 +173,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)

    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));

    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
@@ -193,6 +194,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
+                                   c_m_n_device_ref_result.mDesc.GetElementSpaceSize());

    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
@@ -325,14 +328,18 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << gemm.GetTypeString() << std::endl;

+    bool pass = true;
+
    if(config.do_verification)
    {
+        // CPU verification
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();

        auto ref_argument = ref_gemm.MakeArgument(
            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);

+        std::cout << "Running verification on CPU." << std::endl;
        ref_invoker.Run(ref_argument);

 #ifdef BUILD_INT4_EXAMPLE
@@ -346,15 +353,42 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #else
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());

-        return ck::utils::check_err(c_m_n_device_result,
-                                    c_m_n_host_result,
-                                    "Error: Incorrect results!",
-                                    get_rtol<CDataType>(),
-                                    get_atol<CDataType>());
+        pass &= !ck::utils::check_err(c_m_n_device_result,
+                                      c_m_n_host_result,
+                                      "Error: Incorrect results!",
+                                      get_rtol<CDataType>(),
+                                      get_atol<CDataType>());
 #endif
+
+        // GPU verification
+        auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
+        auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
+
+        auto ref_argument_gpu = ref_gemm_gpu.MakeArgument(
+            static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_m_n_device_ref_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+
+        std::cout << "Running verification on GPU." << std::endl;
+        ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{});
+
+        c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= !ck::utils::check_err(c_m_n_device_result,
+                                      c_m_n_device_ref_result,
+                                      "Error: Incorrect results!",
+                                      get_rtol<CDataType>(),
+                                      get_atol<CDataType>());
    }

-    return true;
+    return !pass;
 }

 bool run_gemm_example(int argc, char* argv[])

--- a/example/66_complex_contraction_bilinear/CMakeLists.txt
+++ b/example/66_complex_contraction_bilinear/CMakeLists.txt
+add_example_executable(example_complex_contraction_bilinear_xdl_fp32 complex_contraction_bilinear_xdl_fp32.cpp)
+add_example_executable(example_complex_contraction_bilinear_xdl_fp64 complex_contraction_bilinear_xdl_fp64.cpp)
+
--- a/example/66_complex_contraction_bilinear/README.md
+++ b/example/66_complex_contraction_bilinear/README.md
+# Instructions for ```example_complex_contraction_bilinear_xdl_fp32```
+
+## Run
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_contraction_bilinear_xdl_fp32 1 1 1
+```
+
+
--- a/example/66_complex_contraction_bilinear/common_instances.hpp
+++ b/example/66_complex_contraction_bilinear/common_instances.hpp
--- a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
+++ b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+using ComputeDataType  = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+#include "run_complex_contraction_bilinear_example.inc"
+
+int main(int argc, char* argv[]) { return run_complex_contraction_bilinear_example(argc, argv); }
--- a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
+++ b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F64;
+using CShuffleDataType = F64;
+using DDataType        = F64;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F64;
+using ComputeDataType  = F64;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+#include "run_complex_contraction_bilinear_example.inc"
+
+int main(int argc, char* argv[]) { return run_complex_contraction_bilinear_example(argc, argv); }
--- a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
+++ b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -45,11 +45,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    endforeach()
    endif()

-    if(INSTANCES_ONLY)
-        set(EX_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(EX_TARGETS ${GPU_TARGETS})
-    endif()
+    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})

    #Do not build any DL examples if DL_KERNELS not set
    foreach(source IN LISTS FILE_NAME)
@@ -147,11 +143,8 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    endforeach()
    endif()

-    if(INSTANCES_ONLY)
-        set(EX_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(EX_TARGETS ${GPU_TARGETS})
-    endif()
+    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})
+
    #Do not build any DL examples if DL_KERNELS not set
    foreach(source IN LISTS FILE_NAME)
        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")

--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -6,7 +6,8 @@ This folder contains example for fmha(fused multi-head attention) using ck_tile
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_fmha_fwd -j
 ```
 This will result in an executable `build/bin/tile_example_fmha_fwd`
@@ -23,7 +24,7 @@ There are 3 template parameters for this kernel template.
 To speed up compile time, we instantiate the kernels into separate file. In this way we can benefit from parallel building from CMake/Make system. This is achieved by `generate.py` script. Besides, you can look into this script to learn how to instantiate a kernel instance step by step, which is described in `FMHA_FWD_KERNEL_BODY` variable.

 ## executable
-`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all supported args. Below is an example of the output (may subject to change)
+`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all the arguments. Below is an example of the output (may subject to change)
 ```
 args:
          -v    weather do CPU validation or not (default:1)
@@ -31,47 +32,52 @@ args:
          -b    batch size (default:2)
          -h    num of head, for q (default:8)
        -h_k    num of head, for k/v, -1 means equal to h (default:-1)
-                 if not equal to h, then this is GQA/MQA case
+                if not equal to h, then this is GQA/MQA case
          -s    seqlen_q. if group-mode, means the average value of seqlen_q (default:3328)
-                 total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary
-                 also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode)
-        -s_k    seqlen_k, -1 means equal to s (default:-1)
+                total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary
+                also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode)
+        -s_k    seqlen_k (including new key/value), -1 means equal to s (default:-1)
          -d    head dim for q, k (default:128)
        -d_v    head dim for v, -1 means equal to d (default:-1)
    -scale_s    scale factor of S. 0 means equal to 1/sqrt(hdim). (default:0)
-                 note when squant=1, this value will be modified by range_q/k
+                note when squant=1, this value will be modified by range_q/k
    -range_q    per-tensor quantization range of q. used if squant=1. (default:16)
    -range_k    per-tensor quantization range of k. used if squant=1. (default:16)
    -range_v    per-tensor quantization range of v. used if squant=1. (default:16)
    -range_p    per-tensor quantization range of p [e^(s-m)]. used if squant=1. (default:1)
    -range_o    per-tensor quantization range of o (p*v). used if squant=1. (default:16)
     -squant    if using static quantization fusion or not. auto: fp8 will default use squant, other will not (default:auto)
-                 0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O.
-                 calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o
+                0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O.
+                calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o
      -iperm    permute input (default:1)
-                 if true, will be b*h*s*d, else b*s*h*d
+                if true, will be b*h*s*d, else b*s*h*d
      -operm    permute output (default:1)
       -bias    n or 0, no bias (default:n)
-                 e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s
-                 a(libi) or 2, alibi with 1*h. a:1, b*h
+                e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s
+                a(libi) or 2, alibi with 1*h. a:1, b*h
       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
       -mask    0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b') (default:0)
-                 't', top-left causal mask, 'b', bottom-r causal mask
-                 't:l,r', top-left sliding window attn(swa) with FA style left right size
-                 'b:l,r', bottom-r sliding window attn(swa) with FA style left right size
-                 'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa
-                 'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa
-                 'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now)
+                't', top-left causal mask, 'b', bottom-r causal mask
+                't:l,r', top-left sliding window attn(swa) with FA style left right size
+                'b:l,r', bottom-r sliding window attn(swa) with FA style left right size
+                'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa
+                'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa
+                'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now)
    -vlayout    r for row-major(seqlen*hdim), c for col-major(hdim*seqlen) (default:r)
        -lse    0 not store lse, 1 store lse (default:0)
      -kname    if set to 1 will print kernel name (default:0)
       -init    init method. ui, uniform random int, ni, normalized random int (default:uf)
-                 uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization
+                uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization
       -seed    random seed used for initializing input tensors. 0 for non-deterministic seed (default:11939)
+  -drop_seed    seed for random number generator (default:1)
+-drop_offset    offset for random number generator (default:0)
+ -drop_prefs    seed and offset values are present on GPU; 0 - host, 1 - device/GPU (default:0)
     -warmup    number of iterations before benchmark the kernel (default:5)
     -repeat    number of iterations to benchmark the kernel (default:20)
 ```
-Example: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
+Example 1: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
+Example 2: `./bin/tile_example_fmha_fwd -b=1 -h=8 -s=16384 -d=64 -drop_prefs=1 -drop_seed=10 -drop_offset=1234` will run a fmha case with 
+  batch=1, nhead=8, sequence length=16384, hdim=64, drop_seed=0 (in GPU memory), drop_offset=1234 (in GPU memory) fp16 case

 ## support features
 Currently we are still in rapid development stage, so more features/optimizations will be coming soon.

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -600,8 +600,8 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                # TODO: use async pipeline when compiler is more stable 
                if hdim == 256 or hdim in [32, 64, 128]:
                # if True:
-                    pipelines.append(Pipeline('qr', 'row', 'f', 'f', 'f', 'f', bias, lse, squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))

                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))
                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))

--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -85,6 +85,9 @@ auto create_args(int argc, char* argv[])
        .insert("p_drop", "0", "0~1 probability of dropout")
        .insert("drop_seed", "1", "seed for random number generator")
        .insert("drop_offset", "0", "offset for random number generator")
+        .insert("drop_prefs",
+                "0",
+                "seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert("warmup", "5", "number of iterations before benchmark the kernel")
        .insert("repeat", "20", "number of iterations to benchmark the kernel")
@@ -158,6 +161,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    float p_drop         = arg_parser.get_float("p_drop");
    uint64_t drop_seed   = arg_parser.get_uint64("drop_seed");
    uint64_t drop_offset = arg_parser.get_uint64("drop_offset");
+    bool drop_prefs      = arg_parser.get_bool("drop_prefs");
+
    if(use_dbias && bias.type != bias_enum::elementwise_bias)
    {
        std::cerr << "dbias only exists when bias type is elementwise" << std::endl;
@@ -381,6 +386,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::DeviceMem dbias_buf(dbias_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
    ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
+    ck_tile::DeviceMem drop_seed_buf(drop_prefs ? sizeof(uint64_t) : 0);
+    ck_tile::DeviceMem drop_offset_buf(drop_prefs ? sizeof(uint64_t) : 0);
    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem dq_acc_buf(dq_acc_host.get_element_space_size_in_bytes());

@@ -391,6 +398,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    do_buf.ToDevice(do_host.data());
    seqstart_q.ToDevice(seqstart_q_host.data());
    seqstart_k.ToDevice(seqstart_k_host.data());
+    drop_seed_buf.ToDevice(drop_prefs ? &drop_seed : nullptr);
+    drop_offset_buf.ToDevice(drop_prefs ? &drop_offset : nullptr);
    alibi_slope_buf.ToDevice(alibi_slope_host.data());

    // clang-format off
@@ -472,6 +481,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t split_stride_dq_acc =
            (shape_batch * nhead * shape_seqlen_q * hdim_q);

+        const auto drop_seed_offset = [&]() -> decltype(fmha_bwd_args::drop_seed_offset) {
+            if(drop_prefs)
+            {
+                return std::make_pair(drop_seed_buf.GetDeviceBuffer(),
+                                      drop_offset_buf.GetDeviceBuffer());
+            }
+            else
+            {
+                return std::make_pair(drop_seed, drop_offset);
+            }
+        }();
+
        return fmha_bwd_args{q_buf.GetDeviceBuffer(),
                             k_buf.GetDeviceBuffer(),
                             v_buf.GetDeviceBuffer(),
@@ -545,7 +566,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             static_cast<ck_tile::index_t>(mask.type),
                             p_drop,
                             p_undrop,
-                             {drop_seed, drop_offset}};
+                             drop_seed_offset};
    }();

    float ave_time = fmha_bwd(fmha_traits, fmha_args, stream_config);

--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -9,7 +9,10 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "mask.hpp"
 #include "bias.hpp"
+
 #include <type_traits>
+#include <utility>
+#include <variant>

 template <typename DataType>
 struct FmhaBwdTypeConfig;
@@ -135,7 +138,8 @@ struct fmha_bwd_args
    ck_tile::index_t mask_type;
    float p_drop;
    float p_undrop;
-    std::tuple<uint64_t, uint64_t> drop_seed_offset;
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
 };

 template <typename FmhaBwdDQDKDVKernel>

--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -122,6 +122,9 @@ auto create_args(int argc, char* argv[])
        .insert("p_drop", "0", "0~1 probability of dropout")
        .insert("drop_seed", "1", "seed for random number generator")
        .insert("drop_offset", "0", "offset for random number generator")
+        .insert("drop_prefs",
+                "0",
+                "seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert(
            "rotary_dim", "0", "RoPE rotary dimension. rotary_dim <= 0 means not apply RoPE at all")
@@ -442,6 +445,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    float p_drop         = arg_parser.get_float("p_drop");
    uint64_t drop_seed   = arg_parser.get_uint64("drop_seed");
    uint64_t drop_offset = arg_parser.get_uint64("drop_offset");
+    bool drop_prefs      = arg_parser.get_bool("drop_prefs");
+
    if(p_drop < 0.0f || p_drop > 1.0f)
    {
        std::cerr << "The value of p_drop should be 0~1" << std::endl;
@@ -552,16 +557,33 @@ bool run(const ck_tile::ArgParser& arg_parser)
    }
 #endif

-    auto get_lengths = [&](bool permute,
-                           ck_tile::index_t b /*batch*/,
-                           ck_tile::index_t h /*nhead*/,
-                           ck_tile::index_t s /*seqlen*/,
-                           ck_tile::index_t d /*hdim*/) {
-        if(permute)
-            return std::array<ck_tile::index_t, 4>{b, h, s, d};
-        else
-            return std::array<ck_tile::index_t, 4>{b, s, h, d};
-    };
+    struct
+    {
+        auto operator()(bool permute,
+                        ck_tile::index_t b /*batch*/,
+                        ck_tile::index_t h /*nhead*/,
+                        ck_tile::index_t s /*seqlen*/,
+                        ck_tile::index_t d /*hdim*/)
+        {
+            if(permute)
+                return std::array<ck_tile::index_t, 4>{b, h, s, d};
+            else
+                return std::array<ck_tile::index_t, 4>{b, s, h, d};
+        }
+
+        auto operator()(bool permute,
+                        ck_tile::index_t ns /*num_splits*/,
+                        ck_tile::index_t b /*batch*/,
+                        ck_tile::index_t h /*nhead*/,
+                        ck_tile::index_t s /*seqlen*/,
+                        ck_tile::index_t d /*hdim*/)
+        {
+            if(permute)
+                return std::array<ck_tile::index_t, 5>{ns, b, h, s, d};
+            else
+                return std::array<ck_tile::index_t, 5>{ns, b, s, h, d};
+        }
+    } get_lengths;

    bool is_v_rowmajor = vlayout == std::string("r");

@@ -617,7 +639,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1});
    ck_tile::HostTensor<OaccDataType> o_acc_host(
        1 < num_splits || use_kvcache
-            ? std::array<ck_tile::index_t, 5>{num_splits, batch, nhead, max_seqlen_q, hdim_v}
+            ? get_lengths(o_perm, num_splits, shape_batch, nhead, shape_seqlen_q, hdim_v)
            : std::array<ck_tile::index_t, 5>{1, 1, 1, 1, 1});

    // batch mode of lse data layout is [batch, nhead, seqlen_q]
@@ -739,6 +761,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
        need_append_kvcache ? cache_seqlen_ks.size() * sizeof(int32_t) : 0);
    ck_tile::DeviceMem rotary_cos_buf(rotary_cos_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem rotary_sin_buf(rotary_sin_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem drop_seed_buf(drop_prefs ? sizeof(uint64_t) : 0);
+    ck_tile::DeviceMem drop_offset_buf(drop_prefs ? sizeof(uint64_t) : 0);
    ck_tile::DeviceMem randval_buf(randval_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem block_table_buf(block_table_host.get_element_space_size_in_bytes());
@@ -757,6 +781,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    cache_seqlen_k_buf.ToDevice(need_append_kvcache ? cache_seqlen_ks.data() : nullptr);
    rotary_cos_buf.ToDevice(rotary_cos_host.data());
    rotary_sin_buf.ToDevice(rotary_sin_host.data());
+    drop_seed_buf.ToDevice(drop_prefs ? &drop_seed : nullptr);
+    drop_offset_buf.ToDevice(drop_prefs ? &drop_offset : nullptr);
    alibi_slope_buf.ToDevice(alibi_slope_host.data());
    block_table_buf.ToDevice(block_table_host.data());
    cache_batch_idx_buf.ToDevice(cache_batch_idx_host.data());
@@ -854,7 +880,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        }();
        const ck_tile::index_t stride_bias    = (i_perm ? shape_seqlen_k : 1 * shape_seqlen_k);
        const ck_tile::index_t stride_randval = (max_seqlen_k);
-        const ck_tile::index_t stride_o_acc   = hdim_v;
+        const ck_tile::index_t stride_o_acc   = (o_perm ? hdim_v : nhead * hdim_v);
        const ck_tile::index_t stride_o       = (o_perm ? hdim_v : nhead * hdim_v);
        // setup nhead_stride_* arguments
        const ck_tile::index_t nhead_stride_q = (i_perm ? shape_seqlen_q * hdim_q : hdim_q);
@@ -881,7 +907,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t nhead_stride_lse     = shape_seqlen_q;
        const ck_tile::index_t nhead_stride_lse_acc = shape_seqlen_q;
-        const ck_tile::index_t nhead_stride_o_acc   = (max_seqlen_q * hdim_v);
+        const ck_tile::index_t nhead_stride_o_acc   = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
        const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
        // setup batch_stride_* arguments
        const ck_tile::index_t batch_stride_q = (nhead * shape_seqlen_q * hdim_q);
@@ -897,12 +923,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t batch_stride_lse     = (nhead * shape_seqlen_q);
        const ck_tile::index_t batch_stride_lse_acc = (nhead * shape_seqlen_q);
-        const ck_tile::index_t batch_stride_o_acc   = (nhead * max_seqlen_q * hdim_v);
+        const ck_tile::index_t batch_stride_o_acc   = (nhead * shape_seqlen_q * hdim_v);
        const ck_tile::index_t batch_stride_o       = (nhead * shape_seqlen_q * hdim_v);
        const ck_tile::index_t batch_stride_block_table = (max_num_page_blocks / batch);
        // setup split_stride_* arguments (only used in split-kv kernel)
        const ck_tile::index_t split_stride_lse_acc = (shape_batch * nhead * shape_seqlen_q);
-        const ck_tile::index_t split_stride_o_acc   = (batch * nhead * max_seqlen_q * hdim_v);
+        const ck_tile::index_t split_stride_o_acc = (shape_batch * nhead * shape_seqlen_q * hdim_v);

        args.q_ptr = q_buf.GetDeviceBuffer();
        args.k_ptr = k_buf.GetDeviceBuffer();
@@ -996,9 +1022,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
                args.nhead_stride_randval = nhead_stride_randval;
                args.batch_stride_randval = batch_stride_randval;

-                args.p_drop           = p_drop;
-                args.s_randval        = s_randval;
-                args.drop_seed_offset = std::tie(drop_seed, drop_offset);
+                args.p_drop    = p_drop;
+                args.s_randval = s_randval;
+                if(drop_prefs)
+                {
+                    args.drop_seed_offset = std::make_pair(drop_seed_buf.GetDeviceBuffer(),
+                                                           drop_offset_buf.GetDeviceBuffer());
+                }
+                else
+                {
+                    args.drop_seed_offset = std::make_pair(drop_seed, drop_offset);
+                }
            }
            else if constexpr(std::is_same_v<fmha_fwd_splitkv_args, std::decay_t<decltype(args)>>)
            {

--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -13,6 +13,8 @@
 #include "rotary.hpp"

 #include <type_traits>
+#include <utility>
+#include <variant>

 template <typename DataType>
 struct FmhaFwdTypeConfig;
@@ -144,7 +146,9 @@ struct fmha_fwd_args

    float p_drop;
    bool s_randval;
-    std::tuple<uint64_t, uint64_t> drop_seed_offset;
+
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
 };

 struct fmha_fwd_splitkv_args
@@ -398,10 +402,8 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.nhead_stride_bias,
                                     args.nhead_stride_lse_acc,
                                     args.nhead_stride_o_acc,
-                                     args.batch_stride_k,
-                                     args.batch_stride_v,
-                                     args.batch_stride_lse_acc,
-                                     args.batch_stride_o_acc,
+                                     args.batch_stride_k, // only used for paged-kvcache
+                                     args.batch_stride_v, // only used for paged-kvcache
                                     args.split_stride_lse_acc,
                                     args.split_stride_o_acc,
                                     args.window_size_left,
@@ -475,7 +477,6 @@ auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.lse_ptr,
                                     args.o_ptr,
                                     args.batch,
-                                     args.max_seqlen_q,
                                     args.seqstart_q_ptr,
                                     args.hdim_v,
                                     args.num_splits,
@@ -486,7 +487,6 @@ auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.nhead_stride_o_acc,
                                     args.nhead_stride_lse,
                                     args.nhead_stride_o,
-                                     args.batch_stride_o_acc,
                                     args.split_stride_lse_acc,
                                     args.split_stride_o_acc);
        }
@@ -497,7 +497,6 @@ auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.lse_ptr,
                                     args.o_ptr,
                                     args.batch,
-                                     args.max_seqlen_q,
                                     args.seqlen_q,
                                     args.hdim_v,
                                     args.num_splits,

--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
@@ -6,7 +6,8 @@ This folder contains example for Layernorm2D forward using ck_tile tile-programm
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_layernorm2d_fwd -j
 ```
 This will result in an executable `build/bin/tile_example_layernorm2d_fwd`