Merge branch 'develop' into ck_migraphx_integration

dda18da0 · Illia Silin · GitHub · 3b2a7aee · 4cf70b36 · dda18da0
Unverified Commit dda18da0 authored Oct 14, 2024 by Illia Silin Committed by GitHub Oct 14, 2024
20 changed files
--- a/example/01_gemm/gemm_xdl_fp8_bf8.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_bf8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -44,6 +44,17 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
                                                                        ComputeTypeA,
                                                                        ComputeTypeB>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -33,6 +33,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
@@ -53,6 +53,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
@@ -52,6 +52,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_streamk.cpp
+++ b/example/01_gemm/gemm_xdl_streamk.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -44,6 +44,17 @@ using DeviceGemmInstance = DeviceGemmStreamK;
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_streamk_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -37,6 +37,17 @@ using DeviceGemmInstance = DeviceGemmInstance;
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -173,6 +173,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
@@ -193,6 +194,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
+                                   c_m_n_device_ref_result.mDesc.GetElementSpaceSize());
    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
@@ -325,14 +328,18 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << gemm.GetTypeString() << std::endl;
+    bool pass = true;
    if(config.do_verification)
    {
+        // CPU verification
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
        auto ref_argument = ref_gemm.MakeArgument(
            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+        std::cout << "Running verification on CPU." << std::endl;
        ref_invoker.Run(ref_argument);
 #ifdef BUILD_INT4_EXAMPLE
@@ -346,15 +353,42 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #else
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        return ck::utils::check_err(c_m_n_device_result,
+        pass &= !ck::utils::check_err(c_m_n_device_result,
-                                    c_m_n_host_result,
+                                      c_m_n_host_result,
-                                    "Error: Incorrect results!",
+                                      "Error: Incorrect results!",
-                                    get_rtol<CDataType>(),
+                                      get_rtol<CDataType>(),
-                                    get_atol<CDataType>());
+                                      get_atol<CDataType>());
 #endif
+        // GPU verification
+        auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
+        auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
+        auto ref_argument_gpu = ref_gemm_gpu.MakeArgument(
+            static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_m_n_device_ref_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+        std::cout << "Running verification on GPU." << std::endl;
+        ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{});
+        c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+        pass &= !ck::utils::check_err(c_m_n_device_result,
+                                      c_m_n_device_ref_result,
+                                      "Error: Incorrect results!",
+                                      get_rtol<CDataType>(),
+                                      get_atol<CDataType>());
    }
-    return true;
+    return !pass;
 }
 bool run_gemm_example(int argc, char* argv[])

--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -117,9 +117,9 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    auto f_get_default_stride =
        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == -1)
+            if(stride == 0)
            {
-                // give a chance if stride is -1, return a default packed stride
+                // give a chance if stride is 0, return a default packed stride
                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                {
                    return static_cast<std::size_t>(col);

--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
@@ -5,3 +5,4 @@ add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permu
 add_example_executable(example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp)
 add_example_executable(example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp)
 add_example_executable(example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp)
+add_example_executable(elementwise_scale_permute_amax_2D_fp16_fp8 elementwise_scale_permute_amax_2D_fp16_fp8.cpp)
--- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/utility/reduction_enums.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using F8  = ck::f8_t;
+using InputDataType  = F16;
+using ScaleDataType  = F32;
+using OutputDataType = F8;
+static constexpr ck::index_t NumDim = 2;
+constexpr ck::ReduceTensorOp ReduceOpId = ck::ReduceTensorOp::MAX;
+constexpr bool PropagateNan             = true;
+constexpr bool OutputIndex              = false;
+using ReduceOperation = typename ck::reduce_binary_operator<ReduceOpId>::opType;
+struct ScalePassThrough
+{
+    ScalePassThrough(const float alpha = 1.f) : alpha_(alpha) {}
+    __host__ __device__ constexpr void
+    operator()(OutputDataType& y0, OutputDataType& y1, const InputDataType& x0) const
+    {
+        y0 = ck::type_convert<OutputDataType>(ck::type_convert<ScaleDataType>(x0) * alpha_);
+        y1 = y0;
+    }
+    const ScaleDataType alpha_;
+};
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using UnaryAbs    = ck::tensor_operation::element_wise::UnaryAbs;
+using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
+    ck::Tuple<InputDataType>,                  // InDataTypeTuple
+    ck::Tuple<OutputDataType, OutputDataType>, // OutDataTypeTuple
+    ScalePassThrough,                          // Elementwise
+    NumDim,                                    // NumDim
+    256,                                       // BlockSize
+    128,                                       // M0PerBlock
+    128,                                       // M1PerBlock
+    8,                                         // M0PerThread
+    8,                                         // M1PerThread
+    ck::Sequence<1, 0>,                        // ThreadClusterArrangeOrder
+    ck::Sequence<8>,                           // InScalarPerVectorSeq
+    ck::Sequence<8, 1>>;                       // OutScalarPerVectorSeq
+using DeviceReduceInstance =
+    ck::tensor_operation::device::DeviceReduceMultiBlock<OutputDataType,
+                                                         OutputDataType,
+                                                         OutputDataType,
+                                                         NumDim,
+                                                         NumDim,
+                                                         ReduceOperation,
+                                                         UnaryAbs,
+                                                         PassThrough,
+                                                         ck::InMemoryDataOperationEnum::Set,
+                                                         PropagateNan,
+                                                         OutputIndex,
+                                                         false, // HaveIndexInputIfOutputIndex
+                                                         1024,  // BlockSize
+                                                         1,     // MThreadClusterSize
+                                                         1024,  // KThreadClusterSize
+                                                         1,     // MThreadSliceSize
+                                                         16,    // KThreadSliceSize
+                                                         1,     // InSrcVectorDim
+                                                         16,    // InSrceVectorSize
+                                                         1>;    // OutDstVectorSize
+void reference_scale_permute_amax(Tensor<InputDataType>& input,
+                                  Tensor<OutputDataType>& host_output_scaled_casted_transposed,
+                                  Tensor<OutputDataType>& host_output_scaled_casted,
+                                  Tensor<OutputDataType>& host_output_amax,
+                                  const float scale)
+{
+    ScalePassThrough out_element_op(scale);
+    const ck::index_t M = input.GetLengths()[0];
+    const ck::index_t K = input.GetLengths()[1];
+    for(ck::index_t m = 0; m < M; m++)
+    {
+        for(ck::index_t k = 0; k < K; k++)
+        {
+            OutputDataType y0, y1;
+            out_element_op(y0, y1, input(m, k));
+            host_output_scaled_casted(m, k)            = y0;
+            host_output_scaled_casted_transposed(m, k) = y1;
+            const OutputDataType y_fabs =
+                ck::type_convert<OutputDataType>(ck::math::abs(ck::type_convert<float>(y0)));
+            host_output_amax(0) = ck::math::max(y_fabs, host_output_amax(0));
+        }
+    }
+}
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+    const float scale = 2.f;
+    ck::index_t M = 1024;
+    ck::index_t K = 1024;
+    if(argc == 3)
+    {
+        M = std::stoi(argv[1]);
+        K = std::stoi(argv[2]);
+    }
+    std::array<ck::index_t, 2> dims        = {M, K};
+    std::array<ck::index_t, 2> in_strides  = {K, 1};
+    std::array<ck::index_t, 2> out_strides = {1, M};
+    Tensor<InputDataType> input(dims, in_strides);
+    Tensor<OutputDataType> output_scaled_casted_transposed(dims, out_strides);
+    Tensor<OutputDataType> output_scaled_casted(dims, in_strides);
+    Tensor<OutputDataType> output_amax({1});
+    input.GenerateTensorValue(GeneratorTensor_3<InputDataType>{0.0, 1.0});
+    DeviceMem input_dev_buf(sizeof(InputDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem output_scaled_casted_transposed_dev_buf(
+        sizeof(OutputDataType) * output_scaled_casted_transposed.mDesc.GetElementSpaceSize());
+    DeviceMem output_scaled_casted_dev_buf(sizeof(OutputDataType) *
+                                           output_scaled_casted.mDesc.GetElementSpaceSize());
+    DeviceMem output_amax_dev_buf(sizeof(OutputDataType) * output_amax.mDesc.GetElementSpaceSize());
+    input_dev_buf.ToDevice(input.mData.data());
+    std::array<const void*, 1> inputs = {input_dev_buf.GetDeviceBuffer()};
+    std::array<void*, 2> outputs      = {output_scaled_casted_transposed_dev_buf.GetDeviceBuffer(),
+                                    output_scaled_casted_dev_buf.GetDeviceBuffer()};
+    std::cout << "Input: " << input.mDesc << std::endl;
+    std::cout << "Scale: " << scale << std::endl;
+    std::cout << "Output scaled casted transposed: " << output_scaled_casted_transposed.mDesc
+              << std::endl;
+    std::cout << "Output scaled casted: " << output_scaled_casted.mDesc << std::endl;
+    std::cout << "Output amax: " << output_amax.mDesc << std::endl;
+    auto launch_transpose_scale = [&]() {
+        auto transposeScale = DeviceElementwisePermuteInstance{};
+        auto argument       = transposeScale.MakeArgumentPointer(dims,
+                                                           {in_strides},
+                                                           {out_strides, in_strides},
+                                                           inputs,
+                                                           outputs,
+                                                           ScalePassThrough{scale});
+        if(!transposeScale.IsSupportedArgument(argument.get()))
+        {
+            throw std::runtime_error(
+                "The runtime parameters seems not supported by the device instance, exiting!");
+        };
+        auto transposeScale_invoker_ptr = transposeScale.MakeInvokerPointer();
+        return transposeScale_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+    };
+    auto launch_reduce = [&]() {
+        auto reduce = DeviceReduceInstance{};
+        auto reduce_argument_ptr =
+            reduce.MakeArgumentPointer(dims,
+                                       in_strides,
+                                       {1},    // Output Lengths
+                                       {1},    // Output Strides
+                                       {0, 1}, // Reduce Dims
+                                       static_cast<double>(1.f),
+                                       static_cast<double>(0.f),
+                                       output_scaled_casted_dev_buf.GetDeviceBuffer(),
+                                       nullptr,
+                                       output_amax_dev_buf.GetDeviceBuffer(),
+                                       nullptr,
+                                       UnaryAbs{},
+                                       PassThrough{});
+        if(!reduce.IsSupportedArgument(reduce_argument_ptr.get()))
+        {
+            throw std::runtime_error(
+                "The runtime parameters seems not supported by the device instance, exiting!");
+        };
+        auto invoker_ptr = reduce.MakeInvokerPointer();
+        return invoker_ptr->Run(reduce_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    };
+    float ave_time = launch_transpose_scale();
+    ave_time += launch_reduce();
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        Tensor<OutputDataType> host_output_scaled_casted_transposed(dims, out_strides);
+        Tensor<OutputDataType> host_output_scaled_casted(dims, in_strides);
+        Tensor<OutputDataType> host_output_amax({1});
+        reference_scale_permute_amax(input,
+                                     host_output_scaled_casted_transposed,
+                                     host_output_scaled_casted,
+                                     host_output_amax,
+                                     scale);
+        output_scaled_casted_transposed_dev_buf.FromDevice(
+            output_scaled_casted_transposed.mData.data());
+        output_scaled_casted_dev_buf.FromDevice(output_scaled_casted.mData.data());
+        output_amax_dev_buf.FromDevice(output_amax.mData.data());
+        pass &= ck::utils::check_err(output_scaled_casted_transposed.mData,
+                                     host_output_scaled_casted_transposed.mData,
+                                     "Error: Incorrect results scaled transposed",
+                                     1e-3,
+                                     1e-3);
+        pass &= ck::utils::check_err(output_scaled_casted.mData,
+                                     host_output_scaled_casted.mData,
+                                     "Error: Incorrect results scaled",
+                                     1e-3,
+                                     1e-3);
+        pass &= ck::utils::check_err(
+            output_amax.mData, host_output_amax.mData, "Error: Incorrect results amax", 1e-3, 1e-3);
+    }
+    return pass ? 0 : 1;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -45,11 +45,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    endforeach()
    endif()
-    if(INSTANCES_ONLY)
+    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})
-        set(EX_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(EX_TARGETS ${GPU_TARGETS})
-    endif()
    #Do not build any DL examples if DL_KERNELS not set
    foreach(source IN LISTS FILE_NAME)
@@ -147,11 +143,8 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    endforeach()
    endif()
-    if(INSTANCES_ONLY)
+    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})
-        set(EX_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(EX_TARGETS ${GPU_TARGETS})
-    endif()
    #Do not build any DL examples if DL_KERNELS not set
    foreach(source IN LISTS FILE_NAME)
        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")

--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -6,7 +6,8 @@ This folder contains example for fmha(fused multi-head attention) using ck_tile
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_fmha_fwd -j
 ```
 This will result in an executable `build/bin/tile_example_fmha_fwd`
@@ -23,7 +24,7 @@ There are 3 template parameters for this kernel template.
 To speed up compile time, we instantiate the kernels into separate file. In this way we can benefit from parallel building from CMake/Make system. This is achieved by `generate.py` script. Besides, you can look into this script to learn how to instantiate a kernel instance step by step, which is described in `FMHA_FWD_KERNEL_BODY` variable.
 ## executable
-`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all supported args. Below is an example of the output (may subject to change)
+`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all the arguments. Below is an example of the output (may subject to change)
 ```
 args:
          -v    weather do CPU validation or not (default:1)
@@ -31,47 +32,52 @@ args:
          -b    batch size (default:2)
          -h    num of head, for q (default:8)
        -h_k    num of head, for k/v, -1 means equal to h (default:-1)
-                 if not equal to h, then this is GQA/MQA case
+                if not equal to h, then this is GQA/MQA case
          -s    seqlen_q. if group-mode, means the average value of seqlen_q (default:3328)
-                 total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary
+                total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary
-                 also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode)
+                also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode)
-        -s_k    seqlen_k, -1 means equal to s (default:-1)
+        -s_k    seqlen_k (including new key/value), -1 means equal to s (default:-1)
          -d    head dim for q, k (default:128)
        -d_v    head dim for v, -1 means equal to d (default:-1)
    -scale_s    scale factor of S. 0 means equal to 1/sqrt(hdim). (default:0)
-                 note when squant=1, this value will be modified by range_q/k
+                note when squant=1, this value will be modified by range_q/k
    -range_q    per-tensor quantization range of q. used if squant=1. (default:16)
    -range_k    per-tensor quantization range of k. used if squant=1. (default:16)
    -range_v    per-tensor quantization range of v. used if squant=1. (default:16)
    -range_p    per-tensor quantization range of p [e^(s-m)]. used if squant=1. (default:1)
    -range_o    per-tensor quantization range of o (p*v). used if squant=1. (default:16)
     -squant    if using static quantization fusion or not. auto: fp8 will default use squant, other will not (default:auto)
-                 0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O.
+                0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O.
-                 calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o
+                calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o
      -iperm    permute input (default:1)
-                 if true, will be b*h*s*d, else b*s*h*d
+                if true, will be b*h*s*d, else b*s*h*d
      -operm    permute output (default:1)
       -bias    n or 0, no bias (default:n)
-                 e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s
+                e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s
-                 a(libi) or 2, alibi with 1*h. a:1, b*h
+                a(libi) or 2, alibi with 1*h. a:1, b*h
       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
       -mask    0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b') (default:0)
-                 't', top-left causal mask, 'b', bottom-r causal mask
+                't', top-left causal mask, 'b', bottom-r causal mask
-                 't:l,r', top-left sliding window attn(swa) with FA style left right size
+                't:l,r', top-left sliding window attn(swa) with FA style left right size
-                 'b:l,r', bottom-r sliding window attn(swa) with FA style left right size
+                'b:l,r', bottom-r sliding window attn(swa) with FA style left right size
-                 'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa
+                'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa
-                 'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa
+                'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa
-                 'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now)
+                'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now)
    -vlayout    r for row-major(seqlen*hdim), c for col-major(hdim*seqlen) (default:r)
        -lse    0 not store lse, 1 store lse (default:0)
      -kname    if set to 1 will print kernel name (default:0)
       -init    init method. ui, uniform random int, ni, normalized random int (default:uf)
-                 uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization
+                uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization
       -seed    random seed used for initializing input tensors. 0 for non-deterministic seed (default:11939)
+  -drop_seed    seed for random number generator (default:1)
+-drop_offset    offset for random number generator (default:0)
+ -drop_prefs    seed and offset values are present on GPU; 0 - host, 1 - device/GPU (default:0)
     -warmup    number of iterations before benchmark the kernel (default:5)
     -repeat    number of iterations to benchmark the kernel (default:20)
 ```
-Example: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
+Example 1: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
+Example 2: `./bin/tile_example_fmha_fwd -b=1 -h=8 -s=16384 -d=64 -drop_prefs=1 -drop_seed=10 -drop_offset=1234` will run a fmha case with 
+  batch=1, nhead=8, sequence length=16384, hdim=64, drop_seed=0 (in GPU memory), drop_offset=1234 (in GPU memory) fp16 case
 ## support features
 Currently we are still in rapid development stage, so more features/optimizations will be coming soon.

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -600,8 +600,8 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                # TODO: use async pipeline when compiler is more stable 
                if hdim == 256 or hdim in [32, 64, 128]:
                # if True:
-                    pipelines.append(Pipeline('qr', 'row', 'f', 'f', 'f', 'f', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))
                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))

--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -85,6 +85,9 @@ auto create_args(int argc, char* argv[])
        .insert("p_drop", "0", "0~1 probability of dropout")
        .insert("drop_seed", "1", "seed for random number generator")
        .insert("drop_offset", "0", "offset for random number generator")
+        .insert("drop_prefs",
+                "0",
+                "seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert("warmup", "5", "number of iterations before benchmark the kernel")
        .insert("repeat", "20", "number of iterations to benchmark the kernel")
@@ -158,6 +161,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    float p_drop         = arg_parser.get_float("p_drop");
    uint64_t drop_seed   = arg_parser.get_uint64("drop_seed");
    uint64_t drop_offset = arg_parser.get_uint64("drop_offset");
+    bool drop_prefs      = arg_parser.get_bool("drop_prefs");
    if(use_dbias && bias.type != bias_enum::elementwise_bias)
    {
        std::cerr << "dbias only exists when bias type is elementwise" << std::endl;
@@ -381,6 +386,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::DeviceMem dbias_buf(dbias_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
    ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
+    ck_tile::DeviceMem drop_seed_buf(drop_prefs ? sizeof(uint64_t) : 0);
+    ck_tile::DeviceMem drop_offset_buf(drop_prefs ? sizeof(uint64_t) : 0);
    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem dq_acc_buf(dq_acc_host.get_element_space_size_in_bytes());
@@ -391,6 +398,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    do_buf.ToDevice(do_host.data());
    seqstart_q.ToDevice(seqstart_q_host.data());
    seqstart_k.ToDevice(seqstart_k_host.data());
+    drop_seed_buf.ToDevice(drop_prefs ? &drop_seed : nullptr);
+    drop_offset_buf.ToDevice(drop_prefs ? &drop_offset : nullptr);
    alibi_slope_buf.ToDevice(alibi_slope_host.data());
    // clang-format off
@@ -472,6 +481,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t split_stride_dq_acc =
            (shape_batch * nhead * shape_seqlen_q * hdim_q);
+        const auto drop_seed_offset = [&]() -> decltype(fmha_bwd_args::drop_seed_offset) {
+            if(drop_prefs)
+            {
+                return std::make_pair(drop_seed_buf.GetDeviceBuffer(),
+                                      drop_offset_buf.GetDeviceBuffer());
+            }
+            else
+            {
+                return std::make_pair(drop_seed, drop_offset);
+            }
+        }();
        return fmha_bwd_args{q_buf.GetDeviceBuffer(),
                             k_buf.GetDeviceBuffer(),
                             v_buf.GetDeviceBuffer(),
@@ -545,7 +566,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             static_cast<ck_tile::index_t>(mask.type),
                             p_drop,
                             p_undrop,
-                             {drop_seed, drop_offset}};
+                             drop_seed_offset};
    }();
    float ave_time = fmha_bwd(fmha_traits, fmha_args, stream_config);

--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -9,7 +9,10 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "mask.hpp"
 #include "bias.hpp"
 #include <type_traits>
+#include <utility>
+#include <variant>
 template <typename DataType>
 struct FmhaBwdTypeConfig;
@@ -135,7 +138,8 @@ struct fmha_bwd_args
    ck_tile::index_t mask_type;
    float p_drop;
    float p_undrop;
-    std::tuple<uint64_t, uint64_t> drop_seed_offset;
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
 };
 template <typename FmhaBwdDQDKDVKernel>

--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -122,6 +122,9 @@ auto create_args(int argc, char* argv[])
        .insert("p_drop", "0", "0~1 probability of dropout")
        .insert("drop_seed", "1", "seed for random number generator")
        .insert("drop_offset", "0", "offset for random number generator")
+        .insert("drop_prefs",
+                "0",
+                "seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert(
            "rotary_dim", "0", "RoPE rotary dimension. rotary_dim <= 0 means not apply RoPE at all")
@@ -442,6 +445,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    float p_drop         = arg_parser.get_float("p_drop");
    uint64_t drop_seed   = arg_parser.get_uint64("drop_seed");
    uint64_t drop_offset = arg_parser.get_uint64("drop_offset");
+    bool drop_prefs      = arg_parser.get_bool("drop_prefs");
    if(p_drop < 0.0f || p_drop > 1.0f)
    {
        std::cerr << "The value of p_drop should be 0~1" << std::endl;
@@ -756,6 +761,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
        need_append_kvcache ? cache_seqlen_ks.size() * sizeof(int32_t) : 0);
    ck_tile::DeviceMem rotary_cos_buf(rotary_cos_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem rotary_sin_buf(rotary_sin_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem drop_seed_buf(drop_prefs ? sizeof(uint64_t) : 0);
+    ck_tile::DeviceMem drop_offset_buf(drop_prefs ? sizeof(uint64_t) : 0);
    ck_tile::DeviceMem randval_buf(randval_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem alibi_slope_buf(alibi_slope_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem block_table_buf(block_table_host.get_element_space_size_in_bytes());
@@ -774,6 +781,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    cache_seqlen_k_buf.ToDevice(need_append_kvcache ? cache_seqlen_ks.data() : nullptr);
    rotary_cos_buf.ToDevice(rotary_cos_host.data());
    rotary_sin_buf.ToDevice(rotary_sin_host.data());
+    drop_seed_buf.ToDevice(drop_prefs ? &drop_seed : nullptr);
+    drop_offset_buf.ToDevice(drop_prefs ? &drop_offset : nullptr);
    alibi_slope_buf.ToDevice(alibi_slope_host.data());
    block_table_buf.ToDevice(block_table_host.data());
    cache_batch_idx_buf.ToDevice(cache_batch_idx_host.data());
@@ -1013,9 +1022,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
                args.nhead_stride_randval = nhead_stride_randval;
                args.batch_stride_randval = batch_stride_randval;
-                args.p_drop           = p_drop;
+                args.p_drop    = p_drop;
-                args.s_randval        = s_randval;
+                args.s_randval = s_randval;
-                args.drop_seed_offset = std::tie(drop_seed, drop_offset);
+                if(drop_prefs)
+                {
+                    args.drop_seed_offset = std::make_pair(drop_seed_buf.GetDeviceBuffer(),
+                                                           drop_offset_buf.GetDeviceBuffer());
+                }
+                else
+                {
+                    args.drop_seed_offset = std::make_pair(drop_seed, drop_offset);
+                }
            }
            else if constexpr(std::is_same_v<fmha_fwd_splitkv_args, std::decay_t<decltype(args)>>)
            {

--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -13,6 +13,8 @@
 #include "rotary.hpp"
 #include <type_traits>
+#include <utility>
+#include <variant>
 template <typename DataType>
 struct FmhaFwdTypeConfig;
@@ -144,7 +146,9 @@ struct fmha_fwd_args
    float p_drop;
    bool s_randval;
-    std::tuple<uint64_t, uint64_t> drop_seed_offset;
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
 };
 struct fmha_fwd_splitkv_args

--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
@@ -6,7 +6,8 @@ This folder contains example for Layernorm2D forward using ck_tile tile-programm
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_layernorm2d_fwd -j
 ```
 This will result in an executable `build/bin/tile_example_layernorm2d_fwd`

--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -35,7 +35,9 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
                                                                    YDataType,
                                                                    MeanDataType,
                                                                    InvStdDataType,
-                                                                    Shape>;
+                                                                    Shape,
+                                                                    true,
+                                                                    true>;
        using Kernel = ck_tile::Layernorm2dFwd<PipelineProblem>;

--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -6,7 +6,8 @@ This folder contains example for GEMM using ck_tile tile-programming implementat
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_gemm_basic -j
 ```
 This will result in an executable `build/bin/tile_example_gemm_basic`
@@ -14,10 +15,17 @@ This will result in an executable `build/bin/tile_example_gemm_basic`
 ## example
 ```
 args:
-          -m    m dimension (default:3328)
+          -b    batch size (default:1)
-          -n    m dimension (default:4096)
+          -m    m dimension (default:1024)
+          -n    n dimension (default:2048)
          -k    k dimension (default:64)
-          -e    epsilon (default:1e-5)
+   -stride_a    Tensor A stride (default:0)
-          -v    cpu validation or not (default:1)
+   -stride_b    Tensor B stride (default:0)
-       -prec    precision (default:fp16)
+   -stride_c    Tensor C stride (default:0)
+          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
+          -e    Absolute error tolerance (default:1e-5)
+       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+     -warmup    number of iterations before benchmark the kernel (default:10)
+     -repeat    number of iterations to benchmark the kernel (default:100)
+      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
 ```