Merge branch 'amd-develop' into amd-master

3dc5db72 · Jun Liu · b924e330 · e547c141 · 3dc5db72 · 3dc5db72
Commit 3dc5db72 authored Oct 21, 2024 by Jun Liu
20 changed files
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -41,6 +41,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
                                                                            BElementOp,
                                                                            CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_fp8.cpp
+++ b/example/01_gemm/gemm_xdl_fp8.cpp
@@ -37,6 +37,20 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceComputeType     = float;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp,
+                                                                             ReferenceComputeType,
+                                                                             ReferenceComputeType>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_fp8_bf8.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_bf8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -44,6 +44,17 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
                                                                        ComputeTypeA,
                                                                        ComputeTypeB>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -33,6 +33,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
@@ -53,6 +53,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
@@ -52,6 +52,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_streamk.cpp
+++ b/example/01_gemm/gemm_xdl_streamk.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -44,6 +44,17 @@ using DeviceGemmInstance = DeviceGemmStreamK;
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_streamk_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -37,6 +37,17 @@ using DeviceGemmInstance = DeviceGemmInstance;
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -173,6 +173,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
@@ -193,6 +194,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
+                                   c_m_n_device_ref_result.mDesc.GetElementSpaceSize());
    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
@@ -325,14 +328,18 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << gemm.GetTypeString() << std::endl;
+    bool pass = true;
    if(config.do_verification)
    {
+        // CPU verification
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
        auto ref_argument = ref_gemm.MakeArgument(
            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+        std::cout << "Running verification on CPU." << std::endl;
        ref_invoker.Run(ref_argument);
 #ifdef BUILD_INT4_EXAMPLE
@@ -346,15 +353,42 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #else
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        return ck::utils::check_err(c_m_n_device_result,
+        pass &= !ck::utils::check_err(c_m_n_device_result,
-                                    c_m_n_host_result,
+                                      c_m_n_host_result,
-                                    "Error: Incorrect results!",
+                                      "Error: Incorrect results!",
-                                    get_rtol<CDataType>(),
+                                      get_rtol<CDataType>(),
-                                    get_atol<CDataType>());
+                                      get_atol<CDataType>());
 #endif
+        // GPU verification
+        auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
+        auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
+        auto ref_argument_gpu = ref_gemm_gpu.MakeArgument(
+            static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_m_n_device_ref_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+        std::cout << "Running verification on GPU." << std::endl;
+        ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{});
+        c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+        pass &= !ck::utils::check_err(c_m_n_device_result,
+                                      c_m_n_device_ref_result,
+                                      "Error: Incorrect results!",
+                                      get_rtol<CDataType>(),
+                                      get_atol<CDataType>());
    }
-    return true;
+    return !pass;
 }
 bool run_gemm_example(int argc, char* argv[])

--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -117,9 +117,9 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    auto f_get_default_stride =
        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == -1)
+            if(stride == 0)
            {
-                // give a chance if stride is -1, return a default packed stride
+                // give a chance if stride is 0, return a default packed stride
                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                {
                    return static_cast<std::size_t>(col);

--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
@@ -5,3 +5,4 @@ add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permu
 add_example_executable(example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp)
 add_example_executable(example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp)
 add_example_executable(example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp)
+add_example_executable(elementwise_scale_permute_amax_2D_fp16_fp8 elementwise_scale_permute_amax_2D_fp16_fp8.cpp)
--- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/utility/reduction_enums.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using F8  = ck::f8_t;
+using InputDataType  = F16;
+using ScaleDataType  = F32;
+using OutputDataType = F8;
+static constexpr ck::index_t NumDim = 2;
+constexpr ck::ReduceTensorOp ReduceOpId = ck::ReduceTensorOp::MAX;
+constexpr bool PropagateNan             = true;
+constexpr bool OutputIndex              = false;
+using ReduceOperation = typename ck::reduce_binary_operator<ReduceOpId>::opType;
+struct ScalePassThrough
+{
+    ScalePassThrough(const float alpha = 1.f) : alpha_(alpha) {}
+    __host__ __device__ constexpr void
+    operator()(OutputDataType& y0, OutputDataType& y1, const InputDataType& x0) const
+    {
+        y0 = ck::type_convert<OutputDataType>(ck::type_convert<ScaleDataType>(x0) * alpha_);
+        y1 = y0;
+    }
+    const ScaleDataType alpha_;
+};
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using UnaryAbs    = ck::tensor_operation::element_wise::UnaryAbs;
+using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
+    ck::Tuple<InputDataType>,                  // InDataTypeTuple
+    ck::Tuple<OutputDataType, OutputDataType>, // OutDataTypeTuple
+    ScalePassThrough,                          // Elementwise
+    NumDim,                                    // NumDim
+    256,                                       // BlockSize
+    128,                                       // M0PerBlock
+    128,                                       // M1PerBlock
+    8,                                         // M0PerThread
+    8,                                         // M1PerThread
+    ck::Sequence<1, 0>,                        // ThreadClusterArrangeOrder
+    ck::Sequence<8>,                           // InScalarPerVectorSeq
+    ck::Sequence<8, 1>>;                       // OutScalarPerVectorSeq
+using DeviceReduceInstance =
+    ck::tensor_operation::device::DeviceReduceMultiBlock<OutputDataType,
+                                                         OutputDataType,
+                                                         OutputDataType,
+                                                         NumDim,
+                                                         NumDim,
+                                                         ReduceOperation,
+                                                         UnaryAbs,
+                                                         PassThrough,
+                                                         ck::InMemoryDataOperationEnum::Set,
+                                                         PropagateNan,
+                                                         OutputIndex,
+                                                         false, // HaveIndexInputIfOutputIndex
+                                                         1024,  // BlockSize
+                                                         1,     // MThreadClusterSize
+                                                         1024,  // KThreadClusterSize
+                                                         1,     // MThreadSliceSize
+                                                         16,    // KThreadSliceSize
+                                                         1,     // InSrcVectorDim
+                                                         16,    // InSrceVectorSize
+                                                         1>;    // OutDstVectorSize
+void reference_scale_permute_amax(Tensor<InputDataType>& input,
+                                  Tensor<OutputDataType>& host_output_scaled_casted_transposed,
+                                  Tensor<OutputDataType>& host_output_scaled_casted,
+                                  Tensor<OutputDataType>& host_output_amax,
+                                  const float scale)
+{
+    ScalePassThrough out_element_op(scale);
+    const ck::index_t M = input.GetLengths()[0];
+    const ck::index_t K = input.GetLengths()[1];
+    for(ck::index_t m = 0; m < M; m++)
+    {
+        for(ck::index_t k = 0; k < K; k++)
+        {
+            OutputDataType y0, y1;
+            out_element_op(y0, y1, input(m, k));
+            host_output_scaled_casted(m, k)            = y0;
+            host_output_scaled_casted_transposed(m, k) = y1;
+            const OutputDataType y_fabs =
+                ck::type_convert<OutputDataType>(ck::math::abs(ck::type_convert<float>(y0)));
+            host_output_amax(0) = ck::math::max(y_fabs, host_output_amax(0));
+        }
+    }
+}
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+    const float scale = 2.f;
+    ck::index_t M = 1024;
+    ck::index_t K = 1024;
+    if(argc == 3)
+    {
+        M = std::stoi(argv[1]);
+        K = std::stoi(argv[2]);
+    }
+    std::array<ck::index_t, 2> dims        = {M, K};
+    std::array<ck::index_t, 2> in_strides  = {K, 1};
+    std::array<ck::index_t, 2> out_strides = {1, M};
+    Tensor<InputDataType> input(dims, in_strides);
+    Tensor<OutputDataType> output_scaled_casted_transposed(dims, out_strides);
+    Tensor<OutputDataType> output_scaled_casted(dims, in_strides);
+    Tensor<OutputDataType> output_amax({1});
+    input.GenerateTensorValue(GeneratorTensor_3<InputDataType>{0.0, 1.0});
+    DeviceMem input_dev_buf(sizeof(InputDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem output_scaled_casted_transposed_dev_buf(
+        sizeof(OutputDataType) * output_scaled_casted_transposed.mDesc.GetElementSpaceSize());
+    DeviceMem output_scaled_casted_dev_buf(sizeof(OutputDataType) *
+                                           output_scaled_casted.mDesc.GetElementSpaceSize());
+    DeviceMem output_amax_dev_buf(sizeof(OutputDataType) * output_amax.mDesc.GetElementSpaceSize());
+    input_dev_buf.ToDevice(input.mData.data());
+    std::array<const void*, 1> inputs = {input_dev_buf.GetDeviceBuffer()};
+    std::array<void*, 2> outputs      = {output_scaled_casted_transposed_dev_buf.GetDeviceBuffer(),
+                                    output_scaled_casted_dev_buf.GetDeviceBuffer()};
+    std::cout << "Input: " << input.mDesc << std::endl;
+    std::cout << "Scale: " << scale << std::endl;
+    std::cout << "Output scaled casted transposed: " << output_scaled_casted_transposed.mDesc
+              << std::endl;
+    std::cout << "Output scaled casted: " << output_scaled_casted.mDesc << std::endl;
+    std::cout << "Output amax: " << output_amax.mDesc << std::endl;
+    auto launch_transpose_scale = [&]() {
+        auto transposeScale = DeviceElementwisePermuteInstance{};
+        auto argument       = transposeScale.MakeArgumentPointer(dims,
+                                                           {in_strides},
+                                                           {out_strides, in_strides},
+                                                           inputs,
+                                                           outputs,
+                                                           ScalePassThrough{scale});
+        if(!transposeScale.IsSupportedArgument(argument.get()))
+        {
+            throw std::runtime_error(
+                "The runtime parameters seems not supported by the device instance, exiting!");
+        };
+        auto transposeScale_invoker_ptr = transposeScale.MakeInvokerPointer();
+        return transposeScale_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+    };
+    auto launch_reduce = [&]() {
+        auto reduce = DeviceReduceInstance{};
+        auto reduce_argument_ptr =
+            reduce.MakeArgumentPointer(dims,
+                                       in_strides,
+                                       {1},    // Output Lengths
+                                       {1},    // Output Strides
+                                       {0, 1}, // Reduce Dims
+                                       static_cast<double>(1.f),
+                                       static_cast<double>(0.f),
+                                       output_scaled_casted_dev_buf.GetDeviceBuffer(),
+                                       nullptr,
+                                       output_amax_dev_buf.GetDeviceBuffer(),
+                                       nullptr,
+                                       UnaryAbs{},
+                                       PassThrough{});
+        if(!reduce.IsSupportedArgument(reduce_argument_ptr.get()))
+        {
+            throw std::runtime_error(
+                "The runtime parameters seems not supported by the device instance, exiting!");
+        };
+        auto invoker_ptr = reduce.MakeInvokerPointer();
+        return invoker_ptr->Run(reduce_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    };
+    float ave_time = launch_transpose_scale();
+    ave_time += launch_reduce();
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        Tensor<OutputDataType> host_output_scaled_casted_transposed(dims, out_strides);
+        Tensor<OutputDataType> host_output_scaled_casted(dims, in_strides);
+        Tensor<OutputDataType> host_output_amax({1});
+        reference_scale_permute_amax(input,
+                                     host_output_scaled_casted_transposed,
+                                     host_output_scaled_casted,
+                                     host_output_amax,
+                                     scale);
+        output_scaled_casted_transposed_dev_buf.FromDevice(
+            output_scaled_casted_transposed.mData.data());
+        output_scaled_casted_dev_buf.FromDevice(output_scaled_casted.mData.data());
+        output_amax_dev_buf.FromDevice(output_amax.mData.data());
+        pass &= ck::utils::check_err(output_scaled_casted_transposed.mData,
+                                     host_output_scaled_casted_transposed.mData,
+                                     "Error: Incorrect results scaled transposed",
+                                     1e-3,
+                                     1e-3);
+        pass &= ck::utils::check_err(output_scaled_casted.mData,
+                                     host_output_scaled_casted.mData,
+                                     "Error: Incorrect results scaled",
+                                     1e-3,
+                                     1e-3);
+        pass &= ck::utils::check_err(
+            output_amax.mData, host_output_amax.mData, "Error: Incorrect results amax", 1e-3, 1e-3);
+    }
+    return pass ? 0 : 1;
+}
--- a/example/66_complex_contraction_bilinear/CMakeLists.txt
+++ b/example/66_complex_contraction_bilinear/CMakeLists.txt
+add_example_executable(example_complex_contraction_bilinear_xdl_fp32 complex_contraction_bilinear_xdl_fp32.cpp)
+add_example_executable(example_complex_contraction_bilinear_xdl_fp64 complex_contraction_bilinear_xdl_fp64.cpp)
--- a/example/66_complex_contraction_bilinear/README.md
+++ b/example/66_complex_contraction_bilinear/README.md
+# Instructions for ```example_complex_contraction_bilinear_xdl_fp32```
+## Run
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_contraction_bilinear_xdl_fp32 1 1 1
+```
--- a/example/66_complex_contraction_bilinear/common_instances.hpp
+++ b/example/66_complex_contraction_bilinear/common_instances.hpp
--- a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
+++ b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+using ComputeDataType  = F32;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+#include "run_complex_contraction_bilinear_example.inc"
+int main(int argc, char* argv[]) { return run_complex_contraction_bilinear_example(argc, argv); }
--- a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
+++ b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F64;
+using CShuffleDataType = F64;
+using DDataType        = F64;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F64;
+using ComputeDataType  = F64;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+#include "run_complex_contraction_bilinear_example.inc"
+int main(int argc, char* argv[]) { return run_complex_contraction_bilinear_example(argc, argv); }
--- a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
+++ b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -45,11 +45,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    endforeach()
    endif()
-    if(INSTANCES_ONLY)
+    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})
-        set(EX_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(EX_TARGETS ${GPU_TARGETS})
-    endif()
    #Do not build any DL examples if DL_KERNELS not set
    foreach(source IN LISTS FILE_NAME)
@@ -147,11 +143,8 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    endforeach()
    endif()
-    if(INSTANCES_ONLY)
+    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})
-        set(EX_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(EX_TARGETS ${GPU_TARGETS})
-    endif()
    #Do not build any DL examples if DL_KERNELS not set
    foreach(source IN LISTS FILE_NAME)
        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")

--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -6,7 +6,8 @@ This folder contains example for fmha(fused multi-head attention) using ck_tile
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_fmha_fwd -j
 ```
 This will result in an executable `build/bin/tile_example_fmha_fwd`
@@ -23,7 +24,7 @@ There are 3 template parameters for this kernel template.
 To speed up compile time, we instantiate the kernels into separate file. In this way we can benefit from parallel building from CMake/Make system. This is achieved by `generate.py` script. Besides, you can look into this script to learn how to instantiate a kernel instance step by step, which is described in `FMHA_FWD_KERNEL_BODY` variable.
 ## executable
-`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all supported args. Below is an example of the output (may subject to change)
+`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all the arguments. Below is an example of the output (may subject to change)
 ```
 args:
          -v    weather do CPU validation or not (default:1)
@@ -31,47 +32,52 @@ args:
          -b    batch size (default:2)
          -h    num of head, for q (default:8)
        -h_k    num of head, for k/v, -1 means equal to h (default:-1)
-                 if not equal to h, then this is GQA/MQA case
+                if not equal to h, then this is GQA/MQA case
          -s    seqlen_q. if group-mode, means the average value of seqlen_q (default:3328)
-                 total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary
+                total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary
-                 also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode)
+                also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode)
-        -s_k    seqlen_k, -1 means equal to s (default:-1)
+        -s_k    seqlen_k (including new key/value), -1 means equal to s (default:-1)
          -d    head dim for q, k (default:128)
        -d_v    head dim for v, -1 means equal to d (default:-1)
    -scale_s    scale factor of S. 0 means equal to 1/sqrt(hdim). (default:0)
-                 note when squant=1, this value will be modified by range_q/k
+                note when squant=1, this value will be modified by range_q/k
    -range_q    per-tensor quantization range of q. used if squant=1. (default:16)
    -range_k    per-tensor quantization range of k. used if squant=1. (default:16)
    -range_v    per-tensor quantization range of v. used if squant=1. (default:16)
    -range_p    per-tensor quantization range of p [e^(s-m)]. used if squant=1. (default:1)
    -range_o    per-tensor quantization range of o (p*v). used if squant=1. (default:16)
     -squant    if using static quantization fusion or not. auto: fp8 will default use squant, other will not (default:auto)
-                 0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O.
+                0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O.
-                 calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o
+                calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o
      -iperm    permute input (default:1)
-                 if true, will be b*h*s*d, else b*s*h*d
+                if true, will be b*h*s*d, else b*s*h*d
      -operm    permute output (default:1)
       -bias    n or 0, no bias (default:n)
-                 e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s
+                e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s
-                 a(libi) or 2, alibi with 1*h. a:1, b*h
+                a(libi) or 2, alibi with 1*h. a:1, b*h
       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
       -mask    0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b') (default:0)
-                 't', top-left causal mask, 'b', bottom-r causal mask
+                't', top-left causal mask, 'b', bottom-r causal mask
-                 't:l,r', top-left sliding window attn(swa) with FA style left right size
+                't:l,r', top-left sliding window attn(swa) with FA style left right size
-                 'b:l,r', bottom-r sliding window attn(swa) with FA style left right size
+                'b:l,r', bottom-r sliding window attn(swa) with FA style left right size
-                 'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa
+                'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa
-                 'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa
+                'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa
-                 'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now)
+                'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now)
    -vlayout    r for row-major(seqlen*hdim), c for col-major(hdim*seqlen) (default:r)
        -lse    0 not store lse, 1 store lse (default:0)
      -kname    if set to 1 will print kernel name (default:0)
       -init    init method. ui, uniform random int, ni, normalized random int (default:uf)
-                 uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization
+                uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization
       -seed    random seed used for initializing input tensors. 0 for non-deterministic seed (default:11939)
+  -drop_seed    seed for random number generator (default:1)
+-drop_offset    offset for random number generator (default:0)
+ -drop_prefs    seed and offset values are present on GPU; 0 - host, 1 - device/GPU (default:0)
     -warmup    number of iterations before benchmark the kernel (default:5)
     -repeat    number of iterations to benchmark the kernel (default:20)
 ```
-Example: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
+Example 1: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
+Example 2: `./bin/tile_example_fmha_fwd -b=1 -h=8 -s=16384 -d=64 -drop_prefs=1 -drop_seed=10 -drop_offset=1234` will run a fmha case with 
+  batch=1, nhead=8, sequence length=16384, hdim=64, drop_seed=0 (in GPU memory), drop_offset=1234 (in GPU memory) fp16 case
 ## support features
 Currently we are still in rapid development stage, so more features/optimizations will be coming soon.