Narrowing the scope of PR to OCP FP8 enablement only

3289a5c9 · Andriy Roshchenko · dbfb222d · 3289a5c9 · 3289a5c9 · 3289a5c9
Commit 3289a5c9 authored Nov 21, 2024 by Andriy Roshchenko
20 changed files
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -46,4 +46,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
 #include "run_gemm_example.inc"
-int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); }
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
@@ -66,4 +66,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
 #include "run_gemm_example.inc"
-int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); }
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
@@ -65,4 +65,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
 #include "run_gemm_example.inc"
-int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); }
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_streamk.cpp
+++ b/example/01_gemm/gemm_xdl_streamk.cpp
@@ -57,4 +57,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
 #include "run_gemm_example.inc"
-int main(int argc, char* argv[]) { return (run_gemm_streamk_example(argc, argv) ? 0 : -1); }
+int main(int argc, char* argv[]) { return !run_gemm_streamk_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
@@ -50,4 +50,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
 #include "run_gemm_example.inc"
-int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); }
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -166,14 +166,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k);
        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-2.f, 2.f}(b_k_n);
        break;
-    case 6:
-        a_m_k.GenerateTensorValue(GeneratorTensor_PI<ADataType>{});
-        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
-        break;
-    case 7:
-        a_m_k.GenerateTensorValue(GeneratorTensor_PI_A<ADataType>{});
-        b_k_n.GenerateTensorValue(GeneratorTensor_PI_B<BDataType>{});
-        break;
    default:
        ck::utils::FillUniformDistribution<ADataType>{-0.1f, 0.1f}(a_m_k);
        ck::utils::FillUniformDistribution<BDataType>{-0.1f, 0.1f}(b_k_n);
@@ -256,7 +248,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        {
            std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
-            return false;
+            return true;
        }
        ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
@@ -289,7 +281,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        {
            std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
-            return false;
+            return true;
        }
        std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument);
@@ -322,26 +314,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        std::cerr << gemm.GetTypeString() << ": the instance does not support the problem config."
                  << std::endl;
-        return false;
+        return true;
    }
-    if(config.time_kernel)
+    std::size_t flop = 2_uz * M * N * K;
-    {
+    std::size_t num_btype =
-        std::size_t flop = 2_uz * M * N * K;
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
-        std::size_t num_btype =
-            sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
-        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-        float gb_per_sec = num_btype / 1.E6 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
-        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+              << gemm.GetTypeString() << std::endl;
-    }
-    else
-    {
-        std::cout << "FINISHED: " << gemm.GetTypeString() << std::endl;
-    }
    bool pass = true;
@@ -368,29 +353,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #else
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        pass = ck::utils::check_err(c_m_n_device_result,
+        pass &= ck::utils::check_err(c_m_n_device_result,
-                                    c_m_n_host_result,
+                                     c_m_n_host_result,
-                                    "Error: Incorrect results!",
+                                     "Error: Incorrect results!",
-                                    get_rtol<CDataType>(),
+                                     get_rtol<CDataType>(),
-                                    get_atol<CDataType>());
+                                     get_atol<CDataType>());
 #endif
-        if(pass)
-            std::cout << "Verification on CPU: PASS" << std::endl;
-        if(config.init_method == 6 || config.init_method == 7)
-        {
-            std::cout << std::fixed << std::setprecision(16);
-            AccDataType d = ck::type_convert<AccDataType>(c_m_n_device_result(0, 10));
-            AccDataType h = ck::type_convert<AccDataType>(c_m_n_host_result(10, 0));
-            std::cout << "device result: " << d << std::endl;
-            std::cout << "host result: " << h << std::endl;
-            std::cout << "expected result: " << M_PI << std::endl;
-            std::cout << "device - host: " << std::abs(d - h) << std::endl;
-            std::cout << "device - expected: " << std::abs(d - M_PI) << std::endl;
-            std::cout << "atol: " << get_atol<CDataType>() << std::endl;
-            std::cout << std::endl << std::endl;
-        }
    }
    if((config.do_verification == 2) || (config.do_verification == 3))
@@ -416,18 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        auto gpu_pass = ck::utils::check_err(c_m_n_device_result,
+        pass &= ck::utils::check_err(c_m_n_device_result,
-                                             c_m_n_device_ref_result,
+                                     c_m_n_device_ref_result,
-                                             "Error: Incorrect results!",
+                                     "Error: Incorrect results!",
-                                             get_rtol<CDataType>(),
+                                     get_rtol<CDataType>(),
-                                             get_atol<CDataType>());
+                                     get_atol<CDataType>());
-        if(gpu_pass)
-            std::cout << "Verification on GPU: PASS" << std::endl;
-        pass = pass && gpu_pass;
    }
-    return pass;
+    return pass == true;
 }
 bool run_gemm_example(int argc, char* argv[])
@@ -435,7 +399,7 @@ bool run_gemm_example(int argc, char* argv[])
    ProblemSize problem_size;
    ExecutionConfig config;
-    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
 }
 bool run_gemm_streamk_example(int argc, char* argv[])
@@ -443,5 +407,5 @@ bool run_gemm_streamk_example(int argc, char* argv[])
    ProblemSizeStreamK problem_size;
    ExecutionConfig config;
-    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
 }
--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -162,12 +162,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
        break;
    case 2:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
        break;
    case 3:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2, 2});
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
        break;
    default:
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
@@ -237,13 +237,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    {
        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
-        return false;
+        return true;
    }
    bool pass = true;
    if((config.do_verification == 1) || (config.do_verification == 3))
    {
-        std::cout << "Compute reference GEMM on CPU...  ";
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
@@ -251,11 +250,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
            a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
        ref_invoker.Run(ref_argument);
-        std::cout << "DONE!" << std::endl;
-        std::cout << "Compute GEMM on device...  \n";
        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 1});
-        std::cout << "DONE!" << std::endl;
 #ifdef BUILD_INT4_EXAMPLE
        Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc);
@@ -267,19 +263,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #else
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        pass = ck::utils::check_err(c_m_n_device_result,
+        pass &= ck::utils::check_err(c_m_n_device_result,
-                                    c_m_n_host_result,
+                                     c_m_n_host_result,
-                                    "Error: Incorrect results!",
+                                     "Error: Incorrect results!",
-                                    get_rtol<CDataType>(),
+                                     get_rtol<CDataType>(),
-                                    get_atol<CDataType>());
+                                     get_atol<CDataType>());
-        if(pass)
-            std::cout << "Verification on CPU: PASS" << std::endl;
 #endif
    }
    if(config.time_kernel)
    {
-        std::cout << "Time GEMM on device...  \n";
        ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
        std::size_t flop = 2_uz * M * N * K;
@@ -301,5 +294,5 @@ bool run_gemm_universal_streamk_example(int argc, char* argv[])
    ProblemSizeStreamK_universal problem_size;
    ExecutionConfig config;
-    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
 }
--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -146,11 +146,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
        break;
    case 1:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2, 2});
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-3, 3});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
        break;
    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1.0, 1.0});
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
    }
@@ -216,13 +224,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    {
        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
-        return false;
+        return true;
    }
    bool pass = true;
    if((config.do_verification == 1) || (config.do_verification == 3))
    {
-        std::cout << "Compute reference GEMM on CPU...  ";
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
@@ -230,11 +237,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
            a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
        ref_invoker.Run(ref_argument);
-        std::cout << "DONE!" << std::endl;
-        std::cout << "Compute GEMM on device...  \n";
        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 1});
-        std::cout << "DONE!" << std::endl;
 #ifdef BUILD_INT4_EXAMPLE
        Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc);
@@ -246,19 +250,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #else
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        pass = ck::utils::check_err(c_m_n_device_result,
+        pass &= ck::utils::check_err(c_m_n_device_result,
-                                    c_m_n_host_result,
+                                     c_m_n_host_result,
-                                    "Error: Incorrect results!",
+                                     "Error: Incorrect results!",
-                                    get_rtol<CDataType>(),
+                                     get_rtol<CDataType>(),
-                                    get_atol<CDataType>());
+                                     get_atol<CDataType>());
-        if(pass)
-            std::cout << "Verification on CPU: PASS" << std::endl;
 #endif
    }
    if(config.time_kernel)
    {
-        std::cout << "Time GEMM on device...  \n";
        ave_time =
            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 5, 10, true, 4});
@@ -281,5 +282,5 @@ bool run_gemm_splitk_example(int argc, char* argv[])
    ProblemSizeSplitK problem_size;
    ExecutionConfig config;
-    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
 }
--- a/example/04_gemm_add_add_fastgelu/common.hpp
+++ b/example/04_gemm_add_add_fastgelu/common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -57,7 +57,7 @@ struct ProblemSize final
 struct ExecutionConfig final
 {
    bool do_verification = true;
-    int init_method      = 2;
+    int init_method      = 1;
    bool time_kernel     = false;
 };

--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -7,7 +7,7 @@ using ADataType        = BF16;
 using BDataType        = BF16;
 using AccDataType      = F32;
 using CShuffleDataType = F32;
-using CDataType  = F32; // C matrix doesn't exist in GPU memory, this is used for host verification
+using CDataType  = F32; // C matrix doesn't exsit in GPU memory, this is used for host verification
 using D0DataType = BF16;
 using D1DataType = BF16;
 using DsDataType = ck::Tuple<D0DataType, D1DataType>;

--- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-template <typename DataType>
-inline __host__ __device__ constexpr double get_rtol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 2e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 2e-1;
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-template <typename DataType>
-inline __host__ __device__ constexpr double get_atol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 2e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 2e-1;
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
 bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionConfig& config)
 {
 #if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
@@ -235,11 +150,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
        return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result);
 #else
-        return ck::utils::check_err(e_m_n_device_result,
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
-                                    e_m_n_host_result,
-                                    "Error: Incorrect results!",
-                                    get_rtol<EDataType>(),
-                                    get_atol<EDataType>());
 #endif
    }

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
@@ -157,8 +157,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
            break;
        default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_1<BDataType>{1.0});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
        }
    }

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
@@ -75,7 +75,7 @@ struct ProblemSize final
 struct ExecutionConfig final
 {
    bool do_verification = true;
-    int init_method      = 2;
+    int init_method      = 1;
    int k_batch          = 1;
    bool time_kernel     = false;
 };
@@ -154,12 +154,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
            break;
        case 2:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{-1.0, 1.0});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
            break;
        default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_1<BDataType>{1.0});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
        }
    }
@@ -266,7 +266,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
                                                                                BElementOp,
                                                                                CDEElementOp>;
-        std::cout << "Running verification on CPU." << std::endl;
        for(std::size_t i = 0; i < gemm_descs.size(); i++)
        {
            c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data(),
@@ -286,9 +285,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
        }
-        if(pass)
-            std::cout << "Verification on CPU: PASS" << std::endl;
    }
    return pass;

--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -123,12 +123,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
            break;
        case 2:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{-1.0, 1.0});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
            break;
        default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
        }
    }
@@ -187,7 +187,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
    bool pass = true;
    if(config.do_verification)
    {
-        std::cout << "Running verification on CPU." << std::endl;
        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                                BDataType,
                                                                                EDataType,
@@ -219,8 +218,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
 #endif
        }
-        if(pass)
-            std::cout << "Verification on CPU: PASS" << std::endl;
    }
    if(config.time_kernel)

--- a/example/20_grouped_conv_bwd_weight/common.hpp
+++ b/example/20_grouped_conv_bwd_weight/common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -72,7 +72,7 @@ using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLa
 struct ExecutionConfig final
 {
    bool do_verification = true;
-    int init_method      = 2;
+    int init_method      = 1;
    bool time_kernel     = false;
 };

--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 template <ck::index_t NDimSpatial>
 bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
@@ -37,8 +37,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
        break;
    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 0.2});
-        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-1, 1});
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.1, 0.1});
    }
    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
@@ -128,12 +128,7 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
        wei_device_buf.FromDevice(wei_device_result.mData.data());
-        return ck::utils::check_err(
+        return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData);
-            wei_device_result.mData,
-            wei_host_result.mData,
-            "Error: Incorrect results!",
-            1e-3,
-            1e-3); // the errors must be consistent with the less precise type of In/Out DataTypes
    }
    float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});

--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
        break;
    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
-        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
    }
    c0_n_bias.GenerateTensorValue(GeneratorTensor_2<C0DataType>{-5, 5});

--- a/example/35_splitK_gemm/run_splitK_gemm_example.inc
+++ b/example/35_splitK_gemm/run_splitK_gemm_example.inc
@@ -3,88 +3,6 @@
 #pragma once
-template <typename DataType>
-inline __host__ __device__ constexpr double get_rtol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 2e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 2e-1;
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-template <typename DataType>
-inline __host__ __device__ constexpr double get_atol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 2e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 2e-1;
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
 struct ProblemSize final
 {
    ck::index_t M = 3840;
@@ -100,10 +18,9 @@ struct ProblemSize final
 struct ExecutionConfig final
 {
-    // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
+    bool do_verification = true;
-    int do_verification = 1;
+    int init_method      = 1;
-    int init_method     = 7;
+    bool time_kernel     = false;
-    bool time_kernel    = false;
 };
 bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
@@ -151,17 +68,9 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
        break;
-    case 6:
-        a_m_k.GenerateTensorValue(GeneratorTensor_PI<ADataType>{});
-        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
-        break;
-    case 7:
-        a_m_k.GenerateTensorValue(GeneratorTensor_PI_A<ADataType>{});
-        b_k_n.GenerateTensorValue(GeneratorTensor_PI_B<BDataType>{});
-        break;
    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
-        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
    }
    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
@@ -217,7 +126,7 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
    invoker.Run(argument, StreamConfig{nullptr, false});
    bool pass = true;
-    if((config.do_verification == 1) || (config.do_verification == 3))
+    if(config.do_verification)
    {
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
@@ -236,7 +145,6 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
        auto ref_argument = ref_gemm.MakeArgument(
            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
-        std::cout << "Running verification on CPU." << std::endl;
        ref_invoker.Run(ref_argument);
        if(std::is_same<CDataType, ck::half_t>::value)
@@ -246,82 +154,10 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
        }
        else
        {
-            pass &= ck::utils::check_err(c_m_n_device_result,
+            pass &= ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-                                         c_m_n_host_result,
-                                         "Error: Incorrect results!",
-                                         get_rtol<CDataType>(),
-                                         get_atol<CDataType>());
-        }
-        if(pass)
-            std::cout << "Verification on CPU: PASS" << std::endl;
-        if(config.init_method == 6 || config.init_method == 7)
-        {
-            std::cout << std::fixed << std::setprecision(16);
-            AccDataType d = ck::type_convert<AccDataType>(c_m_n_device_result(0, 10));
-            AccDataType h = ck::type_convert<AccDataType>(c_m_n_host_result(10, 0));
-            std::cout << "device result: " << d << std::endl;
-            std::cout << "host result: " << h << std::endl;
-            std::cout << "expected result: " << M_PI << std::endl;
-            std::cout << "device - host: " << std::abs(d - h) << std::endl;
-            std::cout << "device - expected: " << std::abs(d - M_PI) << std::endl;
-            std::cout << "atol: " << get_atol<CDataType>() << std::endl;
-            std::cout << std::endl << std::endl;
        }
    }
-    if((config.do_verification == 2) || (config.do_verification == 3))
-    {
-        Tensor<CDataType> c_m_n_device_ref_result(
-            f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-        DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
-                                       c_m_n_device_ref_result.mDesc.GetElementSpaceSize());
-        // GPU verification
-        using ReferenceComputeType = float;
-        using ReferenceGemmInstanceGPU =
-            ck::tensor_operation::device::ReferenceGemm<ALayout,
-                                                        BLayout,
-                                                        CLayout,
-                                                        ADataType,
-                                                        BDataType,
-                                                        CDataType,
-                                                        AccDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CElementOp,
-                                                        ReferenceComputeType,
-                                                        ReferenceComputeType>;
-        auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
-        auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
-        auto ref_argument_gpu = ref_gemm_gpu.MakeArgument(
-            static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
-            static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
-            static_cast<CDataType*>(c_m_n_device_ref_buf.GetDeviceBuffer()),
-            M,
-            N,
-            K,
-            a_element_op,
-            b_element_op,
-            c_element_op);
-        std::cout << "Running verification on GPU." << std::endl;
-        ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{});
-        c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
-        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        auto gpu_pass = ck::utils::check_err(c_m_n_device_result,
-                                             c_m_n_device_ref_result,
-                                             "Error: Incorrect results!",
-                                             get_rtol<CDataType>(),
-                                             get_atol<CDataType>());
-        if(gpu_pass)
-            std::cout << "Verification on GPU: PASS" << std::endl;
-        pass &= gpu_pass;
-    }
    if(config.time_kernel)
    {
        float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 1});
@@ -372,7 +208,7 @@ bool run_splitK_gemm_example(int argc, char* argv[])
    }
    else
    {
-        printf("arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)\n");
+        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=no, 1=yes)\n");
        printf("arg4: KBatch\n");

--- a/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
@@ -16,7 +16,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
-#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
 #include "ck/library/utility/literals.hpp"
 template <ck::index_t... Is>

--- a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
@@ -16,7 +16,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
-#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
 #include "ck/library/utility/literals.hpp"
 template <ck::index_t... Is>