Optionaly run either CPU or GPU verifications with GEMM examples.

24771ab7 · Andriy Roshchenko · 02958ba5 · 24771ab7 · 24771ab7 · 24771ab7
Commit 24771ab7 authored Oct 24, 2024 by Andriy Roshchenko
5 changed files
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -75,7 +75,8 @@ struct ProblemSizeSplitK final
 struct ExecutionConfig final
 {
-    bool do_verification = true;
+    // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
+    int do_verification = 3;
    int init_method     = 2;
    bool time_kernel    = false;
 };
@@ -126,7 +127,7 @@ bool parse_cmd_args<ProblemSize>(int argc,
    }
    else
    {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                  << std::endl
                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
@@ -176,7 +177,7 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
    else
    {
        std::cerr
-            << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+            << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
            << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
            << "arg3: time kernel (0=no, 1=yes)" << std::endl
            << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
@@ -225,7 +226,7 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
    }
    else
    {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                  << std::endl
                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
@@ -275,7 +276,7 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
    }
    else
    {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                  << std::endl
                  << "arg3: time kernel (0=no, 1=yes)" << std::endl

--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -337,7 +337,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    bool pass = true;
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
    {
        // CPU verification
        auto ref_gemm    = ReferenceGemmInstance{};
@@ -368,7 +368,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #endif
        if(pass)
            std::cout << "Verification on CPU: PASS" << std::endl;
+    }
+    if((config.do_verification == 2) || (config.do_verification == 3))
+    {
        // GPU verification
        auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
        auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();

--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -241,7 +241,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    }
    bool pass = true;
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
    {
        std::cout << "Compute reference GEMM on CPU...  ";
        auto ref_gemm    = ReferenceGemmInstance{};

--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -228,7 +228,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    }
    bool pass = true;
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
    {
        std::cout << "Compute reference GEMM on CPU...  ";
        auto ref_gemm    = ReferenceGemmInstance{};

--- a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
@@ -76,7 +76,7 @@ __global__ void
            // apply b_element_op
            b_element_op(v_b, p_b_grid[element_idx_b]);
            // multiply and accumulate
-            v_acc += static_cast<AccDataType>(v_a) * static_cast<AccDataType>(v_b);
+            v_acc += type_convert<AccDataType>(v_a) * type_convert<AccDataType>(v_b);
        }
        // apply c_element_op
        c_element_op(v_c, v_acc);