clean up

a7361926 · Chao Liu · 81b26528 · a7361926
Commit a7361926 authored Nov 23, 2021 by Chao Liu
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 9 deletions

example/2_gemm_xdl_bias_add/gemm_xdl_bias_add.cpp example/2_gemm_xdl_bias_add/gemm_xdl_bias_add.cpp +20 -9

No files found.
--- a/example/2_gemm_xdl_bias_add/gemm_xdl_bias_add.cpp
+++ b/example/2_gemm_xdl_bias_add/gemm_xdl_bias_add.cpp
@@ -36,28 +36,37 @@ struct PassThrough
 // v2 is from bias vector
 struct BiasAdd
 {
-#if 1
+#if 0
    // correct result
    // no scratch memory, good VGPR allocation (59)
    // good perf (101Tflops)
    template <typename T1, typename T2>
    __host__ __device__ constexpr float operator()(float v0, T1 v1, T2 v2) const
    {
+        constexpr float alpha = 0.1;
+        constexpr float beta  = 0.2;
+        constexpr float gamma = 0.3;
        // compiler seems very volatile to the order of these calculation:
        // compiler is very eager to read AccVgpr (v0) out prematurely, resulting in register
        // over-allocation. Therefore, move v0 calculation to the very end
-        float a = T1(0.2) * v1 + T2(0.3) * v2;
+        float a = T1(beta) * v1 + T2(gamma) * v2;
-        float b = a + float(0.1) * v0;
+        float b = a + float(alpha) * v0;
        return b;
    }
-#elif 0
+#elif 1
-    // correct result
+    float alpha = 0.1;
-    // some scratch memory (68), large VGPR usage (126)
+    float beta  = 0.2;
-    // very little perf drop (101Tflops)
+    float gamma = 0.3;
-    __host__ __device__ constexpr auto operator()(float v0, ck::half_t v1, ck::half_t v2) const
+    // wrong result
+    // lots of scratch memory
+    // huge perf drop
+    template <typename T1, typename T2>
+    __host__ __device__ constexpr float operator()(float v0, T1 v1, T2 v2) const
    {
-        return float(0.1) * v0 + ck::half_t(0.2) * v1 + ck::half_t(0.3) * v2;
+        return alpha * v0 + beta * v1 + gamma * v2;
    }
 #elif 0
    // correct result
@@ -361,5 +370,7 @@ int main(int argc, char* argv[])
                    PassThrough{},
                    PassThrough{},
                    c_element_op);
+        check_error(c_m_n_host_result, c_m_n_device_result);
    }
 }