Use a PassThrough instead of ConvertBF16RTN to calcaulate reference

1f45a250 · Rostyslav Geyyer · 82414279 · 1f45a250 · 1f45a250
Commit 1f45a250 authored May 02, 2023 by Rostyslav Geyyer
2 changed files
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -94,28 +94,19 @@ struct UnaryConvert

 struct ConvertBF16RTN
 {
+    // convert to bf16 using round to nearest (rtn)
    template <typename Y, typename X>
-    __host__ __device__ void operator()(Y& y, const X& x) const;
-
-    // convert fp16->bf16 using rounding to nearest (rtn) via fp32
-    template <>
-    __host__ __device__ void operator()<bhalf_t, half_t>(bhalf_t& y, const half_t& x) const
+    __host__ __device__ void operator()(Y& y, const X& x) const
    {
-        y = bf16_convert_rtn<bhalf_t>(x);
-    }
+        // check Y datatype
+        static_assert(is_same<Y, bhalf_t>::value,
+                "Data type is not supported by this operation!");

-    // convert fp32->bf16 using rounding to nearest (rtn)
-    template <>
-    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
-    {
-        y = bf16_convert_rtn<bhalf_t>(x);
-    }
+        // check X datatype
+        static_assert(is_same<X, float>::value || is_same<X, half_t>::value,
+                "Data type is not supported by this operation!");

-    // need to keep this specialization for fp16->fp16 ops
-    template <>
-    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
-    {
-        y = x;
+        y = bf16_convert_rtn<Y>(x);
    }
 };


--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -66,8 +66,24 @@ struct ReferenceGemm : public device::BaseOperator
                    ADataType v_a;
                    BDataType v_b;

-                    arg.a_element_op_(v_a, arg.a_m_k_(m, k));
-                    arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+                    // use PassThrough instead of ConvertBF16RTN for reference calculation
+                    if constexpr(is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    {
+                        ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k));
+                    }
+                    else
+                    {
+                        arg.a_element_op_(v_a, arg.a_m_k_(m, k));
+                    }
+                    // same for B matrix
+                    if constexpr(is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    {
+                        ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n));
+                    }
+                    else
+                    {
+                        arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+                    }

                    v_acc +=
                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);