clean

c390260d · Chao Liu · 428f6fd2 · c390260d · c390260d · c390260d
Commit c390260d authored Feb 16, 2023 by Chao Liu
3 changed files
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -304,6 +304,20 @@ struct AddFastGelu

        ck::tensor_operation::element_wise::FastGelu{}.template operator()<half_t, half_t>(e, x);
    }
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t, float, half_t>(half_t& e, const float& c, const half_t& d) const
+    {
+        const float x0_f = c + d;
+
+        float x1_f;
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
+                                                                                         x0_f);
+
+        e = type_convert<half_t>(x1_f);
+    }
 };

 } // namespace element_wise

--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -221,6 +221,20 @@ struct AddAddFastGelu

        ck::tensor_operation::element_wise::FastGelu{}.template operator()<half_t, half_t>(e, x);
    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, float, half_t, half_t>(
+        half_t& e, const float& c, const half_t& d0, const half_t& d1) const
+    {
+        const float x0_f = c + d0 + d1;
+
+        float x1_f;
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
+                                                                                         x0_f);
+
+        e = type_convert<half_t>(x1_f);
+    }
 };

 struct Normalize

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -253,7 +253,6 @@ struct FastGelu
        y = x * cdf;
    }

-    // device code, use lower precision "__expf" and "rcp"
    template <>
    __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
    {