fixed threadwise_copy

4cf9a393 · Jing Zhang · 405a15ec · 4cf9a393 · 4cf9a393 · 4cf9a393
Commit 4cf9a393 authored Aug 28, 2023 by Jing Zhang
4 changed files
--- a/example/01_gemm/gemm_xdl_bf16_rtn.cpp
+++ b/example/01_gemm/gemm_xdl_bf16_rtn.cpp
@@ -16,26 +16,9 @@ using ALayout = Row;
 using BLayout = Col;
 using CLayout = Row;

-struct ConvertBF16RTN_
-{
-    // convert to bf16 using round to nearest (rtn)
-    template <typename Y, typename X>
-    __host__ __device__ void operator()(Y& y, const X& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    __host__ __device__ void operator()<ck::bhalf_t, float>(ck::bhalf_t& y, const float& x) const
-    {
-        y = ck::bf16_convert_rtn<ck::bhalf_t, float>(x);
-    }
-};
-
 using AElementOp = PassThrough;
 using BElementOp = PassThrough;
-using CElementOp = ConvertBF16RTN_;
-
+using CElementOp = ck::tensor_operation::element_wise::ConvertBF16RTN;

 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;


--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -39,6 +39,12 @@ struct PassThrough
        y = x;
    }

+    template <>
+    __host__ __device__ void operator()<half_t, float>(half_t& y, const float& x) const
+    {
+        y = type_convert<half_t>(x);
+    }
+
    template <>
    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
    {

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
@@ -104,13 +104,13 @@ struct ThreadwiseTensorSliceTransfer_v6r1

            // apply pointwise operation
            static_for<0, ScalarPerVector, 1>{}([&](auto i) {
-                SrcData v;
+                DstData v;

                // apply element-wise operation
                element_op_(v, src_vector_container.template AsType<SrcData>()[i]);

                // apply type convert
-                dst_vector_container.template AsType<DstData>()(i) = type_convert<DstData>(v);
+                dst_vector_container.template AsType<DstData>()(i) = v;
            });

            const bool is_dst_valid =

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -92,11 +92,11 @@ struct ReferenceGemm : public device::BaseOperator
                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
                }

-                AccDataType v_c;
+                CDataType v_c;

                arg.c_element_op_(v_c, v_acc);

-                arg.c_m_n_(m, n) = ck::type_convert<CDataType>(v_c);
+                arg.c_m_n_(m, n) = v_c;
            };

            make_ParallelTensorFunctor(