fix save bfloat16x4_t

5f1d777b · ltqin · 27d764eb · 5f1d777b · 5f1d777b · 5f1d777b
Commit 5f1d777b authored Mar 10, 2023 by ltqin
3 changed files
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -41,7 +41,8 @@ struct PassThrough
    }

    template <>
-    __host__ __device__ void operator()<bfloat16_t, bfloat16_t>(bfloat16_t& y, const bfloat16_t& x) const
+    __host__ __device__ void operator()<bfloat16_t, bfloat16_t>(bfloat16_t& y,
+                                                                const bfloat16_t& x) const
    {
        y = x;
    }

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -544,7 +544,6 @@ struct MfmaSelector
 #endif
    }

-
    template <>
    static constexpr auto GetMfma<int8_t, 32, 32>()
    {
@@ -756,7 +755,8 @@ struct XdlopsGemm
    __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
    {
        static_assert(is_same<base_type, double>::value || is_same<base_type, float>::value ||
-                          is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value|| is_same<base_type, bfloat16_t>::value ||
+                          is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
+                          is_same<base_type, bfloat16_t>::value ||
                          is_same<base_type, int8_t>::value,
                      "base base_type must be double, float, half, bfloat16, and int8_t!");


--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -424,18 +424,21 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
    {
        if constexpr(N == 1)
        {
-            return llvm_amdgcn_raw_buffer_load_i16(
+            auto tmp = llvm_amdgcn_raw_buffer_load_i16(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return bit_cast<bfloat16_t>(tmp);
        }
        else if constexpr(N == 2)
        {
-            return llvm_amdgcn_raw_buffer_load_i16x2(
+            auto tmp = llvm_amdgcn_raw_buffer_load_i16x2(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return bit_cast<bfloat16x2_t>(tmp);
        }
        else if constexpr(N == 4)
        {
-            return llvm_amdgcn_raw_buffer_load_i16x4(
+            auto tmp = llvm_amdgcn_raw_buffer_load_i16x4(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+            return bit_cast<bfloat16x4_t>(tmp);
        }
        else if constexpr(N == 8)
        {