make innner product compatiable on gfx900

f2ac7832 · Chao Liu · 4e57b30a · f2ac7832 · 4e57b30a · f2ac7832
Commit f2ac7832 authored Aug 11, 2021 by Chao Liu
8 changed files
--- a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
@@ -97,10 +97,9 @@ struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
                                CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
                                    c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
-                            amd_inner_product_dlop<FloatA, FloatB, FloatC>(
+                            inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
-                                a_buf[Number<a_offset>{}],
+                                                                  b_buf[Number<b_offset>{}],
-                                b_buf[Number<b_offset>{}],
+                                                                  c_buf(Number<c_offset>{}));
-                                c_buf(Number<c_offset>{}));
                        });
                    });
                });
@@ -214,7 +213,7 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
                                CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
                                    c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
-                            amd_inner_product_dlop<a_vector_t, b_vector_t, FloatC>(
+                            inner_product<a_vector_t, b_vector_t, FloatC>(
                                a_vec.template AsType<a_vector_t>()[I0],
                                b_vec.template AsType<b_vector_t>()[I0],
                                c_buf(Number<c_offset>{}));

--- a/composable_kernel/include/utility/amd_dlop.hpp
+++ b/composable_kernel/include/utility/amd_dlop.hpp
-#ifndef CK_AMD_DLOP_HPP
-#define CK_AMD_DLOP_HPP
-#include "data_type.hpp"
-namespace ck {
-template <typename TA, typename TB, typename TC>
-__device__ void amd_inner_product_dlop(const TA& a, const TB& b, TC& c);
-template <>
-__device__ void
-amd_inner_product_dlop<float, float, float>(const float& a, const float& b, float& c)
-{
-#if CK_USE_AMD_DLOP_INLINE_ASM
-    asm volatile("\n \
-            v_fmac_f32 %0, %1, %2 \n \
-            "
-                 : "=v"(c)
-                 : "v"(a), "v"(b), "0"(c));
-#else
-    c += a * b;
-#endif
-}
-template <>
-__device__ void
-amd_inner_product_dlop<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I0],
-                           vector_type<float, 2>{b}.AsType<float>()[I0],
-                           c);
-    amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I1],
-                           vector_type<float, 2>{b}.AsType<float>()[I1],
-                           c);
-}
-template <>
-__device__ void
-amd_inner_product_dlop<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I0],
-                           vector_type<float, 4>{b}.AsType<float>()[I0],
-                           c);
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I1],
-                           vector_type<float, 4>{b}.AsType<float>()[I1],
-                           c);
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I2],
-                           vector_type<float, 4>{b}.AsType<float>()[I2],
-                           c);
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I3],
-                           vector_type<float, 4>{b}.AsType<float>()[I3],
-                           c);
-}
-#if CK_USE_AMD_DLOP
-template <>
-__device__ void
-amd_inner_product_dlop<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
-{
-#if CK_USE_AMD_DLOP_INLINE_ASM
-    asm volatile("\n \
-            v_dot2_f32_f16 %0, %1, %2, %0\n \
-            "
-                 : "=v"(c)
-                 : "v"(a), "v"(b), "0"(c));
-#else
-    c = __builtin_amdgcn_sdot2(a, b, c, false);
-#endif
-}
-template <>
-__device__ void
-amd_inner_product_dlop<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
-                           vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
-                           c);
-    amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
-                           vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
-                           c);
-}
-template <>
-__device__ void
-amd_inner_product_dlop<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
-                           c);
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
-                           c);
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
-                           c);
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
-                           c);
-}
-template <>
-__device__ void amd_inner_product_dlop<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a,
-                                                                    const int8x4_t& b,
-                                                                    int32_t& c)
-{
-#if CK_USE_AMD_DLOP_INLINE_ASM
-    asm volatile("\n \
-            v_dot4_i32_i8 %0, %1, %2, %0\n \
-            "
-                 : "=v"(c)
-                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
-#else
-    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
-#endif
-}
-template <>
-__device__ void amd_inner_product_dlop<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a,
-                                                                    const int8x8_t& b,
-                                                                    int32_t& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
-                           vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
-                           c);
-    amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
-                           vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
-                           c);
-}
-template <>
-__device__ void amd_inner_product_dlop<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a,
-                                                                      const int8x16_t& b,
-                                                                      int32_t& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
-                           c);
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
-                           c);
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
-                           c);
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
-                           c);
-}
-#endif // CK_USE_AMD_DLOP
-} // namespace ck
-#endif
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -4,6 +4,8 @@
 #include "data_type.hpp"
 #include "c_style_pointer_cast.hpp"
+// TODO: deprecate all amd_assembly_outer_product_xxx
 namespace ck {
 // c0 += inner_product(a, b0)

--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -31,15 +31,13 @@
 #include "static_buffer.hpp"
 #include "dynamic_buffer.hpp"
+#include "inner_product.hpp"
 // TODO: remove this
 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"
 #endif
-#if CK_USE_AMD_DLOP
-#include "amd_dlop.hpp"
-#endif
 #if CK_USE_AMD_XDLOPS
 #include "amd_xdlops.hpp"
 #endif

--- a/composable_kernel/include/utility/config.hpp
+++ b/composable_kernel/include/utility/config.hpp
@@ -14,12 +14,7 @@
 // should enable one and only one GPU target
 #if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
      defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
-#error Need to define a single GPU target
+#error Need to define (only) one GPU target
-#endif
-// HIP version
-#ifndef CK_HIP_VERSION_FLAT
-#define CK_HIP_VERSION_FLAT 0
 #endif
 // launch bounds
@@ -38,6 +33,16 @@
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
 #endif
+// FMA instruction
+#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900)
+#define CK_USE_AMD_V_MAC_F32
+#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \
+    defined(CK_AMD_GPU_GFX1030)
+#define CK_USE_AMD_V_FMAC_F32
+#define CK_USE_AMD_V_DOT2_F32_F16
+#define CK_USE_AMD_V_DOT4_I32_I8
+#endif
 // multi index
 #define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
@@ -46,13 +51,9 @@
 #define CK_USE_AMD_INLINE_ASM 1
 #endif
-// AMD DLOPS
+// AMD inner product (DLOP)
-#ifndef CK_USE_AMD_DLOP
+#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
-#define CK_USE_AMD_DLOP 1
+#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
-#endif
-#ifndef CK_USE_AMD_DLOP_INLINE_ASM
-#define CK_USE_AMD_DLOP_INLINE_ASM 1
 #endif
 // AMD buffer addressing

--- a/composable_kernel/include/utility/inner_product.hpp
+++ b/composable_kernel/include/utility/inner_product.hpp
+#ifndef CK_INNER_PRODUCT_HPP
+#define CK_INNER_PRODUCT_HPP
+#include "data_type.hpp"
+namespace ck {
+template <typename TA, typename TB, typename TC>
+__device__ void inner_product(const TA& a, const TB& b, TC& c);
+template <>
+__device__ void inner_product<float, float, float>(const float& a, const float& b, float& c)
+{
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
+    asm volatile("\n \
+            v_mac_f32 %0, %1, %2 \n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
+    asm volatile("\n \
+            v_fmac_f32 %0, %1, %2 \n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#else
+    c += a * b;
+#endif
+}
+template <>
+__device__ void
+inner_product<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    inner_product(vector_type<float, 2>{a}.AsType<float>()[I0],
+                  vector_type<float, 2>{b}.AsType<float>()[I0],
+                  c);
+    inner_product(vector_type<float, 2>{a}.AsType<float>()[I1],
+                  vector_type<float, 2>{b}.AsType<float>()[I1],
+                  c);
+}
+template <>
+__device__ void
+inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I0],
+                  vector_type<float, 4>{b}.AsType<float>()[I0],
+                  c);
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I1],
+                  vector_type<float, 4>{b}.AsType<float>()[I1],
+                  c);
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I2],
+                  vector_type<float, 4>{b}.AsType<float>()[I2],
+                  c);
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I3],
+                  vector_type<float, 4>{b}.AsType<float>()[I3],
+                  c);
+}
+template <>
+__device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
+{
+#if defined(CK_USE_AMD_V_DOT2_F32_F16)
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %1, %2, %0\n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#else
+    c = __builtin_amdgcn_sdot2(a, b, c, false);
+#endif
+#else
+    const auto convert = type_convert<int32_t>{};
+    const vector_type<half_t, 2> a_vector{a};
+    const vector_type<half_t, 2> b_vector{b};
+    static_for<0, 2, 1>{}([&](auto i) {
+        c += convert(a_vector.AsType<half_t>()[i]) * convert(b_vector.AsType<half_t>()[i]);
+    });
+#endif
+}
+template <>
+__device__ void inner_product<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
+                  vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
+                  c);
+    inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
+                  vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
+                  c);
+}
+template <>
+__device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
+                  c);
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
+                  c);
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
+                  c);
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
+                  c);
+}
+template <>
+__device__ void
+inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
+{
+#if defined(CK_USE_DOT4_I32_I8)
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+    asm volatile("\n \
+            v_dot4_i32_i8 %0, %1, %2, %0\n \
+            "
+                 : "=v"(c)
+                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
+#else
+    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
+#endif
+#else
+    const auto convert = type_convert<int32_t>{};
+    const vector_type<int8_t, 4> a_vector{a};
+    const vector_type<int8_t, 4> b_vector{b};
+    static_for<0, 4, 1>{}([&](auto i) {
+        c += convert(a_vector.AsType<int8_t>()[i]) * convert(b_vector.AsType<int8_t>()[i]);
+    });
+#endif
+}
+template <>
+__device__ void
+inner_product<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a, const int8x8_t& b, int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
+                  vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
+                  c);
+    inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
+                  vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
+                  c);
+}
+template <>
+__device__ void
+inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t& b, int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
+                  c);
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
+                  c);
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
+                  c);
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
+                  c);
+}
+} // namespace ck
+#endif
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
    const auto wei_desc_k_c_y_x   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
    const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
-#if 0
+#if 1
    // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
    // cdata = 64, BlockSize = 256
    constexpr index_t BlockSize = 256;

--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -20,9 +20,9 @@
 #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
 #define USE_MODE 1
-#define USE_CONV_FWD_V4R4_NCHW 0
+#define USE_CONV_FWD_V4R4_NCHW 1
-#define USE_CONV_FWD_V4R4R2_NHWC 1
+#define USE_CONV_FWD_V4R4R2_NHWC 0
-#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
 #endif
-#if 0
+#if 1
    using in_data_t  = float;
    using acc_data_t = float;
    using out_data_t = float;