added back amd_assembly_outer_product_1x2 and amd_assembly_outer_product_1x4

f5654649 · Chao Liu · 9d5d6afa · f5654649 · f5654649 · f5654649
Commit f5654649 authored Apr 22, 2021 by Chao Liu
4 changed files
--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
@@ -212,6 +212,8 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
        constexpr auto M = CDesc{}.GetLength(I0);
        constexpr auto N = CDesc{}.GetLength(I1);
@@ -223,24 +225,65 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
        static_for<0, K, 1>{}([&](auto k) {
            static_for<0, M, 1>{}([&](auto m) {
-                static_for<0, N, 1>{}([&](auto n) {
+                constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(k, m));
+                if constexpr(N == 2)
+                {
+                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
+                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
-                    constexpr index_t a_offset =
+                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
-                        ADesc{}.CalculateOffset(a_origin_idx + make_tuple(k, m));
+                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
-                    constexpr index_t b_offset =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(k, n));
+                    amd_assembly_outer_product_1x2(a_buf[Number<a_offset>{}],
-                    constexpr index_t c_offset =
+                                                   b_buf[Number<b_offset_0>{}],
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(m, n));
+                                                   b_buf[Number<b_offset_1>{}],
+                                                   c_buf(Number<c_offset_0>{}),
+                                                   c_buf(Number<c_offset_1>{}));
+                }
+                else if constexpr(N == 4)
+                {
+                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
+                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
+                    constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(k, I2));
+                    constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(k, I3));
+                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
+                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
+                    constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(m, I2));
+                    constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(m, I3));
+                    amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
+                                                   b_buf[Number<b_offset_0>{}],
+                                                   b_buf[Number<b_offset_1>{}],
+                                                   b_buf[Number<b_offset_2>{}],
+                                                   b_buf[Number<b_offset_3>{}],
+                                                   c_buf(Number<c_offset_0>{}),
+                                                   c_buf(Number<c_offset_1>{}),
+                                                   c_buf(Number<c_offset_2>{}),
+                                                   c_buf(Number<c_offset_3>{}));
+                }
+                else
+                {
+                    static_for<0, N, 1>{}([&](auto n) {
+                        constexpr index_t a_offset =
+                            ADesc{}.CalculateOffset(a_origin_idx + make_tuple(k, m));
+                        constexpr index_t b_offset =
+                            BDesc{}.CalculateOffset(b_origin_idx + make_tuple(k, n));
+                        constexpr index_t c_offset =
+                            CDesc{}.CalculateOffset(c_origin_idx + make_tuple(m, n));
 #if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-                    amd_assembly_inner_product(a_buf[Number<a_offset>{}],
+                        amd_assembly_inner_product(a_buf[Number<a_offset>{}],
-                                               b_buf[Number<b_offset>{}],
+                                                   b_buf[Number<b_offset>{}],
-                                               c_buf(Number<c_offset>{}));
+                                                   c_buf(Number<c_offset>{}));
 #else
-                    c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
+                        c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
-                        a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
+                            a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
 #endif
-                });
+                    });
+                }
            });
        });
    }

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
@@ -61,85 +61,74 @@ struct ThreadwiseGemm_km_kn_mn_v3
        static_for<0, E, 1>{}([&](auto e) {
            static_for<0, K, 1>{}([&](auto k) {
-#if 0
+                constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
-                constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
                if constexpr(H == 2 && W == 2)
                {
-                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
+                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
-                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
+                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
-                    constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
+                    constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
-                    constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1));
+                    constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1));
-                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
+                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
-                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
+                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
-                    constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
+                    constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
-                    constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1));
+                    constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1));
-                    amd_assembly_outer_product_1x4(p_a[a_offset],
+                    amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
-                                                   p_b[b_offset_0],
+                                                   p_b[Number<b_offset_0>{}],
-                                                   p_b[b_offset_1],
+                                                   p_b[Number<b_offset_1>{}],
-                                                   p_b[b_offset_2],
+                                                   p_b[Number<b_offset_2>{}],
-                                                   p_b[b_offset_3],
+                                                   p_b[Number<b_offset_3>{}],
-                                                   p_c[c_offset_0],
+                                                   p_c[Number<c_offset_0>{}],
-                                                   p_c[c_offset_1],
+                                                   p_c[Number<c_offset_1>{}],
-                                                   p_c[c_offset_2],
+                                                   p_c[Number<c_offset_2>{}],
-                                                   p_c[c_offset_3]);
+                                                   p_c[Number<c_offset_3>{}]);
                }
                else if constexpr(H == 4 && W == 1)
                {
-                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
+                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
-                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
+                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
-                    constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0));
+                    constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0));
-                    constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0));
+                    constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0));
-                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
+                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
-                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
+                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
-                    constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0));
+                    constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0));
-                    constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0));
+                    constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0));
-                    amd_assembly_outer_product_1x4(p_a[a_offset],
+                    amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
-                                                   p_b[b_offset_0],
+                                                   p_b[Number<b_offset_0>{}],
-                                                   p_b[b_offset_1],
+                                                   p_b[Number<b_offset_1>{}],
-                                                   p_b[b_offset_2],
+                                                   p_b[Number<b_offset_2>{}],
-                                                   p_b[b_offset_3],
+                                                   p_b[Number<b_offset_3>{}],
-                                                   p_c[c_offset_0],
+                                                   p_c[Number<c_offset_0>{}],
-                                                   p_c[c_offset_1],
+                                                   p_c[Number<c_offset_1>{}],
-                                                   p_c[c_offset_2],
+                                                   p_c[Number<c_offset_2>{}],
-                                                   p_c[c_offset_3]);
+                                                   p_c[Number<c_offset_3>{}]);
                }
                else
                {
                    static_for<0, H, 1>{}([&](auto h) {
                        static_for<0, W, 1>{}([&](auto w) {
-                            constexpr auto b_offset =
+                            constexpr index_t b_offset =
                                BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
-                            constexpr auto c_offset =
+                            constexpr index_t c_offset =
                                CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
-                            p_c[c_offset] += inner_product_with_conversion<FloatC>{}(p_a[a_offset],
+#if 0
-                                                                                     p_b[b_offset]);
+                            p_c[Number<c_offset>{}] += inner_product_with_conversion<FloatC>{}(p_a[Number<a_offset>{}],
+                                                                                               p_b[Number<b_offset>{}]);
+#else
+                            amd_assembly_inner_product(p_a[Number<a_offset>{}],
+                                                       p_b[Number<b_offset>{}],
+                                                       p_c[Number<c_offset>{}]);
+#endif
                        });
                    });
                }
-#else
-                constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
-                static_for<0, H, 1>{}([&](auto h) {
-                    static_for<0, W, 1>{}([&](auto w) {
-                        constexpr index_t b_offset =
-                            BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
-                        constexpr index_t c_offset =
-                            CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
-                        amd_assembly_inner_product(p_a[Number<a_offset>{}],
-                                                   p_b[Number<b_offset>{}],
-                                                   p_c[Number<c_offset>{}]);
-                    });
-                });
-#endif
            });
        });
    }

--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -74,7 +74,6 @@ __device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t&
                               c);
 }
-#if 0
 // c0 += inner_product(a, b0)
 // c1 += inner_product(a, b1)
 __device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
@@ -438,7 +437,6 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
                                   c2,
                                   c3);
 }
-#endif
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -28,11 +28,11 @@
 #endif
 // launch bounds
-#define CK_USE_LAUNCH_BOUNDS 0
+#define CK_USE_LAUNCH_BOUNDS 1
 #ifdef CK_USE_LAUNCH_BOUNDS
 #define CK_MAX_THREAD_PER_BLOCK 256
-#define CK_MIN_BLOCK_PER_CU 2
+#define CK_MIN_BLOCK_PER_CU 1
 #endif
 // buffer resourse