added back amd_assembly_outer_product_1x2 and amd_assembly_outer_product_1x4

f5654649 · Chao Liu · 9d5d6afa · f5654649 · f5654649 · f5654649
Commit f5654649 authored Apr 22, 2021 by Chao Liu
4 changed files
--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
@@ -212,6 +212,8 @@ struct ThreadwiseGemm_km_kn_mn_v1r1

        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};

        constexpr auto M = CDesc{}.GetLength(I0);
        constexpr auto N = CDesc{}.GetLength(I1);
@@ -223,6 +225,46 @@ struct ThreadwiseGemm_km_kn_mn_v1r1

        static_for<0, K, 1>{}([&](auto k) {
            static_for<0, M, 1>{}([&](auto m) {
+                constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(k, m));
+
+                if constexpr(N == 2)
+                {
+                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
+                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
+
+                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
+                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
+
+                    amd_assembly_outer_product_1x2(a_buf[Number<a_offset>{}],
+                                                   b_buf[Number<b_offset_0>{}],
+                                                   b_buf[Number<b_offset_1>{}],
+                                                   c_buf(Number<c_offset_0>{}),
+                                                   c_buf(Number<c_offset_1>{}));
+                }
+                else if constexpr(N == 4)
+                {
+                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
+                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
+                    constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(k, I2));
+                    constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(k, I3));
+
+                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
+                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
+                    constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(m, I2));
+                    constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(m, I3));
+
+                    amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
+                                                   b_buf[Number<b_offset_0>{}],
+                                                   b_buf[Number<b_offset_1>{}],
+                                                   b_buf[Number<b_offset_2>{}],
+                                                   b_buf[Number<b_offset_3>{}],
+                                                   c_buf(Number<c_offset_0>{}),
+                                                   c_buf(Number<c_offset_1>{}),
+                                                   c_buf(Number<c_offset_2>{}),
+                                                   c_buf(Number<c_offset_3>{}));
+                }
+                else
+                {
                    static_for<0, N, 1>{}([&](auto n) {

                        constexpr index_t a_offset =
@@ -241,6 +283,7 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
                            a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
 #endif
                    });
+                }
            });
        });
    }

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
@@ -61,72 +61,56 @@ struct ThreadwiseGemm_km_kn_mn_v3

        static_for<0, E, 1>{}([&](auto e) {
            static_for<0, K, 1>{}([&](auto k) {
-#if 0
-                constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
+                constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));

                if constexpr(H == 2 && W == 2)
                {

-                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
-                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
-                    constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
-                    constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1));
-
-                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
-                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
-                    constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
-                    constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1));
-
-                    amd_assembly_outer_product_1x4(p_a[a_offset],
-                                                   p_b[b_offset_0],
-                                                   p_b[b_offset_1],
-                                                   p_b[b_offset_2],
-                                                   p_b[b_offset_3],
-                                                   p_c[c_offset_0],
-                                                   p_c[c_offset_1],
-                                                   p_c[c_offset_2],
-                                                   p_c[c_offset_3]);
+                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
+                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
+                    constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
+                    constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1));
+
+                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
+                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
+                    constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
+                    constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1));
+
+                    amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
+                                                   p_b[Number<b_offset_0>{}],
+                                                   p_b[Number<b_offset_1>{}],
+                                                   p_b[Number<b_offset_2>{}],
+                                                   p_b[Number<b_offset_3>{}],
+                                                   p_c[Number<c_offset_0>{}],
+                                                   p_c[Number<c_offset_1>{}],
+                                                   p_c[Number<c_offset_2>{}],
+                                                   p_c[Number<c_offset_3>{}]);
                }
                else if constexpr(H == 4 && W == 1)
                {

-                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
-                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
-                    constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0));
-                    constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0));
-
-                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
-                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
-                    constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0));
-                    constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0));
-
-                    amd_assembly_outer_product_1x4(p_a[a_offset],
-                                                   p_b[b_offset_0],
-                                                   p_b[b_offset_1],
-                                                   p_b[b_offset_2],
-                                                   p_b[b_offset_3],
-                                                   p_c[c_offset_0],
-                                                   p_c[c_offset_1],
-                                                   p_c[c_offset_2],
-                                                   p_c[c_offset_3]);
+                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
+                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
+                    constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0));
+                    constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0));
+
+                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
+                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
+                    constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0));
+                    constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0));
+
+                    amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
+                                                   p_b[Number<b_offset_0>{}],
+                                                   p_b[Number<b_offset_1>{}],
+                                                   p_b[Number<b_offset_2>{}],
+                                                   p_b[Number<b_offset_3>{}],
+                                                   p_c[Number<c_offset_0>{}],
+                                                   p_c[Number<c_offset_1>{}],
+                                                   p_c[Number<c_offset_2>{}],
+                                                   p_c[Number<c_offset_3>{}]);
                }
                else
                {
-                    static_for<0, H, 1>{}([&](auto h) {
-                        static_for<0, W, 1>{}([&](auto w) {
-                            constexpr auto b_offset =
-                                BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
-                            constexpr auto c_offset =
-                                CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
-
-                            p_c[c_offset] += inner_product_with_conversion<FloatC>{}(p_a[a_offset],
-                                                                                     p_b[b_offset]);
-                        });
-                    });
-                }
-#else
-                constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
-
                    static_for<0, H, 1>{}([&](auto h) {
                        static_for<0, W, 1>{}([&](auto w) {
                            constexpr index_t b_offset =
@@ -134,12 +118,17 @@ struct ThreadwiseGemm_km_kn_mn_v3
                            constexpr index_t c_offset =
                                CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));

+#if 0
+                            p_c[Number<c_offset>{}] += inner_product_with_conversion<FloatC>{}(p_a[Number<a_offset>{}],
+                                                                                               p_b[Number<b_offset>{}]);
+#else
                            amd_assembly_inner_product(p_a[Number<a_offset>{}],
                                                       p_b[Number<b_offset>{}],
                                                       p_c[Number<c_offset>{}]);
+#endif
                        });
                    });
-#endif
+                }
            });
        });
    }

--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -74,7 +74,6 @@ __device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t&
                               c);
 }

-#if 0
 // c0 += inner_product(a, b0)
 // c1 += inner_product(a, b1)
 __device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
@@ -438,7 +437,6 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
                                   c2,
                                   c3);
 }
-#endif

 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -28,11 +28,11 @@
 #endif

 // launch bounds
-#define CK_USE_LAUNCH_BOUNDS 0
+#define CK_USE_LAUNCH_BOUNDS 1

 #ifdef CK_USE_LAUNCH_BOUNDS
 #define CK_MAX_THREAD_PER_BLOCK 256
-#define CK_MIN_BLOCK_PER_CU 2
+#define CK_MIN_BLOCK_PER_CU 1
 #endif

 // buffer resourse