Sanity Pass. Inefficient ILP

8e8c6ea1 · aska-0096 · 088a4f2b · 8e8c6ea1 · 8e8c6ea1 · 8e8c6ea1
Commit 8e8c6ea1 authored Sep 19, 2023 by aska-0096
4 changed files
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -39,7 +39,7 @@ using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffl
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   64,   32,   32,    32,   8,   8,   32,   32,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>;
 // clang-format on

 using DeviceGemmInstance = DeviceGemmInstance1;

--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -76,6 +76,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));

+#if 0
+    printf("A Matrix\n");
+    for(int im = 0; im < M; im++)
+    {
+        for(int ik = 0; ik < K; ik++)
+        {
+            if(ik % 8 == 0)
+                printf("|");
+            printf("%04x ", *(reinterpret_cast<uint16_t*>(&(a_m_k(im, ik)))));
+        }
+        printf("\n");
+    }
+#endif
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
@@ -333,15 +333,10 @@ struct BlockwiseGemmXdlops_pipeline_v1
                        CThreadBuffer& c_thread_buf,
                        index_t num_loop) const
    {
-        auto a_thread_read_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
            a_thread_desc_.GetElementSpaceSize());
-        auto a_thread_compute_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
-            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_read_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
-            b_thread_desc_.GetElementSpaceSize());
-        auto b_thread_compute_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
            b_thread_desc_.GetElementSpaceSize());
-
        // preload data into LDS
        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
@@ -361,8 +356,8 @@ struct BlockwiseGemmXdlops_pipeline_v1
                               make_tuple(m0, I0, I0, I0),
                               a_block_buf,
                               a_thread_desc_,
-                               make_tuple(I0, I0, I0, I0),
-                               a_thread_read_buf);
+                               make_tuple(m0, I0, I0, I0),
+                               a_thread_buf);

            static_for<0, NRepeat, 1>{}([&](auto n0) {
                // read B
@@ -370,59 +365,64 @@ struct BlockwiseGemmXdlops_pipeline_v1
                                   make_tuple(n0, I0, I0, I0),
                                   b_block_buf,
                                   b_thread_desc_,
-                                   make_tuple(I0, I0, I0, I0),
-                                   b_thread_read_buf);
+                                   make_tuple(n0, I0, I0, I0),
+                                   b_thread_buf);
            });
        });
-        // Initialize C
-        c_thread_buf.Clear();

        // main body
        if constexpr(HasMainLoop)
        {
+#if 0
+            if(get_thread_local_1d_id() == 0)
+            {
+                printf("HasMainLoop=True\n");
+            }
+#endif
            index_t i = 0;

            do
            {
+                __builtin_amdgcn_sched_group_barrier(0x020, 4, 0); // VMEM read 
+                __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x200, 4, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);

                // Here only KRepeat-1 times read (1~KRepeat) & compute (0~KRepat-1) of this k-loop
                static_for<1, KRepeat, 1>{}([&](auto k) { // k=1,2 instead of kpack*1, ...
-                    // Switch read/compute VGPR buffer
-                    a_thread_compute_buf = a_thread_read_buf;
-                    b_thread_compute_buf = b_thread_read_buf;
                    /* Read N+1 */
                    static_for<0, MRepeat, 1>{}([&](auto m0) {
                        // read A
-                        a_thread_copy_.Run(
-                            a_block_desc_m0_m1_m2_k,
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
                                           make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
                                           a_block_buf,
                                           a_thread_desc_,
-                            make_tuple(I0, I0, I0, I0),
-                            a_thread_read_buf);
+                                           make_tuple(m0, Number<k % 2>{}, I0, I0),
+                                           a_thread_buf);

                        static_for<0, NRepeat, 1>{}([&](auto n0) {
                            // read B
-                            b_thread_copy_.Run(
-                                b_block_desc_n0_n1_n2_k,
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
                                               b_block_buf,
                                               b_thread_desc_,
-                                make_tuple(I0, I0, I0, I0),
-                                b_thread_read_buf);
+                                               make_tuple(n0, Number<k % 2>{}, I0, I0),
+                                               b_thread_buf);
                            /* Compute N */
                            vector_type<FloatAB, KPack> a_thread_vec;
                            vector_type<FloatAB, KPack> b_thread_vec;

                            static_for<0, KPack, 1>{}([&](auto ik) {
                                a_thread_vec.template AsType<FloatAB>()(ik) =
-                                    a_thread_compute_buf[Number<a_thread_desc_.CalculateOffset(
-                                        make_tuple(0, 0, 0, ik))>{}];
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, (k - 1) % 2, 0, ik))>{}];
                                b_thread_vec.template AsType<FloatAB>()(ik) =
-                                    b_thread_compute_buf[Number<b_thread_desc_.CalculateOffset(
-                                        make_tuple(0, 0, 0, ik))>{}];
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, (k - 1) % 2, 0, ik))>{}];
                            });

                            using mfma_input_type =
@@ -452,9 +452,6 @@ struct BlockwiseGemmXdlops_pipeline_v1
                // Wait all wave produce next k-loop data
                block_sync_lds();

-                // switch read/compute VGPR buffer
-                a_thread_compute_buf = a_thread_read_buf;
-                b_thread_compute_buf = b_thread_read_buf;
                // Here 1 time read(idx=0) of next K-loop & compute(idx=KRepeat) this K-loop
                static_for<0, MRepeat, 1>{}([&](auto m0) {
                    // read A
@@ -462,8 +459,8 @@ struct BlockwiseGemmXdlops_pipeline_v1
                                       make_tuple(m0, I0, I0, I0),
                                       a_block_buf,
                                       a_thread_desc_,
-                                       make_tuple(I0, I0, I0, I0),
-                                       a_thread_read_buf);
+                                       make_tuple(m0, I0, I0, I0),
+                                       a_thread_buf);

                    static_for<0, NRepeat, 1>{}([&](auto n0) {
                        // read B
@@ -471,17 +468,19 @@ struct BlockwiseGemmXdlops_pipeline_v1
                                           make_tuple(n0, I0, I0, I0),
                                           b_block_buf,
                                           b_thread_desc_,
-                                           make_tuple(I0, I0, I0, I0),
-                                           b_thread_read_buf);
+                                           make_tuple(n0, I0, I0, I0),
+                                           b_thread_buf);
                        /* Compute N */
                        vector_type<FloatAB, KPack> a_thread_vec;
                        vector_type<FloatAB, KPack> b_thread_vec;

                        static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<FloatAB>()(ik) = a_thread_compute_buf
-                                [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, ik))>{}];
-                            b_thread_vec.template AsType<FloatAB>()(ik) = b_thread_compute_buf
-                                [Number<b_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, ik))>{}];
+                            a_thread_vec.template AsType<FloatAB>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, (KRepeat - 1) % 2, 0, ik))>{}];
+                            b_thread_vec.template AsType<FloatAB>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, (KRepeat - 1) % 2, 0, ik))>{}];
                        });

                        using mfma_input_type =
@@ -503,40 +502,64 @@ struct BlockwiseGemmXdlops_pipeline_v1
        {
            // Here only KRepeat-1 times read & compute
            static_for<1, KRepeat, 1>{}([&](auto k) { // k=1,2 instead of kpack*1, ...
-                // switch read/compute VGPR buffer
-                a_thread_compute_buf = a_thread_read_buf;
-                b_thread_compute_buf = b_thread_read_buf;
                /* Read N+1 */
                static_for<0, MRepeat, 1>{}([&](auto m0) {
                    // read A
-                    a_thread_copy_.Run(
-                        a_block_desc_m0_m1_m2_k,
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
                                       a_block_buf,
                                       a_thread_desc_,
-                        make_tuple(I0, I0, I0, I0),
-                        a_thread_read_buf);
+                                       make_tuple(m0, Number<k % 2>{}, I0, I0),
+                                       a_thread_buf);

                    static_for<0, NRepeat, 1>{}([&](auto n0) {
                        // read B
-                        b_thread_copy_.Run(
-                            b_block_desc_n0_n1_n2_k,
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
                                           b_block_buf,
                                           b_thread_desc_,
-                            make_tuple(I0, I0, I0, I0),
-                            b_thread_read_buf);
+                                           make_tuple(n0, Number<k % 2>{}, I0, I0),
+                                           b_thread_buf);
                        /* Compute N */
                        vector_type<FloatAB, KPack> a_thread_vec;
                        vector_type<FloatAB, KPack> b_thread_vec;

                        static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<FloatAB>()(ik) = a_thread_compute_buf
-                                [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, ik))>{}];
-                            b_thread_vec.template AsType<FloatAB>()(ik) = b_thread_compute_buf
-                                [Number<b_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, ik))>{}];
+                            a_thread_vec.template AsType<FloatAB>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, (k - 1) % 2, 0, ik))>{}];
+                            b_thread_vec.template AsType<FloatAB>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, (k - 1) % 2, 0, ik))>{}];
                        });
-
+#if 0
+                        if(get_thread_local_1d_id() == 0)
+                        {
+                            printf("rep of m.n.k (%01d, %01d, %01d)\n",
+                                   m0.value,
+                                   n0.value,
+                                   k.value - 1);
+                        }
+                        printf(
+                            "Tid: %03d, A_compute_buf: %04x %04x %04x %04x %04x %04x %04x %04x\n",
+                            get_thread_local_1d_id(),
+                            *(reinterpret_cast<uint16_t*>(
+                                &(a_thread_vec.template AsType<FloatAB>()(Number<0>{})))),
+                            *(reinterpret_cast<uint16_t*>(
+                                &(a_thread_vec.template AsType<FloatAB>()(Number<1>{})))),
+                            *(reinterpret_cast<uint16_t*>(
+                                &(a_thread_vec.template AsType<FloatAB>()(Number<2>{})))),
+                            *(reinterpret_cast<uint16_t*>(
+                                &(a_thread_vec.template AsType<FloatAB>()(Number<3>{})))),
+                            *(reinterpret_cast<uint16_t*>(
+                                &(a_thread_vec.template AsType<FloatAB>()(Number<4>{})))),
+                            *(reinterpret_cast<uint16_t*>(
+                                &(a_thread_vec.template AsType<FloatAB>()(Number<5>{})))),
+                            *(reinterpret_cast<uint16_t*>(
+                                &(a_thread_vec.template AsType<FloatAB>()(Number<6>{})))),
+                            *(reinterpret_cast<uint16_t*>(
+                                &(a_thread_vec.template AsType<FloatAB>()(Number<7>{})))));
+#endif
                        using mfma_input_type =
                            typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;

@@ -557,12 +580,40 @@ struct BlockwiseGemmXdlops_pipeline_v1
                    vector_type<FloatAB, KPack> b_thread_vec;

                    static_for<0, KPack, 1>{}([&](auto i) {
-                        a_thread_vec.template AsType<FloatAB>()(i) = a_thread_compute_buf
-                            [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, i))>{}];
-                        b_thread_vec.template AsType<FloatAB>()(i) = b_thread_compute_buf
-                            [Number<b_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, i))>{}];
+                        a_thread_vec.template AsType<FloatAB>()(i) =
+                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                make_tuple(m0, (KRepeat - 1) % 2, 0, i))>{}];
+                        b_thread_vec.template AsType<FloatAB>()(i) =
+                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                make_tuple(n0, (KRepeat - 1) % 2, 0, i))>{}];
                    });

+#if 0
+                    if(get_thread_local_1d_id() == 0)
+                    {
+                        printf(
+                            "rep of m.n.k (%01d, %01d, %01d)\n", m0.value, n0.value, KRepeat - 1);
+                    }
+                    printf("Tid: %03d, A_compute_buf: %04x %04x %04x %04x %04x %04x %04x %04x\n",
+                           get_thread_local_1d_id(),
+                           *(reinterpret_cast<uint16_t*>(
+                               &(a_thread_vec.template AsType<FloatAB>()(Number<0>{})))),
+                           *(reinterpret_cast<uint16_t*>(
+                               &(a_thread_vec.template AsType<FloatAB>()(Number<1>{})))),
+                           *(reinterpret_cast<uint16_t*>(
+                               &(a_thread_vec.template AsType<FloatAB>()(Number<2>{})))),
+                           *(reinterpret_cast<uint16_t*>(
+                               &(a_thread_vec.template AsType<FloatAB>()(Number<3>{})))),
+                           *(reinterpret_cast<uint16_t*>(
+                               &(a_thread_vec.template AsType<FloatAB>()(Number<4>{})))),
+                           *(reinterpret_cast<uint16_t*>(
+                               &(a_thread_vec.template AsType<FloatAB>()(Number<5>{})))),
+                           *(reinterpret_cast<uint16_t*>(
+                               &(a_thread_vec.template AsType<FloatAB>()(Number<6>{})))),
+                           *(reinterpret_cast<uint16_t*>(
+                               &(a_thread_vec.template AsType<FloatAB>()(Number<7>{})))));
+#endif
+
                    using mfma_input_type =
                        typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;

@@ -579,13 +630,15 @@ struct BlockwiseGemmXdlops_pipeline_v1
    }

    protected:
+    // M1, N1 as double buffer index
+    // Read buffer + Compute buffer
    // A[M0, M1, M2, KPack]
    static constexpr auto a_thread_desc_ =
-        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPack>{}));
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, I2, I1, Number<KPack>{}));

    // B[N0, N1, N2, KPack]
    static constexpr auto b_thread_desc_ =
-        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPack>{}));
+        make_naive_tensor_descriptor_packed(make_tuple(Number<NRepeat>{}, I2, I1, Number<KPack>{}));

    // C[M, N, NumRegXdlops]
    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -22,6 +22,7 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
        kernel_gemm_xdl_cshuffle_v1(typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \