fixed bug in global writes

8a891bbd · root · 86580888 · 8a891bbd · 8a891bbd
Commit 8a891bbd authored Jun 23, 2022 by root
Showing with 19 additions and 3 deletions

example/01_gemm/gemm_xdl_fp16.cpp example/01_gemm/gemm_xdl_fp16.cpp +3 -3

include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp ...tion/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp +16 -0

No files found.
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -57,7 +57,7 @@ using DeviceGemmInstance_WaveletModel = ck::tensor_operation::device::DeviceGemm
 //######|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|                |                |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |        |      |      |      |        |         |            |            |            |               |         |                |                |      |      |      |    |    |
 //######|        |        |        |      |      |      |        |         |            |            |            |               |         |                |                      
-        <     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,             256,             256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 8>,               8>;
+        <     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,             256,             256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on
 // clang-format on
@@ -160,8 +160,8 @@ int main(int argc, char* argv[])
    // do GEMM
    //replace DeviceGemmInstance_WaveletModel for wavelet gemm pipeline
-    auto gemm     = DeviceGemmInstance_WaveletModel{};
+    //auto gemm     = DeviceGemmInstance_WaveletModel{};
-    //auto gemm     = DeviceGemmInstance{};
+    auto gemm     = DeviceGemmInstance{};
    auto invoker  = gemm.MakeInvoker();
    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -505,6 +505,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+	// TODO re-architect LDS+math stages 
        GridwiseGemmMath::template RunMathWavePipeline<HasMainKBlockLoop>(a_block_buf,
                                                          b_block_buf,
                                                          blockwise_gemm,
@@ -684,6 +685,21 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+	    //TODO  
+	    //      1. writing in f32 elements and reading back for /f16/bf16 overutilizing LDS BW
+	    //      change pipeline f32-f16 conversion first before write to reduce 50% LDS BW
+	    //      2. we do not need to do LDS swizzle to align global writes writing cache lines
+	    //         v_mfma  cmat, amat, bmat, cmat   - c-mat register layout   are 1xN elments  (N is vertical or strided dimension)
+	    //         v_mfma  cmat, bmat, amat, cmat   - c-mat register layout   are Mx1 elments  (M is coalescing dimension)
+	    //         by enumerating M index in amat, bmat you can align cmat register(s) to contiguous M elements 
+	    //         for example
+	    //              1st mfma instruction  output space : 0 4 8  12 16 ....
+	    //              2nd mfma instruction  output space : 1 5 9  13 17 ....
+	    //              3rd mfma instruction  output space : 2 6 10 14 18 ....
+	    //              4th mfma instruction  output space : 3 7 11 15 19 ....
+	    //              you can pack 4 registers output space into 2WORD and do global write (no LDS swizzling required)
+	    //     3. avoid using s_barrier 
            static_for<0, num_access, 1>{}([&](auto access_id) {
                // make sure it's safe to write to LDS
                block_sync_lds();