tweak

776721ab · Jing Zhang · 0e5848a4 · 776721ab · 776721ab · 776721ab
Commit 776721ab authored May 21, 2021 by Jing Zhang
3 changed files
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
@@ -285,13 +285,14 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
        }
    }

+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
    __device__ static CIndex
-    CalculateCThreadOriginDataIndex(const index_t m0, const index_t n0, const index_t blk_i)
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
    {

        const index_t waveId = get_thread_local_1d_id() / WaveSize;

-        const auto thread_mtx_on_blk = xdlops_gemm.GetBeginOfThreadBlk(blk_i);
+        const auto thread_mtx_on_blk = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);

        const index_t waveId_m = waveId / NWaves;
        const index_t waveId_n = waveId % NWaves;

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
@@ -302,14 +302,14 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));

        const auto blockwise_gemm =
-            BlockwiseGemmXdlops_km_kn_m0m1m2n_v1<BlockSize,
-                                                 FloatAB,
-                                                 FloatAB,
-                                                 decltype(a_k_m0_m1_block_desc),
-                                                 decltype(b_k_n0_n1_block_desc),
-                                                 MPerWave,
-                                                 NPerWave,
-                                                 KPerWave>{};
+            BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline<BlockSize,
+                                                             FloatAB,
+                                                             FloatAB,
+                                                             decltype(a_k_m0_m1_block_desc),
+                                                             decltype(b_k_n0_n1_block_desc),
+                                                             MPerWave,
+                                                             NPerWave,
+                                                             KPerWave>{};
        constexpr auto CLayout = blockwise_gemm.GetCLayout();

        constexpr index_t BlkSize   = CLayout.GetBlkSize();

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -108,12 +108,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
    constexpr index_t GemmNPerBlock = 128;
    constexpr index_t GemmKPerBlock = 16;

-    constexpr index_t GemmMPerWave = 64;
-    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
    constexpr index_t GemmKPerWave = 4;

-    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 1;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;

    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<4, 2>;
    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 64>;