debugging

0e70dfe9 · Jing Zhang · 268c497c · 0e70dfe9 · 0e70dfe9 · 0e70dfe9
Commit 0e70dfe9 authored Apr 27, 2024 by Jing Zhang
3 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
@@ -101,8 +101,8 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD<ALayout,
        (MWaves == 1 && is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value) ? false : true;
    // If true, LDS is used unconditionally
-    static constexpr auto AEnableLds_manu = true;
+    static constexpr auto AEnableLds_manu = false;
-    static constexpr auto BEnableLds_manu = true;
+    static constexpr auto BEnableLds_manu = false;
    static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1);
    static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -375,8 +375,9 @@ struct GridwiseGemmMultipleD_Wmma
            }
            else
            {
+		constexpr auto A_KRow = I2;
                constexpr auto KWmmaPerblock = KPerBlock / WmmaK;
-                constexpr auto K0PerWmma     = WmmaK / 2 / K1;
+                constexpr auto K0PerWmma     = WmmaK / A_KRow / K1;
                // KWmma->MRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread
                return make_naive_tensor_descriptor(
                    make_tuple(Number<KWmmaPerblock>{},
@@ -422,8 +423,9 @@ struct GridwiseGemmMultipleD_Wmma
            }
            else
            {
+		constexpr auto B_KRow = I2;
                constexpr auto KWmmaPerblock = KPerBlock / WmmaK;
-                constexpr auto K0PerWmma     = WmmaK / 2 / K1;
+                constexpr auto K0PerWmma     = WmmaK / B_KRow / K1;
                // KWmma->NRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread
                return make_naive_tensor_descriptor(
                    make_tuple(Number<KWmmaPerblock>{},
@@ -497,7 +499,11 @@ struct GridwiseGemmMultipleD_Wmma
                // AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1
                constexpr auto A_K0   = ABlockDesc_{}.GetLength(I0);
                constexpr auto A_K1   = ABlockDesc_{}.GetLength(I2);
+#ifdef __gfx12__
+                constexpr auto A_KRow = I2;
+#else
                constexpr auto A_KRow = I1;
+#endif
                return transform_tensor_descriptor(
                    ABlockDesc_{},
                    make_tuple(make_unmerge_transform(make_tuple(Number<A_K0>{}, A_KRow)),
@@ -536,7 +542,11 @@ struct GridwiseGemmMultipleD_Wmma
                // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1
                constexpr auto B_K0   = BBlockDesc_{}.GetLength(I0);
                constexpr auto B_K1   = BBlockDesc_{}.GetLength(I2);
+#ifdef __gfx12__
+                constexpr auto B_KRow = I2;
+#else
                constexpr auto B_KRow = I1;
+#endif
                return transform_tensor_descriptor(
                    BBlockDesc_{},
                    make_tuple(make_unmerge_transform(make_tuple(Number<B_K0>{}, B_KRow)),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -295,7 +295,12 @@ struct GridwiseGemm_Wmma
                // AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1
                constexpr auto A_K0   = ABlockDesc_{}.GetLength(I0);
                constexpr auto A_K1   = ABlockDesc_{}.GetLength(I2);
+#ifdef __gfx12__
                constexpr auto A_KRow = I2;
+#else
+                constexpr auto A_KRow = I1;
+#endif
                return transform_tensor_descriptor(
                    ABlockDesc_{},
                    make_tuple(make_unmerge_transform(make_tuple(Number<A_K0>{}, A_KRow)),
@@ -310,6 +315,7 @@ struct GridwiseGemm_Wmma
                // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1
                constexpr auto KWmma     = ABlockDesc_{}.GetLength(I0);
                constexpr auto K0PerWmma = ABlockDesc_{}.GetLength(I3);
+                constexpr auto A_KRow    = ABlockDesc_{}.GetLength(I4);
                constexpr auto A_K1      = ABlockDesc_{}.GetLength(I6);
                // Err: merge transform cause non-constexpr issue
@@ -334,7 +340,7 @@ struct GridwiseGemm_Wmma
                return make_naive_tensor_descriptor_packed(make_tuple(Number<KWmma * K0PerWmma>{},
                                                                      Number<MRepeat>{},
                                                                      I1,
-                                                                      I1,
+                                                                      Number<A_KRow>{},
                                                                      I1,
                                                                      Number<A_K1>{}));
            }
@@ -352,7 +358,11 @@ struct GridwiseGemm_Wmma
                // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1
                constexpr auto B_K0   = BBlockDesc_{}.GetLength(I0);
                constexpr auto B_K1   = BBlockDesc_{}.GetLength(I2);
+#ifdef __gfx12__
                constexpr auto B_KRow = I2;
+#else
+                constexpr auto B_KRow = I1;
+#endif
                return transform_tensor_descriptor(
                    BBlockDesc_{},
                    make_tuple(make_unmerge_transform(make_tuple(Number<B_K0>{}, B_KRow)),
@@ -367,13 +377,14 @@ struct GridwiseGemm_Wmma
                // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1
                constexpr auto KWmma     = BBlockDesc_{}.GetLength(I0);
                constexpr auto K0PerWmma = BBlockDesc_{}.GetLength(I3);
+                constexpr auto B_KRow    = BBlockDesc_{}.GetLength(I4);
                constexpr auto B_K1      = BBlockDesc_{}.GetLength(I6);
                // Workaround, Freeze transform
                return make_naive_tensor_descriptor_packed(make_tuple(Number<KWmma * K0PerWmma>{},
                                                                      Number<NRepeat>{},
                                                                      I1,
-                                                                      I1,
+                                                                      Number<B_KRow>{},
                                                                      I1,
                                                                      Number<B_K1>{}));
            }