debuggging

f1d8217d · Jing Zhang · fe728dc5 · f1d8217d · f1d8217d · f1d8217d
Commit f1d8217d authored Apr 21, 2024 by Jing Zhang
3 changed files
--- a/example/01_gemm/gemm_wmma_fp16.cpp
+++ b/example/01_gemm/gemm_wmma_fp16.cpp
@@ -19,48 +19,51 @@ using AElementOp = PassThrough;
 using BElementOp = PassThrough;
 using CElementOp = PassThrough;

-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;

-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle<ALayout,
-                                                                                 BLayout,
-                                                                                 CLayout,
-                                                                                 ADataType,
-                                                                                 BDataType,
-                                                                                 CDataType,
-                                                                                 AccDataType,
-                                                                                 CShuffleDataType,
-                                                                                 AElementOp,
-                                                                                 BElementOp,
-                                                                                 CElementOp,
-                                                                                 GemmDefault,
-                                                                                 1,
-                                                                                 32,
-                                                                                 16,
-                                                                                 32,
-                                                                                 64,
-                                                                                 8,
-                                                                                 16,
-                                                                                 16,
-                                                                                 1,
-                                                                                 2,
-                                                                                 S<2, 16, 1>,
-                                                                                 S<1, 0, 2>,
-                                                                                 S<1, 0, 2>,
-                                                                                 2,
-                                                                                 8,
-                                                                                 8,
-                                                                                 true,
-                                                                                 S<2, 16, 1>,
-                                                                                 S<1, 0, 2>,
-                                                                                 S<1, 0, 2>,
-                                                                                 2,
-                                                                                 8,
-                                                                                 8,
-                                                                                 true,
-                                                                                 1,
-                                                                                 1,
-                                                                                 S<1, 16, 1, 2>,
-                                                                                 8>;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
+         < ALayout,
+           BLayout,
+           CLayout,
+           ADataType,
+           BDataType,
+           CDataType,
+           AccDataType,
+           CShuffleDataType,
+           AElementOp,
+           BElementOp,
+           CElementOp,
+           GemmDefault,
+           1,           // Prefetch stage
+           128,         // BlockSize
+           64,          // MPerBlock
+           128,         // NPerBlock
+           64,          // KPerBlock
+           8,           // K1
+           16,          // MPerWmma
+           16,          // NPerWmma
+           2,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           8,
+           8,
+           true,
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           8,
+           8,
+           true,
+           1,           // C shuffle (M Repeat) Per store
+           1,           // C shuffle (N Repeat) Per store
+           S<1, 32, 1,  4>,
+           8>;
+// clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
@@ -83,14 +83,14 @@ using DeviceOpInstanceKKNN =
                                                                                  2,
                                                                                  4,
                                                                                  4,
-                                                                                  true,
+                                                                                  false,
                                                                                  S<4, 32, 1>,
                                                                                  S<1, 0, 2>,
                                                                                  S<1, 0, 2>,
                                                                                  2,
                                                                                  4,
                                                                                  4,
-                                                                                  true,
+                                                                                  false,
                                                                                  1,
                                                                                  1,
                                                                                  S<1, 64, 1, 2>,

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
@@ -137,8 +137,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
    static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true;

    // If true, LDS is used unconditionally
-    static constexpr auto AEnableLds_manu = true;
-    static constexpr auto BEnableLds_manu = true;
+    static constexpr auto AEnableLds_manu = false;
+    static constexpr auto BEnableLds_manu = false;

    static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1);
    static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1);