debugging

14b422d7 · Jing Zhang · f221c68e · 14b422d7 · 14b422d7 · 14b422d7
Commit 14b422d7 authored Feb 28, 2024 by Jing Zhang
5 changed files
--- a/example/01_gemm/gemm_wmma_fp16.cpp
+++ b/example/01_gemm/gemm_wmma_fp16.cpp
@@ -21,9 +21,8 @@ using CElementOp = PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle<
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
+    ALayout,
-         < ALayout,             
    BLayout,
    CLayout,
    ADataType,
@@ -36,23 +35,23 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
    CElementOp,
    GemmDefault,
    1,  // Prefetch stage
-           128,         // BlockSize
+    32, // BlockSize
-           64,          // MPerBlock
+    16, // MPerBlock
-           128,         // NPerBlock
+    16, // NPerBlock
    64, // KPerBlock
    8,  // K1
    16, // MPerWmma
    16, // NPerWmma
-           2,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+    1,  // M-Repeat // M-PerWmma / M-Repeat = M-Wave
-           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+    1,  // N-Repeat // N-PerWmma / N-Repeat = N-Wave
-           S<4, 32, 1>,     
+    S<4, 8, 1>,
    S<1, 0, 2>,
    S<1, 0, 2>,
    2,
    8,
    8,
    true,
-           S<4, 32, 1>,     
+    S<4, 8, 1>,
    S<1, 0, 2>,
    S<1, 0, 2>,
    2,
@@ -61,9 +60,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
    true,
    1, // C shuffle (M Repeat) Per store
    1, // C shuffle (N Repeat) Per store
-           S<1, 32, 1,  4>,               
+    S<1, 16, 1, 2>,
    8>;
-// clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -73,12 +73,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
        break;
    case 3:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
+        ck::utils::FillConstant<ADataType>{static_cast<ADataType>(1.f)}(a_m_k);
        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
        break;
    case 4:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{1.f, 1.f}(b_k_n);
+        ck::utils::FillConstant<BDataType>{static_cast<BDataType>(1.f)}(b_k_n);
        break;
    case 5:
        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k);

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
@@ -453,6 +453,7 @@ struct BlockwiseGemmWMMA
                                             A_K1>;
    };
+#if 0
    template <>
    struct AThreadCopySelector<false>
    {
@@ -467,6 +468,7 @@ struct BlockwiseGemmWMMA
            5,
            A_K1>;
    };
+#endif
    template <bool EnableLds>
    struct BThreadCopySelector;
@@ -486,6 +488,7 @@ struct BlockwiseGemmWMMA
                                             B_K1>;
    };
+#if 0
    template <>
    struct BThreadCopySelector<false>
    {
@@ -500,6 +503,7 @@ struct BlockwiseGemmWMMA
            5,
            B_K1>;
    };
+#endif
    typename AThreadCopySelector<AEnableLds>::type a_thread_copy_;
    typename BThreadCopySelector<BEnableLds>::type b_thread_copy_;

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
@@ -97,8 +97,10 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,
    static constexpr auto AEnableLds_manu = false;
    static constexpr auto BEnableLds_manu = false;
-    static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1);
+    static constexpr auto AEnableLds =
-    static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1);
+        true; // AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1);
+    static constexpr auto BEnableLds =
+        true; // BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1);
    static constexpr auto matrix_padder =
        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};

--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -141,8 +141,8 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16_gfx12,
    // Wave mode dependent propety
    static constexpr index_t wave_size = Number<WaveSize>{};
    // * Fixed in Navi3x, Will be wave mode dependent on Navi4x
-    static constexpr index_t num_src_a_vgprs_per_wave = k_per_wmma / 2 * src_a_data_size / 4;
+    // static constexpr index_t num_src_a_vgprs_per_wave = k_per_wmma / 2 * src_a_data_size / 4;
-    static constexpr index_t num_src_b_vgprs_per_wave = k_per_wmma / 2 * src_b_data_size / 4;
+    // static constexpr index_t num_src_b_vgprs_per_wave = k_per_wmma / 2 * src_b_data_size / 4;
    // * num_acc_vgprs_per_wave alone M direction
    // * num_subgroups alone M direction
    static constexpr index_t num_acc_vgprs_per_wave =
@@ -158,6 +158,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16_gfx12,
        }
        else if constexpr(wave_size == 64)
        {
+            static_assert(1, "");
        }
    }
 };