General fix.

8c0e03ba · mtgu0705 · f0fba871 · 8c0e03ba · 8c0e03ba · 8c0e03ba
Commit 8c0e03ba authored Feb 11, 2025 by mtgu0705
6 changed files
--- a/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
@@ -22,7 +22,7 @@ using CLayout = Row;
 void preShuffleBuffer(const I4* src, I4* dst, int N, int K, int NXdl)
 {
-    int KPack = 32;
+    int KPack = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
    int NLane = NXdl;
    int KLane = 64 / NLane;
@@ -174,7 +174,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
-    int NperXdl = GetPreShuffleParameters;
+    // do GEMM
+    auto gemm      = DeviceGemmV2Instance{};
+    int NperXdl = gemm.GetPreShuffleParameters();
    preShuffleBuffer(b_k_n.mData.data(), b_k_n_preshuffled.mData.data(), N, K, NperXdl);
    // weight permute
@@ -263,8 +266,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    auto b_element_op = BElementOp{};
    auto c_element_op = CElementOp{};
-    // do GEMM
-    auto gemm      = DeviceGemmV2Instance{};
    auto invoker   = gemm.MakeInvoker();
    float ave_time = 0;

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
@@ -11,6 +11,15 @@
 namespace ck {
+enum struct BlockGemmPipelineVersion
+{
+    v1, // Naive
+    v2, // Mem
+    v3, // Comp
+    v4, // Comp, double lds buffer
+    v5, // Comp, double global prefetch register buffer
+};
 template <BlockGemmPipelineVersion BlkGemmPipelineVer,
          BlockGemmPipelineScheduler BlkGemmPipeSche,
          index_t BlockSize,

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -46,7 +46,8 @@ struct BlockwiseGemmXdlops_pipeline_base
    static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
    static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
    static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
-    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
+    // static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 = BBlockTransferSrcScalarPerVector;
    static constexpr auto xdlops_gemm =
        XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, ComputeDataType, TransposeC>{};
@@ -54,8 +55,9 @@ struct BlockwiseGemmXdlops_pipeline_base
    static constexpr index_t AMmaKStride = KPack;
    static constexpr index_t BMmaKStride = KPack;
-    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
+    static constexpr index_t KPerThread    = KPerBlock / xdlops_gemm.K0PerXdlops;
-    static constexpr index_t KRepeat    = KPerThread / KPack;
+    static constexpr index_t KRepeat       = KPerThread / KPack;
+    static constexpr index_t KPerInnerLoop = KPack;
    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
@@ -111,6 +113,17 @@ struct BlockwiseGemmXdlops_pipeline_base
        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]);
    }
+    __device__ static auto CalculateAThreadOriginDataIndex6D()
+    {
+        const auto wave_idx = GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], 0, xdlops_a_idx[I0], 0);
+    }
    __device__ static auto CalculateBThreadOriginDataIndex()
    {

--- a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
@@ -142,8 +142,10 @@ struct DeviceGemmV2BPreshuffle : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
-    virtual bool GetPermuteB()         = 0;
+    virtual bool GetPermuteA()            = 0;
-    virtual ck::index_t GetKPerBlock() = 0;
+    virtual bool GetPermuteB()            = 0;
+    virtual ck::index_t GetKPerBlock()    = 0;
+    virtual int GetPreShuffleParameters() = 0;
 };
 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -328,6 +328,7 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
                else
                {
                    throw std::runtime_error("Only support pipeline ver v1, v2, v3 now!");
+                }
            }
 #if 0
            else

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -1134,7 +1134,7 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
        const AElementwiseOperation a_element_op{};
-        const BElementwiseOperation b_element_op{};
+        // const BElementwiseOperation b_element_op{};
        const CElementwiseOperation c_element_op{};
        // divide block work by [M, N]
@@ -1514,7 +1514,7 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
        const AElementwiseOperation a_element_op{};
-        const BElementwiseOperation b_element_op{};
+        // const BElementwiseOperation b_element_op{};
        const CElementwiseOperation c_element_op{};
        // divide block work by [M, N]
@@ -1614,7 +1614,7 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(, 0, KRepeat, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
        // Blockwise GEMM pipeline
        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);