weight permute with splitki

9fed0ade · Jing Zhang · 35d8627b · 9fed0ade · 9fed0ade
Commit 9fed0ade authored Oct 22, 2024 by Jing Zhang
2 changed files
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
@@ -39,6 +39,9 @@ using DeviceGemmV2Instance =
        2, 32, 32, 1,
        1, 1, S<1, 16, 1, 4>, 4,
        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
+    static int NPerBlock = 16;
+    static int KPerBlock = 256;
 #else
        128,
        16, 32, 
@@ -51,8 +54,11 @@ using DeviceGemmV2Instance =
        2, 32, 32, 0,
        1, 1, S<1, 16, 1, 8>, 4,
        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
+    static int NPerBlock = 32;
+    static int KPerBlock = 128;
 #endif
-      // clang-format on
+// clang-format on
    using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                            BDataType,
@@ -146,30 +152,37 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
-#if 1
+//weight permute
-    int NPerBlock = 32;
+#if 0
-    int KPerBlock = 128;
    int N1 = NPerBlock;
    int K1 = KPerBlock;
    int N0 = N / N1;
    int K0 = K / K1;
+    int K01 = K0 / KBatch;
+    int K00 = KBatch;
+    std::cout << "K00 = " << K00 << " K01 = " << K01 << std::endl;
+    for(int k = 0; k < K00; k++)
+    {
        for(int i = 0; i < N0; i++)
        {
-        for(int j = 0; j < K0; j++)
+            for(int j = 0; j < K01; j++)
            {
                for(int ii = 0; ii < N1; ii++)
                {
                    for(int jj = 0; jj < K1; jj++)
                    {
-                    b_k_n_permute(i * K0 * N1 * K1 + j * N1 * K1 + ii * K1 + jj) =
+                        b_k_n_permute(k * N0 * K01 * N1 * K1 + i * K01 * N1 * K1 + j * N1 * K1 + ii * K1 + jj) =
-                        b_k_n((i * N1 + ii) * K + (j * K1 + jj));
+                            b_k_n((i * N1 + ii) * K + (k * K01 * K1 + j * K1 + jj));
+                    }
                }
            }
        }
    }
 #else
    for(int i = 0; i < N; i++)
    {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -14,6 +14,8 @@
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+//#define WEIGHT_PERMUTE
 namespace ck {
 // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
@@ -387,8 +389,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
        }
        else
        {
-            // B Tile Permute
+            // Weight Tile Permute
-#if 0
+#ifndef WEIGHT_PERMUTE
            // not pad N or K
            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
                b_grid_desc_nraw_kraw,
@@ -619,10 +621,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
            }
            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
            {
-#if 0
+#ifndef WEIGHT_PERMUTE
                b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
 #else
-                const int k0_offset = karg.KRead * NPerBlock;
+                const int k0_offset = karg.KRead * karg.N;
                b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
 #endif
            }