gemm0 debugged

072dfbfe · letaoqin · 69114f25 · 072dfbfe · 072dfbfe · 072dfbfe
Commit 072dfbfe authored Dec 03, 2024 by letaoqin
5 changed files
--- a/example/ck_tile/16_fused_moe_general/main.cpp
+++ b/example/ck_tile/16_fused_moe_general/main.cpp
@@ -241,6 +241,9 @@ bool run(const ck_tile::ArgParser& arg_parser)

    ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
    ck_tile::FillUniformDistribution<GDataType>{-.5f, .5f}(g_host);
+    // ck_tile::FillConstant<ADataType>{1}(a_host);
+    // ck_tile::FillConstant<GDataType>{1}(g_host);
+    //ck_tile::FillStepRange<GDataType>{0.0f, 32.0f*128,1.0f}(g_host); 
    ck_tile::FillUniformDistribution<DDataType>{-.5f, .5f}(d_host);
    ck_tile::FillUniformDistribution<AScaleDataType>{-.5f, .5f}(sa_host);
    ck_tile::FillUniformDistribution<GScaleDataType>{-.5f, .5f}(sg_host);
@@ -282,7 +285,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    // output_matrix_2d(a_host, tokens, hidden_size);
    std::cout << sorted_token_ids_host << std::endl;
    // std::cout << num_sorted_tiles_host << std::endl;
-    output_matrix_3d(g_host, experts, shared_intermediate_size_0, hidden_size);
+    // output_matrix_3d(g_host, experts, shared_intermediate_size_0, hidden_size);
    std::cout << sorted_expert_ids_host << std::endl;
    // std::cout << topk_weight_host << std::endl;

@@ -290,8 +293,8 @@ bool run(const ck_tile::ArgParser& arg_parser)

    // done, preparing GPU buffer
    ck_tile::DeviceMem a_buf(a_host);
-    ck_tile::DeviceMem g_perm_buf(g_perm_host);
-    ck_tile::DeviceMem d_perm_buf(d_perm_host);
+    ck_tile::DeviceMem g_perm_buf(g_host);
+    ck_tile::DeviceMem d_perm_buf(d_host);
    ck_tile::DeviceMem sa_buf(sa_host);
    ck_tile::DeviceMem sg_buf(sg_host);
    ck_tile::DeviceMem sd_buf(sd_host);

--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -278,6 +278,7 @@ struct FillConstant
 {
    T value_{0};

+    FillConstant(float value):value_(ck_tile::type_convert<T>(value)){}
    template <typename ForwardIter>
    void operator()(ForwardIter first, ForwardIter last) const
    {

--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_general_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_general_kernel.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/core/tensor/tensor_view.hpp"
 #include <string>
 #include <type_traits>

@@ -298,7 +299,6 @@ struct FusedMoeGemmGlKernel
            return a_window_;
        }();

-        // TODO: gtile using NSub to have less register pressure
        const auto g_window = [&]() {
            const GDataType* g_ptr = reinterpret_cast<const GDataType*>(kargs.g_ptr) +
                                     static_cast<long_index_t>(expert_id) * expert_stride_0;
@@ -313,6 +313,17 @@ struct FusedMoeGemmGlKernel
                g_view_,
                make_tuple(number<BlockShape::Block_N0>{}, number<BlockShape::Block_K0>{}),
                {idx_n0, 0});
+
+            // if(threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0)
+            // {
+            //     for(int i = 0; i < 16; i++)
+            //     {
+            //         printf("in G index is %d ,  value is: %f\n",
+            //                i,
+            //                ck_tile::type_convert<float>(g_ptr[i]));
+            //     }
+            // }
+
            return g_window_;
        }();


--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_general.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_general.hpp
@@ -115,6 +115,7 @@ struct FusedMoeGemmPipeline_General
            a_window_.get_window_origin(),
            Policy::template MakeGlobalTileDistribution_A<Problem>());

+        // load g to register
        auto g_global_to_dram_window = make_tile_window(
            g_window_.get_bottom_tensor_view(),
            make_tuple(number<BlockShape::Block_N0>{}, number<BlockShape::Block_K0>{}),
@@ -153,27 +154,26 @@ struct FusedMoeGemmPipeline_General
        }
 #endif

-        // load g to register
        auto g_dram_block = load_tile(g_global_to_dram_window);

 #if 0
        {
-            constexpr auto a_spans = decltype(g_dram_block)::get_distributed_spans();
+            constexpr auto g_spans = decltype(g_dram_block)::get_distributed_spans();
            int counter            = 0;
-            sweep_tile_span(a_spans[number<0>{}], [&](auto idxn) {
-                sweep_tile_span(a_spans[number<1>{}], [&](auto idxk) {
+            sweep_tile_span(g_spans[number<0>{}], [&](auto idxn) {
+                sweep_tile_span(g_spans[number<1>{}], [&](auto idxk) {
                    constexpr auto i_j_idx = make_tuple(idxn, idxk);
+                    const auto tile_idx = get_x_indices_from_distributed_indices(
+                            g_dram_block.get_tile_distribution(), i_j_idx);
                    if(threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0)
                    {
                        counter       = counter + 1;
-                        index_t idn_0 = idxn.impl_.at(0);
-                        index_t idk_0 = idxk.impl_.at(0);
-                        index_t idk_1 = idxk.impl_.at(1);
-                        printf("in A idn is %d , idk_0 is %d idk_1 is %d, counter is %d, value is: "
-                               "%f \n",
-                               idn_0,
-                               idk_0,
-                               idk_1,
+                        const auto row = tile_idx.at(number<0>{});
+                        const auto col = tile_idx.at(number<1>{});
+                        printf("in G row is %d , col is %d, counter is %d, value is: %f"
+                               " \n",
+                               row,
+                               col,
                               counter,
                               ck_tile::type_convert<float>(g_dram_block(i_j_idx)));
                    }
@@ -185,7 +185,7 @@ struct FusedMoeGemmPipeline_General

        clear_tile(s_acc); // initialize C
        constexpr index_t kK0  = BlockShape::Block_K0;
-        const index_t k0_loops = ck_tile::integer_divide_ceil(intermediate_size, kK0);
+        const index_t k0_loops = ck_tile::integer_divide_ceil(hidden_size, kK0);
        index_t iCounter0      = k0_loops - 1;
        while(iCounter0 > 0)
        {
@@ -208,25 +208,25 @@ struct FusedMoeGemmPipeline_General
            block_sync_lds();
            gemm_0(s_acc, a_lds_win, g_dram_block);
        }
-#if 1
+#if 0
        {
            constexpr auto a_spans = decltype(s_acc)::get_distributed_spans();
            int counter            = 0;
            //a_spans[0] = 1;
            sweep_tile_span(a_spans[number<0>{}], [&](auto idxm) {
                sweep_tile_span(a_spans[number<1>{}], [&](auto idxn) {
-                    constexpr auto i_j_idx = make_tuple(idxn, idxn);
-                    if(threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0)
+                    constexpr auto i_j_idx = make_tuple(idxm, idxn);
+                    const auto tile_idx = get_x_indices_from_distributed_indices(
+                            g_dram_block.get_tile_distribution(), i_j_idx);
+                    if(threadIdx.x == 1 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0)
                    {
                        counter       = counter + 1;
-                        index_t idm_0 = idxm.impl_.at(0);
-                        index_t idn_0 = idxn.impl_.at(0);
-                        index_t idn_1 = idxn.impl_.at(1);
-                        printf("in A idn is %d , idn_0 is %d, idn_1 is %d, counter is %d, value is: "
+                        const auto row = tile_idx.at(number<0>{});
+                        const auto col = tile_idx.at(number<1>{});
+                        printf("in c row is %d , col is %d,  counter is %d, value is: "
                               "%f \n",
-                               idm_0,
-                               idn_0,
-                               idn_1,
+                               row,
+                               col,
                               counter,
                               ck_tile::type_convert<float>(s_acc(i_j_idx)));
                    }

--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_general_policy.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_general_policy.hpp
@@ -186,14 +186,6 @@ struct FusedMoeGemmPipelineGeneralPolicy
        constexpr auto g_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
            g_outer_dstr_enc, typename WG::BWarpDstrEncoding{});

-        // constexpr auto g_block_dstr_encode = tile_distribution_encoding<
-        //     sequence<1>,
-        //     tuple<sequence<1, 4, 32>, sequence<4, 2, 4>>,
-        //     tuple<sequence<0, 1>, sequence<2, 1>>,
-        //     tuple<sequence<0, 1>, sequence<1, 2>>,
-        //     sequence<1, 2, 2>,
-        //     sequence<0, 0, 2>>{};
-
        return make_static_tile_distribution(g_block_dstr_encode);
    }