add debuging code and format

b616b254 · letaoqin · 2baf9422 · b616b254 · b616b254 · b616b254
Commit b616b254 authored Dec 05, 2024 by letaoqin
7 changed files
--- a/example/ck_tile/17_fused_moe_general/instances/fused_moegemm_api_traits.hpp
+++ b/example/ck_tile/17_fused_moe_general/instances/fused_moegemm_api_traits.hpp
@@ -45,7 +45,7 @@ struct fmoe_ // traits, ugly name, only used for internal
    using WarpTile_0     = ck_tile::remove_cvref_t<WarpTile_>;

    using BlockTile_1    = ck_tile::sequence<BT_, BD_, BI_>;
-    using WarpPerBlock_1 = ck_tile::sequence<1, 1, 4>;//ck_tile::remove_cvref_t<WarpPerBlock_>;
+    using WarpPerBlock_1 = ck_tile::sequence<1, 1, 4>; // ck_tile::remove_cvref_t<WarpPerBlock_>;
    using WarpTile_1     = ck_tile::remove_cvref_t<WarpTile_>;

    static constexpr ck_tile::index_t GateOnly   = GateOnly_;

--- a/example/ck_tile/17_fused_moe_general/main.cpp
+++ b/example/ck_tile/17_fused_moe_general/main.cpp
@@ -83,13 +83,43 @@ void topid_unique_gen(
        host_tensor[i] = current_v;
    }
 }
-
+template <typename IndexType>
+void output_matrix_2d(ck_tile::HostTensor<IndexType>& data, int m, int n)
+{
+    std::cout << std::endl;
+    for(int i = 0; i < m; i++)
+    {
+        std::cout << "Line " << i << "\t";
+        for(int j = 0; j < n; j++)
+        {
+            std::cout << ck_tile::type_convert<float>(data(i, j)) << "\t";
+        }
+        std::cout << std::endl;
+    }
+}
+template <typename IndexType>
+void output_matrix_3d(ck_tile::HostTensor<IndexType>& data, int M, int N, int J)
+{
+    std::cout << std::endl;
+    for(int m = 0; m < M; m++)
+    {
+        for(int n = 0; n < N; n++)
+        {
+            std::cout << "experts: " << m << " Line: " << n << "\t";
+            for(int j = 0; j < J; j++)
+            {
+                std::cout << ck_tile::type_convert<float>(data(m, n, j)) << "\t";
+            }
+            std::cout << std::endl;
+        }
+    }
+}
 auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
    arg_parser.insert("t", "128", "num input tokens")
        .insert("e", "32", "num of experts")
-        .insert("k", "5", "topk")
+        .insert("k", "2", "topk")
        .insert("h", "8192", "hidden_size of this model")
        .insert("i", "8192", "intermediate_size between 2 gemms of FFN")
        .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size")
@@ -112,7 +142,7 @@ auto create_args(int argc, char* argv[])
                "0",
                "if set to 1, will try balance the expert in topk-ids(convenient for testing)")
        .insert("init",
-                "2",
+                "1",
                "init method. 0:random stepped float(fast). 1: random uniform, 2:rand normalized"
                "normalized(slow)")
        .insert("seed", "11939", "seed used to do random")
@@ -176,9 +206,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        }
        return base_str;
    }();
-    auto api_str = [&]() {
-        return std::string("moeg");
-    }();
+    auto api_str = [&]() { return std::string("moeg"); }();

    auto stride_str = [&]() {
        if(stride == hidden_size)
@@ -245,7 +273,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        ck_tile::FillUniformDistribution<GScaleDataType>{-.5f, .5f, seed, true}(sg_host);
        ck_tile::FillUniformDistribution<DScaleDataType>{-.5f, .5f, seed, true}(sd_host);
        ck_tile::FillUniformDistribution<YSmoothScaleDataType>{-.5f, .5f, seed, true}(sy_host);
-        ck_tile::FillUniformDistribution<TopkWeightDataType>{-.5f, .5f, seed, true}(
+        ck_tile::FillUniformDistribution<TopkWeightDataType>{0.0f, 1.0f, seed, true}(
            topk_weight_host);
    }
    else if(init == 2)
@@ -343,6 +371,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
        experts,
        block_m);

+    // output_matrix_2d(a_host, tokens, hidden_size);
+    std::cout << sorted_token_ids_host << std::endl;
+    std::cout << num_sorted_tiles_host << std::endl;
+    // output_matrix_3d(g_host, experts, shared_intermediate_size_0, hidden_size);
+    std::cout << sorted_expert_ids_host << std::endl;
+    // std::cout << topk_weight_host << std::endl;
+
+    // std::cout << sorted_weight_host << std::endl;
    // done, preparing GPU buffer
    ck_tile::DeviceMem a_buf(a_host);
    ck_tile::DeviceMem g_perm_buf(g_host);
@@ -441,8 +477,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
    std::cout << std::flush << std::endl;

    return pass;
-    
-
 }

 int main(int argc, char* argv[])

--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_general_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_general_kernel.hpp
@@ -213,9 +213,9 @@ struct FusedMoeGemmGlKernel

    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
    {
-        //constexpr index_t block_m = BlockShape::Block_M0;
+        // constexpr index_t block_m = BlockShape::Block_M0;
        int max_num_tokens_padded = hargs.max_num_tokens_padded;
-            //hargs.topk * hargs.num_tokens + hargs.num_experts * block_m - hargs.topk;
+        // hargs.topk * hargs.num_tokens + hargs.num_experts * block_m - hargs.topk;
        // printf("xxx max_num_tokens_padded:%d\n", max_num_tokens_padded);
        return Partitioner::GridSize(max_num_tokens_padded, hargs.intermediate_size);
    }

--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_general.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_general.hpp
@@ -191,12 +191,13 @@ struct FusedMoeGemmPipeline_General
            block_sync_lds();
            gemm_0(s_acc, a_lds_win, g_dram_block);
        }
+#if 1
+        PrintMem(s_acc);
+#endif
        // relu
        const auto activation = ck_tile::element_wise::Gelu{};
        tile_elementwise_inout(activation, s_acc, s_acc);
-#if 0
-        PrintMem(s_acc);
-#endif
+
        // move sacc to LDS
        auto bridge_lds_view = make_tensor_view<address_space_enum::lds>(
            smem_0, Policy::template MakeBridgeLdsBlockDesc<Problem>());

--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_general_policy.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_general_policy.hpp
@@ -175,7 +175,7 @@ struct FusedMoeGemmPipelineGeneralPolicy
    {
        using WG = decltype(GetWarpGemm0<Problem>());
        using S_ = typename Problem::BlockShape;
-        static_assert(S_::WarpPerBlock_N0==4);
+        static_assert(S_::WarpPerBlock_N0 == 4);
        constexpr auto g_outer_dstr_enc = tile_distribution_encoding<
            sequence<S_::WarpPerBlock_M0>,
            tuple<sequence<S_::Repeat_N0, S_::WarpPerBlock_N0>, sequence<S_::Repeat_K0>>,
@@ -240,9 +240,10 @@ struct FusedMoeGemmPipelineGeneralPolicy
        using S_       = remove_cvref_t<typename Problem::BlockShape>;
        using WarpGemm = remove_cvref_t<decltype(GetWarpGemm1<Problem>())>;

-        constexpr auto y_outer_dstr_enc = tile_distribution_encoding<
-            sequence<1>,
-            tuple<sequence<S_::Repeat_M1, S_::WarpPerBlock_M1>, sequence<S_::WarpPerBlock_K1, S_::Repeat_K1>>,
+        constexpr auto y_outer_dstr_enc =
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<S_::Repeat_M1, S_::WarpPerBlock_M1>,
+                                             sequence<S_::WarpPerBlock_K1, S_::Repeat_K1>>,
                                       tuple<sequence<1, 2>>,
                                       tuple<sequence<1, 0>>,
                                       sequence<1, 2>,
@@ -260,9 +261,10 @@ struct FusedMoeGemmPipelineGeneralPolicy
        using S_       = remove_cvref_t<typename Problem::BlockShape>;
        using WarpGemm = remove_cvref_t<decltype(GetWarpGemm1<Problem>())>;

-        constexpr auto d_outer_dstr_enc = tile_distribution_encoding<
-            sequence<1>,
-            tuple<sequence<S_::Repeat_N1, S_::WarpPerBlock_N1>, sequence<S_::WarpPerBlock_K1, S_::Repeat_K1>>,
+        constexpr auto d_outer_dstr_enc =
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<S_::Repeat_N1, S_::WarpPerBlock_N1>,
+                                             sequence<S_::WarpPerBlock_K1, S_::Repeat_K1>>,
                                       tuple<sequence<1, 2>>,
                                       tuple<sequence<1, 0>>,
                                       sequence<1, 2>,

--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
@@ -52,16 +52,16 @@ struct BlockGemmARegBRegCRegV2
        // M->N Warp
        // constexpr auto a_block_outer_dstr_encoding =
        //     tile_distribution_encoding<sequence<NWarp>,
-        //                                tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-        //                                tuple<sequence<1, 0>>,
+        //                                tuple<sequence<MIterPerWarp, MWarp>,
+        //                                sequence<KIterPerWarp>>, tuple<sequence<1, 0>>,
        //                                tuple<sequence<1, 0>>,
        //                                sequence<1, 2>,
        //                                sequence<0, 0>>{};

        // constexpr auto b_block_outer_dstr_encoding =
        //     tile_distribution_encoding<sequence<MWarp>,
-        //                                tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
-        //                                tuple<sequence<0, 1>>,
+        //                                tuple<sequence<NIterPerWarp, NWarp>,
+        //                                sequence<KIterPerWarp>>, tuple<sequence<0, 1>>,
        //                                tuple<sequence<0, 1>>,
        //                                sequence<1, 2>,
        //                                sequence<0, 0>>{};