revert gemm2 swizz

4b91d1ce · coderfeli · fcc2c867 · 4b91d1ce
Commit 4b91d1ce authored Feb 17, 2025 by coderfeli
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 20 deletions

include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_scatter.hpp ...k/tensor_operation/gpu/grid/gridwise_moe_gemm_scatter.hpp +5 -20

No files found.
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_scatter.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_scatter.hpp
@@ -197,11 +197,8 @@ struct GridwiseMoeGemmScatter

    __host__ static auto CalculateGridSize(index_t M, index_t N)
    {
-        // return std::make_tuple(math::integer_divide_ceil(N, NPerBlock),
-        //                        math::integer_divide_ceil(M, MPerBlock),
-        //                        1);
-        return std::make_tuple(math::integer_divide_ceil(N, NPerBlock) * math::integer_divide_ceil(M, MPerBlock),
-                               1,
+        return std::make_tuple(math::integer_divide_ceil(N, NPerBlock),
+                               math::integer_divide_ceil(M, MPerBlock),
                               1);
    }

@@ -1152,22 +1149,10 @@ struct GridwiseMoeGemmScatter
            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                c_grid_desc_m_n, problem.MBlock, problem.NBlock);

-
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const index_t expert_id = __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[block_m_id]);
        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);     
-        const index_t expert_block_id = blockIdx.x / problem.NBlock;
-        // const index_t b_block_id = blockIdx.x % problem.NBlock;
-        const index_t expert_id = __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
-        const index_t es = __builtin_amdgcn_readfirstlane(p_max_token_id[expert_block_id + 1]);
-        const index_t expert_swizzle = es > 0 ? es : 1; //p_max_token_id[expert_id + 1];
-        const index_t expert_block_swizzle = expert_block_id / expert_swizzle;
-        const index_t b_block_id_swizzle = blockIdx.x % (problem.NBlock * expert_swizzle);
-        const index_t block_n_id = __builtin_amdgcn_readfirstlane(b_block_id_swizzle % 8 +  b_block_id_swizzle / (8 * expert_swizzle) * 8);
-        const index_t block_m_id = __builtin_amdgcn_readfirstlane(expert_block_swizzle * expert_swizzle + b_block_id_swizzle / 8 % expert_swizzle);
-
-        // const index_t block_n_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
-        // const index_t block_m_id = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        // const index_t expert_id = __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[block_m_id]);
-        // const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);     
        const index_t token0 = __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
        
        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);