update the envs for moe_sum and moe_align

766663e6 · zhuwenwen · 1277ff09 · 766663e6 · 766663e6 · 766663e6
Commit 766663e6 authored Oct 13, 2025 by zhuwenwen
3 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -167,6 +167,8 @@ if TYPE_CHECKING:
    VLLM_USE_LIGHTOP: bool = False
    VLLM_USE_OPT_CAT: bool = False
    VLLM_USE_OPT_MOE_SUM: bool = False
+    VLLM_USE_LIGHTOP_MOE_SUM: bool = False
+    VLLM_USE_LIGHTOP_MOE_ALIGN: bool = False
    VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False
    USE_FUSED_RMS_QUANT: bool = False
    USE_FUSED_SILU_MUL_QUANT: bool = False
@@ -1109,6 +1111,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_OPT_MOE_SUM":
        lambda: (os.environ.get("VLLM_USE_OPT_MOE_SUM", "False").lower() in
                 ("true", "1")),  
+    # vLLM will use lightop moe_sum 
+    "VLLM_USE_LIGHTOP_MOE_SUM":
+        lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_SUM", "False").lower() in
+                 ("true", "1")),  
+    # vLLM will use lightop moe_align_block_size 
+    "VLLM_USE_LIGHTOP_MOE_ALIGN":
+        lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_ALIGN", "False").lower() in
+                 ("true", "1")),     
    # vLLM will use opt merge_aatn_states, not triton
    "VLLM_USE_MERGE_ATTN_STATES_OPT":
        lambda: (os.environ.get("VLLM_USE_MERGE_ATTN_STATES_OPT", "True").lower() in

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1918,11 +1918,11 @@ def fused_experts_impl(
        #             ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),
        #                     out_hidden_states[begin_chunk_idx:end_chunk_idx]) * routed_scaling_factor
        else:
-            if envs.VLLM_USE_LIGHTOP:
+            if envs.VLLM_USE_LIGHTOP_MOE_SUM:
                from lightop import op as op
                op.moe_sum(input=intermediate_cache3.view(*intermediate_cache3.size()),
                    output=out_hidden_states[begin_chunk_idx:end_chunk_idx], bias=None, 
-                    expert_mask=None, num_local_tokens=None, factor=None)
+                    expert_mask=None, num_local_tokens=None, factor=1.0)
            elif envs.VLLM_USE_OPT_MOE_SUM:
                moe_reduce_dispatch(intermediate_cache3.view(*intermediate_cache3.size()), out_hidden_states[begin_chunk_idx:end_chunk_idx], begin_chunk_idx, end_chunk_idx)
            else:

--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -240,16 +240,16 @@ def moe_align_block_size(
                                expert_mask = expert_mask,
                                num_local_tokens = None)
    else:
-        if envs.VLLM_USE_LIGHTOP:
+        if envs.VLLM_USE_LIGHTOP_MOE_ALIGN:
            from lightop import op as op
-            ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
-                                    expert_ids, num_tokens_post_pad)
-        else:
            op.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
                                expert_ids, num_tokens_post_pad,
                                expert_map = None,
                                expert_mask = None,
                                num_local_tokens = None)
+        else:
+            ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
+                                    expert_ids, num_tokens_post_pad)
        if expert_map is not None:
            expert_ids = expert_map[expert_ids]