add VLLM_USE_LIGHTOP_FILL_MOE_ALIN

cff5c2d2 · zhuwenwen · ed53dfb0 · cff5c2d2 · cff5c2d2 · cff5c2d2
Commit cff5c2d2 authored Nov 17, 2025 by zhuwenwen
3 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -179,6 +179,7 @@ if TYPE_CHECKING:
    VLLM_SCHED_ENABLE_MINIMAL_INJECTION: bool = False
    VLLM_USE_PD_SPLIT: bool = False
    VLLM_USE_PP_SYNC: bool = False
+    VLLM_USE_LIGHTOP_FILL_MOE_ALIN: bool = False

 def get_default_cache_root():
    return os.getenv(
@@ -1161,6 +1162,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_PP_SYNC":
        lambda: (os.environ.get("VLLM_USE_PP_SYNC", "False").lower() in
                 ("true", "1")), 
+    # vLLM will use lightop to fuse fill and moe align
+    "VLLM_USE_LIGHTOP_FILL_MOE_ALIN":
+        lambda: (os.environ.get("VLLM_USE_LIGHTOP_FILL_MOE_ALIN", "False").lower() in
+                 ("true", "1")), 
 }

 # --8<-- [end:env-vars-definition]

--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -216,7 +216,9 @@ def moe_align_block_size(
        sorted_ids = torch.empty((max_num_tokens_padded, ),
                                 dtype=torch.int32,
                                 device=topk_ids.device)
-        sorted_ids.fill_(topk_ids.numel())
+        if not envs.VLLM_USE_LIGHTOP_FILL_MOE_ALIN:
+            sorted_ids.fill_(topk_ids.numel())
+            
    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
    # Expert ids must be zeroed out to prevent index out of bounds error while
    # mapping global expert ids to local expert ids in expert parallelism.

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -251,6 +251,8 @@ def get_model_architecture(
                    os.environ['VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD'] = '1'
                if not envs.is_set("VLLM_USE_OPT_CAT"):
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
+                if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIN"):
+                    os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIN'] = '1'
                
            if os.getenv('GEMM_PAD') != '1': 
                os.environ['GEMM_PAD'] = '0'