Merge branch 'v0.15.1-dev_lightop_fill_moe_align' into 'v0.15.1-dev'

moe: 补齐 fill+moe_align 融合开关语义 See merge request dcutoolkit/deeplearing/vllm!484

Merge branch 'v0.15.1-dev_lightop_fill_moe_align' into 'v0.15.1-dev'
moe: 补齐 fill+moe_align 融合开关语义 See merge request dcutoolkit/deeplearing/vllm!484
16f88a8a · wangmin6 · 5a14b60c · 706c031c · 16f88a8a · 16f88a8a
Commit 16f88a8a authored Mar 12, 2026 by wangmin6
3 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -306,6 +306,7 @@ if TYPE_CHECKING:
    VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER: bool = False
    VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE: bool = False
    VLLM_USE_FUSED_DTBMM: bool = False # DOUBLE TRANS BMM FP8
+    VLLM_USE_LIGHTOP_FILL_MOE_ALIGN: bool = False
    VLLM_USE_CUDA_GRAPH_SIZES: bool = False


@@ -1910,6 +1911,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
            ).lower()
            in ("true", "1")
        ),
+    # vLLM will use lightop fill + moe_align_block_size
+    "VLLM_USE_LIGHTOP_FILL_MOE_ALIGN":
+        lambda: (os.environ.get("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN", "False").lower() in
+                 ("true", "1")),

    #If set to 1/True, enable fuse split qkv+rmsnorm+rope+kv update just like glm4.7 moe attention.
    "VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE":

--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -92,6 +92,8 @@ def moe_align_block_size(
        sorted_ids = torch.empty(
            (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
        )
+        if not envs.VLLM_USE_LIGHTOP_FILL_MOE_ALIGN:
+            sorted_ids.fill_(topk_ids.numel())
    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
    if expert_map is not None:
        expert_ids = torch.zeros(
@@ -102,6 +104,7 @@ def moe_align_block_size(
            (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
        )
    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+    is_fuse_fill = envs.VLLM_USE_LIGHTOP_FILL_MOE_ALIGN

    if envs.VLLM_USE_LIGHTOP or expert_mask is not None:
        from lightop import op as op
@@ -115,7 +118,7 @@ def moe_align_block_size(
            expert_map = expert_map, 
            expert_mask = expert_mask, 
            num_local_tokens = None, 
-            Is_fuse_fill = True,
+            Is_fuse_fill = is_fuse_fill,
        )
    else:
        if envs.VLLM_USE_LIGHTOP_MOE_ALIGN:
@@ -130,7 +133,7 @@ def moe_align_block_size(
                expert_map = None, 
                expert_mask = None, 
                num_local_tokens = None, 
-                Is_fuse_fill = True,
+                Is_fuse_fill = is_fuse_fill,
            )
        else:
            ops.moe_align_block_size(

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -194,6 +194,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
                if not envs.is_set("VLLM_USE_FUSED_FILL_RMS_CAT"):
                    os.environ['VLLM_USE_FUSED_FILL_RMS_CAT'] = '1'
+                if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
+                    if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
+                        os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
                # if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
                #     if not envs.is_set("USE_FUSED_RMS_QUANT"):
                #         os.environ['USE_FUSED_RMS_QUANT'] = '1'
@@ -205,6 +208,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
                if architectures in [['Qwen3MoeForCausalLM']]:
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
+                    if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
+                        os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'    
                    if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
@@ -231,6 +236,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
                if not envs.is_set("VLLM_USE_FUSED_FILL_RMS_CAT"):
                    os.environ['VLLM_USE_FUSED_FILL_RMS_CAT'] = '1'
+                if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
+                    if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
+                        os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
                # if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
                #     if not envs.is_set("USE_FUSED_RMS_QUANT"):
                #         os.environ['USE_FUSED_RMS_QUANT'] = '1'
@@ -242,6 +250,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
                if architectures in [['Qwen3MoeForCausalLM']]:
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
+                    if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
+                        os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'    
                    if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
@@ -307,6 +317,7 @@ def get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
            model_config.runner_type,
            model_config.trust_remote_code,
            model_config.model_impl,
+            model_config.quantization,
            tuple(getattr(model_config.hf_config, "architectures", [])),
        )
    )