fix tests of kernels

set VLLM_USE_PD_SPLIT=1 update moe_align_block_size

fix tests of kernels
set VLLM_USE_PD_SPLIT=1 update moe_align_block_size
0e607f8e · zhuwenwen · cbdc58ec · 0e607f8e · 0e607f8e · 0e607f8e
Commit 0e607f8e authored Jan 14, 2026 by zhuwenwen
10 changed files
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
--- a/tests/kernels/test_shuffle_rows.py
+++ b/tests/kernels/test_shuffle_rows.py
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -401,7 +401,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
    "LlavaNextForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "royokong/e5-v")),
    "Phi3VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "TIGER-Lab/VLM2Vec-Full"),
                                         trust_remote_code=True),
-    "Qwen2VLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
+    "Qwen2VLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "MrLight/dse-qwen2-2b-mrl-v1")), # noqa: E501
    "PrithviGeoSpatialMAE": _HfExamplesInfo(os.path.join(models_path_prefix, "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"), # noqa: E501
                                            dtype=torch.float16,
                                            enforce_eager=True,
@@ -656,9 +656,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
        os.path.join(models_path_prefix, "meituan-longcat/LongCat-Flash-Chat"),
        trust_remote_code=True,
        speculative_model=os.path.join(models_path_prefix, "meituan-longcat/LongCat-Flash-Chat")),
-    "MiMoMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL")),
+    "MiMoMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL"),
                                    trust_remote_code=True,
-                                    speculative_model=os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL"),
+                                    speculative_model=os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL")),
    "Qwen3NextMTP": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen3-Next-80B-A3B-Instruct"),
                                     min_transformers_version="4.56.3"),
 }

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -233,6 +233,8 @@ if TYPE_CHECKING:
    VLLM_USE_LIGHTOP_MOE_SUM: bool = False
    VLLM_USE_LIGHTOP_MOE_ALIGN: bool = False
    VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False
+    USE_FUSED_RMS_QUANT: bool = False
+    USE_FUSED_SILU_MUL_QUANT: bool = False
    VLLM_USE_PD_SPLIT: bool = False
    VLLM_USE_PP_SYNC: bool = False
    VLLM_USE_PIECEWISE: bool = False
@@ -1635,9 +1637,19 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_MERGE_ATTN_STATES_OPT":
        lambda: (os.environ.get("VLLM_USE_MERGE_ATTN_STATES_OPT", "True").lower() in
                 ("true", "1")),  
+    # vllm will use rmsquant fused op 
+    "USE_FUSED_RMS_QUANT": 
+    lambda: bool(int(os.getenv("USE_FUSED_RMS_QUANT", "0"))),
+    
+    # vllm will use silu_mul_quant fused op,
+    # This variable has a default value of true, 
+    # but it is still controlled by CRQ and RQ.
+    "USE_FUSED_SILU_MUL_QUANT":
+    lambda: bool(int(os.getenv("USE_FUSED_SILU_MUL_QUANT", "0"))),
+
    # vLLM will split prefill and decode, not mix up
    "VLLM_USE_PD_SPLIT":
-        lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "False").lower() in
+        lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "True").lower() in
                 ("true", "1")), 
    # vLLM will sync to avoid pp vmfault
    "VLLM_USE_PP_SYNC":

--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -102,7 +102,7 @@ def moe_align_block_size(
                                expert_map = expert_map,
                                expert_mask = expert_mask,
                                num_local_tokens = None,
-                                Is_fuse_fill = False)
+                                Is_fuse_fill = True)
    else:
        if envs.VLLM_USE_LIGHTOP_MOE_ALIGN:
            from lightop import op as op
@@ -111,7 +111,7 @@ def moe_align_block_size(
                                expert_map = None,
                                expert_mask = None,
                                num_local_tokens = None,
-                                Is_fuse_fill = False)
+                                Is_fuse_fill = True)
        else:
            ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
                                    expert_ids, num_tokens_post_pad)

--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -137,15 +137,9 @@ def get_rope(
                    scaling_alpha, dtype)
            elif "factor" in rope_scaling:
                scaling_factor = rope_scaling["factor"]
-                scaling_alpha = rope_scaling["alpha"]
-                if scaling_alpha:
-                    rotary_emb = DynamicNTKAlphaRotaryEmbedding(
-                        head_size, rotary_dim, max_position, base, is_neox_style,
-                        scaling_alpha, dtype)
-                else:
-                    rotary_emb = DynamicNTKScalingRotaryEmbedding(
-                        head_size, rotary_dim, max_position, base, is_neox_style,
-                        scaling_factor, dtype)
+                rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    head_size, rotary_dim, max_position, base, is_neox_style,
+                    scaling_factor, dtype)
            else:
                raise ValueError("Dynamic rope scaling must contain either "
                                 "'alpha' or 'factor' field")

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -199,11 +199,11 @@ def _get_model_architecture(
                if not envs.is_set("VLLM_USE_OPT_CAT"):
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
            else:
-                if not envs.is_set("VLLM_USE_PD_SPLIT"):
-                    os.environ['VLLM_USE_PD_SPLIT'] = '1'
+                # if not envs.is_set("VLLM_USE_PD_SPLIT"):
+                #     os.environ['VLLM_USE_PD_SPLIT'] = '1'
                if architectures in [['Qwen3MoeForCausalLM']]:
-                    # if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
-                    #     os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
+                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
+                        os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'    
                    if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
@@ -226,11 +226,11 @@ def _get_model_architecture(
                if not envs.is_set("VLLM_USE_OPT_CAT"):
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
            else:
-                if not envs.is_set("VLLM_USE_PD_SPLIT"):
-                    os.environ['VLLM_USE_PD_SPLIT'] = '1'
+                # if not envs.is_set("VLLM_USE_PD_SPLIT"):
+                #     os.environ['VLLM_USE_PD_SPLIT'] = '1'
                if architectures in [['Qwen3MoeForCausalLM']]:
-                    # if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
-                    #     os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
+                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
+                        os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'    
                    if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):

--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -129,8 +129,8 @@ STR_DTYPE_TO_TORCH_DTYPE = {
    "bfloat16": torch.bfloat16,
    "float": torch.float,
    "fp8": torch.uint8,
-    # "fp8_e4m3": torch.uint8,
-    # "fp8_e5m2": torch.uint8,
+    "fp8_e4m3": torch.uint8,
+    "fp8_e5m2": torch.uint8,
    "int8": torch.int8,
    "fp8_inc": torch.float8_e4m3fn,
    "fp8_ds_mla": torch.uint8,

--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1089,14 +1089,15 @@ class Scheduler(SchedulerInterface):
    
    def schedule(self) -> SchedulerOutput:
        if envs.VLLM_USE_PD_SPLIT: 
-            return self.schedule_split_pd()
-        else:
-            if self.connector is not None:
-                return self.schedule_default()
-            if self.full_cuda_graph and self.use_mla and self.num_spec_tokens > 0 :
-                return self.schedule_split_pd()
+            if self.use_mla:
+                if self.full_cuda_graph and self.num_spec_tokens > 0:
+                    return self.schedule_split_pd() 
+                else:
+                    self.schedule_default()
            else:
-                return self.schedule_default()
+                return self.schedule_split_pd()
+        else:
+            return self.schedule_default()

    def _update_after_schedule(
        self,