fix optional error

b8ef3436 · zhuwenwen · cffe15ef · b8ef3436 · b8ef3436 · b8ef3436
Commit b8ef3436 authored Dec 13, 2025 by zhuwenwen
4 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -237,7 +237,7 @@ if TYPE_CHECKING:
    VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
    VLLM_USE_V2_MODEL_RUNNER: bool = False
    # add envs
-    VLLM_OPTEST_URLS_PORT: Optional[int] = None
+    VLLM_OPTEST_URLS_PORT: int | None = None
    VLLM_OPTEST_MODELS_PATH: str = ""
    VLLM_USE_TRITON_PREFIX_FLASH_ATTN: bool = False
    VLLM_USE_FLASH_MLA: bool = False
@@ -248,7 +248,7 @@ if TYPE_CHECKING:
    VLLM_SPEC_DECODE_EAGER: bool = False
    VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False
    VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16
-    VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None
+    VLLM_ENFORCE_EAGER_BS_THRESHOLD: int | None  = None
    VLLM_HAS_CONTEXT_DEFAULT: bool = False
    VLLM_USE_NN: bool = False
    VLLM_ENABLE_TBO: bool = False

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -960,24 +960,24 @@ def invoke_fused_moe_kernel(
            )
            if use_moe_wna16_cuda:
-            bit = 4 if use_int4_w4a16 else 8
+                bit = 4 if use_int4_w4a16 else 8
-            ops.moe_wna16_gemm(
+                ops.moe_wna16_gemm(
-                A,
+                    A,
-                C,
+                    C,
-                B,
+                    B,
-                B_scale,
+                    B_scale,
-                B_zp,
+                    B_zp,
-                topk_weights if mul_routed_weight else None,
+                    topk_weights if mul_routed_weight else None,
-                sorted_token_ids,
+                    sorted_token_ids,
-                expert_ids,
+                    expert_ids,
-                num_tokens_post_padded,
+                    num_tokens_post_padded,
-                top_k,
+                    top_k,
-                config["BLOCK_SIZE_M"],
+                    config["BLOCK_SIZE_M"],
-                config["BLOCK_SIZE_N"],
+                    config["BLOCK_SIZE_N"],
-                config["BLOCK_SIZE_K"],
+                    config["BLOCK_SIZE_K"],
-                bit,
+                    bit,
-            )
+                )
-            return
+                return
        if os.environ.get('AWQ_MOE_SZ') == '1':
            fused_moe_kernel_awq[grid](
@@ -1208,7 +1208,7 @@ def get_moe_configs(
    dtype: str | None,
    block_n: int | None = None,
    block_k: int | None = None,
-    use_nn_moe: Optional[bool] = False,
+    use_nn_moe: bool | None = False,
 ) -> dict[int, Any] | None:
    """
    Return optimized configurations for the fused MoE kernel.
@@ -1365,7 +1365,7 @@ def get_default_config(
    topk: int,
    dtype: str | None,
    block_shape: list[int] | None = None,
-    use_nn_moe: Optional[bool]=False,
+    use_nn_moe: bool | None = False,
 ) -> dict[str, int]:
    if vllm_is_batch_invariant():
        config = {
@@ -1434,7 +1434,7 @@ def try_get_optimal_moe_config(
    dtype: str | None,
    M: int,
    block_shape: list[int] | None = None,
-    use_nn_moe: Optional[bool] = False,
+    use_nn_moe: bool | None = False,
 ) -> dict[str, int]:
    from vllm.model_executor.layers.fused_moe import get_config
@@ -1791,7 +1791,7 @@ def inplace_fused_experts(
    block_shape: list[int] | None = None,
    w1_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
-    use_nn_moe: Optional[bool] = False,
+    use_nn_moe: bool | None = False,
 ) -> None:
    fused_experts_impl(
        hidden_states,
@@ -1850,7 +1850,7 @@ def inplace_fused_experts_fake(
    block_shape: list[int] | None = None,
    w1_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
-    use_nn_moe: Optional[bool] = False,
+    use_nn_moe: bool | None = False,
 ) -> None:
    pass
@@ -1952,7 +1952,7 @@ def outplace_fused_experts_fake(
    block_shape: list[int] | None = None,
    w1_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
-    use_nn_moe: Optional[bool] = False,
+    use_nn_moe: bool | None = False,
 ) -> torch.Tensor:
    return torch.empty_like(hidden_states)
@@ -2002,7 +2002,7 @@ def fused_experts(
    allow_deep_gemm: bool = False,
    allow_cutlass_block_scaled_grouped_gemm: bool = False,
    use_int4_w4a8: bool = False,
-    use_nn_moe: Optional[bool] = False,
+    use_nn_moe: bool | None = False,
 ) -> torch.Tensor:
    if quant_config is None:
        quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
@@ -2145,7 +2145,7 @@ def fused_experts_impl(
    block_shape: list[int] | None = None,
    w1_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
-    use_nn_moe: Optional[bool] = False,
+    use_nn_moe: bool | None = False,
 ) -> torch.Tensor:
    # Check constraints.
    num_tokens = hidden_states.size(0)

--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -249,8 +249,8 @@ class RMSNorm(CustomOp):
    def forward_apex(
        self,
        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
+        residual: torch.Tensor | None = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        if self.variance_size_override is not None:
            return self.forward_native(x, residual)

--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -740,10 +740,10 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
    def forward(
        self, input_,
-        rms_weight: Optional[torch.Tensor] = None,
+        rms_weight: torch.Tensor | None = None,
-        residual: Optional[torch.Tensor] = None,
+        residual: torch.Tensor | None = None,
-        update_hd: Optional[bool] = True
+        update_hd:  bool | None = True
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter] | None:
        if envs.USE_FUSED_RMS_QUANT and rms_weight is not None:
            input_quant_args = None
            assert residual is not None and rms_weight is not None 
@@ -795,7 +795,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
        skip_bias_add: bool = False,
        params_dtype: torch.dtype | None = None,
        quant_config: QuantizationConfig | None = None,
-        eps: Optional[float] = 1e-6,
+        eps: float | None = 1e-6,
        prefix: str = "",
        *,
        return_bias: bool = True,