Commit b8ef3436 authored by zhuwenwen's avatar zhuwenwen
Browse files

fix optional error

parent cffe15ef
...@@ -237,7 +237,7 @@ if TYPE_CHECKING: ...@@ -237,7 +237,7 @@ if TYPE_CHECKING:
VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary" VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
VLLM_USE_V2_MODEL_RUNNER: bool = False VLLM_USE_V2_MODEL_RUNNER: bool = False
# add envs # add envs
VLLM_OPTEST_URLS_PORT: Optional[int] = None VLLM_OPTEST_URLS_PORT: int | None = None
VLLM_OPTEST_MODELS_PATH: str = "" VLLM_OPTEST_MODELS_PATH: str = ""
VLLM_USE_TRITON_PREFIX_FLASH_ATTN: bool = False VLLM_USE_TRITON_PREFIX_FLASH_ATTN: bool = False
VLLM_USE_FLASH_MLA: bool = False VLLM_USE_FLASH_MLA: bool = False
...@@ -248,7 +248,7 @@ if TYPE_CHECKING: ...@@ -248,7 +248,7 @@ if TYPE_CHECKING:
VLLM_SPEC_DECODE_EAGER: bool = False VLLM_SPEC_DECODE_EAGER: bool = False
VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False
VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16 VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16
VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None VLLM_ENFORCE_EAGER_BS_THRESHOLD: int | None = None
VLLM_HAS_CONTEXT_DEFAULT: bool = False VLLM_HAS_CONTEXT_DEFAULT: bool = False
VLLM_USE_NN: bool = False VLLM_USE_NN: bool = False
VLLM_ENABLE_TBO: bool = False VLLM_ENABLE_TBO: bool = False
......
...@@ -1208,7 +1208,7 @@ def get_moe_configs( ...@@ -1208,7 +1208,7 @@ def get_moe_configs(
dtype: str | None, dtype: str | None,
block_n: int | None = None, block_n: int | None = None,
block_k: int | None = None, block_k: int | None = None,
use_nn_moe: Optional[bool] = False, use_nn_moe: bool | None = False,
) -> dict[int, Any] | None: ) -> dict[int, Any] | None:
""" """
Return optimized configurations for the fused MoE kernel. Return optimized configurations for the fused MoE kernel.
...@@ -1365,7 +1365,7 @@ def get_default_config( ...@@ -1365,7 +1365,7 @@ def get_default_config(
topk: int, topk: int,
dtype: str | None, dtype: str | None,
block_shape: list[int] | None = None, block_shape: list[int] | None = None,
use_nn_moe: Optional[bool]=False, use_nn_moe: bool | None = False,
) -> dict[str, int]: ) -> dict[str, int]:
if vllm_is_batch_invariant(): if vllm_is_batch_invariant():
config = { config = {
...@@ -1434,7 +1434,7 @@ def try_get_optimal_moe_config( ...@@ -1434,7 +1434,7 @@ def try_get_optimal_moe_config(
dtype: str | None, dtype: str | None,
M: int, M: int,
block_shape: list[int] | None = None, block_shape: list[int] | None = None,
use_nn_moe: Optional[bool] = False, use_nn_moe: bool | None = False,
) -> dict[str, int]: ) -> dict[str, int]:
from vllm.model_executor.layers.fused_moe import get_config from vllm.model_executor.layers.fused_moe import get_config
...@@ -1791,7 +1791,7 @@ def inplace_fused_experts( ...@@ -1791,7 +1791,7 @@ def inplace_fused_experts(
block_shape: list[int] | None = None, block_shape: list[int] | None = None,
w1_bias: torch.Tensor | None = None, w1_bias: torch.Tensor | None = None,
w2_bias: torch.Tensor | None = None, w2_bias: torch.Tensor | None = None,
use_nn_moe: Optional[bool] = False, use_nn_moe: bool | None = False,
) -> None: ) -> None:
fused_experts_impl( fused_experts_impl(
hidden_states, hidden_states,
...@@ -1850,7 +1850,7 @@ def inplace_fused_experts_fake( ...@@ -1850,7 +1850,7 @@ def inplace_fused_experts_fake(
block_shape: list[int] | None = None, block_shape: list[int] | None = None,
w1_bias: torch.Tensor | None = None, w1_bias: torch.Tensor | None = None,
w2_bias: torch.Tensor | None = None, w2_bias: torch.Tensor | None = None,
use_nn_moe: Optional[bool] = False, use_nn_moe: bool | None = False,
) -> None: ) -> None:
pass pass
...@@ -1952,7 +1952,7 @@ def outplace_fused_experts_fake( ...@@ -1952,7 +1952,7 @@ def outplace_fused_experts_fake(
block_shape: list[int] | None = None, block_shape: list[int] | None = None,
w1_bias: torch.Tensor | None = None, w1_bias: torch.Tensor | None = None,
w2_bias: torch.Tensor | None = None, w2_bias: torch.Tensor | None = None,
use_nn_moe: Optional[bool] = False, use_nn_moe: bool | None = False,
) -> torch.Tensor: ) -> torch.Tensor:
return torch.empty_like(hidden_states) return torch.empty_like(hidden_states)
...@@ -2002,7 +2002,7 @@ def fused_experts( ...@@ -2002,7 +2002,7 @@ def fused_experts(
allow_deep_gemm: bool = False, allow_deep_gemm: bool = False,
allow_cutlass_block_scaled_grouped_gemm: bool = False, allow_cutlass_block_scaled_grouped_gemm: bool = False,
use_int4_w4a8: bool = False, use_int4_w4a8: bool = False,
use_nn_moe: Optional[bool] = False, use_nn_moe: bool | None = False,
) -> torch.Tensor: ) -> torch.Tensor:
if quant_config is None: if quant_config is None:
quant_config = FUSED_MOE_UNQUANTIZED_CONFIG quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
...@@ -2145,7 +2145,7 @@ def fused_experts_impl( ...@@ -2145,7 +2145,7 @@ def fused_experts_impl(
block_shape: list[int] | None = None, block_shape: list[int] | None = None,
w1_bias: torch.Tensor | None = None, w1_bias: torch.Tensor | None = None,
w2_bias: torch.Tensor | None = None, w2_bias: torch.Tensor | None = None,
use_nn_moe: Optional[bool] = False, use_nn_moe: bool | None = False,
) -> torch.Tensor: ) -> torch.Tensor:
# Check constraints. # Check constraints.
num_tokens = hidden_states.size(0) num_tokens = hidden_states.size(0)
......
...@@ -249,8 +249,8 @@ class RMSNorm(CustomOp): ...@@ -249,8 +249,8 @@ class RMSNorm(CustomOp):
def forward_apex( def forward_apex(
self, self,
x: torch.Tensor, x: torch.Tensor,
residual: Optional[torch.Tensor] = None, residual: torch.Tensor | None = None,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if self.variance_size_override is not None: if self.variance_size_override is not None:
return self.forward_native(x, residual) return self.forward_native(x, residual)
......
...@@ -740,10 +740,10 @@ class MergedColumnParallelLinear(ColumnParallelLinear): ...@@ -740,10 +740,10 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
def forward( def forward(
self, input_, self, input_,
rms_weight: Optional[torch.Tensor] = None, rms_weight: torch.Tensor | None = None,
residual: Optional[torch.Tensor] = None, residual: torch.Tensor | None = None,
update_hd: Optional[bool] = True update_hd: bool | None = True
) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: ) -> torch.Tensor | tuple[torch.Tensor, Parameter] | None:
if envs.USE_FUSED_RMS_QUANT and rms_weight is not None: if envs.USE_FUSED_RMS_QUANT and rms_weight is not None:
input_quant_args = None input_quant_args = None
assert residual is not None and rms_weight is not None assert residual is not None and rms_weight is not None
...@@ -795,7 +795,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): ...@@ -795,7 +795,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
skip_bias_add: bool = False, skip_bias_add: bool = False,
params_dtype: torch.dtype | None = None, params_dtype: torch.dtype | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
eps: Optional[float] = 1e-6, eps: float | None = 1e-6,
prefix: str = "", prefix: str = "",
*, *,
return_bias: bool = True, return_bias: bool = True,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment