Unverified Commit a3205bef authored by Lucas Kabela's avatar Lucas Kabela Committed by GitHub
Browse files

[CI] Enable mypy coverage for individual excluded files (#34292)


Signed-off-by: default avatarLucas Kabela <lucaskabela@meta.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 6930becd
...@@ -36,7 +36,6 @@ SEPARATE_GROUPS = [ ...@@ -36,7 +36,6 @@ SEPARATE_GROUPS = [
# TODO(woosuk): Include the code from Megatron and HuggingFace. # TODO(woosuk): Include the code from Megatron and HuggingFace.
EXCLUDE = [ EXCLUDE = [
"vllm/engine/arg_utils.py",
"vllm/model_executor/parallel_utils", "vllm/model_executor/parallel_utils",
"vllm/model_executor/models", "vllm/model_executor/models",
"vllm/model_executor/layers/fla/ops", "vllm/model_executor/layers/fla/ops",
...@@ -49,9 +48,6 @@ EXCLUDE = [ ...@@ -49,9 +48,6 @@ EXCLUDE = [
"vllm/profiler", "vllm/profiler",
"vllm/reasoning", "vllm/reasoning",
"vllm/tool_parser", "vllm/tool_parser",
"vllm/v1/cudagraph_dispatcher.py",
"vllm/outputs.py",
"vllm/logger.py",
] ]
......
...@@ -39,7 +39,7 @@ KVOffloadingBackend = Literal["native", "lmcache"] ...@@ -39,7 +39,7 @@ KVOffloadingBackend = Literal["native", "lmcache"]
class CacheConfig: class CacheConfig:
"""Configuration for the KV cache.""" """Configuration for the KV cache."""
block_size: SkipValidation[BlockSize] = None # type: ignore block_size: SkipValidation[BlockSize] = None # type: ignore[assignment]
"""Size of a contiguous cache block in number of tokens. On CUDA devices, """Size of a contiguous cache block in number of tokens. On CUDA devices,
only block sizes up to 32 are supported. only block sizes up to 32 are supported.
......
...@@ -182,7 +182,7 @@ class ParallelConfig: ...@@ -182,7 +182,7 @@ class ParallelConfig:
threshold, microbatching will be used. Otherwise, the request will be threshold, microbatching will be used. Otherwise, the request will be
processed in a single batch.""" processed in a single batch."""
disable_nccl_for_dp_synchronization: bool = Field(default=None) disable_nccl_for_dp_synchronization: bool | None = Field(default=None)
"""Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py
to use Gloo instead of NCCL for its all reduce. to use Gloo instead of NCCL for its all reduce.
......
...@@ -115,7 +115,7 @@ class SchedulerConfig: ...@@ -115,7 +115,7 @@ class SchedulerConfig:
# scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler" # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
# (default) or "mod.custom_class". # (default) or "mod.custom_class".
scheduler_cls: str | type[object] = Field(default=None) scheduler_cls: str | type[object] | None = Field(default=None)
"""The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
the default scheduler. Can be a class directly or the path to a class of the default scheduler. Can be a class directly or the path to a class of
form "mod.custom_class".""" form "mod.custom_class"."""
...@@ -128,7 +128,7 @@ class SchedulerConfig: ...@@ -128,7 +128,7 @@ class SchedulerConfig:
and starting configuration. and starting configuration.
""" """
async_scheduling: bool = Field(default=None) async_scheduling: bool | None = Field(default=None)
"""If set to False, disable async scheduling. Async scheduling helps to """If set to False, disable async scheduling. Async scheduling helps to
avoid gaps in GPU utilization, leading to better latency and throughput. avoid gaps in GPU utilization, leading to better latency and throughput.
""" """
......
...@@ -10,7 +10,7 @@ import json ...@@ -10,7 +10,7 @@ import json
import pathlib import pathlib
import textwrap import textwrap
from collections.abc import Callable, Mapping, Sequence, Set from collections.abc import Callable, Mapping, Sequence, Set
from dataclasses import MISSING, Field, field, fields, is_dataclass from dataclasses import MISSING, field, fields, is_dataclass
from itertools import pairwise from itertools import pairwise
from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
...@@ -66,7 +66,7 @@ def config( ...@@ -66,7 +66,7 @@ def config(
return decorator(cls) return decorator(cls)
def get_field(cls: ConfigType, name: str) -> Field: def get_field(cls: ConfigType, name: str) -> Any:
"""Get the default factory field of a dataclass by name. Used for getting """Get the default factory field of a dataclass by name. Used for getting
default factory fields in `EngineArgs`.""" default factory fields in `EngineArgs`."""
if not is_dataclass(cls): if not is_dataclass(cls):
......
...@@ -67,6 +67,7 @@ from vllm.config.cache import ( ...@@ -67,6 +67,7 @@ from vllm.config.cache import (
PrefixCachingHashAlgo, PrefixCachingHashAlgo,
) )
from vllm.config.device import Device from vllm.config.device import Device
from vllm.config.lora import MaxLoRARanks
from vllm.config.model import ( from vllm.config.model import (
ConvertOption, ConvertOption,
HfOverrides, HfOverrides,
...@@ -77,7 +78,12 @@ from vllm.config.model import ( ...@@ -77,7 +78,12 @@ from vllm.config.model import (
) )
from vllm.config.multimodal import MMCacheType, MMEncoderTPMode from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
from vllm.config.observability import DetailedTraceModules from vllm.config.observability import DetailedTraceModules
from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy from vllm.config.parallel import (
All2AllBackend,
DataParallelBackend,
DistributedExecutorBackend,
ExpertPlacementStrategy,
)
from vllm.config.scheduler import SchedulerPolicy from vllm.config.scheduler import SchedulerPolicy
from vllm.config.utils import get_field from vllm.config.utils import get_field
from vllm.config.vllm import OptimizationLevel from vllm.config.vllm import OptimizationLevel
...@@ -257,7 +263,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]: ...@@ -257,7 +263,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
# VllmConfig's Fields have default_factory set to config classes. # VllmConfig's Fields have default_factory set to config classes.
# These could emit logs on init, which would be confusing. # These could emit logs on init, which would be confusing.
with suppress_logging(): with suppress_logging():
default = default.default_factory() default = default.default_factory() # type: ignore[call-arg]
elif field.default_factory is not MISSING: elif field.default_factory is not MISSING:
default = field.default_factory() default = field.default_factory()
...@@ -373,7 +379,7 @@ class EngineArgs: ...@@ -373,7 +379,7 @@ class EngineArgs:
dtype: ModelDType = ModelConfig.dtype dtype: ModelDType = ModelConfig.dtype
kv_cache_dtype: CacheDType = CacheConfig.cache_dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
seed: int = ModelConfig.seed seed: int = ModelConfig.seed
max_model_len: int | None = ModelConfig.max_model_len max_model_len: int = ModelConfig.max_model_len
cudagraph_capture_sizes: list[int] | None = ( cudagraph_capture_sizes: list[int] | None = (
CompilationConfig.cudagraph_capture_sizes CompilationConfig.cudagraph_capture_sizes
) )
...@@ -405,9 +411,9 @@ class EngineArgs: ...@@ -405,9 +411,9 @@ class EngineArgs:
data_parallel_rpc_port: int | None = None data_parallel_rpc_port: int | None = None
data_parallel_hybrid_lb: bool = False data_parallel_hybrid_lb: bool = False
data_parallel_external_lb: bool = False data_parallel_external_lb: bool = False
data_parallel_backend: str = ParallelConfig.data_parallel_backend data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
all2all_backend: str = ParallelConfig.all2all_backend all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
enable_dbo: bool = ParallelConfig.enable_dbo enable_dbo: bool = ParallelConfig.enable_dbo
ubatch_size: int = ParallelConfig.ubatch_size ubatch_size: int = ParallelConfig.ubatch_size
dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
...@@ -425,7 +431,7 @@ class EngineArgs: ...@@ -425,7 +431,7 @@ class EngineArgs:
max_parallel_loading_workers: int | None = ( max_parallel_loading_workers: int | None = (
ParallelConfig.max_parallel_loading_workers ParallelConfig.max_parallel_loading_workers
) )
block_size: BlockSize | None = CacheConfig.block_size block_size: BlockSize = CacheConfig.block_size
enable_prefix_caching: bool | None = None enable_prefix_caching: bool | None = None
prefix_caching_hash_algo: PrefixCachingHashAlgo = ( prefix_caching_hash_algo: PrefixCachingHashAlgo = (
CacheConfig.prefix_caching_hash_algo CacheConfig.prefix_caching_hash_algo
...@@ -451,7 +457,7 @@ class EngineArgs: ...@@ -451,7 +457,7 @@ class EngineArgs:
hf_token: bool | str | None = ModelConfig.hf_token hf_token: bool | str | None = ModelConfig.hf_token
hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
tokenizer_revision: str | None = ModelConfig.tokenizer_revision tokenizer_revision: str | None = ModelConfig.tokenizer_revision
quantization: QuantizationMethods | None = ModelConfig.quantization quantization: QuantizationMethods | str | None = ModelConfig.quantization
allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization
enforce_eager: bool = ModelConfig.enforce_eager enforce_eager: bool = ModelConfig.enforce_eager
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
...@@ -479,11 +485,11 @@ class EngineArgs: ...@@ -479,11 +485,11 @@ class EngineArgs:
) )
io_processor_plugin: str | None = None io_processor_plugin: str | None = None
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
video_pruning_rate: float = MultiModalConfig.video_pruning_rate video_pruning_rate: float | None = MultiModalConfig.video_pruning_rate
# LoRA fields # LoRA fields
enable_lora: bool = False enable_lora: bool = False
max_loras: int = LoRAConfig.max_loras max_loras: int = LoRAConfig.max_loras
max_lora_rank: int = LoRAConfig.max_lora_rank max_lora_rank: MaxLoRARanks = LoRAConfig.max_lora_rank
default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras
fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
max_cpu_loras: int | None = LoRAConfig.max_cpu_loras max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
...@@ -557,7 +563,7 @@ class EngineArgs: ...@@ -557,7 +563,7 @@ class EngineArgs:
ModelConfig, "override_generation_config" ModelConfig, "override_generation_config"
) )
model_impl: str = ModelConfig.model_impl model_impl: str = ModelConfig.model_impl
override_attention_dtype: str = ModelConfig.override_attention_dtype override_attention_dtype: str | None = ModelConfig.override_attention_dtype
attention_backend: AttentionBackendEnum | None = AttentionConfig.backend attention_backend: AttentionBackendEnum | None = AttentionConfig.backend
calculate_kv_scales: bool = CacheConfig.calculate_kv_scales calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
...@@ -569,7 +575,7 @@ class EngineArgs: ...@@ -569,7 +575,7 @@ class EngineArgs:
additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config") additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config")
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
pt_load_map_location: str = LoadConfig.pt_load_map_location pt_load_map_location: str | dict[str, str] = LoadConfig.pt_load_map_location
logits_processors: list[str | type[LogitsProcessor]] | None = ( logits_processors: list[str | type[LogitsProcessor]] | None = (
ModelConfig.logits_processors ModelConfig.logits_processors
...@@ -1280,7 +1286,7 @@ class EngineArgs: ...@@ -1280,7 +1286,7 @@ class EngineArgs:
hf_config_path=self.hf_config_path, hf_config_path=self.hf_config_path,
runner=self.runner, runner=self.runner,
convert=self.convert, convert=self.convert,
tokenizer=self.tokenizer, tokenizer=self.tokenizer, # type: ignore[arg-type]
tokenizer_mode=self.tokenizer_mode, tokenizer_mode=self.tokenizer_mode,
trust_remote_code=self.trust_remote_code, trust_remote_code=self.trust_remote_code,
allowed_local_media_path=self.allowed_local_media_path, allowed_local_media_path=self.allowed_local_media_path,
...@@ -1445,12 +1451,16 @@ class EngineArgs: ...@@ -1445,12 +1451,16 @@ class EngineArgs:
self.kv_cache_dtype, model_config self.kv_cache_dtype, model_config
) )
assert self.enable_prefix_caching is not None, (
"enable_prefix_caching must be set by this point"
)
cache_config = CacheConfig( cache_config = CacheConfig(
block_size=self.block_size, block_size=self.block_size,
gpu_memory_utilization=self.gpu_memory_utilization, gpu_memory_utilization=self.gpu_memory_utilization,
kv_cache_memory_bytes=self.kv_cache_memory_bytes, kv_cache_memory_bytes=self.kv_cache_memory_bytes,
swap_space=self.swap_space, swap_space=self.swap_space,
cache_dtype=resolved_cache_dtype, cache_dtype=resolved_cache_dtype, # type: ignore[arg-type]
is_attention_free=model_config.is_attention_free, is_attention_free=model_config.is_attention_free,
num_gpu_blocks_override=self.num_gpu_blocks_override, num_gpu_blocks_override=self.num_gpu_blocks_override,
sliding_window=sliding_window, sliding_window=sliding_window,
...@@ -1676,6 +1686,16 @@ class EngineArgs: ...@@ -1676,6 +1686,16 @@ class EngineArgs:
target_parallel_config=parallel_config, target_parallel_config=parallel_config,
) )
assert self.max_num_batched_tokens is not None, (
"max_num_batched_tokens must be set by this point"
)
assert self.max_num_seqs is not None, "max_num_seqs must be set by this point"
assert self.enable_chunked_prefill is not None, (
"enable_chunked_prefill must be set by this point"
)
assert model_config.max_model_len is not None, (
"max_model_len must be set by this point"
)
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
runner_type=model_config.runner_type, runner_type=model_config.runner_type,
max_num_batched_tokens=self.max_num_batched_tokens, max_num_batched_tokens=self.max_num_batched_tokens,
...@@ -2043,6 +2063,9 @@ class EngineArgs: ...@@ -2043,6 +2063,9 @@ class EngineArgs:
) )
if orig_max_num_batched_tokens is None: if orig_max_num_batched_tokens is None:
assert model_config.max_model_len is not None, (
"max_model_len must be set by this point"
)
if not self.enable_chunked_prefill: if not self.enable_chunked_prefill:
# If max_model_len is too short, use the default for higher throughput. # If max_model_len is too short, use the default for higher throughput.
self.max_num_batched_tokens = max( self.max_num_batched_tokens = max(
......
...@@ -38,7 +38,7 @@ def _use_color() -> bool: ...@@ -38,7 +38,7 @@ def _use_color() -> bool:
return False return False
DEFAULT_LOGGING_CONFIG = { DEFAULT_LOGGING_CONFIG: dict[str, dict[str, Any] | Any] = {
"formatters": { "formatters": {
"vllm": { "vllm": {
"class": "vllm.logging_utils.NewLineFormatter", "class": "vllm.logging_utils.NewLineFormatter",
...@@ -157,7 +157,7 @@ _METHODS_TO_PATCH = { ...@@ -157,7 +157,7 @@ _METHODS_TO_PATCH = {
def _configure_vllm_root_logger() -> None: def _configure_vllm_root_logger() -> None:
logging_config = dict[str, dict[str, Any] | Any]() logging_config: dict[str, dict[str, Any] | Any] = {}
if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH: if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH:
raise RuntimeError( raise RuntimeError(
...@@ -225,7 +225,8 @@ def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]: ...@@ -225,7 +225,8 @@ def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]:
logging.disable(current_level) logging.disable(current_level)
def current_formatter_type(lgr: Logger) -> Literal["color", "newline", None]: def current_formatter_type(logger: Logger) -> Literal["color", "newline", None]:
lgr: Logger | None = logger
while lgr is not None: while lgr is not None:
if lgr.handlers and len(lgr.handlers) == 1 and lgr.handlers[0].name == "vllm": if lgr.handlers and len(lgr.handlers) == 1 and lgr.handlers[0].name == "vllm":
formatter = lgr.handlers[0].formatter formatter = lgr.handlers[0].formatter
......
...@@ -162,7 +162,7 @@ class RequestOutput: ...@@ -162,7 +162,7 @@ class RequestOutput:
completion.token_ids.extend(next_completion.token_ids) completion.token_ids.extend(next_completion.token_ids)
if next_completion.logprobs: if next_completion.logprobs:
assert completion.logprobs is not None assert completion.logprobs is not None
completion.logprobs.extend(next_completion.logprobs) completion.logprobs.extend(next_completion.logprobs) # type: ignore[arg-type]
completion.cumulative_logprob = ( completion.cumulative_logprob = (
next_completion.cumulative_logprob next_completion.cumulative_logprob
) )
......
...@@ -71,6 +71,9 @@ class CudagraphDispatcher: ...@@ -71,6 +71,9 @@ class CudagraphDispatcher:
"""Pre-compute the mapping from batch size to padded graph size.""" """Pre-compute the mapping from batch size to padded graph size."""
max_size = self.compilation_config.max_cudagraph_capture_size max_size = self.compilation_config.max_cudagraph_capture_size
capture_sizes = self.compilation_config.cudagraph_capture_sizes capture_sizes = self.compilation_config.cudagraph_capture_sizes
assert capture_sizes is not None, (
"Cudagraph capture sizes must be set when cudagraphs are enabled."
)
self._bs_to_padded_graph_size: list[int] = [0] * (max_size + 1) self._bs_to_padded_graph_size: list[int] = [0] * (max_size + 1)
for end, start in zip( for end, start in zip(
capture_sizes + [max_size + 1], capture_sizes + [max_size + 1],
...@@ -89,6 +92,7 @@ class CudagraphDispatcher: ...@@ -89,6 +92,7 @@ class CudagraphDispatcher:
and self.cudagraph_mode != CUDAGraphMode.NONE and self.cudagraph_mode != CUDAGraphMode.NONE
): ):
for size in self.compilation_config.compile_sizes: for size in self.compilation_config.compile_sizes:
size = int(size)
if size <= self.compilation_config.max_cudagraph_capture_size: if size <= self.compilation_config.max_cudagraph_capture_size:
padded = self._bs_to_padded_graph_size[size] padded = self._bs_to_padded_graph_size[size]
if padded != size: if padded != size:
...@@ -178,6 +182,9 @@ class CudagraphDispatcher: ...@@ -178,6 +182,9 @@ class CudagraphDispatcher:
# guarantee all keys would be used. For example, if we allow lazy # guarantee all keys would be used. For example, if we allow lazy
# capturing in future PR, some keys may never be triggered. # capturing in future PR, some keys may never be triggered.
if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE: if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
assert self.compilation_config.cudagraph_capture_sizes is not None, (
"Cudagraph capture sizes must be set when mixed mode is enabled."
)
for bs, num_active_loras in product( for bs, num_active_loras in product(
self.compilation_config.cudagraph_capture_sizes, lora_cases self.compilation_config.cudagraph_capture_sizes, lora_cases
): ):
...@@ -200,6 +207,9 @@ class CudagraphDispatcher: ...@@ -200,6 +207,9 @@ class CudagraphDispatcher:
uniform_decode_query_len uniform_decode_query_len
* self.vllm_config.scheduler_config.max_num_seqs * self.vllm_config.scheduler_config.max_num_seqs
) )
assert self.compilation_config.cudagraph_capture_sizes is not None, (
"Cudagraph capture sizes must be set when full mode is enabled."
)
cudagraph_capture_sizes_for_decode = [ cudagraph_capture_sizes_for_decode = [
x x
for x in self.compilation_config.cudagraph_capture_sizes for x in self.compilation_config.cudagraph_capture_sizes
...@@ -262,6 +272,9 @@ class CudagraphDispatcher: ...@@ -262,6 +272,9 @@ class CudagraphDispatcher:
else: else:
# When not specializing, graphs are captured only with max_loras + 1, # When not specializing, graphs are captured only with max_loras + 1,
# so we must use max_loras + 1 for dispatch to find a matching graph. # so we must use max_loras + 1 for dispatch to find a matching graph.
assert self.vllm_config.lora_config is not None, (
"LoRA config must be set when has_lora is True."
)
effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1 effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1
batch_desc = self._create_padded_batch_descriptor( batch_desc = self._create_padded_batch_descriptor(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment