Unverified Commit 662205d3 authored by Matthew Bonanni's avatar Matthew Bonanni Committed by GitHub
Browse files

[Bugfix] Fix Basic Models Test (#34818)


Signed-off-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
Co-authored-by: default avatarRobert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
parent 4fb8beef
...@@ -13,6 +13,7 @@ import torch.nn as nn ...@@ -13,6 +13,7 @@ import torch.nn as nn
from PIL import Image from PIL import Image
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
from vllm.config.cache import CacheConfig
from vllm.config.multimodal import ( from vllm.config.multimodal import (
AudioDummyOptions, AudioDummyOptions,
BaseDummyOptions, BaseDummyOptions,
...@@ -131,7 +132,9 @@ def initialize_dummy_model( ...@@ -131,7 +132,9 @@ def initialize_dummy_model(
): ):
temp_file = tempfile.mkstemp()[1] temp_file = tempfile.mkstemp()[1]
current_device = torch.get_default_device() current_device = torch.get_default_device()
vllm_config = VllmConfig(model_config=model_config) vllm_config = VllmConfig(
model_config=model_config, cache_config=CacheConfig(block_size=16)
)
with set_current_vllm_config(vllm_config=vllm_config): with set_current_vllm_config(vllm_config=vllm_config):
init_distributed_environment( init_distributed_environment(
world_size=1, world_size=1,
......
...@@ -457,6 +457,9 @@ def dummy_hf_overrides( ...@@ -457,6 +457,9 @@ def dummy_hf_overrides(
# Kimi uses `num_expert_group` instead of `n_group`. # Kimi uses `num_expert_group` instead of `n_group`.
if n_group is None: if n_group is None:
n_group = getattr(text_config, "num_expert_group", None) n_group = getattr(text_config, "num_expert_group", None)
# InternS1Pro uses `router_n_groups` instead of `n_group`.
if n_group is None:
n_group = getattr(text_config, "router_n_groups", None)
num_experts = n_group * 2 if n_group is not None else 2 num_experts = n_group * 2 if n_group is not None else 2
# we use three layers for Gemma-3n to check # we use three layers for Gemma-3n to check
...@@ -486,12 +489,14 @@ def dummy_hf_overrides( ...@@ -486,12 +489,14 @@ def dummy_hf_overrides(
# Only set MoE related config when the model has MoE layers. # Only set MoE related config when the model has MoE layers.
# Otherwise all models detected as MoE by _get_transformers_backend_cls. # Otherwise all models detected as MoE by _get_transformers_backend_cls.
if model_arch_config.num_experts > 0: if model_arch_config.num_experts > 0:
orig_topk = getattr(text_config, "num_experts_per_tok", 2)
topk = min(orig_topk, 2)
update_dict.update( update_dict.update(
{ {
"num_experts": num_experts, "num_experts": num_experts,
"num_experts_per_tok": 2, "num_experts_per_tok": topk,
# Kimi uses `num_experts_per_token`. # Kimi uses `num_experts_per_token`.
"num_experts_per_token": 2, "num_experts_per_token": topk,
"num_local_experts": num_experts, "num_local_experts": num_experts,
# Otherwise there will not be any expert layers # Otherwise there will not be any expert layers
"first_k_dense_replace": 0, "first_k_dense_replace": 0,
......
...@@ -78,7 +78,7 @@ def _create_proposer( ...@@ -78,7 +78,7 @@ def _create_proposer(
device = current_platform.device_type device = current_platform.device_type
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
cache_config=CacheConfig(), cache_config=CacheConfig(block_size=16),
speculative_config=speculative_config, speculative_config=speculative_config,
device_config=DeviceConfig(device=device), device_config=DeviceConfig(device=device),
parallel_config=ParallelConfig(), parallel_config=ParallelConfig(),
......
...@@ -41,8 +41,8 @@ class CacheConfig: ...@@ -41,8 +41,8 @@ class CacheConfig:
block_size: SkipValidation[int] = None # type: ignore[assignment] block_size: SkipValidation[int] = None # type: ignore[assignment]
"""Size of a contiguous cache block in number of tokens. """Size of a contiguous cache block in number of tokens.
This is None until `Platform.check_and_update_config()` sets it based on This is None until the platform sets it. Always an int by the time
the current platform. Always an int by the time the engine starts.""" the engine starts."""
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1) gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
"""The fraction of GPU memory to be used for the model executor, which can """The fraction of GPU memory to be used for the model executor, which can
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
......
...@@ -915,32 +915,6 @@ class VllmConfig: ...@@ -915,32 +915,6 @@ class VllmConfig:
) )
current_platform.check_and_update_config(self) current_platform.check_and_update_config(self)
# If DCP, ensure the block size is right.
if self.parallel_config.decode_context_parallel_size > 1:
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
self.parallel_config.cp_kv_cache_interleave_size
!= self.parallel_config.dcp_kv_cache_interleave_size
):
self.parallel_config.cp_kv_cache_interleave_size = (
self.parallel_config.dcp_kv_cache_interleave_size
)
logger.warning_once(
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
"_interleave_size. And dcp-kv-cache-interleave-size will be "
"deprecated when PCP is fully supported."
)
assert (
self.parallel_config.cp_kv_cache_interleave_size
<= self.cache_config.block_size
and self.cache_config.block_size
% self.parallel_config.cp_kv_cache_interleave_size
== 0
), (
f"Block_size({self.cache_config.block_size}) should be greater "
"than or equal to and divisible by cp_kv_cache_interleave_size "
f"({self.parallel_config.cp_kv_cache_interleave_size})."
)
# Do this after all the updates to compilation_config.mode # Do this after all the updates to compilation_config.mode
effective_dp_size = ( effective_dp_size = (
self.parallel_config.data_parallel_size self.parallel_config.data_parallel_size
...@@ -1108,26 +1082,6 @@ class VllmConfig: ...@@ -1108,26 +1082,6 @@ class VllmConfig:
# Default to enable HMA if not explicitly disabled by user or logic above. # Default to enable HMA if not explicitly disabled by user or logic above.
self.scheduler_config.disable_hybrid_kv_cache_manager = False self.scheduler_config.disable_hybrid_kv_cache_manager = False
if self.cache_config.mamba_cache_mode == "align":
assert (
self.cache_config.block_size
<= self.scheduler_config.max_num_batched_tokens
), (
"In Mamba cache align mode, block_size "
f"({self.cache_config.block_size}) must be <= "
"max_num_batched_tokens "
f"({self.scheduler_config.max_num_batched_tokens})."
)
if self.scheduler_config.long_prefill_token_threshold > 0:
assert (
self.scheduler_config.long_prefill_token_threshold
>= self.cache_config.block_size
)
assert not self.scheduler_config.disable_chunked_mm_input, (
"Chunked MM input is required because we need the flexibility to "
"schedule a multiple of block_size tokens even if they are in the "
"middle of a mm input"
)
if self.compilation_config.debug_dump_path: if self.compilation_config.debug_dump_path:
self.compilation_config.debug_dump_path = ( self.compilation_config.debug_dump_path = (
self.compilation_config.debug_dump_path.absolute().expanduser() self.compilation_config.debug_dump_path.absolute().expanduser()
...@@ -1488,6 +1442,57 @@ class VllmConfig: ...@@ -1488,6 +1442,57 @@ class VllmConfig:
f"compilation_config={self.compilation_config!r}" f"compilation_config={self.compilation_config!r}"
) )
def validate_block_size(self) -> None:
"""Validate block_size against DCP and mamba constraints.
Called after Platform.update_block_size_for_backend() has
finalised block_size, so that the checks see the real value
rather than the initial None sentinel.
"""
block_size = self.cache_config.block_size
assert block_size is not None, (
"validate_block_size called before block_size was set"
)
# DCP interleave-size compatibility
if self.parallel_config.decode_context_parallel_size > 1:
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
self.parallel_config.cp_kv_cache_interleave_size
!= self.parallel_config.dcp_kv_cache_interleave_size
):
self.parallel_config.cp_kv_cache_interleave_size = (
self.parallel_config.dcp_kv_cache_interleave_size
)
logger.warning_once(
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
"_interleave_size. And dcp-kv-cache-interleave-size will be "
"deprecated when PCP is fully supported."
)
assert (
self.parallel_config.cp_kv_cache_interleave_size <= block_size
and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
), (
f"Block_size({block_size}) should be greater "
"than or equal to and divisible by cp_kv_cache_interleave_size "
f"({self.parallel_config.cp_kv_cache_interleave_size})."
)
# Mamba cache align-mode constraints
if self.cache_config.mamba_cache_mode == "align":
assert block_size <= self.scheduler_config.max_num_batched_tokens, (
"In Mamba cache align mode, block_size "
f"({block_size}) must be <= "
"max_num_batched_tokens "
f"({self.scheduler_config.max_num_batched_tokens})."
)
if self.scheduler_config.long_prefill_token_threshold > 0:
assert self.scheduler_config.long_prefill_token_threshold >= block_size
assert not self.scheduler_config.disable_chunked_mm_input, (
"Chunked MM input is required because we need the flexibility "
"to schedule a multiple of block_size tokens even if they are "
"in the middle of a mm input"
)
@model_validator(mode="after") @model_validator(mode="after")
def validate_mamba_block_size(self) -> "VllmConfig": def validate_mamba_block_size(self) -> "VllmConfig":
if self.model_config is None: if self.model_config is None:
......
...@@ -30,9 +30,8 @@ from vllm.v1.kv_cache_interface import ( ...@@ -30,9 +30,8 @@ from vllm.v1.kv_cache_interface import (
def create_chunked_local_attention_backend( def create_chunked_local_attention_backend(
underlying_attn_backend: AttentionBackend, underlying_attn_backend: AttentionBackend,
attention_chunk_size: int, attention_chunk_size: int,
block_size: int,
) -> type[AttentionBackend]: ) -> type[AttentionBackend]:
prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_" prefix = f"ChunkedLocalAttention_{attention_chunk_size}_"
underlying_builder = underlying_attn_backend.get_builder_cls() underlying_builder = underlying_attn_backend.get_builder_cls()
assert issubclass(underlying_builder, AttentionMetadataBuilder) assert issubclass(underlying_builder, AttentionMetadataBuilder)
...@@ -55,7 +54,9 @@ def create_chunked_local_attention_backend( ...@@ -55,7 +54,9 @@ def create_chunked_local_attention_backend(
fast_build: bool = False, fast_build: bool = False,
): ):
cm, make_virtual_batches_block_table = make_local_attention_virtual_batches( cm, make_virtual_batches_block_table = make_local_attention_virtual_batches(
attention_chunk_size, common_attn_metadata, block_size attention_chunk_size,
common_attn_metadata,
self.kv_cache_spec.block_size,
) )
metadata = super().build(common_prefix_len, cm, fast_build) metadata = super().build(common_prefix_len, cm, fast_build)
metadata.make_virtual_batches_block_table = make_virtual_batches_block_table metadata.make_virtual_batches_block_table = make_virtual_batches_block_table
...@@ -97,13 +98,13 @@ class ChunkedLocalAttention(Attention): ...@@ -97,13 +98,13 @@ class ChunkedLocalAttention(Attention):
block_size = cache_config.block_size block_size = cache_config.block_size
else: else:
kv_cache_dtype = "auto" kv_cache_dtype = "auto"
block_size = 16 block_size = None
underlying_attn_backend = get_attn_backend( underlying_attn_backend = get_attn_backend(
head_size, dtype, kv_cache_dtype, block_size head_size, dtype, kv_cache_dtype, block_size
) )
attn_backend = create_chunked_local_attention_backend( attn_backend = create_chunked_local_attention_backend(
underlying_attn_backend, attention_chunk_size, block_size underlying_attn_backend, attention_chunk_size
) )
super().__init__( super().__init__(
......
...@@ -407,17 +407,24 @@ class MLAAttention(nn.Module, AttentionLayerBase): ...@@ -407,17 +407,24 @@ class MLAAttention(nn.Module, AttentionLayerBase):
) )
# Attributes for forward_impl method # Attributes for forward_impl method
self.chunked_prefill_workspace_size = ( self._vllm_config = get_current_vllm_config()
MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size( self._chunked_prefill_workspace_size: int | None = None
get_current_vllm_config()
)
)
self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8( self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8(
static=True, static=True,
group_shape=GroupShape.PER_TENSOR, group_shape=GroupShape.PER_TENSOR,
compile_native=True, compile_native=True,
) )
@property
def chunked_prefill_workspace_size(self) -> int:
if self._chunked_prefill_workspace_size is None:
self._chunked_prefill_workspace_size = (
MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
self._vllm_config
)
)
return self._chunked_prefill_workspace_size
def forward( def forward(
self, self,
q: torch.Tensor, q: torch.Tensor,
......
...@@ -169,21 +169,6 @@ class CudaPlatformBase(Platform): ...@@ -169,21 +169,6 @@ class CudaPlatformBase(Platform):
if parallel_config.worker_cls == "auto": if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
cache_config = vllm_config.cache_config
user_specified_block_size = cache_config.block_size is not None
if not user_specified_block_size:
cache_config.block_size = 16
# Ensure block_size is compatible with the attention backend.
# Note: model_config may be None during testing.
# Skip hybrid (attention+mamba) models — their block_size is
# managed by HybridAttentionMambaModelConfig
if model_config is not None and not model_config.is_hybrid:
cls._update_block_size_for_backend(
vllm_config,
user_specified_block_size,
)
scheduler_config = vllm_config.scheduler_config scheduler_config = vllm_config.scheduler_config
# Note: model_config may be None during testing # Note: model_config may be None during testing
if ( if (
...@@ -199,148 +184,47 @@ class CudaPlatformBase(Platform): ...@@ -199,148 +184,47 @@ class CudaPlatformBase(Platform):
scheduler_config.disable_chunked_mm_input = True scheduler_config.disable_chunked_mm_input = True
@classmethod @classmethod
def _update_block_size_for_backend( def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
cls,
vllm_config: "VllmConfig",
user_specified_block_size: bool,
) -> None:
"""Ensure block_size is compatible with the attention backend.
If the user specified --block-size, the selector validates/filters
backends by that block size (raising on incompatibility). Otherwise,
the backend is selected unconstrained and block_size is set to the
backend's preferred value.
"""
from vllm.config.vllm import set_current_vllm_config
from vllm.v1.attention.selector import AttentionSelectorConfig
model_config = vllm_config.model_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
if cache_config.block_size is not None:
# User specified --block-size; keep it.
return
device_capability = cls.get_device_capability() model_config = vllm_config.model_config
if device_capability is None: # model_config may be None during testing.
# Skip hybrid models — their block_size is managed by
# HybridAttentionMambaModelConfig.
if model_config is None or model_config.is_hybrid:
cache_config.block_size = 16
return return
use_mla = model_config.use_mla from vllm.config.vllm import (
attn_selector_config = AttentionSelectorConfig( get_layers_from_vllm_config,
head_size=model_config.get_head_size(), set_current_vllm_config,
dtype=model_config.dtype, # type: ignore[arg-type] )
kv_cache_dtype=cache_config.cache_dtype, from vllm.model_executor.layers.attention_layer_base import (
block_size=cache_config.block_size if user_specified_block_size else None, AttentionLayerBase,
use_mla=use_mla,
has_sink=False,
use_sparse=use_mla and hasattr(model_config.hf_config, "index_topk"),
use_mm_prefix=model_config.is_mm_prefix_lm,
) )
user_specified_backend = vllm_config.attention_config.backend attn_layers = get_layers_from_vllm_config(
num_heads = model_config.get_num_attention_heads( vllm_config,
vllm_config.parallel_config, AttentionLayerBase,
) )
if not attn_layers:
cache_config.block_size = 16
return
first_layer = next(iter(attn_layers.values()))
backend_cls = first_layer.get_attn_backend()
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
chosen_backend = cls.select_attention_backend( preferred = backend_cls.get_preferred_block_size(16)
selected_backend=user_specified_backend, if preferred != 16:
attn_selector_config=attn_selector_config, logger.info(
device_capability=device_capability, "Setting kv cache block size to %d for %s backend.",
# Don't raise here — we produce better errors below. preferred,
raise_on_invalid=False, backend_cls.get_name(),
num_heads=num_heads,
) )
cache_config.block_size = preferred
# If the user's --block-size forced a non-optimal backend,
# warn them. Only relevant when the user didn't also specify
# --attention-backend (in which case the choice is explicit).
if (
chosen_backend is not None
and user_specified_block_size
and user_specified_backend is None
):
optimal = cls.select_attention_backend(
selected_backend=None,
attn_selector_config=attn_selector_config._replace(
block_size=None,
),
device_capability=device_capability,
raise_on_invalid=False,
num_heads=num_heads,
)
if optimal is not None and optimal != chosen_backend:
logger.warning(
"--block-size %d is not supported by the preferred "
"%s backend. Using %s instead, which may result "
"in reduced performance. Consider removing "
"--block-size to auto-select the optimal "
"block size.",
cache_config.block_size,
optimal.name,
chosen_backend.name,
)
if chosen_backend is not None:
if user_specified_block_size:
# User's block_size is compatible with the chosen
# backend.
return
# User didn't specify --block-size, so auto-select the
# preferred block size for the chosen backend.
try:
backend_class = chosen_backend.get_class()
except ImportError:
return # Will fail later with a better error
preferred = backend_class.get_preferred_block_size(
cache_config.block_size,
)
if cache_config.block_size != preferred:
logger.info(
"Setting kv cache block size to %d for %s backend.",
preferred,
chosen_backend.name,
)
cache_config.block_size = preferred
return
# No valid backend found. If the user didn't constrain the
# selection, defer the error to get_attn_backend_cls where
# the full config (including per-layer settings) is
# available.
if not user_specified_block_size:
return
if user_specified_backend is not None:
# User specified --block-size and --attention-backend
# and they are incompatible.
try:
backend_class = user_specified_backend.get_class()
supported = backend_class.get_supported_kernel_block_sizes()
except ImportError:
supported = None
raise ValueError(
f"User-specified --block-size "
f"{cache_config.block_size} is incompatible with "
f"the specified --attention-backend "
f"{user_specified_backend.name} (supported kernel "
f"block sizes: {supported}). Either remove "
f"--block-size to auto-select, or choose a "
f"compatible value."
)
else:
# User specified --block-size but no backend supports
# it.
_, invalid_reasons = cls.get_valid_backends(
device_capability=device_capability,
attn_selector_config=attn_selector_config,
num_heads=num_heads,
)
reasons_str = ", ".join(
f"{b.name}: [{', '.join(r)}]" for b, r in invalid_reasons.items()
)
raise ValueError(
f"No valid attention backend found for "
f"--block-size {cache_config.block_size}. "
f"Reasons: {{{reasons_str}}}. Either remove "
f"--block-size to auto-select, or choose a "
f"compatible value."
)
@classmethod @classmethod
def get_current_memory_usage( def get_current_memory_usage(
...@@ -358,10 +242,10 @@ class CudaPlatformBase(Platform): ...@@ -358,10 +242,10 @@ class CudaPlatformBase(Platform):
num_heads: int | None = None, num_heads: int | None = None,
) -> tuple[ ) -> tuple[
list[tuple["AttentionBackendEnum", int]], list[tuple["AttentionBackendEnum", int]],
dict["AttentionBackendEnum", list[str]], dict["AttentionBackendEnum", tuple[int, list[str]]],
]: ]:
valid_backends_priorities = [] valid_backends_priorities = []
invalid_reasons = {} invalid_reasons: dict[AttentionBackendEnum, tuple[int, list[str]]] = {}
backend_priorities = _get_backend_priorities( backend_priorities = _get_backend_priorities(
attn_selector_config.use_mla, attn_selector_config.use_mla,
...@@ -378,7 +262,7 @@ class CudaPlatformBase(Platform): ...@@ -378,7 +262,7 @@ class CudaPlatformBase(Platform):
except ImportError: except ImportError:
invalid_reasons_i = ["ImportError"] invalid_reasons_i = ["ImportError"]
if invalid_reasons_i: if invalid_reasons_i:
invalid_reasons[backend] = invalid_reasons_i invalid_reasons[backend] = (priority, invalid_reasons_i)
else: else:
valid_backends_priorities.append((backend, priority)) valid_backends_priorities.append((backend, priority))
...@@ -439,7 +323,7 @@ class CudaPlatformBase(Platform): ...@@ -439,7 +323,7 @@ class CudaPlatformBase(Platform):
"{" "{"
+ ", ".join( + ", ".join(
f"{backend.name}: [{', '.join(reasons)}]" f"{backend.name}: [{', '.join(reasons)}]"
for backend, reasons in invalid_reasons.items() for backend, (_, reasons) in invalid_reasons.items()
) )
+ "}" + "}"
) )
...@@ -452,7 +336,30 @@ class CudaPlatformBase(Platform): ...@@ -452,7 +336,30 @@ class CudaPlatformBase(Platform):
# Select the one with the highest priority (lowest index). # Select the one with the highest priority (lowest index).
sorted_backends = sorted(valid_backends_priorities, key=lambda x: x[1]) sorted_backends = sorted(valid_backends_priorities, key=lambda x: x[1])
return sorted_backends[0][0] chosen_backend, chosen_priority = sorted_backends[0]
# If the user specified --block-size (but not --attention-backend),
# check whether that constraint precluded any higher-priority backends.
if attn_selector_config.block_size is not None:
excluded = [
backend
for backend, (priority, reasons) in invalid_reasons.items()
if priority < chosen_priority
and reasons == ["block_size not supported"]
]
if excluded:
names = ", ".join(b.name for b in excluded)
logger.warning(
"--block-size %d excluded higher-priority backend(s) "
"%s. Using %s instead, which may result in reduced "
"performance. Consider removing --block-size to "
"auto-select the optimal block size.",
attn_selector_config.block_size,
names,
chosen_backend.name,
)
return chosen_backend
@classmethod @classmethod
def get_attn_backend_cls( def get_attn_backend_cls(
...@@ -487,7 +394,7 @@ class CudaPlatformBase(Platform): ...@@ -487,7 +394,7 @@ class CudaPlatformBase(Platform):
"{" "{"
+ ", ".join( + ", ".join(
f"{backend.name}: [{', '.join(reasons)}]" f"{backend.name}: [{', '.join(reasons)}]"
for backend, reasons in invalid_reasons.items() for backend, (_, reasons) in invalid_reasons.items()
) )
+ "}" + "}"
) )
...@@ -499,7 +406,7 @@ class CudaPlatformBase(Platform): ...@@ -499,7 +406,7 @@ class CudaPlatformBase(Platform):
logger.info_once( logger.info_once(
"Using %s attention backend out of potential backends: %s", "Using %s attention backend out of potential backends: %s",
chosen_backend.name, chosen_backend.name,
tuple(b[0].name for b in valid_backends_priorities), tuple(backend.name for backend, _ in valid_backends_priorities),
scope="local", scope="local",
) )
......
...@@ -406,6 +406,13 @@ class Platform: ...@@ -406,6 +406,13 @@ class Platform:
""" """
pass pass
@classmethod
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
"""
Ensure block_size is compatible with the attention backend.
"""
pass
@classmethod @classmethod
def verify_model_arch(cls, model_arch: str) -> None: def verify_model_arch(cls, model_arch: str) -> None:
""" """
......
...@@ -114,7 +114,14 @@ class EngineCore: ...@@ -114,7 +114,14 @@ class EngineCore:
num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
vllm_config vllm_config
) )
if kv_cache_config.kv_cache_groups:
vllm_config.cache_config.block_size = min(
g.kv_cache_spec.block_size for g in kv_cache_config.kv_cache_groups
)
elif vllm_config.cache_config.block_size is None:
# Attention-free models (encoder-only, SSM) — use default.
vllm_config.cache_config.block_size = 16
vllm_config.validate_block_size()
vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks)) self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks))
......
...@@ -41,6 +41,7 @@ from vllm.distributed.parallel_state import ( ...@@ -41,6 +41,7 @@ from vllm.distributed.parallel_state import (
) )
from vllm.envs import enable_envs_cache from vllm.envs import enable_envs_cache
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.tracing import instrument, maybe_init_worker_tracer from vllm.tracing import instrument, maybe_init_worker_tracer
from vllm.utils.network_utils import ( from vllm.utils.network_utils import (
get_distributed_init_method, get_distributed_init_method,
...@@ -579,6 +580,9 @@ class WorkerProc: ...@@ -579,6 +580,9 @@ class WorkerProc:
self._init_message_queues(input_shm_handle, vllm_config) self._init_message_queues(input_shm_handle, vllm_config)
self.worker.load_model() self.worker.load_model()
# Set block size based on the attention backends
current_platform.update_block_size_for_backend(vllm_config)
# Enable environment variable cache (e.g. assume no more # Enable environment variable cache (e.g. assume no more
# environment variable overrides after this point) # environment variable overrides after this point)
enable_envs_cache() enable_envs_cache()
......
...@@ -385,6 +385,11 @@ class RayDistributedExecutor(Executor): ...@@ -385,6 +385,11 @@ class RayDistributedExecutor(Executor):
self.collective_rpc("init_device") self.collective_rpc("init_device")
self.collective_rpc("load_model") self.collective_rpc("load_model")
def _update_block_size(worker):
current_platform.update_block_size_for_backend(worker.vllm_config)
self.collective_rpc(_update_block_size)
for pp_rank in range(self.parallel_config.pipeline_parallel_size): for pp_rank in range(self.parallel_config.pipeline_parallel_size):
self.pp_tp_workers.append([]) self.pp_tp_workers.append([])
for tp_rank in range(self.parallel_config.tensor_parallel_size): for tp_rank in range(self.parallel_config.tensor_parallel_size):
......
...@@ -12,6 +12,7 @@ import torch.distributed as dist ...@@ -12,6 +12,7 @@ import torch.distributed as dist
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
...@@ -46,6 +47,7 @@ class UniProcExecutor(Executor): ...@@ -46,6 +47,7 @@ class UniProcExecutor(Executor):
self.driver_worker.init_worker(all_kwargs=[kwargs]) self.driver_worker.init_worker(all_kwargs=[kwargs])
self.driver_worker.init_device() self.driver_worker.init_device()
self.driver_worker.load_model() self.driver_worker.load_model()
current_platform.update_block_size_for_backend(self.vllm_config)
def _distributed_args(self) -> tuple[str, int, int]: def _distributed_args(self) -> tuple[str, int, int]:
"""Return (distributed_init_method, rank, local_rank).""" """Return (distributed_init_method, rank, local_rank)."""
......
...@@ -513,6 +513,7 @@ class GPUModelRunner( ...@@ -513,6 +513,7 @@ class GPUModelRunner(
custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = ( custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = (
tuple(logits_processors) if logits_processors is not None else () tuple(logits_processors) if logits_processors is not None else ()
) )
placeholder_block_size = self.cache_config.block_size or 16
self.input_batch = InputBatch( self.input_batch = InputBatch(
max_num_reqs=self.max_num_reqs, max_num_reqs=self.max_num_reqs,
# We need to use the encoder length for encoder-decoer # We need to use the encoder length for encoder-decoer
...@@ -522,8 +523,8 @@ class GPUModelRunner( ...@@ -522,8 +523,8 @@ class GPUModelRunner(
device=self.device, device=self.device,
pin_memory=self.pin_memory, pin_memory=self.pin_memory,
vocab_size=self.model_config.get_vocab_size(), vocab_size=self.model_config.get_vocab_size(),
block_sizes=[self.cache_config.block_size], block_sizes=[placeholder_block_size],
kernel_block_sizes=[self.cache_config.block_size], kernel_block_sizes=[placeholder_block_size],
is_spec_decode=bool(self.vllm_config.speculative_config), is_spec_decode=bool(self.vllm_config.speculative_config),
logitsprocs=build_logitsprocs( logitsprocs=build_logitsprocs(
self.vllm_config, self.vllm_config,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment