Commit fcfc474d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.3' into v0.8.3-dev

parents bb94d2e5 296c6572
...@@ -726,15 +726,13 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -726,15 +726,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
default="ttft,tpot,itl", default="ttft,tpot,itl",
help="Comma-seperated list of selected metrics to report percentils. " help="Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. " "This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
"Default value is \"ttft,tpot,itl\".")
parser.add_argument( parser.add_argument(
"--metric-percentiles", "--metric-percentiles",
type=str, type=str,
default="99", default="99",
help="Comma-seperated list of percentiles for selected metrics. " help="Comma-seperated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.", "Use \"--percentile-metrics\" to select metrics.",
) )
parser.add_argument( parser.add_argument(
......
...@@ -381,8 +381,8 @@ class VllmBackend: ...@@ -381,8 +381,8 @@ class VllmBackend:
with open(filepath) as f: with open(filepath) as f:
hash_content.append(f.read()) hash_content.append(f.read())
import hashlib import hashlib
code_hash = hashlib.md5( code_hash = hashlib.md5("\n".join(hash_content).encode(),
"\n".join(hash_content).encode()).hexdigest() usedforsecurity=False).hexdigest()
factors.append(code_hash) factors.append(code_hash)
# 3. compiler hash # 3. compiler hash
...@@ -390,7 +390,8 @@ class VllmBackend: ...@@ -390,7 +390,8 @@ class VllmBackend:
factors.append(compiler_hash) factors.append(compiler_hash)
# combine all factors to generate the cache dir # combine all factors to generate the cache dir
hash_key = hashlib.md5(str(factors).encode()).hexdigest()[:10] hash_key = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()[:10]
cache_dir = os.path.join( cache_dir = os.path.join(
envs.VLLM_CACHE_ROOT, envs.VLLM_CACHE_ROOT,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import contextlib
import copy import copy
import hashlib import hashlib
import importlib.metadata
import os import os
from contextlib import ExitStack from contextlib import ExitStack
from typing import Any, Callable, Dict, List, Optional, Tuple from typing import Any, Callable, Dict, List, Optional, Tuple
...@@ -9,6 +11,7 @@ from unittest.mock import patch ...@@ -9,6 +11,7 @@ from unittest.mock import patch
import torch import torch
import torch._inductor.compile_fx import torch._inductor.compile_fx
import torch.fx as fx import torch.fx as fx
from packaging.version import Version
from vllm.config import VllmConfig from vllm.config import VllmConfig
...@@ -139,10 +142,12 @@ class InductorAdaptor(CompilerInterface): ...@@ -139,10 +142,12 @@ class InductorAdaptor(CompilerInterface):
from torch._inductor.codecache import torch_key from torch._inductor.codecache import torch_key
torch_factors = torch_key() torch_factors = torch_key()
factors.append(torch_factors) factors.append(torch_factors)
hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10] hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()[:10]
return hash_str return hash_str
def initialize_cache(self, cache_dir: str, disable_cache: bool = False): def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
self.cache_dir = cache_dir
if disable_cache: if disable_cache:
return return
# redirect the cache directory to a sub-directory # redirect the cache directory to a sub-directory
...@@ -155,7 +160,6 @@ class InductorAdaptor(CompilerInterface): ...@@ -155,7 +160,6 @@ class InductorAdaptor(CompilerInterface):
triton_cache = os.path.join(cache_dir, "triton_cache") triton_cache = os.path.join(cache_dir, "triton_cache")
os.makedirs(triton_cache, exist_ok=True) os.makedirs(triton_cache, exist_ok=True)
os.environ["TRITON_CACHE_DIR"] = triton_cache os.environ["TRITON_CACHE_DIR"] = triton_cache
self.cache_dir = cache_dir
def compile( def compile(
self, self,
...@@ -228,7 +232,20 @@ class InductorAdaptor(CompilerInterface): ...@@ -228,7 +232,20 @@ class InductorAdaptor(CompilerInterface):
inductor_compiled_graph = output inductor_compiled_graph = output
if inductor_compiled_graph is not None: if inductor_compiled_graph is not None:
nonlocal file_path nonlocal file_path
file_path = inductor_compiled_graph.current_callable.__code__.co_filename # noqa compiled_fn = inductor_compiled_graph.current_callable
file_path = compiled_fn.__code__.co_filename # noqa
if not file_path.startswith(self.cache_dir):
# hooked in the align_inputs_from_check_idxs function
# in torch/_inductor/utils.py
for cell in compiled_fn.__closure__:
if not callable(cell.cell_contents):
continue
code = cell.cell_contents.__code__
if code.co_filename.startswith(self.cache_dir):
# this is the real file path
# compiled from Inductor
file_path = code.co_filename
break
hash_str = inductor_compiled_graph._fx_graph_cache_key hash_str = inductor_compiled_graph._fx_graph_cache_key
return output return output
...@@ -271,6 +288,9 @@ class InductorAdaptor(CompilerInterface): ...@@ -271,6 +288,9 @@ class InductorAdaptor(CompilerInterface):
"torch._inductor.codecache.FxGraphCache._check_can_cache", "torch._inductor.codecache.FxGraphCache._check_can_cache",
_check_can_cache)) _check_can_cache))
# Dynamo metrics context, see method for more details.
stack.enter_context(self.metrics_context())
compiled_graph = compile_fx( compiled_graph = compile_fx(
graph, graph,
example_inputs, example_inputs,
...@@ -295,8 +315,14 @@ class InductorAdaptor(CompilerInterface): ...@@ -295,8 +315,14 @@ class InductorAdaptor(CompilerInterface):
hash_str = handle[0] hash_str = handle[0]
from torch._inductor.codecache import FxGraphCache from torch._inductor.codecache import FxGraphCache
with patch("torch._inductor.codecache.FxGraphCache._get_shape_env", with ExitStack() as exit_stack:
lambda *args, **kwargs: AlwaysHitShapeEnv()): exit_stack.enter_context(
patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
lambda *args, **kwargs: AlwaysHitShapeEnv()))
# Dynamo metrics context, see method for more details.
exit_stack.enter_context(self.metrics_context())
if torch.__version__.startswith("2.5"): if torch.__version__.startswith("2.5"):
inductor_compiled_graph = FxGraphCache._lookup_graph( inductor_compiled_graph = FxGraphCache._lookup_graph(
hash_str, example_inputs, True, False) hash_str, example_inputs, True, False)
...@@ -337,6 +363,28 @@ class InductorAdaptor(CompilerInterface): ...@@ -337,6 +363,28 @@ class InductorAdaptor(CompilerInterface):
return compiled_graph return compiled_graph
def metrics_context(self) -> contextlib.AbstractContextManager:
"""
This method returns the Dynamo metrics context (if it exists,
otherwise a null context). It is used by various compile components.
Present in torch>=2.6, it's used inside FxGraphCache in
torch==2.6 (but not after). It might also be used in various other
torch.compile internal functions.
Because it is re-entrant, we always set it (even if entering via Dynamo
and the context was already entered). We might want to revisit if it
should be set at a different level of compilation.
This is likely a bug in PyTorch: public APIs should not rely on
manually setting up internal contexts. But we also rely on non-public
APIs which might not provide these guarantees.
"""
if Version(importlib.metadata.version('torch')) >= Version("2.6"):
import torch._dynamo.utils
return torch._dynamo.utils.get_metrics_context()
else:
return contextlib.nullcontext()
class EagerAdaptor(CompilerInterface): class EagerAdaptor(CompilerInterface):
name = "eager" name = "eager"
......
...@@ -4,8 +4,6 @@ from typing import Callable, Dict, List, NamedTuple, Optional, Tuple ...@@ -4,8 +4,6 @@ from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
import torch import torch
import torch._inductor.pattern_matcher as pm import torch._inductor.pattern_matcher as pm
# TODO(luka) use vllm.utils once #10836 landed
from compressed_tensors.quantization import FP8_DTYPE
from torch import fx from torch import fx
from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._higher_order_ops.auto_functionalize import auto_functionalized
from torch._inductor.pattern_matcher import PatternMatcherPass from torch._inductor.pattern_matcher import PatternMatcherPass
...@@ -13,12 +11,14 @@ from torch._ops import OpOverload ...@@ -13,12 +11,14 @@ from torch._ops import OpOverload
from vllm.config import CompilationConfig from vllm.config import CompilationConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform
from .fx_utils import find_getitem_maybe from .fx_utils import find_getitem_maybe
from .multi_output_match import MultiOutputMatch from .multi_output_match import MultiOutputMatch
from .vllm_inductor_pass import VllmInductorPass from .vllm_inductor_pass import VllmInductorPass
logger = init_logger(__name__) logger = init_logger(__name__)
FP8_DTYPE = current_platform.fp8_dtype()
def empty_bf16(*args, **kwargs): def empty_bf16(*args, **kwargs):
......
...@@ -29,7 +29,7 @@ from vllm.logger import init_logger ...@@ -29,7 +29,7 @@ from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
get_quantization_config) get_quantization_config)
from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models import ModelRegistry
from vllm.platforms import CpuArchEnum from vllm.platforms import CpuArchEnum, current_platform
from vllm.sampling_params import GuidedDecodingParams from vllm.sampling_params import GuidedDecodingParams
from vllm.tracing import is_otel_available, otel_import_error_traceback from vllm.tracing import is_otel_available, otel_import_error_traceback
from vllm.transformers_utils.config import ( from vllm.transformers_utils.config import (
...@@ -38,9 +38,10 @@ from vllm.transformers_utils.config import ( ...@@ -38,9 +38,10 @@ from vllm.transformers_utils.config import (
get_sentence_transformer_tokenizer_config, is_encoder_decoder, get_sentence_transformer_tokenizer_config, is_encoder_decoder,
try_get_generation_config, uses_mrope) try_get_generation_config, uses_mrope)
from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.s3_utils import S3Model
from vllm.transformers_utils.utils import is_s3 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
get_cpu_memory, random_uuid, resolve_obj_by_qualname) get_cpu_memory, get_open_port, random_uuid,
resolve_obj_by_qualname)
if TYPE_CHECKING: if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup from ray.util.placement_group import PlacementGroup
...@@ -221,6 +222,9 @@ class ModelConfig: ...@@ -221,6 +222,9 @@ class ModelConfig:
factors.append(self.trust_remote_code) factors.append(self.trust_remote_code)
factors.append(self.rope_scaling) factors.append(self.rope_scaling)
factors.append(self.rope_theta) factors.append(self.rope_theta)
# rope cos/sin cache depends on the max_position_embeddings
factors.append(
getattr(self.hf_config, "max_position_embeddings", "None"))
return hashlib.sha256(str(factors).encode()).hexdigest() return hashlib.sha256(str(factors).encode()).hexdigest()
def __init__( def __init__(
...@@ -263,9 +267,13 @@ class ModelConfig: ...@@ -263,9 +267,13 @@ class ModelConfig:
override_generation_config: Optional[dict[str, Any]] = None, override_generation_config: Optional[dict[str, Any]] = None,
model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
) -> None: ) -> None:
self.model = model self.model = maybe_model_redirect(model)
self.tokenizer = maybe_model_redirect(tokenizer)
self.hf_config_path = hf_config_path self.hf_config_path = hf_config_path
self.tokenizer = tokenizer if isinstance(hf_config_path, str):
self.hf_config_path = maybe_model_redirect(hf_config_path)
self.tokenizer_mode = tokenizer_mode self.tokenizer_mode = tokenizer_mode
self.trust_remote_code = trust_remote_code self.trust_remote_code = trust_remote_code
self.allowed_local_media_path = allowed_local_media_path self.allowed_local_media_path = allowed_local_media_path
...@@ -309,8 +317,8 @@ class ModelConfig: ...@@ -309,8 +317,8 @@ class ModelConfig:
) and backend == "FLASHINFER" and find_spec("flashinfer") is None: ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
raise ValueError( raise ValueError(
"VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
"module was not found." "module was not found. See "
"See https://github.com/vllm-project/vllm/blob/main/Dockerfile" "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501
"for instructions on how to install it.") "for instructions on how to install it.")
# The tokenizer version is consistent with the model version by default. # The tokenizer version is consistent with the model version by default.
...@@ -346,6 +354,8 @@ class ModelConfig: ...@@ -346,6 +354,8 @@ class ModelConfig:
self.hf_config = hf_config self.hf_config = hf_config
self.hf_text_config = get_hf_text_config(self.hf_config) self.hf_text_config = get_hf_text_config(self.hf_config)
self.attention_chunk_size = getattr(self.hf_text_config,
"attention_chunk_size", None)
self.encoder_config = self._get_encoder_config() self.encoder_config = self._get_encoder_config()
self.hf_image_processor_config = get_hf_image_processor_config( self.hf_image_processor_config = get_hf_image_processor_config(
self.model, revision) self.model, revision)
...@@ -403,6 +413,7 @@ class ModelConfig: ...@@ -403,6 +413,7 @@ class ModelConfig:
self.is_attention_free = self._init_attention_free() self.is_attention_free = self._init_attention_free()
self.is_hybrid = self._init_is_hybrid() self.is_hybrid = self._init_is_hybrid()
self.has_noops = self._init_has_noops()
self.has_inner_state = self._init_has_inner_state() self.has_inner_state = self._init_has_inner_state()
if current_platform.is_neuron(): if current_platform.is_neuron():
...@@ -502,6 +513,10 @@ class ModelConfig: ...@@ -502,6 +513,10 @@ class ModelConfig:
def _init_is_hybrid(self) -> bool: def _init_is_hybrid(self) -> bool:
return self.registry.is_hybrid_model(self.architectures) return self.registry.is_hybrid_model(self.architectures)
def _init_has_noops(self) -> bool:
architectures = getattr(self.hf_config, "architectures", [])
return self.registry.is_noops_model(architectures)
def _init_has_inner_state(self) -> bool: def _init_has_inner_state(self) -> bool:
return self.registry.model_has_inner_state(self.architectures) return self.registry.model_has_inner_state(self.architectures)
...@@ -671,11 +686,19 @@ class ModelConfig: ...@@ -671,11 +686,19 @@ class ModelConfig:
self.max_seq_len_to_capture = self.max_model_len self.max_seq_len_to_capture = self.max_model_len
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
self.max_model_len) self.max_model_len)
ROCM_UNSUPPORTED_MODELS = ['mllama']
if (self.hf_config.model_type in ROCM_UNSUPPORTED_MODELS
and not self.enforce_eager and current_platform.is_rocm()):
logger.warning(
"CUDA graph is not supported for %s on ROCm yet, fallback "
"to the eager mode.", self.hf_config.model_type)
self.enforce_eager = True
def _verify_bnb_config(self) -> None: def _verify_bnb_config(self) -> None:
""" """
The current version of bitsandbytes (0.44.0) with 8-bit models does not The current version of bitsandbytes (0.45.3) with 8-bit models does not
yet support CUDA graph. yet support CUDA graph.
# TODO Remove this when bitsandbytes supports.
""" """
is_bitsandbytes = self.quantization == "bitsandbytes" is_bitsandbytes = self.quantization == "bitsandbytes"
has_quantization_config = (getattr(self.hf_config, has_quantization_config = (getattr(self.hf_config,
...@@ -690,8 +713,9 @@ class ModelConfig: ...@@ -690,8 +713,9 @@ class ModelConfig:
not self.enforce_eager, not self.enforce_eager,
]): ]):
logger.warning( logger.warning(
"CUDA graph is not supported on BitAndBytes 8bit yet, " "CUDA graph is not supported on BitsAndBytes 8bit yet, "
"fallback to the eager mode.") "fallback to the eager mode.")
self.enforce_eager = True self.enforce_eager = True
def _verify_with_expert_parallelism(self) -> None: def _verify_with_expert_parallelism(self) -> None:
...@@ -746,6 +770,12 @@ class ModelConfig: ...@@ -746,6 +770,12 @@ class ModelConfig:
self, self,
parallel_config: "ParallelConfig", parallel_config: "ParallelConfig",
) -> None: ) -> None:
if parallel_config.distributed_executor_backend == "external_launcher":
assert self.seed is not None, (
"Seed must be set when using external launcher backend to "
"make sure sampling results are the same across workers.")
total_num_attention_heads = getattr(self.hf_text_config, total_num_attention_heads = getattr(self.hf_text_config,
"num_attention_heads", 0) "num_attention_heads", 0)
tensor_parallel_size = parallel_config.tensor_parallel_size tensor_parallel_size = parallel_config.tensor_parallel_size
...@@ -797,10 +827,18 @@ class ModelConfig: ...@@ -797,10 +827,18 @@ class ModelConfig:
@property @property
def is_deepseek_mla(self) -> bool: def is_deepseek_mla(self) -> bool:
return (hasattr(self.hf_text_config, "model_type")) \ if not hasattr(self.hf_text_config, "model_type"):
and (self.hf_text_config.model_type in \ return False
('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'))\ elif self.hf_text_config.model_type in \
and (self.hf_text_config.kv_lora_rank is not None) ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'):
return self.hf_text_config.kv_lora_rank is not None
elif self.hf_text_config.model_type == 'eagle':
# if the model is an EAGLE module, check for the
# underlying architecture
return self.hf_text_config.model.model_type in \
('deepseek_v2', 'deepseek_v3') \
and self.hf_text_config.kv_lora_rank is not None
return False
def get_head_size(self) -> int: def get_head_size(self) -> int:
# TODO remove hard code # TODO remove hard code
...@@ -854,6 +892,14 @@ class ModelConfig: ...@@ -854,6 +892,14 @@ class ModelConfig:
return getattr(self.hf_config.attn_config, "kv_n_heads", return getattr(self.hf_config.attn_config, "kv_n_heads",
self.hf_config.num_attention_heads) self.hf_config.num_attention_heads)
if self.hf_config.model_type == "nemotron-nas":
for block in self.hf_config.block_configs:
if not block.attention.no_op:
return self.hf_config.num_attention_heads \
// block.attention.n_heads_in_group
raise RuntimeError("Couldn't determine number of kv heads")
if self.is_attention_free: if self.is_attention_free:
return 0 return 0
...@@ -922,7 +968,9 @@ class ModelConfig: ...@@ -922,7 +968,9 @@ class ModelConfig:
# This function relies on 'layers_block_type' in hf_config, # This function relies on 'layers_block_type' in hf_config,
# for w/o this attribute, we will need to have workarounds like so # for w/o this attribute, we will need to have workarounds like so
attn_block_type = block_type == LayerBlockType.attention attn_block_type = block_type == LayerBlockType.attention
is_transformer = not self.is_hybrid and not self.is_attention_free is_transformer = not self.is_hybrid and \
not self.has_noops and \
not self.is_attention_free
start, end = self.get_layers_start_end_indices(parallel_config) start, end = self.get_layers_start_end_indices(parallel_config)
if is_transformer: if is_transformer:
...@@ -933,27 +981,39 @@ class ModelConfig: ...@@ -933,27 +981,39 @@ class ModelConfig:
# Note that this code assumes there # Note that this code assumes there
# is only one type of attention-free block type. # is only one type of attention-free block type.
return 0 if attn_block_type else end - start return 0 if attn_block_type else end - start
elif self.has_noops:
block_configs = self.hf_config.block_configs
return sum(not bc.attention.no_op
for bc in block_configs[start:end])
else: else:
# Hybrid model # Hybrid model Jamba
layers_block_type_value = getattr(self.hf_config, layers_block_type_value = getattr(self.hf_config,
"layers_block_type", None) "layers_block_type", None)
if layers_block_type_value is None: if layers_block_type_value is not None:
raise ValueError("The model is an hybrid without a " if hasattr(self.hf_text_config,
"layers_block_type in the hf_config, " "model_type") and (self.hf_text_config.model_type
"cannot determine the num of " == "zamba2"):
f"{block_type.value} layers") if attn_block_type:
return sum(t == "hybrid"
if hasattr(self.hf_text_config, for t in layers_block_type_value[start:end])
"model_type") and (self.hf_text_config.model_type else:
== "zamba2"): return self.get_num_layers(parallel_config)
if attn_block_type: return sum(t == block_type.value
return sum(t == "hybrid" for t in layers_block_type_value[start:end])
for t in layers_block_type_value[start:end])
else: # Hybrid model Minimax
return self.get_num_layers(parallel_config) attn_type_list = getattr(self.hf_config, "attn_type_list", None)
if attn_type_list:
return sum(t == 1 for t in attn_type_list[start:end])
if layers_block_type_value is None and attn_type_list is None:
raise ValueError(
"The model is an hybrid without a"
"layers_block_type or an attn_type_list in the hf_config,"
"cannot determine the num of "
f"{block_type.value} layers")
return sum(t == block_type.value return sum(t == 1 for t in attn_type_list[start:end])
for t in layers_block_type_value[start:end])
def get_multimodal_config(self) -> "MultiModalConfig": def get_multimodal_config(self) -> "MultiModalConfig":
""" """
...@@ -1079,8 +1139,7 @@ class CacheConfig: ...@@ -1079,8 +1139,7 @@ class CacheConfig:
is_attention_free: Whether the model is attention-free. is_attention_free: Whether the model is attention-free.
num_gpu_blocks_override: Number of GPU blocks to use. This overrides the num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
profiled num_gpu_blocks if specified. Does nothing if None. profiled num_gpu_blocks if specified. Does nothing if None.
sliding_window: Sliding window size for the KV cache. Can not work with sliding_window: Sliding window size for the KV cache.
prefix caching enabled.
enable_prefix_caching: Whether to enable prefix caching. enable_prefix_caching: Whether to enable prefix caching.
cpu_offload_gb: Size of the CPU offload buffer in GiB. cpu_offload_gb: Size of the CPU offload buffer in GiB.
""" """
...@@ -1100,7 +1159,8 @@ class CacheConfig: ...@@ -1100,7 +1159,8 @@ class CacheConfig:
factors: list[Any] = [] factors: list[Any] = []
factors.append(self.cache_dtype) factors.append(self.cache_dtype)
# `cpu_offload_gb` does not use `torch.compile` yet. # `cpu_offload_gb` does not use `torch.compile` yet.
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
def __init__( def __init__(
...@@ -1113,6 +1173,7 @@ class CacheConfig: ...@@ -1113,6 +1173,7 @@ class CacheConfig:
num_gpu_blocks_override: Optional[int] = None, num_gpu_blocks_override: Optional[int] = None,
sliding_window: Optional[int] = None, sliding_window: Optional[int] = None,
enable_prefix_caching: bool = False, enable_prefix_caching: bool = False,
prefix_caching_hash_algo: str = "builtin",
cpu_offload_gb: float = 0, cpu_offload_gb: float = 0,
calculate_kv_scales: Optional[bool] = None, calculate_kv_scales: Optional[bool] = None,
) -> None: ) -> None:
...@@ -1124,6 +1185,7 @@ class CacheConfig: ...@@ -1124,6 +1185,7 @@ class CacheConfig:
self.is_attention_free = is_attention_free self.is_attention_free = is_attention_free
self.sliding_window = sliding_window self.sliding_window = sliding_window
self.enable_prefix_caching = enable_prefix_caching self.enable_prefix_caching = enable_prefix_caching
self.prefix_caching_hash_algo = prefix_caching_hash_algo
self.cpu_offload_gb = cpu_offload_gb self.cpu_offload_gb = cpu_offload_gb
self.calculate_kv_scales = calculate_kv_scales self.calculate_kv_scales = calculate_kv_scales
self._verify_args() self._verify_args()
...@@ -1174,6 +1236,13 @@ class CacheConfig: ...@@ -1174,6 +1236,13 @@ class CacheConfig:
"Prefix caching is not supported with sliding window. " "Prefix caching is not supported with sliding window. "
"Run with --disable-sliding-window to use prefix caching.") "Run with --disable-sliding-window to use prefix caching.")
if self.enable_prefix_caching and self.prefix_caching_hash_algo not in (
"builtin", "sha256"):
raise ValueError(
"Unknown prefix caching hash algorithm: "
f"{self.prefix_caching_hash_algo}. Must be either "
"'builtin' or 'sha256'.")
def verify_with_parallel_config( def verify_with_parallel_config(
self, self,
parallel_config: "ParallelConfig", parallel_config: "ParallelConfig",
...@@ -1223,7 +1292,8 @@ class TokenizerPoolConfig: ...@@ -1223,7 +1292,8 @@ class TokenizerPoolConfig:
# no factors to consider. # no factors to consider.
# this config will not affect the computation graph. # this config will not affect the computation graph.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
def __post_init__(self): def __post_init__(self):
...@@ -1334,7 +1404,8 @@ class LoadConfig: ...@@ -1334,7 +1404,8 @@ class LoadConfig:
# no factors to consider. # no factors to consider.
# this config will not affect the computation graph. # this config will not affect the computation graph.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
def __post_init__(self): def __post_init__(self):
...@@ -1362,6 +1433,8 @@ class ParallelConfig: ...@@ -1362,6 +1433,8 @@ class ParallelConfig:
tensor_parallel_size: int = 1 # Number of tensor parallel groups. tensor_parallel_size: int = 1 # Number of tensor parallel groups.
data_parallel_size: int = 1 # Number of data parallel groups. data_parallel_size: int = 1 # Number of data parallel groups.
data_parallel_rank: int = 0 # Rank of the data parallel group. data_parallel_rank: int = 0 # Rank of the data parallel group.
# Local rank of the data parallel group, defaults to global rank.
data_parallel_rank_local: Optional[int] = None
# IP of the data parallel master. # IP of the data parallel master.
data_parallel_master_ip: str = "127.0.0.1" data_parallel_master_ip: str = "127.0.0.1"
data_parallel_master_port: int = 29500 # Port of the data parallel master. data_parallel_master_port: int = 29500 # Port of the data parallel master.
...@@ -1466,10 +1539,18 @@ class ParallelConfig: ...@@ -1466,10 +1539,18 @@ class ParallelConfig:
self.world_size = self.pipeline_parallel_size * \ self.world_size = self.pipeline_parallel_size * \
self.tensor_parallel_size self.tensor_parallel_size
self.data_parallel_size = envs.VLLM_DP_SIZE if self.data_parallel_size > 1:
self.data_parallel_rank = envs.VLLM_DP_RANK # Data parallel was specified in the engine args.
self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP self.data_parallel_master_port = get_open_port()
self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT # TODO multi-node
else:
# Otherwise fall back to env vars (e.g. for offline SPMD case).
self.data_parallel_size = envs.VLLM_DP_SIZE
self.data_parallel_rank = envs.VLLM_DP_RANK
self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
self.world_size_across_dp = self.world_size * self.data_parallel_size self.world_size_across_dp = self.world_size * self.data_parallel_size
if self.distributed_executor_backend == "external_launcher": if self.distributed_executor_backend == "external_launcher":
...@@ -1547,11 +1628,11 @@ class ParallelConfig: ...@@ -1547,11 +1628,11 @@ class ParallelConfig:
if self.use_ray: if self.use_ray:
from vllm.executor import ray_utils from vllm.executor import ray_utils
ray_utils.assert_ray_available() ray_utils.assert_ray_available()
# if current_platform.is_rocm(): # if not current_platform.use_custom_allreduce():
# self.disable_custom_all_reduce = True # self.disable_custom_all_reduce = True
# logger.info( # logger.info(
# "Disabled the custom all-reduce kernel because it is not " # "Disabled the custom all-reduce kernel because it is not "
# "supported on hcus.") # "supported on current platform.")
if self.ray_workers_use_nsight and not self.use_ray: if self.ray_workers_use_nsight and not self.use_ray:
raise ValueError("Unable to use nsight profiling unless workers " raise ValueError("Unable to use nsight profiling unless workers "
"run with Ray.") "run with Ray.")
...@@ -1654,7 +1735,8 @@ class SchedulerConfig: ...@@ -1654,7 +1735,8 @@ class SchedulerConfig:
# no factors to consider. # no factors to consider.
# this config will not affect the computation graph. # this config will not affect the computation graph.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
def __post_init__(self) -> None: def __post_init__(self) -> None:
...@@ -1790,7 +1872,8 @@ class DeviceConfig: ...@@ -1790,7 +1872,8 @@ class DeviceConfig:
# the device/platform information will be summarized # the device/platform information will be summarized
# by torch/vllm automatically. # by torch/vllm automatically.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
def __init__(self, device: str = "auto") -> None: def __init__(self, device: str = "auto") -> None:
...@@ -1799,7 +1882,10 @@ class DeviceConfig: ...@@ -1799,7 +1882,10 @@ class DeviceConfig:
from vllm.platforms import current_platform from vllm.platforms import current_platform
self.device_type = current_platform.device_type self.device_type = current_platform.device_type
if not self.device_type: if not self.device_type:
raise RuntimeError("Failed to infer device type") raise RuntimeError(
"Failed to infer device type, please set "
"the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
"to turn on verbose logging to help debug the issue.")
else: else:
# Device type is assigned explicitly # Device type is assigned explicitly
self.device_type = device self.device_type = device
...@@ -1963,7 +2049,8 @@ class SpeculativeConfig: ...@@ -1963,7 +2049,8 @@ class SpeculativeConfig:
# no factors to consider. # no factors to consider.
# spec decode does not use `torch.compile` yet. # spec decode does not use `torch.compile` yet.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
@classmethod @classmethod
...@@ -1985,14 +2072,13 @@ class SpeculativeConfig: ...@@ -1985,14 +2072,13 @@ class SpeculativeConfig:
def __post_init__(self): def __post_init__(self):
# Note: After next release, the method parameter will be used to # Note: "method" is a new parameter that helps to extend the
# specify the speculative method, which helps to extend the # configuration of non-model-based proposers, and the "model" parameter
# configuration of non-model-based proposers, and the model parameter # will be used to set the draft model, eagle head, or additional weight
# will be used when the draft model or head is needed. # when needed. If users do not specify "method", the speculative method
# If users do not specify the method, the speculative method will # will be detected automatically if possible. If the speculative method
# be detected automatically if possible. If the speculative method can # can not be detected, it will be considered as the "draft_model" by
# not be detected, it will be considered as the draft-model-based # default.
# method by default.
if self.model is None and self.num_speculative_tokens is not None: if self.model is None and self.num_speculative_tokens is not None:
# TODO(Shangming): Refactor mtp configuration logic when supporting # TODO(Shangming): Refactor mtp configuration logic when supporting
...@@ -2007,8 +2093,8 @@ class SpeculativeConfig: ...@@ -2007,8 +2093,8 @@ class SpeculativeConfig:
raise ValueError("num_speculative_tokens was provided without " raise ValueError("num_speculative_tokens was provided without "
"speculative model.") "speculative model.")
# Automatically configure the ngram method during configuration # Automatically configure the method for ngram when "model" is used
# refactoring to ensure a smooth transition. # instead of "method"
if self.method is None and (self.model is not None if self.method is None and (self.model is not None
and self.model in ("ngram", "[ngram]")): and self.model in ("ngram", "[ngram]")):
self.method = "ngram" self.method = "ngram"
...@@ -2090,9 +2176,10 @@ class SpeculativeConfig: ...@@ -2090,9 +2176,10 @@ class SpeculativeConfig:
# Replace hf_config for EAGLE draft_model # Replace hf_config for EAGLE draft_model
if self.method == "eagle": if self.method == "eagle":
if self.enable_chunked_prefill: if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
raise ValueError( raise ValueError(
"Chunked prefill and EAGLE are not compatible.") "Chunked prefill and EAGLE are not compatible "
"when using V0.")
from vllm.transformers_utils.configs.eagle import ( from vllm.transformers_utils.configs.eagle import (
EAGLEConfig) EAGLEConfig)
...@@ -2302,12 +2389,10 @@ class SpeculativeConfig: ...@@ -2302,12 +2389,10 @@ class SpeculativeConfig:
return self.num_speculative_tokens return self.num_speculative_tokens
def __repr__(self) -> str: def __repr__(self) -> str:
if self.prompt_lookup_max is not None and self.prompt_lookup_max > 0: method = self.method
draft_model = "ngram" model = None if method == "ngram" else self.draft_model_config.model
else:
draft_model = self.draft_model_config.model
num_spec_tokens = self.num_speculative_tokens num_spec_tokens = self.num_speculative_tokens
return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})" return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
@dataclass @dataclass
...@@ -2343,7 +2428,8 @@ class LoRAConfig: ...@@ -2343,7 +2428,8 @@ class LoRAConfig:
factors.append(self.lora_extra_vocab_size) factors.append(self.lora_extra_vocab_size)
factors.append(self.long_lora_scaling_factors) factors.append(self.long_lora_scaling_factors)
factors.append(self.bias_enabled) factors.append(self.bias_enabled)
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
def __post_init__(self): def __post_init__(self):
...@@ -2373,9 +2459,9 @@ class LoRAConfig: ...@@ -2373,9 +2459,9 @@ class LoRAConfig:
f"max_loras ({self.max_loras}) is 1") f"max_loras ({self.max_loras}) is 1")
def verify_with_cache_config(self, cache_config: CacheConfig): def verify_with_cache_config(self, cache_config: CacheConfig):
# TODO LoRA supports CPU offload. if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
if cache_config.cpu_offload_gb > 0: raise ValueError(
raise ValueError("CPU offload is not supported with LoRA yet.") "V0 LoRA does not support CPU offload, please use V1.")
def verify_with_model_config(self, model_config: ModelConfig): def verify_with_model_config(self, model_config: ModelConfig):
if self.lora_dtype in (None, "auto"): if self.lora_dtype in (None, "auto"):
...@@ -2413,7 +2499,8 @@ class PromptAdapterConfig: ...@@ -2413,7 +2499,8 @@ class PromptAdapterConfig:
# no factors to consider. # no factors to consider.
# this config will not affect the computation graph. # this config will not affect the computation graph.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
def __post_init__(self): def __post_init__(self):
...@@ -2458,7 +2545,8 @@ class MultiModalConfig: ...@@ -2458,7 +2545,8 @@ class MultiModalConfig:
# no factors to consider. # no factors to consider.
# this config will not affect the computation graph. # this config will not affect the computation graph.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
def get_limit_per_prompt(self, modality: str) -> int: def get_limit_per_prompt(self, modality: str) -> int:
...@@ -2524,7 +2612,8 @@ class PoolerConfig: ...@@ -2524,7 +2612,8 @@ class PoolerConfig:
# no factors to consider. # no factors to consider.
# this config will not affect the computation graph. # this config will not affect the computation graph.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
@staticmethod @staticmethod
...@@ -2661,6 +2750,10 @@ def _get_and_verify_max_len( ...@@ -2661,6 +2750,10 @@ def _get_and_verify_max_len(
max_len_key = key if max_len < derived_max_model_len \ max_len_key = key if max_len < derived_max_model_len \
else max_len_key else max_len_key
derived_max_model_len = min(derived_max_model_len, max_len) derived_max_model_len = min(derived_max_model_len, max_len)
# For Command-R / Cohere, Cohere2 / Aya Vision models
if tmp_max_len := getattr(hf_config, "model_max_length", None):
max_len_key = "model_max_length"
derived_max_model_len = tmp_max_len
# If sliding window is manually disabled, max_length should be less # If sliding window is manually disabled, max_length should be less
# than the sliding window length in the model config. # than the sliding window length in the model config.
...@@ -2805,7 +2898,8 @@ class DecodingConfig: ...@@ -2805,7 +2898,8 @@ class DecodingConfig:
# no factors to consider. # no factors to consider.
# this config will not affect the computation graph. # this config will not affect the computation graph.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
def __post_init__(self): def __post_init__(self):
...@@ -2855,7 +2949,8 @@ class ObservabilityConfig: ...@@ -2855,7 +2949,8 @@ class ObservabilityConfig:
# no factors to consider. # no factors to consider.
# this config will not affect the computation graph. # this config will not affect the computation graph.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
def __post_init__(self): def __post_init__(self):
...@@ -2917,7 +3012,8 @@ class KVTransferConfig(BaseModel): ...@@ -2917,7 +3012,8 @@ class KVTransferConfig(BaseModel):
# no factors to consider. # no factors to consider.
# this config will not affect the computation graph. # this config will not affect the computation graph.
factors: list[Any] = [] factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str return hash_str
@classmethod @classmethod
...@@ -2945,12 +3041,6 @@ class KVTransferConfig(BaseModel): ...@@ -2945,12 +3041,6 @@ class KVTransferConfig(BaseModel):
return self.kv_connector is not None and \ return self.kv_connector is not None and \
self.kv_role in ["kv_producer", "kv_consumer", "kv_both"] self.kv_role in ["kv_producer", "kv_consumer", "kv_both"]
@property
def need_kv_parallel_group(self) -> bool:
# for those database-based connector, vLLM does not need to create
# parallel group, and in that case the kv parallel size will be 1.
return self.kv_connector is not None and self.kv_parallel_size > 1
@property @property
def is_kv_producer(self) -> bool: def is_kv_producer(self) -> bool:
return self.kv_connector is not None and \ return self.kv_connector is not None and \
...@@ -3414,7 +3504,8 @@ class VllmConfig: ...@@ -3414,7 +3504,8 @@ class VllmConfig:
vllm_factors.append("None") vllm_factors.append("None")
factors.append(vllm_factors) factors.append(vllm_factors)
hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10] hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()[:10]
return hash_str return hash_str
def pad_for_cudagraph(self, batch_size: int) -> int: def pad_for_cudagraph(self, batch_size: int) -> int:
...@@ -3526,9 +3617,10 @@ class VllmConfig: ...@@ -3526,9 +3617,10 @@ class VllmConfig:
if self.cache_config is not None and \ if self.cache_config is not None and \
self.cache_config.cpu_offload_gb > 0 and \ self.cache_config.cpu_offload_gb > 0 and \
self.compilation_config.level != CompilationLevel.NO_COMPILATION: self.compilation_config.level != CompilationLevel.NO_COMPILATION \
and not envs.VLLM_USE_V1:
logger.warning( logger.warning(
"CPU offload is not supported with `torch.compile` yet." "CPU offload is not supported with `torch.compile` in v0 yet."
" Disabling `torch.compile`.") " Disabling `torch.compile`.")
self.compilation_config.level = CompilationLevel.NO_COMPILATION self.compilation_config.level = CompilationLevel.NO_COMPILATION
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
# not sure why, they are created from a different context. # not sure why, they are created from a different context.
# the only successful approach is to call cuda driver API in C. # the only successful approach is to call cuda driver API in C.
import dataclasses import dataclasses
import gc
import os import os
from contextlib import contextmanager from contextlib import contextmanager
from typing import Any, Callable, Dict, Optional, Tuple, Union from typing import Any, Callable, Dict, Optional, Tuple, Union
...@@ -175,7 +176,7 @@ class CuMemAllocator: ...@@ -175,7 +176,7 @@ class CuMemAllocator:
str]] = None) -> None: str]] = None) -> None:
""" """
Put the allocator in sleep mode. Put the allocator in sleep mode.
All data in the memory allocation with the specified tag will be All data in the memory allocation with the specified tag will be
offloaded to CPU memory, and others will be discarded. offloaded to CPU memory, and others will be discarded.
:param offload_tags: The tags of the memory allocation that will be :param offload_tags: The tags of the memory allocation that will be
...@@ -204,28 +205,37 @@ class CuMemAllocator: ...@@ -204,28 +205,37 @@ class CuMemAllocator:
data.cpu_backup_tensor = cpu_backup_tensor data.cpu_backup_tensor = cpu_backup_tensor
unmap_and_release(handle) unmap_and_release(handle)
def wake_up(self): gc.collect()
torch.cuda.empty_cache()
def wake_up(self, tags: Optional[list[str]] = None) -> None:
""" """
Wake up the allocator from sleep mode. Wake up the allocator from sleep mode.
All data that is previously offloaded will be loaded back to GPU All data that is previously offloaded will be loaded back to GPU
memory, and the rest of the data will have empty memory.""" memory, and the rest of the data will have empty memory.
:param tags: The tags of the memory allocation that will be loaded
back to GPU memory. If None, all memory allocation will be loaded
back to GPU memory.
"""
for ptr, data in self.pointer_to_data.items(): for ptr, data in self.pointer_to_data.items():
handle = data.handle if tags is None or data.tag in tags:
create_and_map(handle) handle = data.handle
if data.cpu_backup_tensor is not None: create_and_map(handle)
cpu_backup_tensor = data.cpu_backup_tensor if data.cpu_backup_tensor is not None:
if cpu_backup_tensor is not None: cpu_backup_tensor = data.cpu_backup_tensor
size_in_bytes = cpu_backup_tensor.numel( if cpu_backup_tensor is not None:
) * cpu_backup_tensor.element_size() size_in_bytes = cpu_backup_tensor.numel(
cpu_ptr = cpu_backup_tensor.data_ptr() ) * cpu_backup_tensor.element_size()
libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes) cpu_ptr = cpu_backup_tensor.data_ptr()
data.cpu_backup_tensor = None libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
data.cpu_backup_tensor = None
@contextmanager @contextmanager
def use_memory_pool(self, tag: Optional[str] = None): def use_memory_pool(self, tag: Optional[str] = None):
""" """
A context manager to use the memory pool. A context manager to use the memory pool.
All memory allocation created inside the context will be allocated All memory allocation created inside the context will be allocated
in the memory pool, and has the specified tag. in the memory pool, and has the specified tag.
:param tag: The tag of the memory allocation. If None, the default tag :param tag: The tag of the memory allocation. If None, the default tag
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Optional import os
from typing import List, Optional
import torch import torch
from torch.distributed import ProcessGroup from torch.distributed import ProcessGroup
from vllm.platforms import current_platform
from vllm.platforms.interface import CpuArchEnum
from .base_device_communicator import DeviceCommunicatorBase from .base_device_communicator import DeviceCommunicatorBase
...@@ -16,19 +20,120 @@ class CpuCommunicator(DeviceCommunicatorBase): ...@@ -16,19 +20,120 @@ class CpuCommunicator(DeviceCommunicatorBase):
device_group: Optional[ProcessGroup] = None, device_group: Optional[ProcessGroup] = None,
unique_name: str = ""): unique_name: str = ""):
super().__init__(cpu_group, device, device_group, unique_name) super().__init__(cpu_group, device, device_group, unique_name)
self.ipex_available = False
self.dist_module = torch.distributed self.dist_module = torch.distributed
try:
import intel_extension_for_pytorch as ipex if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
self.ipex_available = True self.dist_module = _CPUSHMDistributed(self)
self.dist_module = ipex.distributed
except ImportError:
"""
Intel IPEX not found. Falling back to PyTorch native
all_reduce for CPU (e.g. MacOS)
"""
pass
def all_reduce(self, input_): def all_reduce(self, input_):
self.dist_module.all_reduce(input_, group=self.device_group) self.dist_module.all_reduce(input_, group=self.device_group)
return input_ return input_
def gather(self,
input_: torch.Tensor,
dst: int = 0,
dim: int = -1) -> Optional[torch.Tensor]:
"""
NOTE: We assume that the input tensor is on the same device across
all the ranks.
NOTE: `dst` is the local rank of the destination rank.
"""
world_size = self.world_size
assert -input_.dim() <= dim < input_.dim(), (
f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
if dim < 0:
# Convert negative dim to positive.
dim += input_.dim()
# Allocate output tensor.
if self.rank_in_group == dst:
gather_list = [torch.empty_like(input_) for _ in range(world_size)]
else:
gather_list = None
# Gather.
self.dist_module.gather(input_,
gather_list,
dst=self.ranks[dst],
group=self.device_group)
if self.rank_in_group == dst:
output_tensor = torch.cat(gather_list, dim=dim)
else:
output_tensor = None
return output_tensor
def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
if dim < 0:
# Convert negative dim to positive.
dim += input_.dim()
input_size = input_.size()
# NOTE: we have to use concat-style all-gather here,
# stack-style all-gather has compatibility issues with
# torch.compile . see https://github.com/pytorch/pytorch/issues/138795
output_size = (input_size[0] * self.world_size, ) + input_size[1:]
# Allocate output tensor.
output_tensor = torch.empty(output_size,
dtype=input_.dtype,
device=input_.device)
# All-gather.
self.dist_module.all_gather_into_tensor(output_tensor,
input_,
group=self.device_group)
# Reshape
output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
output_tensor = output_tensor.movedim(0, dim)
output_tensor = output_tensor.reshape(input_size[:dim] +
(self.world_size *
input_size[dim], ) +
input_size[dim + 1:])
return output_tensor
class _CPUSHMDistributed:
def __init__(self, communicator: CpuCommunicator):
instance_identifier = os.environ["VLLM_DIST_IDENT"]
self.communicator = communicator
group_ranks = [str(rank) for rank in self.communicator.ranks]
shm_group_identifier = f"[{'-'.join(group_ranks)}]"
self.group_name = f"{instance_identifier}-{shm_group_identifier}-cpushm"
self.handle = self._init_cpu_shm()
def _init_cpu_shm(self) -> int:
handle = torch.ops._C.init_shm_manager(
self.group_name,
self.communicator.world_size,
self.communicator.rank,
)
torch.distributed.barrier(self.communicator.device_group)
torch.ops._C.join_shm_manager(
handle,
self.group_name,
)
torch.distributed.barrier(self.communicator.device_group)
return handle
def all_reduce(self,
input: torch.Tensor,
group: Optional[ProcessGroup] = None) -> None:
torch.ops._C.shm_allreduce(self.handle, input)
def gather(self,
input: torch.Tensor,
gather_list: Optional[List[torch.Tensor]],
dst: int = -1,
group: Optional[ProcessGroup] = None) -> None:
# Note: different from the torch gather, here we use local dst rank.
torch.ops._C.shm_gather(self.handle, input, gather_list,
torch.distributed.get_group_rank(group, dst))
def all_gather_into_tensor(self,
output: torch.Tensor,
input: torch.Tensor,
group: Optional[ProcessGroup] = None) -> None:
torch.ops._C.shm_all_gather(self.handle, input, output)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import ctypes
from contextlib import contextmanager from contextlib import contextmanager
from typing import List, Optional, Union from typing import List, Optional, Union
...@@ -10,7 +9,6 @@ from torch.distributed import ProcessGroup ...@@ -10,7 +9,6 @@ from torch.distributed import ProcessGroup
import vllm.envs as envs import vllm.envs as envs
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
# from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
from vllm.distributed.device_communicators.custom_all_reduce_utils import ( from vllm.distributed.device_communicators.custom_all_reduce_utils import (
gpu_p2p_access_check) gpu_p2p_access_check)
from vllm.distributed.parallel_state import in_the_same_node_as from vllm.distributed.parallel_state import in_the_same_node_as
...@@ -23,7 +21,7 @@ try: ...@@ -23,7 +21,7 @@ try:
custom_ar = True custom_ar = True
except Exception: except Exception:
# For AMD GPUs and CPUs # For CPUs
custom_ar = False custom_ar = False
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -56,7 +54,7 @@ class CustomAllreduce: ...@@ -56,7 +54,7 @@ class CustomAllreduce:
def __init__(self, def __init__(self,
group: ProcessGroup, group: ProcessGroup,
device: Union[int, str, torch.device], device: Union[int, str, torch.device],
max_size=8192 * 1024 * 2) -> None: max_size=8192 * 1024) -> None:
""" """
Args: Args:
group: the process group to work on. If None, it will use the group: the process group to work on. If None, it will use the
...@@ -72,9 +70,9 @@ class CustomAllreduce: ...@@ -72,9 +70,9 @@ class CustomAllreduce:
if not custom_ar: if not custom_ar:
# disable because of missing custom allreduce library # disable because of missing custom allreduce library
# e.g. in a non-cuda environment # e.g. in a non-GPU environment
logger.warning("Custom allreduce is disabled because " logger.info("Custom allreduce is disabled because "
"of missing custom allreduce library") "of missing custom allreduce library")
return return
self.group = group self.group = group
...@@ -91,7 +89,6 @@ class CustomAllreduce: ...@@ -91,7 +89,6 @@ class CustomAllreduce:
rank = dist.get_rank(group=self.group) rank = dist.get_rank(group=self.group)
self.rank = rank self.rank = rank
self.rank = rank
world_size = dist.get_world_size(group=self.group) world_size = dist.get_world_size(group=self.group)
if world_size == 1: if world_size == 1:
# No need to initialize custom allreduce for single GPU case. # No need to initialize custom allreduce for single GPU case.
...@@ -135,10 +132,11 @@ class CustomAllreduce: ...@@ -135,10 +132,11 @@ class CustomAllreduce:
# this checks hardware and driver support for NVLink # this checks hardware and driver support for NVLink
assert current_platform.is_cuda_alike() assert current_platform.is_cuda_alike()
full_nvlink = current_platform.is_fully_connected_nvlink_or_xgmi( fully_connected = current_platform.is_fully_connected(
physical_device_ids) physical_device_ids)
if not full_nvlink: # if world_size > 2 and not fully_connected:
if not fully_connected:
logger.warning( logger.warning(
"Custom allreduce is disabled because it's not supported on" "Custom allreduce is disabled because it's not supported on"
" more than two PCIe-only GPUs. To silence this warning, " " more than two PCIe-only GPUs. To silence this warning, "
...@@ -147,12 +145,13 @@ class CustomAllreduce: ...@@ -147,12 +145,13 @@ class CustomAllreduce:
# test P2P capability, this checks software/cudaruntime support # test P2P capability, this checks software/cudaruntime support
# this is expensive to compute at the first time # this is expensive to compute at the first time
# then we cache the result # then we cache the result
# if not _can_p2p(rank, world_size): # On AMD GPU, p2p is always enabled between XGMI connected GPUs
# logger.warning( if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
# "Custom allreduce is disabled because your platform lacks " logger.warning(
# "GPU P2P capability or P2P test failed. To silence this " "Custom allreduce is disabled because your platform lacks "
# "warning, specify disable_custom_all_reduce=True explicitly.") "GPU P2P capability or P2P test failed. To silence this "
# return "warning, specify disable_custom_all_reduce=True explicitly.")
return
self.disabled = False self.disabled = False
# Buffers memory are owned by this Python class and passed to C++. # Buffers memory are owned by this Python class and passed to C++.
...@@ -175,46 +174,11 @@ class CustomAllreduce: ...@@ -175,46 +174,11 @@ class CustomAllreduce:
self.max_size = max_size self.max_size = max_size
self.rank = rank self.rank = rank
self.world_size = world_size self.world_size = world_size
self.full_nvlink = full_nvlink self.fully_connected = fully_connected
self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank, self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank,
self.full_nvlink) self.fully_connected)
ops.register_buffer(self._ptr, self.buffer_ptrs) ops.register_buffer(self._ptr, self.buffer_ptrs)
# @staticmethod
# def create_shared_buffer(
# size_in_bytes: int,
# group: Optional[ProcessGroup] = None) -> List[int]:
# """
# Creates a shared buffer and returns a list of pointers
# representing the buffer on all processes in the group.
# """
# lib = CudaRTLibrary()
# pointer = lib.cudaMalloc(size_in_bytes)
# handle = lib.cudaIpcGetMemHandle(pointer)
# world_size = dist.get_world_size(group=group)
# rank = dist.get_rank(group=group)
# handles = [None] * world_size
# dist.all_gather_object(handles, handle, group=group)
# pointers: List[int] = []
# for i, h in enumerate(handles):
# if i == rank:
# pointers.append(pointer.value) # type: ignore
# else:
# pointers.append(
# lib.cudaIpcOpenMemHandle(h).value) # type: ignore
# return pointers
# @staticmethod
# def free_shared_buffer(pointers: List[int],
# group: Optional[ProcessGroup] = None,
# rank: Optional[int] = None) -> None:
# if rank is None:
# rank = dist.get_rank(group=group)
# lib = CudaRTLibrary()
# lib.cudaFree(ctypes.c_void_p(pointers[rank]))
@contextmanager @contextmanager
def capture(self): def capture(self):
""" """
...@@ -261,7 +225,7 @@ class CustomAllreduce: ...@@ -261,7 +225,7 @@ class CustomAllreduce:
return False return False
# for 4 or more non NVLink-capable GPUs, custom allreduce provides # for 4 or more non NVLink-capable GPUs, custom allreduce provides
# little performance improvement over NCCL. # little performance improvement over NCCL.
if self.world_size == 2 or self.full_nvlink: if self.world_size == 2 or self.fully_connected:
return inp_size < self.max_size return inp_size < self.max_size
return False return False
...@@ -312,8 +276,8 @@ class CustomAllreduce: ...@@ -312,8 +276,8 @@ class CustomAllreduce:
def __del__(self): def __del__(self):
self.close() self.close()
@staticmethod @staticmethod
def create_shared_buffer(size_in_bytes: int, def create_shared_buffer(size_in_bytes: int,
group: Optional[ProcessGroup] = None, group: Optional[ProcessGroup] = None,
...@@ -340,4 +304,3 @@ class CustomAllreduce: ...@@ -340,4 +304,3 @@ class CustomAllreduce:
if rank is None: if rank is None:
rank = dist.get_rank(group=group) rank = dist.get_rank(group=group)
ops.free_shared_buffer(pointers[rank]) ops.free_shared_buffer(pointers[rank])
...@@ -125,8 +125,13 @@ class ShmRingBuffer: ...@@ -125,8 +125,13 @@ class ShmRingBuffer:
lambda *args, **kwargs: None): lambda *args, **kwargs: None):
try: try:
self.shared_memory = shared_memory.SharedMemory(name=name) self.shared_memory = shared_memory.SharedMemory(name=name)
assert ( # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
self.shared_memory.size == self.total_bytes_of_buffer) # Some platforms allocate memory based on page size,
# so the shared memory block size may be larger or equal
# to the requested size. The size parameter is ignored
# when attaching to an existing block.
assert (self.shared_memory.size
>= self.total_bytes_of_buffer)
except FileNotFoundError: except FileNotFoundError:
# we might deserialize the object in a different node # we might deserialize the object in a different node
# in this case, this object is not used, # in this case, this object is not used,
......
...@@ -22,6 +22,8 @@ if current_platform.is_tpu(): ...@@ -22,6 +22,8 @@ if current_platform.is_tpu():
import torch_xla.core.xla_model as xm import torch_xla.core.xla_model as xm
import torch_xla.runtime as xr import torch_xla.runtime as xr
from torch_xla._internal import pjrt from torch_xla._internal import pjrt
from torch_xla.distributed.xla_multiprocessing import (
create_optimized_replica_groups)
if USE_RAY: if USE_RAY:
from vllm.executor import ray_utils from vllm.executor import ray_utils
...@@ -79,9 +81,12 @@ class TpuCommunicator(DeviceCommunicatorBase): ...@@ -79,9 +81,12 @@ class TpuCommunicator(DeviceCommunicatorBase):
pjrt.initialize_multiprocess(local_rank, local_world_size) pjrt.initialize_multiprocess(local_rank, local_world_size)
xr._init_world_size_ordinal() xr._init_world_size_ordinal()
self.groups = create_optimized_replica_groups()
def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
return xm.all_reduce(xm.REDUCE_SUM, input_) # TODO: Remove the groups specification after XLA compiler can support
# auto-reordering the ring order for all-reduce.
return xm.all_reduce(xm.REDUCE_SUM, input_, groups=self.groups)
def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
assert dim == -1, "TPUs only support dim=-1 for all-gather." assert dim == -1, "TPUs only support dim=-1 for all-gather."
......
...@@ -53,3 +53,8 @@ KVConnectorFactory.register_connector( ...@@ -53,3 +53,8 @@ KVConnectorFactory.register_connector(
"LMCacheConnector", "LMCacheConnector",
"vllm.distributed.kv_transfer.kv_connector.lmcache_connector", "vllm.distributed.kv_transfer.kv_connector.lmcache_connector",
"LMCacheConnector") "LMCacheConnector")
KVConnectorFactory.register_connector(
"MooncakeStoreConnector",
"vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
"MooncakeStoreConnector")
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
"""
MooncakeStore Connector for Distributed Machine Learning Inference
The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
(KV cache producer) and decode vLLM workers (KV cache consumer) using a
database-style KVStore.
"""
import hashlib
from typing import TYPE_CHECKING, List, Tuple, Union
import torch
from vllm import _custom_ops as ops
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
from vllm.logger import init_logger
from vllm.sequence import IntermediateTensors
if TYPE_CHECKING:
from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
logger = init_logger(__name__)
class MooncakeStoreConnector(KVConnectorBase):
def __init__(
self,
rank: int,
local_rank: int,
config: VllmConfig,
):
self.config = config.kv_transfer_config
self.tp_size = config.parallel_config.tensor_parallel_size
self.local_tp_rank = local_rank
# Init kv_store
if self.config.kv_connector == "MooncakeStoreConnector":
# Check if MOONCAKE_CONFIG_PATH is set
import os
use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None
if not use_mooncake_store:
raise ValueError(
"To use MooncakeStoreConnector, you need to pass the ENV: "
"'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
else:
from vllm.distributed.kv_transfer.kv_lookup_buffer.mooncake_store import ( # noqa: E501
MooncakeStore)
logger.info(
"Initializing KVStoreConnector under kv_transfer_config %s",
self.config)
self.kv_store = MooncakeStore(config)
else:
logger.error("Can not find %s", self.config.kv_connector)
assert self.kv_store is not None
def close(self) -> None:
"""Close the buffer and release resources.
This method is responsible for cleaning up resources related to the
connector when it is no longer needed.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
self.kv_store.close()
def send_kv_caches_and_hidden_states(
self,
model_executable: torch.nn.Module,
model_input: "ModelInputForGPUWithSamplingMetadata",
kv_caches: List[torch.Tensor],
hidden_or_intermediate_states: Union[torch.Tensor,
IntermediateTensors],
) -> None:
input_tokens_tensor = model_input.input_tokens
seq_lens = model_input.attn_metadata.seq_lens
slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
start_layer = model_executable.model.start_layer
end_layer = model_executable.model.end_layer
model_config = model_executable.model.config
num_heads = int(model_config.num_key_value_heads / self.tp_size)
hidden_size = model_config.hidden_size
num_attention_heads = model_config.num_attention_heads
head_size = int(hidden_size / num_attention_heads)
for idx, slen in enumerate(seq_lens):
start_pos = sum(seq_lens[:idx])
end_pos = start_pos + slen
current_tokens = input_tokens_tensor[start_pos:end_pos]
store_key_prefix = self.tensor_hash(current_tokens)
keys, values = [], []
for layer_id in range(start_layer, end_layer):
kv_cache = kv_caches[layer_id - start_layer]
key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
keys.append(key_cache[current_slot_mapping].unsqueeze(0))
values.append(value_cache[current_slot_mapping].unsqueeze(0))
keys = torch.cat(keys, dim=0)
values = torch.cat(values, dim=0)
kvcache_to_sent = torch.stack((keys, values), dim=0)
store_kvcache_key = f"{store_key_prefix}_{self.local_tp_rank}"
self.kv_store.put(store_kvcache_key, kvcache_to_sent)
hidden_key = f"{store_key_prefix}_hidden_{self.local_tp_rank}"
self.kv_store.put(hidden_key,
hidden_or_intermediate_states[start_pos:end_pos])
logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
def recv_kv_caches_and_hidden_states(
self, model_executable: torch.nn.Module,
model_input: "ModelInputForGPUWithSamplingMetadata",
kv_caches: List[torch.Tensor]
) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
"ModelInputForGPUWithSamplingMetadata"]:
bypass_model_exec = True
input_tokens_tensor = model_input.input_tokens
seq_lens = model_input.attn_metadata.seq_lens
num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
start_layer = model_executable.model.start_layer
end_layer = model_executable.model.end_layer
hidden_or_intermediate_states_for_one_req = []
for idx, slen in enumerate(seq_lens):
start_pos = sum(seq_lens[:idx])
end_pos = start_pos + slen
if start_pos >= num_prefill_tokens:
# This can happen during inflight batching. See:
# vllm/worker/model_runner.py::_prepare_model_input_tensors:
# - input_tokens[:num_prefill_tokens] contains prefill tokens.
# - input_tokens[num_prefill_tokens:] contains decode tokens.
logger.warning("You should set --enable_chunked_prefill=False "
"and --max_num_batched_tokens "
"should be equal to max_seq_len_to_capture")
bypass_model_exec = False
assert start_pos == num_prefill_tokens
break
current_tokens = input_tokens_tensor[start_pos:end_pos]
# get roi for current seq
load_key_prefix = self.tensor_hash(current_tokens)
load_kvcache_key = f"{load_key_prefix}_{self.local_tp_rank}"
remote_kv = self.kv_store.get(load_kvcache_key)
hidden_key = f"{load_key_prefix}_hidden_{self.local_tp_rank}"
hidden = self.kv_store.get(hidden_key)
if remote_kv is None or hidden is None:
# didn't find any match.
bypass_model_exec = False
continue
num_computed_tokens = current_tokens.shape[0]
# update the end position based on how many tokens are cached.
end_pos = start_pos + num_computed_tokens
# call self.kv_store to get kv layer by layer
for layer_id in range(start_layer, end_layer):
layer = model_executable.model.layers[layer_id]
# get kvcache object
kv_cache = kv_caches[layer_id - start_layer]
key_cache, value_cache = kv_cache[0], kv_cache[1]
# get remote kvcache
remote_k, remote_v = remote_kv[0][layer_id], remote_kv[1][
layer_id]
# use ops.reshape_and_cache_flash to put kv into kvcache
ops.reshape_and_cache_flash(
remote_k.to(key_cache.device),
remote_v.to(value_cache.device),
key_cache,
value_cache,
slot_mapping[start_pos:end_pos],
layer.self_attn.attn.kv_cache_dtype,
layer.self_attn.attn._k_scale,
layer.self_attn.attn._v_scale,
)
hidden_or_intermediate_states_for_one_req.append(hidden)
if not bypass_model_exec:
logger.warning(
"[rank%d]: Failed to receive all KVs and hidden "
"states, redo model forwarding.", torch.distributed.get_rank())
hidden_or_intermediate_states = None
else:
logger.debug(
"[rank%d]: Successfully received all KVs and hidden "
"states, skip model forwarding.", torch.distributed.get_rank())
hidden_or_intermediate_states = torch.cat(
hidden_or_intermediate_states_for_one_req, dim=0)
return hidden_or_intermediate_states, bypass_model_exec, model_input
@staticmethod
def tensor_hash(tensor: torch.Tensor) -> int:
"""Calculate the hash value of the tensor."""
tensor_bytes = tensor.clone().detach().cpu().numpy().tobytes()
hash_object = hashlib.blake2b(tensor_bytes)
hash_hex = hash_object.hexdigest()
return int(hash_hex[:16], 16)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
This file contains a new class `KVLookupBufferBase` that allows developers to This file contains a new class `KVLookupBufferBase` that allows developers to
think of KV cache operations as inserting new KV cache entries (`insert`) think of KV cache operations as inserting new KV cache entries (`insert`)
into the lookup buffer and querying existing KV caches (`drop_select`) into the lookup buffer and querying existing KV caches (`drop_select`)
from the lookup buffer. from the lookup buffer.
All distributed communications are abstracted behind this class. This file also contains a new class `KVStoreBufferBase` that allows developers
to manage the KVCache buffer as a simple key-value storage buffer with basic
put/get operations.
These classes above are abstracted behind class `KVCacheBufferBase`.
""" """
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
...@@ -14,9 +18,27 @@ from typing import List, Optional ...@@ -14,9 +18,27 @@ from typing import List, Optional
import torch import torch
class KVLookupBufferBase(ABC): class KVCacheBufferBase(ABC):
"""
Abstract base class for a KVCache buffer.
""" """
Abstract base class for a lookup buffer.
@abstractmethod
def close(self) -> None:
"""Close the buffer and release resources.
This method is responsible for cleaning up resources related to the
KVCache buffer when it is no longer needed.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
raise NotImplementedError
class KVLookupBufferBase(KVCacheBufferBase):
"""
Abstract base class for a KVCache lookup buffer.
This class provides an abstraction for a key-value (KV) cache lookup buffer. This class provides an abstraction for a key-value (KV) cache lookup buffer.
...@@ -96,12 +118,55 @@ class KVLookupBufferBase(ABC): ...@@ -96,12 +118,55 @@ class KVLookupBufferBase(ABC):
""" """
raise NotImplementedError raise NotImplementedError
class KVStoreBufferBase(KVCacheBufferBase):
"""
Abstract base class for a KVCache storage buffer with key-value semantics.
This class provides a simple key-value storage buffer abstract with basic
put/get operations, which enables flexible KVCache transfer granular
control.
The functionality is similar to a distributed key-value store, where:
- Key: A unique string identifier for the cached entry
- Value:
- Tensor to be stored and retrieved
- None (indicating deletion or empty value)
"""
@abstractmethod
def put(
self,
key: str,
value: Optional[torch.Tensor],
) -> None:
"""Store a key-value pair in the buffer.
Args:
key (str): Unique identifier for a tensor, this tensor could be the
key cache tensor, value cache tensor, or hidden state tensor
generated during model forwarding.
value (Optional[torch.Tensor]): Tensor to be stored.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
raise NotImplementedError
@abstractmethod @abstractmethod
def close(self) -> None: def get(
"""Close the buffer and release resources. self,
key: str,
) -> Optional[torch.Tensor]:
"""Retrieve a value from the buffer by key.
Args:
key (str): Unique identifier for a tensor, this tensor could be the
key cache tensor, value cache tensor, or hidden state tensor
generated during model forwarding.
This method is responsible for cleaning up resources related to the Returns:
lookup buffer when it is no longer needed. Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
Raises: Raises:
NotImplementedError: This method must be implemented in subclasses. NotImplementedError: This method must be implemented in subclasses.
......
# SPDX-License-Identifier: Apache-2.0
"""
This file contains a new class `MooncakeStore` that allows developers to
think of KV cache transfer operations as putting new KV cache entries
into a remote KVStore-based lookup buffer and getting existing KV caches
from this remote lookup buffer.
"""
import json
import os
from dataclasses import dataclass
from typing import Optional
import torch
from safetensors.torch import load as safetensors_load
from safetensors.torch import save as safetensors_save
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
KVStoreBufferBase)
from vllm.logger import init_logger
DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB
DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB
logger = init_logger(__name__)
@dataclass
class MooncakeStoreConfig:
local_hostname: str
metadata_server: str
global_segment_size: int
local_buffer_size: int
protocol: str
device_name: str
master_server_address: str
@staticmethod
def from_file(file_path: str) -> 'MooncakeStoreConfig':
"""Load the config from a JSON file."""
with open(file_path) as fin:
config = json.load(fin)
return MooncakeStoreConfig(
local_hostname=config.get("local_hostname"),
metadata_server=config.get("metadata_server"),
global_segment_size=config.get("global_segment_size",
DEFAULT_GLOBAL_SEGMENT_SIZE),
local_buffer_size=config.get("local_buffer_size",
DEFAULT_LOCAL_BUFFER_SIZE),
protocol=config.get("protocol", "tcp"),
device_name=config.get("device_name", ""),
master_server_address=config.get("master_server_address"),
)
@staticmethod
def load_from_env() -> 'MooncakeStoreConfig':
"""Load config from a file specified in the environment variable."""
config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
if config_file_path is None:
raise ValueError(
"The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
return MooncakeStoreConfig.from_file(config_file_path)
class MooncakeStore(KVStoreBufferBase):
def __init__(
self,
config: VllmConfig,
):
try:
from mooncake_vllm_adaptor import MooncakeDistributedStore
except ImportError as e:
raise ImportError(
"Please install mooncake by following the instructions at "
"https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501
"to run vLLM with MooncakeConnector.") from e
try:
self.store = MooncakeDistributedStore()
self.config = MooncakeStoreConfig.load_from_env()
logger.info("Mooncake Configuration loaded successfully.")
self.store.setup(self.config.local_hostname,
self.config.metadata_server,
self.config.global_segment_size,
self.config.local_buffer_size,
self.config.protocol, self.config.device_name,
self.config.master_server_address)
except ValueError as e:
logger.error("Configuration loading failed: %s", e)
raise
except Exception as exc:
logger.error(
"An error occurred while loading the configuration: %s", exc)
raise
def close(self):
# MooncakeDistributedStore will automatically call the destructor, so
# it is unnecessary to close it manually.
pass
def put(
self,
key: str,
value: Optional[torch.Tensor],
) -> None:
# A message queue needs to be introduced before making it asynchronous.
if value is not None:
self._put_impl(key, value)
def get(
self,
key: str,
) -> Optional[torch.Tensor]:
# A message queue needs to be introduced before making it asynchronous.
value = self._get_impl(key)
return value
def _put_impl(
self,
key: str,
value: torch.Tensor,
) -> None:
"""Put KVCache to Mooncake Store"""
device_id = value.device.index if value.device.type == 'cuda' else -1
device_tensor = torch.tensor(device_id, dtype=torch.int32)
value_bytes = safetensors_save({
"tensor": value,
"device_id": device_tensor
})
try:
self.store.put(key, value_bytes)
except TypeError as err:
logger.error("Failed to put value into Mooncake Store: %s", err)
raise TypeError("Mooncake Store Put Type Error.") from err
def _get_impl(
self,
key: str,
) -> Optional[torch.Tensor]:
"""Get KVCache from Mooncake Store"""
try:
data = self.store.get(key)
except TypeError as err:
logger.error("Failed to get value from Mooncake Store: %s", err)
raise TypeError("Mooncake Store Get Type Error.") from err
if data:
loaded_tensors = safetensors_load(data)
tensor = loaded_tensors["tensor"]
device_id_tensor = loaded_tensors["device_id"]
device_id = int(device_id_tensor.item())
device = torch.device(
'cuda', device_id) if device_id >= 0 else torch.device('cpu')
return tensor.to(device)
return None
...@@ -119,11 +119,13 @@ def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor: ...@@ -119,11 +119,13 @@ def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
if supports_custom_op(): if supports_custom_op():
from vllm.platforms import current_platform
direct_register_custom_op( direct_register_custom_op(
op_name="all_reduce", op_name="all_reduce",
op_func=all_reduce, op_func=all_reduce,
mutates_args=[], mutates_args=[],
fake_impl=all_reduce_fake, fake_impl=all_reduce_fake,
dispatch_key=current_platform.dispatch_key,
) )
...@@ -219,7 +221,8 @@ class GroupCoordinator: ...@@ -219,7 +221,8 @@ class GroupCoordinator:
self.cpu_group, 1 << 22, 6) self.cpu_group, 1 << 22, 6)
from vllm.platforms import current_platform from vllm.platforms import current_platform
self.use_custom_op_call = current_platform.is_cuda_alike() self.use_custom_op_call = (current_platform.is_cuda_alike()
or current_platform.is_tpu())
@property @property
def first_rank(self): def first_rank(self):
......
...@@ -15,6 +15,7 @@ import torch ...@@ -15,6 +15,7 @@ import torch
from torch.distributed import ProcessGroup, TCPStore from torch.distributed import ProcessGroup, TCPStore
from torch.distributed.distributed_c10d import (Backend, PrefixStore, from torch.distributed.distributed_c10d import (Backend, PrefixStore,
_get_default_timeout, _get_default_timeout,
_unregister_process_group,
is_nccl_available) is_nccl_available)
from torch.distributed.rendezvous import rendezvous from torch.distributed.rendezvous import rendezvous
...@@ -206,10 +207,7 @@ class StatelessProcessGroup: ...@@ -206,10 +207,7 @@ class StatelessProcessGroup:
def barrier(self): def barrier(self):
"""A barrier to synchronize all ranks.""" """A barrier to synchronize all ranks."""
for i in range(self.world_size): for i in range(self.world_size):
if i == self.rank: self.broadcast_obj(None, src=i)
self.broadcast_obj(None, src=self.rank)
else:
self.broadcast_obj(None, src=i)
@staticmethod @staticmethod
def create( def create(
...@@ -333,3 +331,15 @@ def stateless_init_torch_distributed_process_group( ...@@ -333,3 +331,15 @@ def stateless_init_torch_distributed_process_group(
pg._register_backend(device, backend_type, backend_class) pg._register_backend(device, backend_type, backend_class)
return pg return pg
def stateless_destroy_torch_distributed_process_group(
pg: ProcessGroup) -> None:
"""
Destroy ProcessGroup returned by
stateless_init_torch_distributed_process_group().
"""
# Lazy import for non-CUDA backends.
from torch.distributed.distributed_c10d import _shutdown_backend
_shutdown_backend(pg)
_unregister_process_group(pg.group_name)
...@@ -23,6 +23,7 @@ from vllm.executor.executor_base import ExecutorBase ...@@ -23,6 +23,7 @@ from vllm.executor.executor_base import ExecutorBase
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.plugins import load_general_plugins from vllm.plugins import load_general_plugins
from vllm.reasoning import ReasoningParserManager
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
from vllm.transformers_utils.utils import check_gguf_file from vllm.transformers_utils.utils import check_gguf_file
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
...@@ -114,10 +115,12 @@ class EngineArgs: ...@@ -114,10 +115,12 @@ class EngineArgs:
# number of P/D disaggregation (or other disaggregation) workers # number of P/D disaggregation (or other disaggregation) workers
pipeline_parallel_size: int = 1 pipeline_parallel_size: int = 1
tensor_parallel_size: int = 1 tensor_parallel_size: int = 1
data_parallel_size: int = 1
enable_expert_parallel: bool = False enable_expert_parallel: bool = False
max_parallel_loading_workers: Optional[int] = None max_parallel_loading_workers: Optional[int] = None
block_size: Optional[int] = None block_size: Optional[int] = None
enable_prefix_caching: Optional[bool] = None enable_prefix_caching: Optional[bool] = None
prefix_caching_hash_algo: str = "builtin"
disable_sliding_window: bool = False disable_sliding_window: bool = False
disable_cascade_attn: bool = False disable_cascade_attn: bool = False
use_v2_block_manager: bool = True use_v2_block_manager: bool = True
...@@ -180,23 +183,7 @@ class EngineArgs: ...@@ -180,23 +183,7 @@ class EngineArgs:
guided_decoding_backend: str = 'xgrammar' guided_decoding_backend: str = 'xgrammar'
logits_processor_pattern: Optional[str] = None logits_processor_pattern: Optional[str] = None
speculative_config: Optional[Union[str, Dict[str, Any]]] = None speculative_config: Optional[Dict[str, Any]] = None
# TODO(Shangming): Deprecate these out-of-date params after next release
speculative_model: Optional[str] = None
speculative_model_quantization: Optional[str] = None
speculative_draft_tensor_parallel_size: Optional[int] = None
num_speculative_tokens: Optional[int] = None
num_speculative_heads: Optional[int] = None
speculative_disable_mqa_scorer: Optional[bool] = False
speculative_max_model_len: Optional[int] = None
speculative_disable_by_batch_size: Optional[int] = None
ngram_prompt_lookup_max: Optional[int] = None
ngram_prompt_lookup_min: Optional[int] = None
spec_decoding_acceptance_method: str = 'rejection_sampler'
typical_acceptance_sampler_posterior_threshold: Optional[float] = None
typical_acceptance_sampler_posterior_alpha: Optional[float] = None
disable_logprobs_during_spec_decoding: Optional[bool] = None
qlora_adapter_name_or_path: Optional[str] = None qlora_adapter_name_or_path: Optional[str] = None
show_hidden_metrics_for_version: Optional[str] = None show_hidden_metrics_for_version: Optional[str] = None
...@@ -323,9 +310,7 @@ class EngineArgs: ...@@ -323,9 +310,7 @@ class EngineArgs:
parser.add_argument('--download-dir', parser.add_argument('--download-dir',
type=nullable_str, type=nullable_str,
default=EngineArgs.download_dir, default=EngineArgs.download_dir,
help='Directory to download and load the weights, ' help='Directory to download and load the weights.')
'default to the default cache dir of '
'huggingface.')
parser.add_argument( parser.add_argument(
'--load-format', '--load-format',
type=str, type=str,
...@@ -400,8 +385,7 @@ class EngineArgs: ...@@ -400,8 +385,7 @@ class EngineArgs:
'Valid backend values are "xgrammar", "guidance", and "auto". ' 'Valid backend values are "xgrammar", "guidance", and "auto". '
'With "auto", we will make opinionated choices based on request' 'With "auto", we will make opinionated choices based on request'
'contents and what the backend libraries currently support, so ' 'contents and what the backend libraries currently support, so '
'the behavior is subject to change in each release. ' 'the behavior is subject to change in each release.')
'The default is xgrammar.')
parser.add_argument( parser.add_argument(
'--logits-processor-pattern', '--logits-processor-pattern',
type=nullable_str, type=nullable_str,
...@@ -445,6 +429,14 @@ class EngineArgs: ...@@ -445,6 +429,14 @@ class EngineArgs:
type=int, type=int,
default=EngineArgs.tensor_parallel_size, default=EngineArgs.tensor_parallel_size,
help='Number of tensor parallel replicas.') help='Number of tensor parallel replicas.')
parser.add_argument('--data-parallel-size',
'-dp',
type=int,
default=EngineArgs.data_parallel_size,
help='Number of data parallel replicas. '
'MoE layers will be sharded according to the '
'product of the tensor-parallel-size and '
'data-parallel-size.')
parser.add_argument( parser.add_argument(
'--enable-expert-parallel', '--enable-expert-parallel',
action='store_true', action='store_true',
...@@ -479,6 +471,15 @@ class EngineArgs: ...@@ -479,6 +471,15 @@ class EngineArgs:
help="Enables automatic prefix caching. " help="Enables automatic prefix caching. "
"Use ``--no-enable-prefix-caching`` to disable explicitly.", "Use ``--no-enable-prefix-caching`` to disable explicitly.",
) )
parser.add_argument(
"--prefix-caching-hash-algo",
type=str,
choices=["builtin", "sha256"],
default=EngineArgs.prefix_caching_hash_algo,
help="Set the hash algorithm for prefix caching. "
"Options are 'builtin' (Python's built-in hash) or 'sha256' "
"(collision resistant but with certain overheads).",
)
parser.add_argument('--disable-sliding-window', parser.add_argument('--disable-sliding-window',
action='store_true', action='store_true',
help='Disables sliding window, ' help='Disables sliding window, '
...@@ -551,9 +552,7 @@ class EngineArgs: ...@@ -551,9 +552,7 @@ class EngineArgs:
type=int, type=int,
default=EngineArgs.max_num_partial_prefills, default=EngineArgs.max_num_partial_prefills,
help="For chunked prefill, the max number of concurrent \ help="For chunked prefill, the max number of concurrent \
partial prefills." partial prefills.")
"Defaults to 1",
)
parser.add_argument( parser.add_argument(
"--max-long-partial-prefills", "--max-long-partial-prefills",
type=int, type=int,
...@@ -562,15 +561,13 @@ class EngineArgs: ...@@ -562,15 +561,13 @@ class EngineArgs:
"than --long-prefill-token-threshold that will be prefilled " "than --long-prefill-token-threshold that will be prefilled "
"concurrently. Setting this less than --max-num-partial-prefills " "concurrently. Setting this less than --max-num-partial-prefills "
"will allow shorter prompts to jump the queue in front of longer " "will allow shorter prompts to jump the queue in front of longer "
"prompts in some cases, improving latency. Defaults to 1.") "prompts in some cases, improving latency.")
parser.add_argument( parser.add_argument(
"--long-prefill-token-threshold", "--long-prefill-token-threshold",
type=float, type=float,
default=EngineArgs.long_prefill_token_threshold, default=EngineArgs.long_prefill_token_threshold,
help="For chunked prefill, a request is considered long if the " help="For chunked prefill, a request is considered long if the "
"prompt is longer than this number of tokens. Defaults to 4%% of " "prompt is longer than this number of tokens.")
"the model's context length.",
)
parser.add_argument('--max-num-seqs', parser.add_argument('--max-num-seqs',
type=int, type=int,
default=EngineArgs.max_num_seqs, default=EngineArgs.max_num_seqs,
...@@ -658,7 +655,7 @@ class EngineArgs: ...@@ -658,7 +655,7 @@ class EngineArgs:
type=nullable_kvs, type=nullable_kvs,
default=EngineArgs.limit_mm_per_prompt, default=EngineArgs.limit_mm_per_prompt,
# The default value is given in # The default value is given in
# MultiModalRegistry.init_mm_limits_per_prompt # MultiModalConfig.get_limit_per_prompt
help=('For each multimodal plugin, limit how many ' help=('For each multimodal plugin, limit how many '
'input instances to allow for each prompt. ' 'input instances to allow for each prompt. '
'Expects a comma-separated list of items, ' 'Expects a comma-separated list of items, '
...@@ -730,8 +727,7 @@ class EngineArgs: ...@@ -730,8 +727,7 @@ class EngineArgs:
type=int, type=int,
default=EngineArgs.max_cpu_loras, default=EngineArgs.max_cpu_loras,
help=('Maximum number of LoRAs to store in CPU memory. ' help=('Maximum number of LoRAs to store in CPU memory. '
'Must be >= than max_loras. ' 'Must be >= than max_loras.'))
'Defaults to max_loras.'))
parser.add_argument( parser.add_argument(
'--fully-sharded-loras', '--fully-sharded-loras',
action='store_true', action='store_true',
...@@ -793,129 +789,10 @@ class EngineArgs: ...@@ -793,129 +789,10 @@ class EngineArgs:
help='If set, the prefill requests can be chunked based on the ' help='If set, the prefill requests can be chunked based on the '
'max_num_batched_tokens.') 'max_num_batched_tokens.')
parser.add_argument('--speculative-config', parser.add_argument('--speculative-config',
type=nullable_str, type=json.loads,
default=None, default=None,
help='The configurations for speculative decoding.' help='The configurations for speculative decoding.'
' Should be a JSON string.') ' Should be a JSON string.')
parser.add_argument(
'--speculative-model',
type=nullable_str,
default=EngineArgs.speculative_model,
help=
'The name of the draft model to be used in speculative decoding.')
# Quantization settings for speculative model.
parser.add_argument(
'--speculative-model-quantization',
type=nullable_str,
choices=[*QUANTIZATION_METHODS, None],
default=EngineArgs.speculative_model_quantization,
help='Method used to quantize the weights of speculative model. '
'If None, we first check the `quantization_config` '
'attribute in the model config file. If that is '
'None, we assume the model weights are not '
'quantized and use `dtype` to determine the data '
'type of the weights.')
parser.add_argument(
'--num-speculative-tokens',
type=int,
default=EngineArgs.num_speculative_tokens,
help='The number of speculative tokens to sample from '
'the draft model in speculative decoding.')
parser.add_argument(
'--num-speculative-heads',
type=int,
default=EngineArgs.num_speculative_heads,
help='The number of speculative heads to sample from '
'the draft model in speculative decoding.')
parser.add_argument(
'--speculative-disable-mqa-scorer',
action='store_true',
help=
'If set to True, the MQA scorer will be disabled in speculative '
' and fall back to batch expansion')
parser.add_argument(
'--speculative-draft-tensor-parallel-size',
'-spec-draft-tp',
type=int,
default=EngineArgs.speculative_draft_tensor_parallel_size,
help='Number of tensor parallel replicas for '
'the draft model in speculative decoding.')
parser.add_argument(
'--speculative-max-model-len',
type=int,
default=EngineArgs.speculative_max_model_len,
help='The maximum sequence length supported by the '
'draft model. Sequences over this length will skip '
'speculation.')
parser.add_argument(
'--speculative-disable-by-batch-size',
type=int,
default=EngineArgs.speculative_disable_by_batch_size,
help='Disable speculative decoding for new incoming requests '
'if the number of enqueue requests is larger than this value.')
parser.add_argument(
'--ngram-prompt-lookup-max',
type=int,
default=EngineArgs.ngram_prompt_lookup_max,
help='Max size of window for ngram prompt lookup in speculative '
'decoding.')
parser.add_argument(
'--ngram-prompt-lookup-min',
type=int,
default=EngineArgs.ngram_prompt_lookup_min,
help='Min size of window for ngram prompt lookup in speculative '
'decoding.')
parser.add_argument(
'--spec-decoding-acceptance-method',
type=str,
default=EngineArgs.spec_decoding_acceptance_method,
choices=['rejection_sampler', 'typical_acceptance_sampler'],
help='Specify the acceptance method to use during draft token '
'verification in speculative decoding. Two types of acceptance '
'routines are supported: '
'1) RejectionSampler which does not allow changing the '
'acceptance rate of draft tokens, '
'2) TypicalAcceptanceSampler which is configurable, allowing for '
'a higher acceptance rate at the cost of lower quality, '
'and vice versa.')
parser.add_argument(
'--typical-acceptance-sampler-posterior-threshold',
type=float,
default=EngineArgs.typical_acceptance_sampler_posterior_threshold,
help='Set the lower bound threshold for the posterior '
'probability of a token to be accepted. This threshold is '
'used by the TypicalAcceptanceSampler to make sampling decisions '
'during speculative decoding. Defaults to 0.09')
parser.add_argument(
'--typical-acceptance-sampler-posterior-alpha',
type=float,
default=EngineArgs.typical_acceptance_sampler_posterior_alpha,
help='A scaling factor for the entropy-based threshold for token '
'acceptance in the TypicalAcceptanceSampler. Typically defaults '
'to sqrt of --typical-acceptance-sampler-posterior-threshold '
'i.e. 0.3')
parser.add_argument(
'--disable-logprobs-during-spec-decoding',
action=StoreBoolean,
default=EngineArgs.disable_logprobs_during_spec_decoding,
nargs="?",
const="True",
help='If set to True, token log probabilities are not returned '
'during speculative decoding. If set to False, log probabilities '
'are returned according to the settings in SamplingParams. If '
'not specified, it defaults to True. Disabling log probabilities '
'during speculative decoding reduces latency by skipping logprob '
'calculation in proposal sampling, target sampling, and after '
'accepted tokens are determined.')
parser.add_argument('--model-loader-extra-config', parser.add_argument('--model-loader-extra-config',
type=nullable_str, type=nullable_str,
default=EngineArgs.model_loader_extra_config, default=EngineArgs.model_loader_extra_config,
...@@ -1117,7 +994,7 @@ class EngineArgs: ...@@ -1117,7 +994,7 @@ class EngineArgs:
parser.add_argument( parser.add_argument(
"--reasoning-parser", "--reasoning-parser",
type=str, type=str,
choices=["deepseek_r1"], choices=list(ReasoningParserManager.reasoning_parsers),
default=None, default=None,
help= help=
"Select the reasoning parser depending on the model that you're " "Select the reasoning parser depending on the model that you're "
...@@ -1228,58 +1105,14 @@ class EngineArgs: ...@@ -1228,58 +1105,14 @@ class EngineArgs:
This function utilizes `speculative_config` to create a This function utilizes `speculative_config` to create a
SpeculativeConfig object. The `speculative_config` can either be SpeculativeConfig object. The `speculative_config` can either be
provided as a JSON string input via CLI arguments or directly as a provided as a JSON string input via CLI arguments or directly as a
dictionary from the engine. If `speculative_config` is not set, this dictionary from the engine.
function will attempt to construct a configuration dictionary using
certain parameters, which are scheduled for deprecation in the next
release. Note that in next releases, `speculative_config` must be
provided, and the deprecated standalone speculative-related parameters
will be removed.
""" """
if self.speculative_config is None: if self.speculative_config is None:
if (self.speculative_model is None return None
and self.num_speculative_tokens is None):
return None
# TODO(Shangming): Deprecate this way of setting SpeculativeConfig,
# only allow '--speculative-config' after next release
logger.warning_once(
"Please use '--speculative-config' to set all configurations "
"related to speculative decoding. The current method of "
"specifying the model through '--speculative-model' and "
"adding related parameters (e.g., '--num-speculative-tokens') "
"separately will be deprecated in the next release.")
spec_config_dict = {
"model": self.speculative_model,
"quantization": self.speculative_model_quantization,
"max_model_len": self.speculative_max_model_len,
"draft_tensor_parallel_size":
self.speculative_draft_tensor_parallel_size,
"num_speculative_tokens": self.num_speculative_tokens,
"disable_mqa_scorer": self.speculative_disable_mqa_scorer,
"disable_by_batch_size":
self.speculative_disable_by_batch_size,
"prompt_lookup_max": self.ngram_prompt_lookup_max,
"prompt_lookup_min": self.ngram_prompt_lookup_min,
"acceptance_method": self.spec_decoding_acceptance_method,
"posterior_threshold":
self.typical_acceptance_sampler_posterior_threshold,
"posterior_alpha":
self.typical_acceptance_sampler_posterior_alpha,
"disable_logprobs": self.disable_logprobs_during_spec_decoding,
}
self.speculative_config = spec_config_dict
else:
if isinstance(self.speculative_config, str):
import ast
self.speculative_config = ast.literal_eval(
self.speculative_config)
# Note(Shangming): These parameters are not obtained from the cli arg # Note(Shangming): These parameters are not obtained from the cli arg
# '--speculative-config' and must be passed in when creating the engine # '--speculative-config' and must be passed in when creating the engine
# config. # config.
assert isinstance(self.speculative_config, dict)
self.speculative_config.update({ self.speculative_config.update({
"target_model_config": target_model_config, "target_model_config": target_model_config,
"target_parallel_config": target_parallel_config, "target_parallel_config": target_parallel_config,
...@@ -1349,6 +1182,7 @@ class EngineArgs: ...@@ -1349,6 +1182,7 @@ class EngineArgs:
num_gpu_blocks_override=self.num_gpu_blocks_override, num_gpu_blocks_override=self.num_gpu_blocks_override,
sliding_window=model_config.get_sliding_window(), sliding_window=model_config.get_sliding_window(),
enable_prefix_caching=self.enable_prefix_caching, enable_prefix_caching=self.enable_prefix_caching,
prefix_caching_hash_algo=self.prefix_caching_hash_algo,
cpu_offload_gb=self.cpu_offload_gb, cpu_offload_gb=self.cpu_offload_gb,
calculate_kv_scales=self.calculate_kv_scales, calculate_kv_scales=self.calculate_kv_scales,
) )
...@@ -1367,6 +1201,7 @@ class EngineArgs: ...@@ -1367,6 +1201,7 @@ class EngineArgs:
parallel_config = ParallelConfig( parallel_config = ParallelConfig(
pipeline_parallel_size=self.pipeline_parallel_size, pipeline_parallel_size=self.pipeline_parallel_size,
tensor_parallel_size=self.tensor_parallel_size, tensor_parallel_size=self.tensor_parallel_size,
data_parallel_size=self.data_parallel_size,
enable_expert_parallel=self.enable_expert_parallel, enable_expert_parallel=self.enable_expert_parallel,
max_parallel_loading_workers=self.max_parallel_loading_workers, max_parallel_loading_workers=self.max_parallel_loading_workers,
disable_custom_all_reduce=self.disable_custom_all_reduce, disable_custom_all_reduce=self.disable_custom_all_reduce,
...@@ -1561,7 +1396,8 @@ class EngineArgs: ...@@ -1561,7 +1396,8 @@ class EngineArgs:
# Xgrammar and Guidance are supported. # Xgrammar and Guidance are supported.
SUPPORTED_GUIDED_DECODING = [ SUPPORTED_GUIDED_DECODING = [
"xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto" "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
"guidance:disable-any-whitespace", "auto"
] ]
if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING: if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
_raise_or_fallback(feature_name="--guided-decoding-backend", _raise_or_fallback(feature_name="--guided-decoding-backend",
...@@ -1603,12 +1439,6 @@ class EngineArgs: ...@@ -1603,12 +1439,6 @@ class EngineArgs:
recommend_to_remove=False) recommend_to_remove=False)
return False return False
# No CPU offloading yet.
if self.cpu_offload_gb != EngineArgs.cpu_offload_gb:
_raise_or_fallback(feature_name="--cpu-offload-gb",
recommend_to_remove=False)
return False
# Only Fp16 and Bf16 dtypes since we only support FA. # Only Fp16 and Bf16 dtypes since we only support FA.
V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16] V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
if model_config.dtype not in V1_SUPPORTED_DTYPES: if model_config.dtype not in V1_SUPPORTED_DTYPES:
...@@ -1617,7 +1447,7 @@ class EngineArgs: ...@@ -1617,7 +1447,7 @@ class EngineArgs:
return False return False
# Some quantization is not compatible with torch.compile. # Some quantization is not compatible with torch.compile.
V1_UNSUPPORTED_QUANT = ["bitsandbytes", "gguf"] V1_UNSUPPORTED_QUANT = ["gguf"]
if model_config.quantization in V1_UNSUPPORTED_QUANT: if model_config.quantization in V1_UNSUPPORTED_QUANT:
_raise_or_fallback( _raise_or_fallback(
feature_name=f"--quantization {model_config.quantization}", feature_name=f"--quantization {model_config.quantization}",
...@@ -1636,21 +1466,11 @@ class EngineArgs: ...@@ -1636,21 +1466,11 @@ class EngineArgs:
recommend_to_remove=False) recommend_to_remove=False)
return False return False
# No TransformersModel support so far.
if (model_config.model_impl == ModelImpl.TRANSFORMERS
or model_config.model_impl == "transformers"):
_raise_or_fallback(
feature_name=f"model_impl={model_config.model_impl}",
recommend_to_remove=False)
return False
# No Concurrent Partial Prefills so far. # No Concurrent Partial Prefills so far.
if (self.max_num_partial_prefills if (self.max_num_partial_prefills
!= EngineArgs.max_num_partial_prefills != EngineArgs.max_num_partial_prefills
or self.max_long_partial_prefills or self.max_long_partial_prefills
!= EngineArgs.max_long_partial_prefills != EngineArgs.max_long_partial_prefills):
or self.long_prefill_token_threshold
!= EngineArgs.long_prefill_token_threshold):
_raise_or_fallback(feature_name="Concurrent Partial Prefill", _raise_or_fallback(feature_name="Concurrent Partial Prefill",
recommend_to_remove=False) recommend_to_remove=False)
return False return False
...@@ -1662,12 +1482,22 @@ class EngineArgs: ...@@ -1662,12 +1482,22 @@ class EngineArgs:
return False return False
# Only Ngram speculative decoding so far. # Only Ngram speculative decoding so far.
if (self.speculative_model is not None is_ngram_enabled = False
or self.num_speculative_tokens is not None): is_eagle_enabled = False
if self.speculative_config is not None:
# This is supported but experimental (handled below). # This is supported but experimental (handled below).
if self.speculative_model in ("ngram", "[ngram]"): speculative_method = self.speculative_config.get("method")
pass if speculative_method:
if speculative_method in ("ngram", "[ngram]"):
is_ngram_enabled = True
elif speculative_method == "eagle":
is_eagle_enabled = True
else: else:
speculative_model = self.speculative_config.get("model")
if speculative_model in ("ngram", "[ngram]"):
is_ngram_enabled = True
if not (is_ngram_enabled or is_eagle_enabled):
# Other speculative decoding methods are not supported yet.
_raise_or_fallback(feature_name="Speculative Decoding", _raise_or_fallback(feature_name="Speculative Decoding",
recommend_to_remove=False) recommend_to_remove=False)
return False return False
...@@ -1689,9 +1519,8 @@ class EngineArgs: ...@@ -1689,9 +1519,8 @@ class EngineArgs:
_raise_or_fallback(feature_name=name, recommend_to_remove=True) _raise_or_fallback(feature_name=name, recommend_to_remove=True)
return False return False
# No support for device type other than CUDA, AMD (experiemntal) or # Platforms must decide if they can support v1 for this model
# TPU (experimental) so far. if not current_platform.supports_v1(model_config=model_config):
if not (current_platform.is_cuda_alike() or current_platform.is_tpu()):
_raise_or_fallback( _raise_or_fallback(
feature_name=f"device type={current_platform.device_type}", feature_name=f"device type={current_platform.device_type}",
recommend_to_remove=False) recommend_to_remove=False)
...@@ -1704,23 +1533,26 @@ class EngineArgs: ...@@ -1704,23 +1533,26 @@ class EngineArgs:
and _warn_or_fallback("Engine in background thread")): and _warn_or_fallback("Engine in background thread")):
return False return False
# LoRA is supported on V1, but off by default for now. # PP is supported on V1 with Ray distributed executor,
if self.enable_lora and _warn_or_fallback("LORA"): # but off for MP distributed executor for now.
if (self.pipeline_parallel_size > 1
and self.distributed_executor_backend != "ray"):
name = "Pipeline Parallelism without Ray distributed executor"
_raise_or_fallback(feature_name=name, recommend_to_remove=False)
return False return False
# PP is supported on V1, but off by default for now. # ngram is supported on V1, but off by default for now.
if self.pipeline_parallel_size > 1 and _warn_or_fallback("PP"): if is_ngram_enabled and _warn_or_fallback("ngram"):
return False return False
# ngram is supported on V1, but off by default for now. # Eagle is under development, so we don't support it yet.
if self.speculative_model in ( if is_eagle_enabled and _warn_or_fallback("Eagle"):
"ngram", "[ngram]") and _warn_or_fallback("ngram"):
return False return False
# Non-CUDA is supported on V1, but off by default for now. # Non-CUDA is supported on V1, but off by default for now.
not_cuda = not current_platform.is_cuda() not_cuda = not current_platform.is_cuda()
if not_cuda and _warn_or_fallback( # noqa: SIM103 if not_cuda and _warn_or_fallback( # noqa: SIM103
current_platform.device_type): current_platform.device_name):
return False return False
############################################################# #############################################################
...@@ -1743,7 +1575,7 @@ class EngineArgs: ...@@ -1743,7 +1575,7 @@ class EngineArgs:
is_gpu = current_platform.is_cuda() is_gpu = current_platform.is_cuda()
use_sliding_window = (model_config.get_sliding_window() use_sliding_window = (model_config.get_sliding_window()
is not None) is not None)
use_spec_decode = self.speculative_model is not None use_spec_decode = self.speculative_config is not None
if (is_gpu and not use_sliding_window and not use_spec_decode if (is_gpu and not use_sliding_window and not use_spec_decode
and not self.enable_lora and not self.enable_lora
...@@ -1771,12 +1603,22 @@ class EngineArgs: ...@@ -1771,12 +1603,22 @@ class EngineArgs:
msg = "Chunked prefill is not supported for pooling models" msg = "Chunked prefill is not supported for pooling models"
raise ValueError(msg) raise ValueError(msg)
# Disable prefix caching for multimodal models for VLLM_V0. # if using prefix caching, we must set a hash algo
if (model_config.is_multimodal_model and self.enable_prefix_caching): if self.enable_prefix_caching:
logger.warning( # Disable prefix caching for multimodal models for VLLM_V0.
"--enable-prefix-caching is not supported for multimodal " if model_config.is_multimodal_model:
"models in V0 and has been disabled.") logger.warning(
self.enable_prefix_caching = False "--enable-prefix-caching is not supported for multimodal "
"models in V0 and has been disabled.")
self.enable_prefix_caching = False
# VLLM_V0 only supports builtin hash algo for prefix caching.
if self.prefix_caching_hash_algo is None:
self.prefix_caching_hash_algo = "builtin"
elif self.prefix_caching_hash_algo == "sha256":
raise ValueError(
"sha256 is not supported for prefix caching in V0 engine. "
"Please use 'builtin'.")
# Set max_num_seqs to 256 for VLLM_V0. # Set max_num_seqs to 256 for VLLM_V0.
if self.max_num_seqs is None: if self.max_num_seqs is None:
...@@ -1792,6 +1634,10 @@ class EngineArgs: ...@@ -1792,6 +1634,10 @@ class EngineArgs:
if self.enable_prefix_caching is None: if self.enable_prefix_caching is None:
self.enable_prefix_caching = True self.enable_prefix_caching = True
# if using prefix caching, we must set a hash algo
if self.enable_prefix_caching and self.prefix_caching_hash_algo is None:
self.prefix_caching_hash_algo = "builtin"
# V1 should use the new scheduler by default. # V1 should use the new scheduler by default.
# Swap it only if this arg is set to the original V0 default # Swap it only if this arg is set to the original V0 default
if self.scheduler_cls == EngineArgs.scheduler_cls: if self.scheduler_cls == EngineArgs.scheduler_cls:
......
...@@ -303,8 +303,11 @@ class _AsyncLLMEngine(LLMEngine): ...@@ -303,8 +303,11 @@ class _AsyncLLMEngine(LLMEngine):
ctx.seq_group_metadata_list = seq_group_metadata_list ctx.seq_group_metadata_list = seq_group_metadata_list
ctx.scheduler_outputs = scheduler_outputs ctx.scheduler_outputs = scheduler_outputs
finished_requests_ids = self.scheduler[ if not scheduler_outputs.is_empty():
virtual_engine].get_and_reset_finished_requests_ids() # this will cause mamba_cache/minimax_cache failed
# to release finished_requests_ids of the last steps
finished_requests_ids = self.scheduler[
virtual_engine].get_and_reset_finished_requests_ids()
# Maybe switch from async mode to sync mode # Maybe switch from async mode to sync mode
if not allow_async_output_proc and len(ctx.output_queue) > 0: if not allow_async_output_proc and len(ctx.output_queue) > 0:
...@@ -1222,8 +1225,8 @@ class AsyncLLMEngine(EngineClient): ...@@ -1222,8 +1225,8 @@ class AsyncLLMEngine(EngineClient):
async def sleep(self, level: int = 1) -> None: async def sleep(self, level: int = 1) -> None:
self.engine.sleep(level) self.engine.sleep(level)
async def wake_up(self) -> None: async def wake_up(self, tags: Optional[list[str]] = None) -> None:
self.engine.wake_up() self.engine.wake_up(tags)
async def is_sleeping(self) -> bool: async def is_sleeping(self) -> bool:
return self.engine.is_sleeping() return self.engine.is_sleeping()
......
...@@ -8,8 +8,8 @@ from collections import deque ...@@ -8,8 +8,8 @@ from collections import deque
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
from functools import partial from functools import partial
from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable, from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
List, Mapping, NamedTuple, Optional) Iterable, List, Mapping, NamedTuple, Optional)
from typing import Sequence as GenericSequence from typing import Sequence as GenericSequence
from typing import Set, Type, Union, cast, overload from typing import Set, Type, Union, cast, overload
...@@ -31,8 +31,8 @@ from vllm.entrypoints.openai.logits_processors import ( ...@@ -31,8 +31,8 @@ from vllm.entrypoints.openai.logits_processors import (
get_logits_processors as get_openai_logits_processors) get_logits_processors as get_openai_logits_processors)
from vllm.executor.executor_base import ExecutorBase from vllm.executor.executor_base import ExecutorBase
from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
PromptType, SingletonInputsAdapter) PromptType)
from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
from vllm.inputs.preprocess import InputPreprocessor from vllm.inputs.preprocess import InputPreprocessor
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.logits_process import get_bad_words_logits_processors from vllm.logits_process import get_bad_words_logits_processors
...@@ -68,6 +68,7 @@ _LOCAL_LOGGING_INTERVAL_SEC = 5 ...@@ -68,6 +68,7 @@ _LOCAL_LOGGING_INTERVAL_SEC = 5
_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup) _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
_O = TypeVar("_O", RequestOutput, PoolingRequestOutput) _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
_R = TypeVar("_R", default=Any)
@dataclass @dataclass
...@@ -612,12 +613,7 @@ class LLMEngine: ...@@ -612,12 +613,7 @@ class LLMEngine:
seq_id = next(self.seq_counter) seq_id = next(self.seq_counter)
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
if is_encoder_decoder_inputs(processed_inputs): encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
decoder_inputs = processed_inputs["decoder"]
encoder_inputs = processed_inputs["encoder"]
else:
decoder_inputs = processed_inputs
encoder_inputs = None
seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id, seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
lora_request, prompt_adapter_request) lora_request, prompt_adapter_request)
...@@ -1959,10 +1955,10 @@ class LLMEngine: ...@@ -1959,10 +1955,10 @@ class LLMEngine:
"Sleep mode is not enabled in the model config") "Sleep mode is not enabled in the model config")
self.model_executor.sleep(level=level) self.model_executor.sleep(level=level)
def wake_up(self) -> None: def wake_up(self, tags: Optional[list[str]] = None) -> None:
assert self.vllm_config.model_config.enable_sleep_mode, ( assert self.vllm_config.model_config.enable_sleep_mode, (
"Sleep mode is not enabled in the model config") "Sleep mode is not enabled in the model config")
self.model_executor.wake_up() self.model_executor.wake_up(tags)
def is_sleeping(self) -> bool: def is_sleeping(self) -> bool:
return self.model_executor.is_sleeping return self.model_executor.is_sleeping
...@@ -2048,15 +2044,16 @@ class LLMEngine: ...@@ -2048,15 +2044,16 @@ class LLMEngine:
def _validate_model_inputs(self, inputs: ProcessorInputs, def _validate_model_inputs(self, inputs: ProcessorInputs,
lora_request: Optional[LoRARequest]): lora_request: Optional[LoRARequest]):
if is_encoder_decoder_inputs(inputs): encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
# For encoder-decoder multimodal models, the max_prompt_len
# restricts the decoder prompt length # For encoder-decoder multimodal models, the max_prompt_len
prompt_inputs = inputs["decoder" if self.model_config. # restricts the decoder prompt length
is_multimodal_model else "encoder"] if self.model_config.is_multimodal_model:
prompt_inputs = decoder_inputs
else: else:
prompt_inputs = inputs prompt_inputs = encoder_inputs or decoder_inputs
prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids prompt_ids = prompt_inputs["prompt_token_ids"]
if prompt_ids is None or len(prompt_ids) == 0: if prompt_ids is None or len(prompt_ids) == 0:
raise ValueError("Prompt cannot be empty") raise ValueError("Prompt cannot be empty")
...@@ -2101,8 +2098,9 @@ class LLMEngine: ...@@ -2101,8 +2098,9 @@ class LLMEngine:
guided_decoding.backend = guided_decoding.backend or \ guided_decoding.backend = guided_decoding.backend or \
self.decoding_config.guided_decoding_backend self.decoding_config.guided_decoding_backend
logger.debug("Reasoning backend: %s", if self.decoding_config.reasoning_backend is not None:
self.decoding_config.reasoning_backend) logger.debug("Building with reasoning backend %s",
self.decoding_config.reasoning_backend)
processor = get_local_guided_decoding_logits_processor( processor = get_local_guided_decoding_logits_processor(
guided_params=guided_decoding, guided_params=guided_decoding,
...@@ -2143,6 +2141,14 @@ class LLMEngine: ...@@ -2143,6 +2141,14 @@ class LLMEngine:
return sampling_params return sampling_params
def collective_rpc(self,
method: Union[str, Callable[..., _R]],
timeout: Optional[float] = None,
args: tuple = (),
kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
return self.model_executor.collective_rpc(method, timeout, args,
kwargs)
if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
......
...@@ -52,6 +52,11 @@ class Metrics: ...@@ -52,6 +52,11 @@ class Metrics:
max_model_len = vllm_config.model_config.max_model_len max_model_len = vllm_config.model_config.max_model_len
# Use this flag to hide metrics that were deprecated in
# a previous release and which will be removed future
self.show_hidden_metrics = \
vllm_config.observability_config.show_hidden_metrics
# System stats # System stats
# Scheduler State # Scheduler State
self.gauge_scheduler_running = self._gauge_cls( self.gauge_scheduler_running = self._gauge_cls(
...@@ -76,14 +81,15 @@ class Metrics: ...@@ -76,14 +81,15 @@ class Metrics:
) )
# Deprecated in 0.8 - KV cache offloading is not used in V1 # Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True # Hidden in 0.9, due to be removed in 0.10
self.gauge_scheduler_swapped = self._gauge_cls( if self.show_hidden_metrics:
name="vllm:num_requests_swapped", self.gauge_scheduler_swapped = self._gauge_cls(
documentation=( name="vllm:num_requests_swapped",
"Number of requests swapped to CPU. " documentation=(
"DEPRECATED: KV cache offloading is not used in V1"), "Number of requests swapped to CPU. "
labelnames=labelnames, "DEPRECATED: KV cache offloading is not used in V1"),
multiprocess_mode="sum") labelnames=labelnames,
multiprocess_mode="sum")
# KV Cache Usage in % # KV Cache Usage in %
self.gauge_gpu_cache_usage = self._gauge_cls( self.gauge_gpu_cache_usage = self._gauge_cls(
...@@ -93,34 +99,33 @@ class Metrics: ...@@ -93,34 +99,33 @@ class Metrics:
multiprocess_mode="sum") multiprocess_mode="sum")
# Deprecated in 0.8 - KV cache offloading is not used in V1 # Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True # Hidden in 0.9, due to be removed in 0.10
self.gauge_cpu_cache_usage = self._gauge_cls( if self.show_hidden_metrics:
name="vllm:cpu_cache_usage_perc", self.gauge_cpu_cache_usage = self._gauge_cls(
documentation=( name="vllm:cpu_cache_usage_perc",
"CPU KV-cache usage. 1 means 100 percent usage. " documentation=(
"DEPRECATED: KV cache offloading is not used in V1"), "CPU KV-cache usage. 1 means 100 percent usage. "
labelnames=labelnames, "DEPRECATED: KV cache offloading is not used in V1"),
multiprocess_mode="sum") labelnames=labelnames,
multiprocess_mode="sum")
# Deprecated in 0.8 - KV cache offloading is not used in V1 self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
# TODO: in 0.9, only enable if show_hidden_metrics=True name="vllm:cpu_prefix_cache_hit_rate",
self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls( documentation=(
name="vllm:cpu_prefix_cache_hit_rate", "CPU prefix cache block hit rate. "
documentation=( "DEPRECATED: KV cache offloading is not used in V1"),
"CPU prefix cache block hit rate. " labelnames=labelnames,
"DEPRECATED: KV cache offloading is not used in V1"), multiprocess_mode="sum")
labelnames=labelnames,
multiprocess_mode="sum")
# Deprecated in 0.8 - replaced by queries+hits counters in V1 # Deprecated in 0.8 - replaced by queries+hits counters in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True # Hidden in 0.9, due to be removed in 0.10
self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls( if self.show_hidden_metrics:
name="vllm:gpu_prefix_cache_hit_rate", self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
documentation=("GPU prefix cache block hit rate. " name="vllm:gpu_prefix_cache_hit_rate",
"DEPRECATED: use vllm:gpu_prefix_cache_queries and " documentation=("GPU prefix cache block hit rate. "
"vllm:gpu_prefix_cache_queries in V1"), "DEPRECATED: use vllm:gpu_prefix_cache_queries "
labelnames=labelnames, "and vllm:gpu_prefix_cache_queries in V1"),
multiprocess_mode="sum") labelnames=labelnames,
multiprocess_mode="sum")
# Iteration stats # Iteration stats
self.counter_num_preemption = self._counter_cls( self.counter_num_preemption = self._counter_cls(
...@@ -198,33 +203,35 @@ class Metrics: ...@@ -198,33 +203,35 @@ class Metrics:
labelnames=labelnames, labelnames=labelnames,
buckets=request_latency_buckets) buckets=request_latency_buckets)
# Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds: # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
# TODO: in 0.9, only enable if show_hidden_metrics=True # Hidden in 0.9, due to be removed in 0.10
self.histogram_time_in_queue_request = self._histogram_cls( if self.show_hidden_metrics:
name="vllm:time_in_queue_requests", self.histogram_time_in_queue_request = self._histogram_cls(
documentation=( name="vllm:time_in_queue_requests",
"Histogram of time the request spent in the queue in seconds. " documentation=
"DEPRECATED: use vllm:request_queue_time_seconds instead."), ("Histogram of time the request spent in the queue in seconds. "
labelnames=labelnames, "DEPRECATED: use vllm:request_queue_time_seconds instead."),
buckets=request_latency_buckets) labelnames=labelnames,
buckets=request_latency_buckets)
# Deprecated in 0.8 - use prefill/decode/inference time metrics # Deprecated in 0.8 - use prefill/decode/inference time metrics
# TODO: in 0.9, only enable if show_hidden_metrics=True # Hidden in 0.9, due to be removed in 0.10
self.histogram_model_forward_time_request = self._histogram_cls( if self.show_hidden_metrics:
name="vllm:model_forward_time_milliseconds", self.histogram_model_forward_time_request = self._histogram_cls(
documentation=( name="vllm:model_forward_time_milliseconds",
"Histogram of time spent in the model forward pass in ms. " documentation=
"DEPRECATED: use prefill/decode/inference time metrics instead." ("Histogram of time spent in the model forward pass in ms. "
), "DEPRECATED: use prefill/decode/inference time metrics instead"
labelnames=labelnames, ),
buckets=build_1_2_3_5_8_buckets(3000)) labelnames=labelnames,
self.histogram_model_execute_time_request = self._histogram_cls( buckets=build_1_2_3_5_8_buckets(3000))
name="vllm:model_execute_time_milliseconds", self.histogram_model_execute_time_request = self._histogram_cls(
documentation=( name="vllm:model_execute_time_milliseconds",
"Histogram of time spent in the model execute function in ms." documentation=
"DEPRECATED: use prefill/decode/inference time metrics instead." ("Histogram of time spent in the model execute function in ms."
), "DEPRECATED: use prefill/decode/inference time metrics instead"
labelnames=labelnames, ),
buckets=build_1_2_3_5_8_buckets(3000)) labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
# Metadata # Metadata
self.histogram_num_prompt_tokens_request = self._histogram_cls( self.histogram_num_prompt_tokens_request = self._histogram_cls(
...@@ -543,11 +550,6 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -543,11 +550,6 @@ class PrometheusStatLogger(StatLoggerBase):
self.metrics = self._metrics_cls(labelnames=list(labels.keys()), self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
vllm_config=vllm_config) vllm_config=vllm_config)
# Use this flag to hide metrics that were deprecated in
# a previous release and which will be removed future
self.show_hidden_metrics = \
vllm_config.observability_config.show_hidden_metrics
def _log_gauge(self, gauge, data: Union[int, float]) -> None: def _log_gauge(self, gauge, data: Union[int, float]) -> None:
# Convenience function for logging to gauge. # Convenience function for logging to gauge.
gauge.labels(**self.labels).set(data) gauge.labels(**self.labels).set(data)
...@@ -580,18 +582,20 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -580,18 +582,20 @@ class PrometheusStatLogger(StatLoggerBase):
# System state data # System state data
self._log_gauge(self.metrics.gauge_scheduler_running, self._log_gauge(self.metrics.gauge_scheduler_running,
stats.num_running_sys) stats.num_running_sys)
self._log_gauge(self.metrics.gauge_scheduler_swapped, if self.metrics.show_hidden_metrics:
stats.num_swapped_sys) self._log_gauge(self.metrics.gauge_scheduler_swapped,
stats.num_swapped_sys)
self._log_gauge(self.metrics.gauge_scheduler_waiting, self._log_gauge(self.metrics.gauge_scheduler_waiting,
stats.num_waiting_sys) stats.num_waiting_sys)
self._log_gauge(self.metrics.gauge_gpu_cache_usage, self._log_gauge(self.metrics.gauge_gpu_cache_usage,
stats.gpu_cache_usage_sys) stats.gpu_cache_usage_sys)
self._log_gauge(self.metrics.gauge_cpu_cache_usage, if self.metrics.show_hidden_metrics:
stats.cpu_cache_usage_sys) self._log_gauge(self.metrics.gauge_cpu_cache_usage,
self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate, stats.cpu_cache_usage_sys)
stats.cpu_prefix_cache_hit_rate) self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, stats.cpu_prefix_cache_hit_rate)
stats.gpu_prefix_cache_hit_rate) self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
stats.gpu_prefix_cache_hit_rate)
# Including max-lora in metric, in future this property of lora # Including max-lora in metric, in future this property of lora
# config maybe extended to be dynamic. # config maybe extended to be dynamic.
lora_info = { lora_info = {
...@@ -629,12 +633,15 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -629,12 +633,15 @@ class PrometheusStatLogger(StatLoggerBase):
stats.time_prefill_requests) stats.time_prefill_requests)
self._log_histogram(self.metrics.histogram_decode_time_request, self._log_histogram(self.metrics.histogram_decode_time_request,
stats.time_decode_requests) stats.time_decode_requests)
self._log_histogram(self.metrics.histogram_time_in_queue_request, if self.metrics.show_hidden_metrics:
stats.time_in_queue_requests) self._log_histogram(self.metrics.histogram_time_in_queue_request,
self._log_histogram(self.metrics.histogram_model_forward_time_request, stats.time_in_queue_requests)
stats.model_forward_time_requests) self._log_histogram(
self._log_histogram(self.metrics.histogram_model_execute_time_request, self.metrics.histogram_model_forward_time_request,
stats.model_execute_time_requests) stats.model_forward_time_requests)
self._log_histogram(
self.metrics.histogram_model_execute_time_request,
stats.model_execute_time_requests)
# Metadata # Metadata
finished_reason_counter = CollectionsCounter( finished_reason_counter = CollectionsCounter(
stats.finished_reason_requests) stats.finished_reason_requests)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment