Commit cc7f22a8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.1' into v0.9.1-ori

parents b9ea0c09 b6553be1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from typing import Union
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast
import json
from collections.abc import Sequence
......@@ -7,6 +8,7 @@ from typing import Any, Union
import regex as re
from transformers import PreTrainedTokenizerBase
import vllm.envs as envs
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaFunctionCall, DeltaMessage,
DeltaToolCall,
......@@ -64,7 +66,19 @@ class Llama4PythonicToolParser(ToolParser):
if model_output.startswith("<|python_start|>"):
model_output = model_output[len("<|python_start|>"):]
model_output = model_output.replace("<|python_end|>", "")
if not (self.TOOL_CALL_REGEX.match(model_output)):
is_tool_call_pattern = False
try:
is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
model_output,
timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
except TimeoutError:
logger.warning(
"Regex timeout occurred when matching tool call pattern.")
logger.debug("Regex timeout occurred when matching user input: %s",
model_output)
if not is_tool_call_pattern:
return ExtractedToolCallInformation(tools_called=False,
tool_calls=[],
content=model_output)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Sequence
......@@ -43,11 +44,17 @@ class MistralToolCall(ToolCall):
return id.isalnum() and len(id) == 9
def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
return isinstance(model_tokenizer, MistralTokenizer) \
and model_tokenizer.version >= 11
@ToolParserManager.register_module("mistral")
class MistralToolParser(ToolParser):
"""
Tool call parser for Mistral 7B Instruct v0.3, intended for use with the
examples/tool_chat_template_mistral.jinja template.
Tool call parser for Mistral 7B Instruct v0.3, intended for use with
- [`mistral_common`](https://github.com/mistralai/mistral-common/)
- the examples/tool_chat_template_mistral.jinja template.
Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
"""
......@@ -69,6 +76,12 @@ class MistralToolParser(ToolParser):
self.bot_token = "[TOOL_CALLS]"
self.bot_token_id = self.vocab.get(self.bot_token)
self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
if _is_fn_name_regex_support(self.model_tokenizer):
self.fn_name_regex = re.compile(r'([a-zA-Z0-9_-]+)(\{.*?\})',
re.DOTALL)
else:
self.fn_name_regex = None
if self.bot_token_id is None:
raise RuntimeError(
"Mistral Tool Parser could not locate the tool call token in "
......@@ -108,11 +121,25 @@ class MistralToolParser(ToolParser):
tool_content = model_output.replace(self.bot_token, "").strip()
try:
# we first try to directly load the json as parsing very nested
# jsons is difficult
try:
function_call_arr = json.loads(tool_content)
if self.fn_name_regex:
matches = self.fn_name_regex.findall(tool_content)
function_call_arr = []
for match in matches:
fn_name = match[0]
args = match[1]
# fn_name is encoded outside serialized json dump
# only arguments are serialized
function_call_arr.append({
"name": fn_name,
"arguments": json.loads(args)
})
else:
function_call_arr = json.loads(tool_content)
except json.JSONDecodeError:
# use a regex to find the part corresponding to the tool call.
# NOTE: This use case should not happen if the model is trained
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Sequence
......@@ -68,8 +69,8 @@ class Phi4MiniJsonToolParser(ToolParser):
len(function_call_arr))
except json.JSONDecodeError as e:
logger.error(
"Failed to parse function calls from model output: %s. "
"Error: %s", model_output, str(e))
"Failed to parse function calls from model output. "
"Error: %s", str(e))
tool_calls: list[ToolCall] = [
ToolCall(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast
import json
......@@ -8,6 +9,7 @@ from typing import Any, Union
import regex as re
from transformers import PreTrainedTokenizerBase
import vllm.envs as envs
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaFunctionCall, DeltaMessage,
DeltaToolCall,
......@@ -61,8 +63,18 @@ class PythonicToolParser(ToolParser):
"""
Extract the tool calls from a complete model response.
"""
if not (self.TOOL_CALL_REGEX.match(model_output)):
is_tool_call_pattern = False
try:
is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
model_output,
timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
except TimeoutError:
logger.warning(
"Regex timeout occurred when matching tool call pattern.")
logger.debug("Regex timeout occurred when matching user input: %s",
model_output)
if not is_tool_call_pattern:
return ExtractedToolCallInformation(tools_called=False,
tool_calls=[],
content=model_output)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from json import JSONDecodeError, JSONDecoder
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Union
from torch.nn import CosineSimilarity
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
from ssl import SSLContext
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import functools
......@@ -13,8 +14,9 @@ from vllm.logger import init_logger
logger = init_logger(__name__)
VLLM_SERVE_PARSER_EPILOG = (
"Tip: Use `vllm serve --help=<keyword>` to explore arguments from help.\n"
VLLM_SUBCMD_PARSER_EPILOG = (
"Tip: Use `vllm [serve|run-batch] --help=<keyword>` "
"to explore arguments from help.\n"
" - To view a argument group: --help=ModelConfig\n"
" - To view a single argument: --help=max-num-seqs\n"
" - To search by keyword: --help=max\n"
......@@ -26,6 +28,11 @@ async def listen_for_disconnect(request: Request) -> None:
while True:
message = await request.receive()
if message["type"] == "http.disconnect":
if request.app.state.enable_server_load_tracking:
# on timeout/cancellation the BackgroundTask in load_aware_call
# cannot decrement the server load metrics.
# Must be decremented by with_cancellation instead.
request.app.state.server_load_metrics -= 1
break
......@@ -167,8 +174,15 @@ def _validate_truncation_size(
return truncate_prompt_tokens
def show_filtered_argument_or_group_from_help(parser):
def show_filtered_argument_or_group_from_help(parser, subcommand_name):
import sys
# Only handle --help=<keyword> for the current subcommand.
# Since subparser_init() runs for all subcommands during CLI setup,
# we skip processing if the subcommand name is not in sys.argv.
if subcommand_name not in sys.argv:
return
for arg in sys.argv:
if arg.startswith('--help='):
search_keyword = arg.split('=', 1)[1]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import torch
from vllm.logger import init_logger
logger = init_logger(__name__)
# set some common config/environment variables that should be set
# for all processes created by vllm and all processes
# that interact with vllm workers.
# they are executed whenever `import vllm` is called.
if not os.path.exists('/dev/nvidia-caps-imex-channels'):
# normally, we disable NCCL_CUMEM_ENABLE because it
# will cost 1~2 GiB GPU memory with cudagraph+allreduce,
# see https://github.com/NVIDIA/nccl/issues/1234
# for more details.
# However, NCCL requires NCCL_CUMEM_ENABLE to work with
if 'NCCL_CUMEM_ENABLE' in os.environ:
logger.warning(
"NCCL_CUMEM_ENABLE is set to %s, skipping override. "
"This may increase memory overhead with cudagraph+allreduce: "
"https://github.com/NVIDIA/nccl/issues/1234",
os.environ['NCCL_CUMEM_ENABLE'])
elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
# NCCL requires NCCL_CUMEM_ENABLE to work with
# multi-node NVLink, typically on GB200-NVL72 systems.
# The ultimate way to detect multi-node NVLink is to use
# NVML APIs, which are too expensive to call here.
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
import os
......@@ -15,6 +16,7 @@ if TYPE_CHECKING:
VLLM_NCCL_SO_PATH: Optional[str] = None
LD_LIBRARY_PATH: Optional[str] = None
VLLM_USE_TRITON_FLASH_ATTN: bool = False
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
VLLM_FLASH_ATTN_VERSION: Optional[int] = None
LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: Optional[str] = None
......@@ -42,6 +44,7 @@ if TYPE_CHECKING:
VLLM_PP_LAYER_PARTITION: Optional[str] = None
VLLM_CPU_KVCACHE_SPACE: int = 0
VLLM_CPU_OMP_THREADS_BIND: str = ""
VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0
VLLM_CPU_MOE_PREPACK: bool = True
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
VLLM_XLA_CHECK_RECOMPILATION: bool = False
......@@ -50,6 +53,7 @@ if TYPE_CHECKING:
VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
VLLM_XLA_USE_SPMD: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
......@@ -68,6 +72,7 @@ if TYPE_CHECKING:
VERBOSE: bool = False
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
VLLM_RPC_TIMEOUT: int = 10000 # ms
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
VLLM_PLUGINS: Optional[list[str]] = None
VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
VLLM_TORCH_PROFILER_DIR: Optional[str] = None
......@@ -107,6 +112,7 @@ if TYPE_CHECKING:
VLLM_DP_SIZE: int = 1
VLLM_DP_MASTER_IP: str = ""
VLLM_DP_MASTER_PORT: int = 0
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
VLLM_V0_USE_OUTLINES_CACHE: bool = False
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
......@@ -118,6 +124,9 @@ if TYPE_CHECKING:
VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
VLLM_ALL2ALL_BACKEND: str = "naive"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
VLLM_SLEEP_WHEN_IDLE: bool = False
VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
def get_default_cache_root():
......@@ -142,10 +151,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
def get_vllm_port() -> Optional[int]:
"""Get the port from VLLM_PORT environment variable.
Returns:
The port number as an integer if VLLM_PORT is set, None otherwise.
Raises:
ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue.
"""
......@@ -158,17 +167,13 @@ def get_vllm_port() -> Optional[int]:
return int(port)
except ValueError as err:
from urllib.parse import urlparse
try:
parsed = urlparse(port)
if parsed.scheme:
raise ValueError(
f"VLLM_PORT '{port}' appears to be a URI. "
"This may be caused by a Kubernetes service discovery issue"
"check the warning in: https://docs.vllm.ai/en/stable/usage/env_vars.html"
)
except Exception:
pass
parsed = urlparse(port)
if parsed.scheme:
raise ValueError(
f"VLLM_PORT '{port}' appears to be a URI. "
"This may be caused by a Kubernetes service discovery issue,"
"check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
) from None
raise ValueError(
f"VLLM_PORT '{port}' must be a valid integer") from err
......@@ -290,6 +295,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "False").lower() in
("true", "1")),
# Use separate prefill and decode kernels for V1 attention instead of
# the unified triton kernel.
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION":
lambda:
(os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
("true", "1")),
# Force vllm to use a specific flash-attention version (2 or 3), only valid
# when using the flash-attention backend.
"VLLM_FLASH_ATTN_VERSION":
......@@ -300,9 +312,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: bool(
os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
# Internal flag to enable/disable Inductor standalone compile
"VLLM_TEST_STANDALONE_COMPILE":
lambda: os.environ.get("VLLM_TEST_STANDALONE_COMPILE", "0") != "0",
# Feature flag to enable/disable Inductor standalone compile.
# In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
# enabled by default.
"VLLM_USE_STANDALONE_COMPILE":
lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1",
# local rank of the process in the distributed setting, used to determine
# the GPU device id
......@@ -323,8 +337,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Whether to log responses from API Server for debugging
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE":
lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False").
lower() == "true",
lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"
).lower() == "true",
# S3 access information, used for tensorizer to load model from S3
"S3_ACCESS_KEY_ID":
......@@ -409,7 +423,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
"VLLM_CPU_OMP_THREADS_BIND":
lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),
lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
# (CPU backend only) CPU cores not used by OMP threads .
# Those CPU cores will not be used by OMP threads of a rank.
"VLLM_CPU_NUM_OF_RESERVED_CPU":
lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")),
# (CPU backend only) whether to use prepack for MoE layer. This will be
# passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
......@@ -506,6 +525,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, assert on XLA recompilation after each execution step.
"VLLM_XLA_CHECK_RECOMPILATION":
lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))),
# Enable SPMD mode for TPU backend.
"VLLM_XLA_USE_SPMD":
lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
"VLLM_FUSED_MOE_CHUNK_SIZE":
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
......@@ -541,6 +564,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_RPC_TIMEOUT":
lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
# Timeout in seconds for keeping HTTP connections alive in API server
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
# a list of plugin names to load, separated by commas.
# if this is not set, it means all plugins will be loaded
# if this is set to an empty string, no plugins will be loaded
......@@ -746,6 +773,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DP_MASTER_PORT":
lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
# Randomize inputs during dummy runs when using Data Parallel
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
# Whether to use S3 path for model loading in CI via RunAI Streamer
"VLLM_CI_USE_S3":
lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
......@@ -813,6 +844,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Available options:
# - "naive": naive all2all implementation using all-reduce
# - "pplx": use pplx kernels
# - "deepep_high_throughput", use deepep high-throughput kernels
# - "deepep_low_latency", use deepep low-latency kernels
"VLLM_ALL2ALL_BACKEND":
lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
......@@ -822,6 +855,21 @@ environment_variables: dict[str, Callable[[], Any]] = {
# This is used to prevent the kernel from running out of memory.
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
# Regex timeout for use by the vLLM tool parsing plugins.
"VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
# Reduce CPU usage when vLLM is idle. Enabling this will incur small
# latency penalty when a request eventually comes.
"VLLM_SLEEP_WHEN_IDLE":
lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
# Control the max chunk bytes (in MB) for the rpc message queue.
# Object larger than this threshold will be broadcast to worker
# processes via zmq.
"VLLM_MQ_MAX_CHUNK_BYTES_MB":
lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")),
}
# --8<-- [end:env-vars-definition]
......@@ -884,7 +932,7 @@ def compute_hash() -> str:
"VLLM_USE_TRITON_AWQ",
"VLLM_DP_RANK",
"VLLM_DP_SIZE",
"VLLM_TEST_STANDALONE_COMPILE",
"VLLM_USE_STANDALONE_COMPILE",
]
for key in environment_variables_to_hash:
if key in environment_variables:
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import time
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment