Commit cc7f22a8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.1' into v0.9.1-ori

parents b9ea0c09 b6553be1
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
from collections.abc import Sequence from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence from collections.abc import Sequence
from typing import Union from typing import Union
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
from collections.abc import Sequence from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
from collections.abc import Sequence from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
from collections.abc import Sequence from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
from collections.abc import Sequence from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
from collections.abc import Sequence from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast import ast
import json import json
from collections.abc import Sequence from collections.abc import Sequence
...@@ -7,6 +8,7 @@ from typing import Any, Union ...@@ -7,6 +8,7 @@ from typing import Any, Union
import regex as re import regex as re
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
import vllm.envs as envs
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaFunctionCall, DeltaMessage, DeltaFunctionCall, DeltaMessage,
DeltaToolCall, DeltaToolCall,
...@@ -64,7 +66,19 @@ class Llama4PythonicToolParser(ToolParser): ...@@ -64,7 +66,19 @@ class Llama4PythonicToolParser(ToolParser):
if model_output.startswith("<|python_start|>"): if model_output.startswith("<|python_start|>"):
model_output = model_output[len("<|python_start|>"):] model_output = model_output[len("<|python_start|>"):]
model_output = model_output.replace("<|python_end|>", "") model_output = model_output.replace("<|python_end|>", "")
if not (self.TOOL_CALL_REGEX.match(model_output)):
is_tool_call_pattern = False
try:
is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
model_output,
timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
except TimeoutError:
logger.warning(
"Regex timeout occurred when matching tool call pattern.")
logger.debug("Regex timeout occurred when matching user input: %s",
model_output)
if not is_tool_call_pattern:
return ExtractedToolCallInformation(tools_called=False, return ExtractedToolCallInformation(tools_called=False,
tool_calls=[], tool_calls=[],
content=model_output) content=model_output)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
from collections.abc import Sequence from collections.abc import Sequence
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
from collections.abc import Sequence from collections.abc import Sequence
...@@ -43,11 +44,17 @@ class MistralToolCall(ToolCall): ...@@ -43,11 +44,17 @@ class MistralToolCall(ToolCall):
return id.isalnum() and len(id) == 9 return id.isalnum() and len(id) == 9
def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
return isinstance(model_tokenizer, MistralTokenizer) \
and model_tokenizer.version >= 11
@ToolParserManager.register_module("mistral") @ToolParserManager.register_module("mistral")
class MistralToolParser(ToolParser): class MistralToolParser(ToolParser):
""" """
Tool call parser for Mistral 7B Instruct v0.3, intended for use with the Tool call parser for Mistral 7B Instruct v0.3, intended for use with
examples/tool_chat_template_mistral.jinja template. - [`mistral_common`](https://github.com/mistralai/mistral-common/)
- the examples/tool_chat_template_mistral.jinja template.
Used when --enable-auto-tool-choice --tool-call-parser mistral are all set Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
""" """
...@@ -69,6 +76,12 @@ class MistralToolParser(ToolParser): ...@@ -69,6 +76,12 @@ class MistralToolParser(ToolParser):
self.bot_token = "[TOOL_CALLS]" self.bot_token = "[TOOL_CALLS]"
self.bot_token_id = self.vocab.get(self.bot_token) self.bot_token_id = self.vocab.get(self.bot_token)
self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL) self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
if _is_fn_name_regex_support(self.model_tokenizer):
self.fn_name_regex = re.compile(r'([a-zA-Z0-9_-]+)(\{.*?\})',
re.DOTALL)
else:
self.fn_name_regex = None
if self.bot_token_id is None: if self.bot_token_id is None:
raise RuntimeError( raise RuntimeError(
"Mistral Tool Parser could not locate the tool call token in " "Mistral Tool Parser could not locate the tool call token in "
...@@ -108,11 +121,25 @@ class MistralToolParser(ToolParser): ...@@ -108,11 +121,25 @@ class MistralToolParser(ToolParser):
tool_content = model_output.replace(self.bot_token, "").strip() tool_content = model_output.replace(self.bot_token, "").strip()
try: try:
# we first try to directly load the json as parsing very nested # we first try to directly load the json as parsing very nested
# jsons is difficult # jsons is difficult
try: try:
function_call_arr = json.loads(tool_content) if self.fn_name_regex:
matches = self.fn_name_regex.findall(tool_content)
function_call_arr = []
for match in matches:
fn_name = match[0]
args = match[1]
# fn_name is encoded outside serialized json dump
# only arguments are serialized
function_call_arr.append({
"name": fn_name,
"arguments": json.loads(args)
})
else:
function_call_arr = json.loads(tool_content)
except json.JSONDecodeError: except json.JSONDecodeError:
# use a regex to find the part corresponding to the tool call. # use a regex to find the part corresponding to the tool call.
# NOTE: This use case should not happen if the model is trained # NOTE: This use case should not happen if the model is trained
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
from collections.abc import Sequence from collections.abc import Sequence
...@@ -68,8 +69,8 @@ class Phi4MiniJsonToolParser(ToolParser): ...@@ -68,8 +69,8 @@ class Phi4MiniJsonToolParser(ToolParser):
len(function_call_arr)) len(function_call_arr))
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
logger.error( logger.error(
"Failed to parse function calls from model output: %s. " "Failed to parse function calls from model output. "
"Error: %s", model_output, str(e)) "Error: %s", str(e))
tool_calls: list[ToolCall] = [ tool_calls: list[ToolCall] = [
ToolCall( ToolCall(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast import ast
import json import json
...@@ -8,6 +9,7 @@ from typing import Any, Union ...@@ -8,6 +9,7 @@ from typing import Any, Union
import regex as re import regex as re
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
import vllm.envs as envs
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaFunctionCall, DeltaMessage, DeltaFunctionCall, DeltaMessage,
DeltaToolCall, DeltaToolCall,
...@@ -61,8 +63,18 @@ class PythonicToolParser(ToolParser): ...@@ -61,8 +63,18 @@ class PythonicToolParser(ToolParser):
""" """
Extract the tool calls from a complete model response. Extract the tool calls from a complete model response.
""" """
is_tool_call_pattern = False
if not (self.TOOL_CALL_REGEX.match(model_output)): try:
is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
model_output,
timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
except TimeoutError:
logger.warning(
"Regex timeout occurred when matching tool call pattern.")
logger.debug("Regex timeout occurred when matching user input: %s",
model_output)
if not is_tool_call_pattern:
return ExtractedToolCallInformation(tools_called=False, return ExtractedToolCallInformation(tools_called=False,
tool_calls=[], tool_calls=[],
content=model_output) content=model_output)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
from json import JSONDecodeError, JSONDecoder from json import JSONDecodeError, JSONDecoder
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Union from typing import Union
from torch.nn import CosineSimilarity from torch.nn import CosineSimilarity
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio import asyncio
from ssl import SSLContext from ssl import SSLContext
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio import asyncio
import functools import functools
...@@ -13,8 +14,9 @@ from vllm.logger import init_logger ...@@ -13,8 +14,9 @@ from vllm.logger import init_logger
logger = init_logger(__name__) logger = init_logger(__name__)
VLLM_SERVE_PARSER_EPILOG = ( VLLM_SUBCMD_PARSER_EPILOG = (
"Tip: Use `vllm serve --help=<keyword>` to explore arguments from help.\n" "Tip: Use `vllm [serve|run-batch] --help=<keyword>` "
"to explore arguments from help.\n"
" - To view a argument group: --help=ModelConfig\n" " - To view a argument group: --help=ModelConfig\n"
" - To view a single argument: --help=max-num-seqs\n" " - To view a single argument: --help=max-num-seqs\n"
" - To search by keyword: --help=max\n" " - To search by keyword: --help=max\n"
...@@ -26,6 +28,11 @@ async def listen_for_disconnect(request: Request) -> None: ...@@ -26,6 +28,11 @@ async def listen_for_disconnect(request: Request) -> None:
while True: while True:
message = await request.receive() message = await request.receive()
if message["type"] == "http.disconnect": if message["type"] == "http.disconnect":
if request.app.state.enable_server_load_tracking:
# on timeout/cancellation the BackgroundTask in load_aware_call
# cannot decrement the server load metrics.
# Must be decremented by with_cancellation instead.
request.app.state.server_load_metrics -= 1
break break
...@@ -167,8 +174,15 @@ def _validate_truncation_size( ...@@ -167,8 +174,15 @@ def _validate_truncation_size(
return truncate_prompt_tokens return truncate_prompt_tokens
def show_filtered_argument_or_group_from_help(parser): def show_filtered_argument_or_group_from_help(parser, subcommand_name):
import sys import sys
# Only handle --help=<keyword> for the current subcommand.
# Since subparser_init() runs for all subcommands during CLI setup,
# we skip processing if the subcommand name is not in sys.argv.
if subcommand_name not in sys.argv:
return
for arg in sys.argv: for arg in sys.argv:
if arg.startswith('--help='): if arg.startswith('--help='):
search_keyword = arg.split('=', 1)[1] search_keyword = arg.split('=', 1)[1]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import torch import torch
from vllm.logger import init_logger
logger = init_logger(__name__)
# set some common config/environment variables that should be set # set some common config/environment variables that should be set
# for all processes created by vllm and all processes # for all processes created by vllm and all processes
# that interact with vllm workers. # that interact with vllm workers.
# they are executed whenever `import vllm` is called. # they are executed whenever `import vllm` is called.
if not os.path.exists('/dev/nvidia-caps-imex-channels'): if 'NCCL_CUMEM_ENABLE' in os.environ:
# normally, we disable NCCL_CUMEM_ENABLE because it logger.warning(
# will cost 1~2 GiB GPU memory with cudagraph+allreduce, "NCCL_CUMEM_ENABLE is set to %s, skipping override. "
# see https://github.com/NVIDIA/nccl/issues/1234 "This may increase memory overhead with cudagraph+allreduce: "
# for more details. "https://github.com/NVIDIA/nccl/issues/1234",
# However, NCCL requires NCCL_CUMEM_ENABLE to work with os.environ['NCCL_CUMEM_ENABLE'])
elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
# NCCL requires NCCL_CUMEM_ENABLE to work with
# multi-node NVLink, typically on GB200-NVL72 systems. # multi-node NVLink, typically on GB200-NVL72 systems.
# The ultimate way to detect multi-node NVLink is to use # The ultimate way to detect multi-node NVLink is to use
# NVML APIs, which are too expensive to call here. # NVML APIs, which are too expensive to call here.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib import hashlib
import os import os
...@@ -15,6 +16,7 @@ if TYPE_CHECKING: ...@@ -15,6 +16,7 @@ if TYPE_CHECKING:
VLLM_NCCL_SO_PATH: Optional[str] = None VLLM_NCCL_SO_PATH: Optional[str] = None
LD_LIBRARY_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None
VLLM_USE_TRITON_FLASH_ATTN: bool = False VLLM_USE_TRITON_FLASH_ATTN: bool = False
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
VLLM_FLASH_ATTN_VERSION: Optional[int] = None VLLM_FLASH_ATTN_VERSION: Optional[int] = None
LOCAL_RANK: int = 0 LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: Optional[str] = None CUDA_VISIBLE_DEVICES: Optional[str] = None
...@@ -42,6 +44,7 @@ if TYPE_CHECKING: ...@@ -42,6 +44,7 @@ if TYPE_CHECKING:
VLLM_PP_LAYER_PARTITION: Optional[str] = None VLLM_PP_LAYER_PARTITION: Optional[str] = None
VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_KVCACHE_SPACE: int = 0
VLLM_CPU_OMP_THREADS_BIND: str = "" VLLM_CPU_OMP_THREADS_BIND: str = ""
VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0
VLLM_CPU_MOE_PREPACK: bool = True VLLM_CPU_MOE_PREPACK: bool = True
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
VLLM_XLA_CHECK_RECOMPILATION: bool = False VLLM_XLA_CHECK_RECOMPILATION: bool = False
...@@ -50,6 +53,7 @@ if TYPE_CHECKING: ...@@ -50,6 +53,7 @@ if TYPE_CHECKING:
VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto" VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
VLLM_XLA_USE_SPMD: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_IMAGE_FETCH_TIMEOUT: int = 5
...@@ -68,6 +72,7 @@ if TYPE_CHECKING: ...@@ -68,6 +72,7 @@ if TYPE_CHECKING:
VERBOSE: bool = False VERBOSE: bool = False
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
VLLM_RPC_TIMEOUT: int = 10000 # ms VLLM_RPC_TIMEOUT: int = 10000 # ms
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
VLLM_PLUGINS: Optional[list[str]] = None VLLM_PLUGINS: Optional[list[str]] = None
VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
VLLM_TORCH_PROFILER_DIR: Optional[str] = None VLLM_TORCH_PROFILER_DIR: Optional[str] = None
...@@ -107,6 +112,7 @@ if TYPE_CHECKING: ...@@ -107,6 +112,7 @@ if TYPE_CHECKING:
VLLM_DP_SIZE: int = 1 VLLM_DP_SIZE: int = 1
VLLM_DP_MASTER_IP: str = "" VLLM_DP_MASTER_IP: str = ""
VLLM_DP_MASTER_PORT: int = 0 VLLM_DP_MASTER_PORT: int = 0
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
VLLM_V0_USE_OUTLINES_CACHE: bool = False VLLM_V0_USE_OUTLINES_CACHE: bool = False
VLLM_TPU_BUCKET_PADDING_GAP: int = 0 VLLM_TPU_BUCKET_PADDING_GAP: int = 0
...@@ -118,6 +124,9 @@ if TYPE_CHECKING: ...@@ -118,6 +124,9 @@ if TYPE_CHECKING:
VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
VLLM_ALL2ALL_BACKEND: str = "naive" VLLM_ALL2ALL_BACKEND: str = "naive"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
VLLM_SLEEP_WHEN_IDLE: bool = False
VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
def get_default_cache_root(): def get_default_cache_root():
...@@ -142,10 +151,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: ...@@ -142,10 +151,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
def get_vllm_port() -> Optional[int]: def get_vllm_port() -> Optional[int]:
"""Get the port from VLLM_PORT environment variable. """Get the port from VLLM_PORT environment variable.
Returns: Returns:
The port number as an integer if VLLM_PORT is set, None otherwise. The port number as an integer if VLLM_PORT is set, None otherwise.
Raises: Raises:
ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue. ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue.
""" """
...@@ -158,17 +167,13 @@ def get_vllm_port() -> Optional[int]: ...@@ -158,17 +167,13 @@ def get_vllm_port() -> Optional[int]:
return int(port) return int(port)
except ValueError as err: except ValueError as err:
from urllib.parse import urlparse from urllib.parse import urlparse
try: parsed = urlparse(port)
parsed = urlparse(port) if parsed.scheme:
if parsed.scheme: raise ValueError(
raise ValueError( f"VLLM_PORT '{port}' appears to be a URI. "
f"VLLM_PORT '{port}' appears to be a URI. " "This may be caused by a Kubernetes service discovery issue,"
"This may be caused by a Kubernetes service discovery issue" "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
"check the warning in: https://docs.vllm.ai/en/stable/usage/env_vars.html" ) from None
)
except Exception:
pass
raise ValueError( raise ValueError(
f"VLLM_PORT '{port}' must be a valid integer") from err f"VLLM_PORT '{port}' must be a valid integer") from err
...@@ -290,6 +295,13 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -290,6 +295,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "False").lower() in lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "False").lower() in
("true", "1")), ("true", "1")),
# Use separate prefill and decode kernels for V1 attention instead of
# the unified triton kernel.
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION":
lambda:
(os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
("true", "1")),
# Force vllm to use a specific flash-attention version (2 or 3), only valid # Force vllm to use a specific flash-attention version (2 or 3), only valid
# when using the flash-attention backend. # when using the flash-attention backend.
"VLLM_FLASH_ATTN_VERSION": "VLLM_FLASH_ATTN_VERSION":
...@@ -300,9 +312,11 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -300,9 +312,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: bool( lambda: bool(
os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
# Internal flag to enable/disable Inductor standalone compile # Feature flag to enable/disable Inductor standalone compile.
"VLLM_TEST_STANDALONE_COMPILE": # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
lambda: os.environ.get("VLLM_TEST_STANDALONE_COMPILE", "0") != "0", # enabled by default.
"VLLM_USE_STANDALONE_COMPILE":
lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1",
# local rank of the process in the distributed setting, used to determine # local rank of the process in the distributed setting, used to determine
# the GPU device id # the GPU device id
...@@ -323,8 +337,8 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -323,8 +337,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Whether to log responses from API Server for debugging # Whether to log responses from API Server for debugging
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE": "VLLM_DEBUG_LOG_API_SERVER_RESPONSE":
lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"). lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"
lower() == "true", ).lower() == "true",
# S3 access information, used for tensorizer to load model from S3 # S3 access information, used for tensorizer to load model from S3
"S3_ACCESS_KEY_ID": "S3_ACCESS_KEY_ID":
...@@ -409,7 +423,12 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -409,7 +423,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31", # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'. # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
"VLLM_CPU_OMP_THREADS_BIND": "VLLM_CPU_OMP_THREADS_BIND":
lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"), lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
# (CPU backend only) CPU cores not used by OMP threads .
# Those CPU cores will not be used by OMP threads of a rank.
"VLLM_CPU_NUM_OF_RESERVED_CPU":
lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")),
# (CPU backend only) whether to use prepack for MoE layer. This will be # (CPU backend only) whether to use prepack for MoE layer. This will be
# passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
...@@ -506,6 +525,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -506,6 +525,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, assert on XLA recompilation after each execution step. # If set, assert on XLA recompilation after each execution step.
"VLLM_XLA_CHECK_RECOMPILATION": "VLLM_XLA_CHECK_RECOMPILATION":
lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))), lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))),
# Enable SPMD mode for TPU backend.
"VLLM_XLA_USE_SPMD":
lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
"VLLM_FUSED_MOE_CHUNK_SIZE": "VLLM_FUSED_MOE_CHUNK_SIZE":
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")), lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
...@@ -541,6 +564,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -541,6 +564,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_RPC_TIMEOUT": "VLLM_RPC_TIMEOUT":
lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")), lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
# Timeout in seconds for keeping HTTP connections alive in API server
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
# a list of plugin names to load, separated by commas. # a list of plugin names to load, separated by commas.
# if this is not set, it means all plugins will be loaded # if this is not set, it means all plugins will be loaded
# if this is set to an empty string, no plugins will be loaded # if this is set to an empty string, no plugins will be loaded
...@@ -746,6 +773,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -746,6 +773,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DP_MASTER_PORT": "VLLM_DP_MASTER_PORT":
lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")), lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
# Randomize inputs during dummy runs when using Data Parallel
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
# Whether to use S3 path for model loading in CI via RunAI Streamer # Whether to use S3 path for model loading in CI via RunAI Streamer
"VLLM_CI_USE_S3": "VLLM_CI_USE_S3":
lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1", lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
...@@ -813,6 +844,8 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -813,6 +844,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Available options: # Available options:
# - "naive": naive all2all implementation using all-reduce # - "naive": naive all2all implementation using all-reduce
# - "pplx": use pplx kernels # - "pplx": use pplx kernels
# - "deepep_high_throughput", use deepep high-throughput kernels
# - "deepep_low_latency", use deepep low-latency kernels
"VLLM_ALL2ALL_BACKEND": "VLLM_ALL2ALL_BACKEND":
lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
...@@ -822,6 +855,21 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -822,6 +855,21 @@ environment_variables: dict[str, Callable[[], Any]] = {
# This is used to prevent the kernel from running out of memory. # This is used to prevent the kernel from running out of memory.
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")), lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
# Regex timeout for use by the vLLM tool parsing plugins.
"VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
# Reduce CPU usage when vLLM is idle. Enabling this will incur small
# latency penalty when a request eventually comes.
"VLLM_SLEEP_WHEN_IDLE":
lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
# Control the max chunk bytes (in MB) for the rpc message queue.
# Object larger than this threshold will be broadcast to worker
# processes via zmq.
"VLLM_MQ_MAX_CHUNK_BYTES_MB":
lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")),
} }
# --8<-- [end:env-vars-definition] # --8<-- [end:env-vars-definition]
...@@ -884,7 +932,7 @@ def compute_hash() -> str: ...@@ -884,7 +932,7 @@ def compute_hash() -> str:
"VLLM_USE_TRITON_AWQ", "VLLM_USE_TRITON_AWQ",
"VLLM_DP_RANK", "VLLM_DP_RANK",
"VLLM_DP_SIZE", "VLLM_DP_SIZE",
"VLLM_TEST_STANDALONE_COMPILE", "VLLM_USE_STANDALONE_COMPILE",
] ]
for key in environment_variables_to_hash: for key in environment_variables_to_hash:
if key in environment_variables: if key in environment_variables:
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio import asyncio
import time import time
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio import asyncio
import os import os
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment