Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
174 additions
and
38 deletions
+174
-38
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+1
-0
vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
...entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+1
-0
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
...ypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+1
-0
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+1
-0
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+1
-0
vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
.../entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+1
-0
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+1
-0
vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
...points/openai/tool_parsers/llama4_pythonic_tool_parser.py
+15
-1
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+1
-0
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+31
-4
vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+3
-2
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+14
-2
vllm/entrypoints/openai/tool_parsers/utils.py
vllm/entrypoints/openai/tool_parsers/utils.py
+1
-0
vllm/entrypoints/score_utils.py
vllm/entrypoints/score_utils.py
+1
-0
vllm/entrypoints/ssl.py
vllm/entrypoints/ssl.py
+1
-0
vllm/entrypoints/utils.py
vllm/entrypoints/utils.py
+17
-3
vllm/env_override.py
vllm/env_override.py
+13
-6
vllm/envs.py
vllm/envs.py
+68
-20
vllm/executor/executor_base.py
vllm/executor/executor_base.py
+1
-0
vllm/executor/mp_distributed_executor.py
vllm/executor/mp_distributed_executor.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
collections.abc
import
Sequence
...
...
vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Sequence
from
typing
import
Union
...
...
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
collections.abc
import
Sequence
...
...
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
collections.abc
import
Sequence
...
...
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
collections.abc
import
Sequence
...
...
vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
collections.abc
import
Sequence
...
...
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
collections.abc
import
Sequence
...
...
vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
ast
import
json
from
collections.abc
import
Sequence
...
...
@@ -7,6 +8,7 @@ from typing import Any, Union
import
regex
as
re
from
transformers
import
PreTrainedTokenizerBase
import
vllm.envs
as
envs
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -64,7 +66,19 @@ class Llama4PythonicToolParser(ToolParser):
if
model_output
.
startswith
(
"<|python_start|>"
):
model_output
=
model_output
[
len
(
"<|python_start|>"
):]
model_output
=
model_output
.
replace
(
"<|python_end|>"
,
""
)
if
not
(
self
.
TOOL_CALL_REGEX
.
match
(
model_output
)):
is_tool_call_pattern
=
False
try
:
is_tool_call_pattern
=
self
.
TOOL_CALL_REGEX
.
match
(
model_output
,
timeout
=
envs
.
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
)
is
not
None
except
TimeoutError
:
logger
.
warning
(
"Regex timeout occurred when matching tool call pattern."
)
logger
.
debug
(
"Regex timeout occurred when matching user input: %s"
,
model_output
)
if
not
is_tool_call_pattern
:
return
ExtractedToolCallInformation
(
tools_called
=
False
,
tool_calls
=
[],
content
=
model_output
)
...
...
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
collections.abc
import
Sequence
...
...
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
collections.abc
import
Sequence
...
...
@@ -43,11 +44,17 @@ class MistralToolCall(ToolCall):
return
id
.
isalnum
()
and
len
(
id
)
==
9
def
_is_fn_name_regex_support
(
model_tokenizer
:
AnyTokenizer
)
->
bool
:
return
isinstance
(
model_tokenizer
,
MistralTokenizer
)
\
and
model_tokenizer
.
version
>=
11
@
ToolParserManager
.
register_module
(
"mistral"
)
class
MistralToolParser
(
ToolParser
):
"""
Tool call parser for Mistral 7B Instruct v0.3, intended for use with the
examples/tool_chat_template_mistral.jinja template.
Tool call parser for Mistral 7B Instruct v0.3, intended for use with
- [`mistral_common`](https://github.com/mistralai/mistral-common/)
- the examples/tool_chat_template_mistral.jinja template.
Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
"""
...
...
@@ -69,6 +76,12 @@ class MistralToolParser(ToolParser):
self
.
bot_token
=
"[TOOL_CALLS]"
self
.
bot_token_id
=
self
.
vocab
.
get
(
self
.
bot_token
)
self
.
tool_call_regex
=
re
.
compile
(
r
"\[{.*}\]"
,
re
.
DOTALL
)
if
_is_fn_name_regex_support
(
self
.
model_tokenizer
):
self
.
fn_name_regex
=
re
.
compile
(
r
'([a-zA-Z0-9_-]+)(\{.*?\})'
,
re
.
DOTALL
)
else
:
self
.
fn_name_regex
=
None
if
self
.
bot_token_id
is
None
:
raise
RuntimeError
(
"Mistral Tool Parser could not locate the tool call token in "
...
...
@@ -108,11 +121,25 @@ class MistralToolParser(ToolParser):
tool_content
=
model_output
.
replace
(
self
.
bot_token
,
""
).
strip
()
try
:
# we first try to directly load the json as parsing very nested
# jsons is difficult
try
:
function_call_arr
=
json
.
loads
(
tool_content
)
if
self
.
fn_name_regex
:
matches
=
self
.
fn_name_regex
.
findall
(
tool_content
)
function_call_arr
=
[]
for
match
in
matches
:
fn_name
=
match
[
0
]
args
=
match
[
1
]
# fn_name is encoded outside serialized json dump
# only arguments are serialized
function_call_arr
.
append
({
"name"
:
fn_name
,
"arguments"
:
json
.
loads
(
args
)
})
else
:
function_call_arr
=
json
.
loads
(
tool_content
)
except
json
.
JSONDecodeError
:
# use a regex to find the part corresponding to the tool call.
# NOTE: This use case should not happen if the model is trained
...
...
vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
collections.abc
import
Sequence
...
...
@@ -68,8 +69,8 @@ class Phi4MiniJsonToolParser(ToolParser):
len
(
function_call_arr
))
except
json
.
JSONDecodeError
as
e
:
logger
.
error
(
"Failed to parse function calls from model output
: %s
. "
"Error: %s"
,
model_output
,
str
(
e
))
"Failed to parse function calls from model output. "
"Error: %s"
,
str
(
e
))
tool_calls
:
list
[
ToolCall
]
=
[
ToolCall
(
...
...
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
ast
import
json
...
...
@@ -8,6 +9,7 @@ from typing import Any, Union
import
regex
as
re
from
transformers
import
PreTrainedTokenizerBase
import
vllm.envs
as
envs
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -61,8 +63,18 @@ class PythonicToolParser(ToolParser):
"""
Extract the tool calls from a complete model response.
"""
if
not
(
self
.
TOOL_CALL_REGEX
.
match
(
model_output
)):
is_tool_call_pattern
=
False
try
:
is_tool_call_pattern
=
self
.
TOOL_CALL_REGEX
.
match
(
model_output
,
timeout
=
envs
.
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
)
is
not
None
except
TimeoutError
:
logger
.
warning
(
"Regex timeout occurred when matching tool call pattern."
)
logger
.
debug
(
"Regex timeout occurred when matching user input: %s"
,
model_output
)
if
not
is_tool_call_pattern
:
return
ExtractedToolCallInformation
(
tools_called
=
False
,
tool_calls
=
[],
content
=
model_output
)
...
...
vllm/entrypoints/openai/tool_parsers/utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
json
import
JSONDecodeError
,
JSONDecoder
...
...
vllm/entrypoints/score_utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Union
from
torch.nn
import
CosineSimilarity
...
...
vllm/entrypoints/ssl.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
from
ssl
import
SSLContext
...
...
vllm/entrypoints/utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
functools
...
...
@@ -13,8 +14,9 @@ from vllm.logger import init_logger
logger
=
init_logger
(
__name__
)
VLLM_SERVE_PARSER_EPILOG
=
(
"Tip: Use `vllm serve --help=<keyword>` to explore arguments from help.
\n
"
VLLM_SUBCMD_PARSER_EPILOG
=
(
"Tip: Use `vllm [serve|run-batch] --help=<keyword>` "
"to explore arguments from help.
\n
"
" - To view a argument group: --help=ModelConfig
\n
"
" - To view a single argument: --help=max-num-seqs
\n
"
" - To search by keyword: --help=max
\n
"
...
...
@@ -26,6 +28,11 @@ async def listen_for_disconnect(request: Request) -> None:
while
True
:
message
=
await
request
.
receive
()
if
message
[
"type"
]
==
"http.disconnect"
:
if
request
.
app
.
state
.
enable_server_load_tracking
:
# on timeout/cancellation the BackgroundTask in load_aware_call
# cannot decrement the server load metrics.
# Must be decremented by with_cancellation instead.
request
.
app
.
state
.
server_load_metrics
-=
1
break
...
...
@@ -167,8 +174,15 @@ def _validate_truncation_size(
return
truncate_prompt_tokens
def
show_filtered_argument_or_group_from_help
(
parser
):
def
show_filtered_argument_or_group_from_help
(
parser
,
subcommand_name
):
import
sys
# Only handle --help=<keyword> for the current subcommand.
# Since subparser_init() runs for all subcommands during CLI setup,
# we skip processing if the subcommand name is not in sys.argv.
if
subcommand_name
not
in
sys
.
argv
:
return
for
arg
in
sys
.
argv
:
if
arg
.
startswith
(
'--help='
):
search_keyword
=
arg
.
split
(
'='
,
1
)[
1
]
...
...
vllm/env_override.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
torch
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
# set some common config/environment variables that should be set
# for all processes created by vllm and all processes
# that interact with vllm workers.
# they are executed whenever `import vllm` is called.
if
not
os
.
path
.
exists
(
'/dev/nvidia-caps-imex-channels'
):
# normally, we disable NCCL_CUMEM_ENABLE because it
# will cost 1~2 GiB GPU memory with cudagraph+allreduce,
# see https://github.com/NVIDIA/nccl/issues/1234
# for more details.
# However, NCCL requires NCCL_CUMEM_ENABLE to work with
if
'NCCL_CUMEM_ENABLE'
in
os
.
environ
:
logger
.
warning
(
"NCCL_CUMEM_ENABLE is set to %s, skipping override. "
"This may increase memory overhead with cudagraph+allreduce: "
"https://github.com/NVIDIA/nccl/issues/1234"
,
os
.
environ
[
'NCCL_CUMEM_ENABLE'
])
elif
not
os
.
path
.
exists
(
'/dev/nvidia-caps-imex-channels'
):
# NCCL requires NCCL_CUMEM_ENABLE to work with
# multi-node NVLink, typically on GB200-NVL72 systems.
# The ultimate way to detect multi-node NVLink is to use
# NVML APIs, which are too expensive to call here.
...
...
vllm/envs.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
hashlib
import
os
...
...
@@ -15,6 +16,7 @@ if TYPE_CHECKING:
VLLM_NCCL_SO_PATH
:
Optional
[
str
]
=
None
LD_LIBRARY_PATH
:
Optional
[
str
]
=
None
VLLM_USE_TRITON_FLASH_ATTN
:
bool
=
False
VLLM_V1_USE_PREFILL_DECODE_ATTENTION
:
bool
=
False
VLLM_FLASH_ATTN_VERSION
:
Optional
[
int
]
=
None
LOCAL_RANK
:
int
=
0
CUDA_VISIBLE_DEVICES
:
Optional
[
str
]
=
None
...
...
@@ -42,6 +44,7 @@ if TYPE_CHECKING:
VLLM_PP_LAYER_PARTITION
:
Optional
[
str
]
=
None
VLLM_CPU_KVCACHE_SPACE
:
int
=
0
VLLM_CPU_OMP_THREADS_BIND
:
str
=
""
VLLM_CPU_NUM_OF_RESERVED_CPU
:
int
=
0
VLLM_CPU_MOE_PREPACK
:
bool
=
True
VLLM_XLA_CACHE_PATH
:
str
=
os
.
path
.
join
(
VLLM_CACHE_ROOT
,
"xla_cache"
)
VLLM_XLA_CHECK_RECOMPILATION
:
bool
=
False
...
...
@@ -50,6 +53,7 @@ if TYPE_CHECKING:
VLLM_USE_RAY_COMPILED_DAG
:
bool
=
False
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
:
str
=
"auto"
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM
:
bool
=
False
VLLM_XLA_USE_SPMD
:
bool
=
False
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"spawn"
VLLM_ASSETS_CACHE
:
str
=
os
.
path
.
join
(
VLLM_CACHE_ROOT
,
"assets"
)
VLLM_IMAGE_FETCH_TIMEOUT
:
int
=
5
...
...
@@ -68,6 +72,7 @@ if TYPE_CHECKING:
VERBOSE
:
bool
=
False
VLLM_ALLOW_LONG_MAX_MODEL_LEN
:
bool
=
False
VLLM_RPC_TIMEOUT
:
int
=
10000
# ms
VLLM_HTTP_TIMEOUT_KEEP_ALIVE
:
int
=
5
# seconds
VLLM_PLUGINS
:
Optional
[
list
[
str
]]
=
None
VLLM_LORA_RESOLVER_CACHE_DIR
:
Optional
[
str
]
=
None
VLLM_TORCH_PROFILER_DIR
:
Optional
[
str
]
=
None
...
...
@@ -107,6 +112,7 @@ if TYPE_CHECKING:
VLLM_DP_SIZE
:
int
=
1
VLLM_DP_MASTER_IP
:
str
=
""
VLLM_DP_MASTER_PORT
:
int
=
0
VLLM_RANDOMIZE_DP_DUMMY_INPUTS
:
bool
=
False
VLLM_MARLIN_USE_ATOMIC_ADD
:
bool
=
False
VLLM_V0_USE_OUTLINES_CACHE
:
bool
=
False
VLLM_TPU_BUCKET_PADDING_GAP
:
int
=
0
...
...
@@ -118,6 +124,9 @@ if TYPE_CHECKING:
VLLM_NIXL_SIDE_CHANNEL_PORT
:
int
=
5557
VLLM_ALL2ALL_BACKEND
:
str
=
"naive"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
:
int
=
163840
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
:
int
=
1
VLLM_SLEEP_WHEN_IDLE
:
bool
=
False
VLLM_MQ_MAX_CHUNK_BYTES_MB
:
int
=
16
def
get_default_cache_root
():
...
...
@@ -142,10 +151,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
def
get_vllm_port
()
->
Optional
[
int
]:
"""Get the port from VLLM_PORT environment variable.
Returns:
The port number as an integer if VLLM_PORT is set, None otherwise.
Raises:
ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue.
"""
...
...
@@ -158,17 +167,13 @@ def get_vllm_port() -> Optional[int]:
return
int
(
port
)
except
ValueError
as
err
:
from
urllib.parse
import
urlparse
try
:
parsed
=
urlparse
(
port
)
if
parsed
.
scheme
:
raise
ValueError
(
f
"VLLM_PORT '
{
port
}
' appears to be a URI. "
"This may be caused by a Kubernetes service discovery issue"
"check the warning in: https://docs.vllm.ai/en/stable/usage/env_vars.html"
)
except
Exception
:
pass
parsed
=
urlparse
(
port
)
if
parsed
.
scheme
:
raise
ValueError
(
f
"VLLM_PORT '
{
port
}
' appears to be a URI. "
"This may be caused by a Kubernetes service discovery issue,"
"check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
)
from
None
raise
ValueError
(
f
"VLLM_PORT '
{
port
}
' must be a valid integer"
)
from
err
...
...
@@ -290,6 +295,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_TRITON_FLASH_ATTN"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# Use separate prefill and decode kernels for V1 attention instead of
# the unified triton kernel.
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
:
lambda
:
(
os
.
getenv
(
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# Force vllm to use a specific flash-attention version (2 or 3), only valid
# when using the flash-attention backend.
"VLLM_FLASH_ATTN_VERSION"
:
...
...
@@ -300,9 +312,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda
:
bool
(
os
.
environ
.
get
(
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
,
"1"
)
!=
"0"
),
# Internal flag to enable/disable Inductor standalone compile
"VLLM_TEST_STANDALONE_COMPILE"
:
lambda
:
os
.
environ
.
get
(
"VLLM_TEST_STANDALONE_COMPILE"
,
"0"
)
!=
"0"
,
# Feature flag to enable/disable Inductor standalone compile.
# In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
# enabled by default.
"VLLM_USE_STANDALONE_COMPILE"
:
lambda
:
os
.
environ
.
get
(
"VLLM_USE_STANDALONE_COMPILE"
,
"1"
)
==
"1"
,
# local rank of the process in the distributed setting, used to determine
# the GPU device id
...
...
@@ -323,8 +337,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Whether to log responses from API Server for debugging
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE"
:
lambda
:
os
.
environ
.
get
(
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE"
,
"False"
).
lower
()
==
"true"
,
lambda
:
os
.
environ
.
get
(
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE"
,
"False"
).
lower
()
==
"true"
,
# S3 access information, used for tensorizer to load model from S3
"S3_ACCESS_KEY_ID"
:
...
...
@@ -409,7 +423,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
"VLLM_CPU_OMP_THREADS_BIND"
:
lambda
:
os
.
getenv
(
"VLLM_CPU_OMP_THREADS_BIND"
,
"all"
),
lambda
:
os
.
getenv
(
"VLLM_CPU_OMP_THREADS_BIND"
,
"auto"
),
# (CPU backend only) CPU cores not used by OMP threads .
# Those CPU cores will not be used by OMP threads of a rank.
"VLLM_CPU_NUM_OF_RESERVED_CPU"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_CPU_NUM_OF_RESERVED_CPU"
,
"0"
)),
# (CPU backend only) whether to use prepack for MoE layer. This will be
# passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
...
...
@@ -506,6 +525,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, assert on XLA recompilation after each execution step.
"VLLM_XLA_CHECK_RECOMPILATION"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_XLA_CHECK_RECOMPILATION"
,
"0"
))),
# Enable SPMD mode for TPU backend.
"VLLM_XLA_USE_SPMD"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_XLA_USE_SPMD"
,
"0"
))),
"VLLM_FUSED_MOE_CHUNK_SIZE"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"32768"
)),
...
...
@@ -541,6 +564,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_RPC_TIMEOUT"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_RPC_TIMEOUT"
,
"10000"
)),
# Timeout in seconds for keeping HTTP connections alive in API server
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE"
:
lambda
:
int
(
os
.
environ
.
get
(
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE"
,
"5"
)),
# a list of plugin names to load, separated by commas.
# if this is not set, it means all plugins will be loaded
# if this is set to an empty string, no plugins will be loaded
...
...
@@ -746,6 +773,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DP_MASTER_PORT"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_DP_MASTER_PORT"
,
"0"
)),
# Randomize inputs during dummy runs when using Data Parallel
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS"
:
lambda
:
os
.
environ
.
get
(
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS"
,
"0"
)
==
"1"
,
# Whether to use S3 path for model loading in CI via RunAI Streamer
"VLLM_CI_USE_S3"
:
lambda
:
os
.
environ
.
get
(
"VLLM_CI_USE_S3"
,
"0"
)
==
"1"
,
...
...
@@ -813,6 +844,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Available options:
# - "naive": naive all2all implementation using all-reduce
# - "pplx": use pplx kernels
# - "deepep_high_throughput", use deepep high-throughput kernels
# - "deepep_low_latency", use deepep low-latency kernels
"VLLM_ALL2ALL_BACKEND"
:
lambda
:
os
.
getenv
(
"VLLM_ALL2ALL_BACKEND"
,
"naive"
),
...
...
@@ -822,6 +855,21 @@ environment_variables: dict[str, Callable[[], Any]] = {
# This is used to prevent the kernel from running out of memory.
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE"
,
"163840"
)),
# Regex timeout for use by the vLLM tool parsing plugins.
"VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS"
,
"1"
)),
# Reduce CPU usage when vLLM is idle. Enabling this will incur small
# latency penalty when a request eventually comes.
"VLLM_SLEEP_WHEN_IDLE"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_SLEEP_WHEN_IDLE"
,
"0"
))),
# Control the max chunk bytes (in MB) for the rpc message queue.
# Object larger than this threshold will be broadcast to worker
# processes via zmq.
"VLLM_MQ_MAX_CHUNK_BYTES_MB"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_MQ_MAX_CHUNK_BYTES_MB"
,
"16"
)),
}
# --8<-- [end:env-vars-definition]
...
...
@@ -884,7 +932,7 @@ def compute_hash() -> str:
"VLLM_USE_TRITON_AWQ"
,
"VLLM_DP_RANK"
,
"VLLM_DP_SIZE"
,
"VLLM_
TEST
_STANDALONE_COMPILE"
,
"VLLM_
USE
_STANDALONE_COMPILE"
,
]
for
key
in
environment_variables_to_hash
:
if
key
in
environment_variables
:
...
...
vllm/executor/executor_base.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
time
...
...
vllm/executor/mp_distributed_executor.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
os
...
...
Prev
1
…
46
47
48
49
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment