Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dcb5624a
Commit
dcb5624a
authored
Apr 29, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.5' into v0.8.5-dev
parents
55880ca2
ba41cc90
Changes
554
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1302 additions
and
614 deletions
+1302
-614
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
...tributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+131
-0
vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
...d/kv_transfer/kv_connector/v1/shared_storage_connector.py
+383
-0
vllm/distributed/kv_transfer/kv_connector_agent.py
vllm/distributed/kv_transfer/kv_connector_agent.py
+1
-1
vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
...istributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+1
-1
vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+23
-18
vllm/distributed/kv_transfer/kv_transfer_state.py
vllm/distributed/kv_transfer/kv_transfer_state.py
+70
-0
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+59
-34
vllm/distributed/utils.py
vllm/distributed/utils.py
+21
-1
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+412
-489
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+23
-4
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+14
-35
vllm/engine/metrics.py
vllm/engine/metrics.py
+3
-6
vllm/engine/multiprocessing/client.py
vllm/engine/multiprocessing/client.py
+4
-1
vllm/engine/output_processor/multi_step.py
vllm/engine/output_processor/multi_step.py
+1
-1
vllm/engine/protocol.py
vllm/engine/protocol.py
+6
-1
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+53
-20
vllm/entrypoints/cli/benchmark/latency.py
vllm/entrypoints/cli/benchmark/latency.py
+29
-0
vllm/entrypoints/cli/benchmark/main.py
vllm/entrypoints/cli/benchmark/main.py
+4
-2
vllm/entrypoints/cli/benchmark/throughput.py
vllm/entrypoints/cli/benchmark/throughput.py
+29
-0
vllm/entrypoints/cli/collect_env.py
vllm/entrypoints/cli/collect_env.py
+35
-0
No files found.
Too many changes to show.
To preserve performance only
554 of 554+
files are displayed.
Plain diff
Email patch
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
from
typing
import
TYPE_CHECKING
import
torch
from
lmcache.integration.vllm.vllm_v1_adapter
import
LMCacheConnectorV1Impl
from
vllm.config
import
VllmConfig
from
vllm.distributed.kv_transfer.kv_connector.v1.base
import
(
KVConnectorBase_V1
,
KVConnectorMetadata
,
KVConnectorRole
)
from
vllm.logger
import
init_logger
from
vllm.v1.core.sched.output
import
SchedulerOutput
if
TYPE_CHECKING
:
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.forward_context
import
ForwardContext
from
vllm.v1.request
import
Request
logger
=
init_logger
(
__name__
)
class
LMCacheConnectorV1
(
KVConnectorBase_V1
):
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
role
:
KVConnectorRole
):
super
().
__init__
(
vllm_config
=
vllm_config
,
role
=
role
)
self
.
_lmcache_engine
=
LMCacheConnectorV1Impl
(
vllm_config
,
role
,
self
)
# ==============================
# Worker-side methods
# ==============================
def
start_load_kv
(
self
,
forward_context
:
"ForwardContext"
,
**
kwargs
)
->
None
:
"""
Start loading the KV cache from the connector to vLLM's paged
KV buffer. This is called from the forward context before the
forward pass to enable async loading during model execution.
Args:
forward_context (ForwardContext): the forward context.
**kwargs: additional arguments for the load operation
Note:
The number of elements in kv_caches and layer_names should be
the same.
"""
self
.
_lmcache_engine
.
start_load_kv
(
forward_context
,
**
kwargs
)
def
wait_for_layer_load
(
self
,
layer_name
:
str
)
->
None
:
"""
Block until the KV for a specific layer is loaded into vLLM's
paged buffer. This is called from within attention layer to ensure
async copying from start_load_kv is complete.
This interface will be useful for layer-by-layer pipelining.
Args:
layer_name: the name of that layer
"""
self
.
_lmcache_engine
.
wait_for_layer_load
(
layer_name
)
def
save_kv_layer
(
self
,
layer_name
:
str
,
kv_layer
:
torch
.
Tensor
,
attn_metadata
:
"AttentionMetadata"
,
**
kwargs
)
->
None
:
"""
Start saving the a layer of KV cache from vLLM's paged buffer
to the connector. This is called from within attention layer to
enable async copying during execution.
Args:
layer_name (str): the name of the layer.
kv_layer (torch.Tensor): the paged KV buffer of the current
layer in vLLM.
attn_metadata (AttentionMetadata): the attention metadata.
**kwargs: additional arguments for the save operation.
"""
self
.
_lmcache_engine
.
save_kv_layer
(
layer_name
,
kv_layer
,
attn_metadata
,
**
kwargs
)
def
wait_for_save
(
self
):
"""
Block until all the save operations is done. This is called
as the forward context exits to ensure that the async saving
from save_kv_layer is complete before finishing the forward.
This prevents overwrites of paged KV buffer before saving done.
"""
self
.
_lmcache_engine
.
wait_for_save
()
# ==============================
# Scheduler-side methods
# ==============================
def
get_num_new_matched_tokens
(
self
,
request
:
"Request"
,
num_computed_tokens
:
int
,
)
->
int
:
"""
Get number of new tokens that can be loaded from the
external KV cache beyond the num_computed_tokens.
Args:
request (Request): the request object.
num_computed_tokens (int): the number of locally
computed tokens for this request
Returns:
the number of tokens that can be loaded from the
external KV cache beyond what is already computed.
"""
return
self
.
_lmcache_engine
.
get_num_new_matched_tokens
(
request
,
num_computed_tokens
)
def
update_state_after_alloc
(
self
,
request
:
"Request"
,
num_external_tokens
:
int
):
"""
Update KVConnector state after block allocation.
"""
self
.
_lmcache_engine
.
update_state_after_alloc
(
request
,
num_external_tokens
)
def
build_connector_meta
(
self
,
scheduler_output
:
SchedulerOutput
)
->
KVConnectorMetadata
:
"""
Build the connector metadata for this step.
This function should NOT modify fields in the scheduler_output.
Also, calling this function will reset the state of the connector.
Args:
scheduler_output (SchedulerOutput): the scheduler output object.
"""
return
self
.
_lmcache_engine
.
build_connector_meta
(
scheduler_output
)
vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
hashlib
import
os
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
import
safetensors
import
torch
from
vllm.config
import
VllmConfig
from
vllm.distributed.kv_transfer.kv_connector.v1.base
import
(
KVConnectorBase_V1
,
KVConnectorMetadata
,
KVConnectorRole
)
from
vllm.logger
import
init_logger
from
vllm.v1.attention.backends.mla.common
import
MLACommonMetadata
from
vllm.v1.core.sched.output
import
SchedulerOutput
if
TYPE_CHECKING
:
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.forward_context
import
ForwardContext
from
vllm.v1.request
import
Request
logger
=
init_logger
(
__name__
)
@
dataclass
class
ReqMeta
:
# Request tokens
token_ids
:
torch
.
Tensor
# Slot mappings, should have the same length as token_ids
slot_mapping
:
torch
.
Tensor
# Is store or load
is_store
:
bool
@
staticmethod
def
make_meta
(
token_ids
:
list
[
int
],
block_ids
:
list
[
int
],
block_size
:
int
,
is_store
:
bool
)
->
"ReqMeta"
:
valid_num_tokens
=
align_to_block_size
(
len
(
token_ids
),
block_size
)
token_ids_tensor
=
torch
.
tensor
(
token_ids
)[:
valid_num_tokens
]
block_ids_tensor
=
torch
.
tensor
(
block_ids
)
num_blocks
=
block_ids_tensor
.
shape
[
0
]
block_offsets
=
torch
.
arange
(
0
,
block_size
)
slot_mapping
=
block_offsets
.
reshape
((
1
,
block_size
))
+
\
block_ids_tensor
.
reshape
((
num_blocks
,
1
))
*
block_size
slot_mapping
=
slot_mapping
.
flatten
()[:
valid_num_tokens
]
return
ReqMeta
(
token_ids
=
token_ids_tensor
,
slot_mapping
=
slot_mapping
,
is_store
=
is_store
,
)
@
dataclass
class
SharedStorageConnectorMetadata
(
KVConnectorMetadata
):
requests
:
list
[
ReqMeta
]
def
__init__
(
self
):
self
.
requests
=
[]
def
add_request
(
self
,
token_ids
:
list
[
int
],
block_ids
:
list
[
int
],
block_size
:
int
,
is_store
:
bool
,
)
->
None
:
self
.
requests
.
append
(
ReqMeta
.
make_meta
(
token_ids
,
block_ids
,
block_size
,
is_store
))
class
SharedStorageConnector
(
KVConnectorBase_V1
):
# NOTE: This is Simple debug implementation of the KV connector.
# It save / load the KV cache to / from the disk.
# It does extra work which will overwrite the existing prefix-cache in GPU
# - to remove the overhead, need to add some "mask" in the ReqMeta class
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
role
:
KVConnectorRole
):
super
().
__init__
(
vllm_config
=
vllm_config
,
role
=
role
)
self
.
_block_size
=
vllm_config
.
cache_config
.
block_size
self
.
_requests_need_load
:
dict
[
str
,
Request
]
=
{}
transfer_config
=
vllm_config
.
kv_transfer_config
self
.
_storage_path
=
transfer_config
.
get_from_extra_config
(
"shared_storage_path"
,
"/tmp"
)
logger
.
info
(
vllm_config
.
kv_transfer_config
)
logger
.
info
(
"Shared storage path is %s"
,
self
.
_storage_path
)
def
start_load_kv
(
self
,
forward_context
:
"ForwardContext"
,
**
kwargs
)
->
None
:
"""Start loading the KV cache from the connector buffer to vLLM's
paged KV buffer.
Args:
forward_context (ForwardContext): the forward context.
**kwargs: additional arguments for the load operation
Note:
The number of elements in kv_caches and layer_names should be
the same.
"""
attn_metadata
=
forward_context
.
attn_metadata
def
inject_kv_into_layer
(
dst_kv_cache_layer
:
torch
.
Tensor
,
src_kv_cache
:
torch
.
Tensor
,
slot_mapping
:
torch
.
Tensor
,
)
->
None
:
"""Inject the KV cache into the layer.
Args:
dst_kv_cache_layer (torch.Tensor): the destination KV cache
layer. In shape [2, num_pages, page_size, xxx] if not
using MLA, [num_pages, page_size, xxx] otherwise.
src_kv_cache (torch.Tensor): the source KV cache. In shape
[2, num_tokens, xxx] if not using MLA, [num_tokens, xxx]
otherwise.
slot_mapping (torch.Tensor): the slot mapping. In shape
[num_tokens].
"""
dst_kv_cache_layer_shape
=
dst_kv_cache_layer
.
shape
if
isinstance
(
attn_metadata
,
MLACommonMetadata
):
num_pages
=
dst_kv_cache_layer_shape
[
0
]
page_size
=
dst_kv_cache_layer_shape
[
1
]
dst_kv_cache_layer
=
dst_kv_cache_layer
.
reshape
(
num_pages
*
page_size
,
-
1
)
dst_kv_cache_layer
[
slot_mapping
,
...]
=
src_kv_cache
dst_kv_cache_layer
.
reshape
(
dst_kv_cache_layer_shape
)
else
:
num_pages
=
dst_kv_cache_layer_shape
[
1
]
page_size
=
dst_kv_cache_layer_shape
[
2
]
dst_kv_cache_layer
=
dst_kv_cache_layer
.
reshape
(
2
,
num_pages
*
page_size
,
-
1
)
dst_kv_cache_layer
[:,
slot_mapping
,
...]
=
src_kv_cache
dst_kv_cache_layer
.
reshape
(
dst_kv_cache_layer_shape
)
# Get the metadata
metadata
:
KVConnectorMetadata
=
\
self
.
_get_connector_metadata
()
assert
isinstance
(
metadata
,
SharedStorageConnectorMetadata
)
if
metadata
is
None
:
logger
.
warning
(
"In connector.start_load_kv, but the connector metadata is None"
)
return
attn_metadata
=
forward_context
.
attn_metadata
if
attn_metadata
is
None
:
logger
.
warning
(
"In connector.start_load_kv, but the attn_metadata is None"
)
return
# Load the KV for each request each layer
for
request
in
metadata
.
requests
:
if
request
.
is_store
:
continue
logger
.
info
(
"Inject KV cache of %d tokens to the paged memory"
,
len
(
request
.
slot_mapping
))
for
layer_name
in
forward_context
.
no_compile_layers
:
attn_layer
=
forward_context
.
no_compile_layers
[
layer_name
]
kv_cache_layer
=
attn_layer
.
kv_cache
[
\
forward_context
.
virtual_engine
]
filename
=
self
.
_generate_filename_debug
(
layer_name
,
request
.
token_ids
)
kv_cache
=
safetensors
.
torch
.
load_file
(
filename
)[
"kv_cache"
].
cuda
()
inject_kv_into_layer
(
kv_cache_layer
,
kv_cache
,
request
.
slot_mapping
)
def
wait_for_layer_load
(
self
,
layer_name
:
str
)
->
None
:
"""Blocking until the KV for a specific layer is loaded into vLLM's
paged buffer.
This interface will be useful for layer-by-layer pipelining.
Args:
layer_name: the name of that layer
"""
return
def
save_kv_layer
(
self
,
layer_name
:
str
,
kv_layer
:
torch
.
Tensor
,
attn_metadata
:
"AttentionMetadata"
,
**
kwargs
)
->
None
:
"""Start saving the KV cache of the layer from vLLM's paged buffer
to the connector.
Args:
layer_name (str): the name of the layer.
kv_layer (torch.Tensor): the paged KV buffer of the current
layer in vLLM.
attn_metadata (AttentionMetadata): the attention metadata.
**kwargs: additional arguments for the save operation.
"""
def
extract_kv_from_layer
(
layer
:
torch
.
Tensor
,
slot_mapping
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
"""Extract the KV cache from the layer.
Assume the shape of the layer is (2, num_pages, page_size, xxx)
if MLA is not used, and (num_pages, page_size, xxx) otherwise.
"""
if
isinstance
(
attn_metadata
,
MLACommonMetadata
):
num_pages
,
page_size
=
layer
.
shape
[
0
],
layer
.
shape
[
1
]
return
layer
.
reshape
(
num_pages
*
page_size
,
-
1
)[
slot_mapping
,
...]
num_pages
,
page_size
=
layer
.
shape
[
1
],
layer
.
shape
[
2
]
return
layer
.
reshape
(
2
,
num_pages
*
page_size
,
-
1
)[:,
slot_mapping
,
...]
connector_metadata
=
self
.
_get_connector_metadata
()
assert
isinstance
(
connector_metadata
,
SharedStorageConnectorMetadata
)
for
request
in
connector_metadata
.
requests
:
if
request
.
is_store
:
filename
=
self
.
_generate_filename_debug
(
layer_name
,
request
.
token_ids
)
kv_cache
=
extract_kv_from_layer
(
kv_layer
,
request
.
slot_mapping
)
tensors
=
{
"kv_cache"
:
kv_cache
.
detach
().
cpu
()}
safetensors
.
torch
.
save_file
(
tensors
,
filename
)
def
wait_for_save
(
self
):
return
def
get_num_new_matched_tokens
(
self
,
request
:
"Request"
,
num_computed_tokens
:
int
,
)
->
int
:
"""
Get number of new tokens that can be loaded from the
external KV cache beyond the num_computed_tokens.
Args:
request (Request): the request object.
num_computed_tokens (int): the number of locally
computed tokens for this request
Returns:
the number of tokens that can be loaded from the
external KV cache beyond what is already computed.
"""
# NOTE: in this debug implementation, we assume that the prompt is
# cached_prompt + newly_generated_single_token
# Therefore, we use prompt_token_ids[:-1] to determine the folder name
# NOTE: in current v1 scheduler, the num_computed_tokens is aligned
# with the block granularity. And it expects the returned blocks and
# num_computed_tokens to also be aligned with the block granularity.
if
not
self
.
_found_match_for_request
(
request
):
return
0
logger
.
info
(
"External Cache Hit!"
)
# Now, first num_tokens_to_check tokens are hit, we need to prepare
# the metadata for the worker connector to correctly load the KV
num_tokens_to_check
=
align_to_block_size
(
len
(
request
.
prompt_token_ids
)
-
1
,
self
.
_block_size
)
return
num_tokens_to_check
-
num_computed_tokens
def
update_state_after_alloc
(
self
,
request
:
"Request"
,
num_external_tokens
:
int
):
"""
Update KVConnector state after block allocation.
If blocks were allocated, add to _requests_need_load,
such that we load the KVs in the next forward pass.
"""
if
num_external_tokens
>
0
:
self
.
_requests_need_load
[
request
.
request_id
]
=
request
def
build_connector_meta
(
self
,
scheduler_output
:
SchedulerOutput
,
)
->
KVConnectorMetadata
:
"""Build the connector metadata for this step.
This function should NOT modify any fields in the scheduler_output.
Also, calling this function will reset the state of the connector.
Args:
scheduler_output (SchedulerOutput): the scheduler output object.
"""
meta
=
SharedStorageConnectorMetadata
()
total_need_load
=
0
for
new_req
in
scheduler_output
.
scheduled_new_reqs
:
if
new_req
.
req_id
in
self
.
_requests_need_load
:
meta
.
add_request
(
token_ids
=
new_req
.
prompt_token_ids
,
block_ids
=
new_req
.
block_ids
,
block_size
=
self
.
_block_size
,
is_store
=
False
)
total_need_load
+=
1
else
:
# NOTE: here, we set the store and load being exclusive,
# but a single request can have both store and load.
# NOTE(rob): for this debug implementation, we only cache
# the original prompt tokens.
if
not
self
.
_found_match_for_request
(
new_req
):
meta
.
add_request
(
token_ids
=
new_req
.
prompt_token_ids
,
block_ids
=
new_req
.
block_ids
,
block_size
=
self
.
_block_size
,
is_store
=
True
)
for
cached_req
in
scheduler_output
.
scheduled_cached_reqs
:
# NOTE(rob): here we rely on the resumed requests being
# the first N requests in the list scheduled_cache_reqs.
if
not
cached_req
.
resumed_from_preemption
:
break
if
cached_req
.
req_id
in
self
.
_requests_need_load
:
# NOTE(rob): cached_req_data does not have the full
# list of token ids (only new tokens). So we look it
# up in the actual request object.
request
=
self
.
_requests_need_load
[
cached_req
.
req_id
]
total_tokens
=
(
len
(
cached_req
.
new_token_ids
)
+
cached_req
.
num_computed_tokens
)
token_ids
=
request
.
all_token_ids
[:
total_tokens
]
# NOTE(rob): For resumed req, new_block_ids is all
# of the block_ids for the request.
block_ids
=
cached_req
.
new_block_ids
meta
.
add_request
(
token_ids
=
token_ids
,
block_ids
=
block_ids
,
block_size
=
self
.
_block_size
,
is_store
=
False
)
total_need_load
+=
1
assert
total_need_load
==
len
(
self
.
_requests_need_load
)
self
.
_requests_need_load
.
clear
()
return
meta
# ==============================
# Helper functions
# ==============================
def
_found_match_for_request
(
self
,
request
:
"Request"
,
)
->
bool
:
"""Check if the cache is hit for the request.
"""
num_tokens_to_check
=
align_to_block_size
(
len
(
request
.
prompt_token_ids
)
-
1
,
self
.
_block_size
)
foldername
=
self
.
_generate_foldername_debug
(
torch
.
tensor
(
request
.
prompt_token_ids
)[:
num_tokens_to_check
],
create_folder
=
False
)
return
os
.
path
.
exists
(
foldername
)
def
_generate_foldername_debug
(
self
,
input_ids
:
torch
.
Tensor
,
create_folder
=
False
,
)
->
str
:
"""Generate a folder name based on the hash of the bytes of the input
ids.
"""
input_ids_bytes
=
input_ids
.
numpy
().
tobytes
()
input_ids_hash
=
hashlib
.
md5
(
input_ids_bytes
,
usedforsecurity
=
False
).
hexdigest
()
foldername
=
os
.
path
.
join
(
self
.
_storage_path
,
input_ids_hash
)
if
create_folder
:
os
.
makedirs
(
foldername
,
exist_ok
=
True
)
return
foldername
def
_generate_filename_debug
(
self
,
layer_name
:
str
,
input_ids
:
torch
.
Tensor
,
)
->
str
:
"""Generate a file name based on the layer name and the hash
of the bytes of the input ids.
"""
foldername
=
self
.
_generate_foldername_debug
(
input_ids
,
create_folder
=
True
)
return
os
.
path
.
join
(
foldername
,
f
"
{
layer_name
}
.safetensors"
)
def
align_to_block_size
(
num_tokens
:
int
,
block_size
)
->
int
:
"""Align the number of tokens to the block size.
"""
return
(
num_tokens
-
1
)
//
block_size
*
block_size
vllm/distributed/kv_transfer/kv_
transfe
r_agent.py
→
vllm/distributed/kv_transfer/kv_
connecto
r_agent.py
View file @
dcb5624a
...
...
@@ -46,7 +46,7 @@ class KVTransferAgent:
assert
self
.
config
.
kv_transfer_config
.
is_kv_transfer_instance
,
"KV"
\
"TransferAgent should only be used when kv_connector is set."
self
.
connector
=
KVConnectorFactory
.
create_connector
(
self
.
connector
=
KVConnectorFactory
.
create_connector
_v0
(
rank
,
local_rank
,
config
)
def
send_kv_caches_and_hidden_states
(
...
...
vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
View file @
dcb5624a
...
...
@@ -70,7 +70,7 @@ class MooncakeStore(KVStoreBufferBase):
):
try
:
from
mooncake
_vllm_adap
tor
import
MooncakeDistributedStore
from
mooncake
.s
tor
e
import
MooncakeDistributedStore
except
ImportError
as
e
:
raise
ImportError
(
"Please install mooncake by following the instructions at "
...
...
vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
View file @
dcb5624a
...
...
@@ -2,6 +2,7 @@
import
json
import
os
import
struct
from
concurrent.futures
import
ThreadPoolExecutor
from
dataclasses
import
dataclass
from
typing
import
Optional
,
Union
...
...
@@ -57,14 +58,14 @@ class MooncakeTransferEngine:
def
__init__
(
self
,
kv_rank
:
int
,
local_rank
:
int
):
try
:
import
mooncake
_vllm_adaptor
as
mva
from
mooncake
.engine
import
TransferEngine
except
ImportError
as
e
:
raise
ImportError
(
"Please install mooncake by following the instructions at "
"https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "
# noqa: E501
"to run vLLM with MooncakeConnector."
)
from
e
self
.
engine
=
mva
.
mooncake_vllm_adaptor
()
self
.
engine
=
TransferEngine
()
self
.
local_rank
=
local_rank
try
:
...
...
@@ -115,14 +116,14 @@ class MooncakeTransferEngine:
p_rank_offset
=
int
(
p_port
)
+
8
+
self
.
local_rank
*
2
d_rank_offset
=
int
(
d_port
)
+
8
+
self
.
local_rank
*
2
if
kv_rank
==
0
:
self
.
sender_socket
.
bind
(
f
"tcp://
*
:
{
p_rank_offset
+
1
}
"
)
self
.
sender_socket
.
bind
(
f
"tcp://
{
p_host
}
:
{
p_rank_offset
+
1
}
"
)
self
.
receiver_socket
.
connect
(
f
"tcp://
{
d_host
}
:
{
d_rank_offset
+
1
}
"
)
self
.
sender_ack
.
connect
(
f
"tcp://
{
d_host
}
:
{
d_rank_offset
+
2
}
"
)
self
.
receiver_ack
.
bind
(
f
"tcp://
*
:
{
p_rank_offset
+
2
}
"
)
self
.
receiver_ack
.
bind
(
f
"tcp://
{
p_host
}
:
{
p_rank_offset
+
2
}
"
)
else
:
self
.
receiver_socket
.
connect
(
f
"tcp://
{
p_host
}
:
{
p_rank_offset
+
1
}
"
)
self
.
sender_socket
.
bind
(
f
"tcp://
*
:
{
d_rank_offset
+
1
}
"
)
self
.
receiver_ack
.
bind
(
f
"tcp://
*
:
{
d_rank_offset
+
2
}
"
)
self
.
sender_socket
.
bind
(
f
"tcp://
{
d_host
}
:
{
d_rank_offset
+
1
}
"
)
self
.
receiver_ack
.
bind
(
f
"tcp://
{
d_host
}
:
{
d_rank_offset
+
2
}
"
)
self
.
sender_ack
.
connect
(
f
"tcp://
{
p_host
}
:
{
p_rank_offset
+
2
}
"
)
def
initialize
(
self
,
local_hostname
:
str
,
metadata_server
:
str
,
...
...
@@ -140,12 +141,12 @@ class MooncakeTransferEngine:
"Mooncake Configuration error. `metadata_backend`"
f
" should be one of
{
supported_backend
}
."
)
self
.
engine
.
initialize
E
xt
(
local_hostname
,
metadata_server
,
protocol
,
device_name
,
metadata_backend
)
self
.
engine
.
initialize
_e
xt
(
local_hostname
,
metadata_server
,
protocol
,
device_name
,
metadata_backend
)
def
allocate_managed_buffer
(
self
,
length
:
int
)
->
int
:
"""Allocate a managed buffer of the specified length."""
ret
=
self
.
engine
.
allocate
M
anaged
B
uffer
(
length
)
ret
=
self
.
engine
.
allocate
_m
anaged
_b
uffer
(
length
)
if
ret
<=
0
:
logger
.
error
(
"Allocation Return Error"
)
raise
Exception
(
"Allocation Return Error"
)
...
...
@@ -153,13 +154,13 @@ class MooncakeTransferEngine:
def
free_managed_buffer
(
self
,
buffer
:
int
,
length
:
int
)
->
int
:
"""Free a previously allocated managed buffer."""
return
self
.
engine
.
free
M
anaged
B
uffer
(
buffer
,
length
)
return
self
.
engine
.
free
_m
anaged
_b
uffer
(
buffer
,
length
)
def
transfer_sync
(
self
,
buffer
:
int
,
peer_buffer_address
:
int
,
length
:
int
)
->
int
:
"""Synchronously transfer data to the specified address."""
ret
=
self
.
engine
.
transfer
S
ync
(
self
.
remote_url
,
buffer
,
peer_buffer_address
,
length
)
ret
=
self
.
engine
.
transfer
_s
ync
_read
(
self
.
remote_url
,
buffer
,
peer_buffer_address
,
length
)
if
ret
<
0
:
logger
.
error
(
"Transfer Return Error"
)
raise
Exception
(
"Transfer Return Error"
)
...
...
@@ -168,15 +169,15 @@ class MooncakeTransferEngine:
def
write_bytes_to_buffer
(
self
,
buffer
:
int
,
user_data
:
bytes
,
length
:
int
)
->
int
:
"""Write bytes to the allocated buffer."""
return
self
.
engine
.
write
B
ytes
ToB
uffer
(
buffer
,
user_data
,
length
)
return
self
.
engine
.
write
_b
ytes
_to_b
uffer
(
buffer
,
user_data
,
length
)
def
read_bytes_from_buffer
(
self
,
buffer
:
int
,
length
:
int
)
->
bytes
:
"""Read bytes from the allocated buffer."""
return
self
.
engine
.
read
B
ytes
F
rom
B
uffer
(
buffer
,
length
)
return
self
.
engine
.
read
_b
ytes
_f
rom
_b
uffer
(
buffer
,
length
)
def
wait_for_ack
(
self
,
src_ptr
:
int
,
length
:
int
)
->
None
:
"""Asynchronously wait for ACK from the receiver."""
ack
=
self
.
sender_ack
.
recv
_pyobj
()
ack
=
self
.
sender_ack
.
recv
()
if
ack
!=
b
'ACK'
:
logger
.
error
(
"Failed to receive ACK from the receiver"
)
...
...
@@ -187,18 +188,22 @@ class MooncakeTransferEngine:
length
=
len
(
user_data
)
src_ptr
=
self
.
allocate_managed_buffer
(
length
)
self
.
write_bytes_to_buffer
(
src_ptr
,
user_data
,
length
)
self
.
sender_socket
.
send_pyobj
((
src_ptr
,
length
))
self
.
sender_socket
.
send_multipart
(
[
struct
.
pack
(
"!Q"
,
src_ptr
),
struct
.
pack
(
"!Q"
,
length
)])
self
.
buffer_cleaner
.
submit
(
self
.
wait_for_ack
,
src_ptr
,
length
)
def
recv_bytes
(
self
)
->
bytes
:
"""Receive bytes from the remote process."""
src_ptr
,
length
=
self
.
receiver_socket
.
recv_pyobj
()
data
=
self
.
receiver_socket
.
recv_multipart
()
src_ptr
=
struct
.
unpack
(
"!Q"
,
data
[
0
])[
0
]
length
=
struct
.
unpack
(
"!Q"
,
data
[
1
])[
0
]
dst_ptr
=
self
.
allocate_managed_buffer
(
length
)
self
.
transfer_sync
(
dst_ptr
,
src_ptr
,
length
)
ret
=
self
.
read_bytes_from_buffer
(
dst_ptr
,
length
)
# Buffer cleanup
self
.
receiver_ack
.
send
_pyobj
(
b
'ACK'
)
self
.
receiver_ack
.
send
(
b
'ACK'
)
self
.
free_managed_buffer
(
dst_ptr
,
length
)
return
ret
...
...
vllm/distributed/kv_transfer/kv_transfer_state.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
from
typing
import
TYPE_CHECKING
,
Optional
from
vllm
import
envs
from
vllm.distributed.kv_transfer.kv_connector.base
import
KVConnectorBaseType
from
vllm.distributed.kv_transfer.kv_connector.factory
import
(
KVConnectorFactory
)
from
vllm.distributed.kv_transfer.kv_connector.v1
import
(
KVConnectorBase_V1
,
KVConnectorRole
)
from
vllm.distributed.parallel_state
import
get_world_group
if
TYPE_CHECKING
:
from
vllm.config
import
VllmConfig
_KV_CONNECTOR_AGENT
:
Optional
[
KVConnectorBaseType
]
=
None
def
get_kv_transfer_group
()
->
KVConnectorBaseType
:
assert
_KV_CONNECTOR_AGENT
is
not
None
,
(
"disaggregated KV cache transfer parallel group is not initialized"
)
return
_KV_CONNECTOR_AGENT
def
has_kv_transfer_group
()
->
bool
:
return
_KV_CONNECTOR_AGENT
is
not
None
def
is_v1_kv_transfer_group
(
connector
:
Optional
[
KVConnectorBaseType
]
=
None
)
->
bool
:
"""Check if the KV connector is the v1 connector.
If the argument is None, it will check the global KV connector
Args:
connector: The KV connector to check. If None, it will check the
global KV connector.
Note:
This function will no-longer be needed after the v1 KV connector
becomes the default.
"""
if
connector
is
None
:
connector
=
_KV_CONNECTOR_AGENT
if
connector
is
None
:
return
False
return
isinstance
(
connector
,
KVConnectorBase_V1
)
def
ensure_kv_transfer_initialized
(
vllm_config
:
"VllmConfig"
)
->
None
:
"""
Initialize KV cache transfer parallel group.
"""
global
_KV_CONNECTOR_AGENT
if
vllm_config
.
kv_transfer_config
is
None
:
return
if
(
vllm_config
.
kv_transfer_config
.
is_kv_transfer_instance
and
_KV_CONNECTOR_AGENT
is
None
):
if
envs
.
VLLM_USE_V1
:
_KV_CONNECTOR_AGENT
=
KVConnectorFactory
.
create_connector_v1
(
config
=
vllm_config
,
role
=
KVConnectorRole
.
WORKER
)
else
:
_KV_CONNECTOR_AGENT
=
KVConnectorFactory
.
create_connector_v0
(
rank
=
get_world_group
().
rank
,
local_rank
=
get_world_group
().
local_rank
,
config
=
vllm_config
,
)
vllm/distributed/parallel_state.py
View file @
dcb5624a
...
...
@@ -29,15 +29,13 @@ from collections import namedtuple
from
contextlib
import
contextmanager
,
nullcontext
from
dataclasses
import
dataclass
from
multiprocessing
import
shared_memory
from
typing
import
(
TYPE_CHECKING
,
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Union
)
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
unittest.mock
import
patch
import
torch
import
torch.distributed
from
torch.distributed
import
Backend
,
ProcessGroup
import
vllm.distributed.kv_transfer.kv_transfer_agent
as
kv_transfer
import
vllm.envs
as
envs
from
vllm.distributed.device_communicators.base_device_communicator
import
(
DeviceCommunicatorBase
)
...
...
@@ -46,9 +44,6 @@ from vllm.logger import init_logger
from
vllm.utils
import
(
direct_register_custom_op
,
resolve_obj_by_qualname
,
supports_custom_op
)
if
TYPE_CHECKING
:
from
vllm.config
import
VllmConfig
@
dataclass
class
GraphCaptureContext
:
...
...
@@ -118,6 +113,38 @@ def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
return
torch
.
empty_like
(
tensor
)
def
reduce_scatter
(
tensor
:
torch
.
Tensor
,
dim
:
int
,
world_size
:
int
,
group_name
:
str
)
->
torch
.
Tensor
:
assert
group_name
in
_groups
,
f
"Group
{
group_name
}
is not found."
group
=
_groups
[
group_name
]()
if
group
is
None
:
raise
ValueError
(
f
"Group
{
group_name
}
is destroyed."
)
return
group
.
reduce_scatter
(
tensor
,
dim
)
def
reduce_scatter_fake
(
tensor
:
torch
.
Tensor
,
dim
:
int
,
world_size
:
int
,
group_name
:
str
)
->
torch
.
Tensor
:
new_shape
=
list
(
tensor
.
shape
)
new_shape
[
dim
]
=
tensor
.
shape
[
dim
]
//
world_size
return
torch
.
empty
(
new_shape
,
dtype
=
tensor
.
dtype
,
device
=
tensor
.
device
)
def
all_gather
(
tensor
:
torch
.
Tensor
,
dim
:
int
,
world_size
:
int
,
group_name
:
str
)
->
torch
.
Tensor
:
assert
group_name
in
_groups
,
f
"Group
{
group_name
}
is not found."
group
=
_groups
[
group_name
]()
if
group
is
None
:
raise
ValueError
(
f
"Group
{
group_name
}
is destroyed."
)
return
group
.
all_gather
(
tensor
,
dim
)
def
all_gather_fake
(
tensor
:
torch
.
Tensor
,
dim
:
int
,
world_size
:
int
,
group_name
:
str
)
->
torch
.
Tensor
:
new_shape
=
list
(
tensor
.
shape
)
new_shape
[
dim
]
=
tensor
.
shape
[
dim
]
*
world_size
return
torch
.
empty
(
new_shape
,
dtype
=
tensor
.
dtype
,
device
=
tensor
.
device
)
if
supports_custom_op
():
from
vllm.platforms
import
current_platform
direct_register_custom_op
(
...
...
@@ -128,6 +155,20 @@ if supports_custom_op():
dispatch_key
=
current_platform
.
dispatch_key
,
)
direct_register_custom_op
(
op_name
=
"reduce_scatter"
,
op_func
=
reduce_scatter
,
mutates_args
=
[],
fake_impl
=
reduce_scatter_fake
,
)
direct_register_custom_op
(
op_name
=
"all_gather"
,
op_func
=
all_gather
,
mutates_args
=
[],
fake_impl
=
all_gather_fake
,
)
class
GroupCoordinator
:
"""
...
...
@@ -327,6 +368,18 @@ class GroupCoordinator:
return
self
.
device_communicator
.
all_gather
(
input_
,
dim
)
def
reduce_scatter
(
self
,
input_
:
torch
.
Tensor
,
dim
:
int
=
-
1
)
->
torch
.
Tensor
:
world_size
=
self
.
world_size
# Bypass the function if we are using only 1 GPU.
if
world_size
==
1
:
return
input_
assert
-
input_
.
dim
()
<=
dim
<
input_
.
dim
(),
(
f
"Invalid dim (
{
dim
}
) for input tensor with shape
{
input_
.
size
()
}
"
)
return
self
.
device_communicator
.
reduce_scatter
(
input_
,
dim
)
def
gather
(
self
,
input_
:
torch
.
Tensor
,
dst
:
int
=
0
,
...
...
@@ -772,14 +825,6 @@ def get_pp_group() -> GroupCoordinator:
# kept for backward compatibility
get_pipeline_model_parallel_group
=
get_pp_group
_KV_TRANSFER
:
Optional
[
kv_transfer
.
KVTransferAgent
]
=
None
def
get_kv_transfer_group
()
->
kv_transfer
.
KVTransferAgent
:
assert
_KV_TRANSFER
is
not
None
,
(
"disaggregated KV cache transfer parallel group is not initialized"
)
return
_KV_TRANSFER
@
contextmanager
def
graph_capture
(
device
:
torch
.
device
):
...
...
@@ -962,26 +1007,6 @@ def initialize_model_parallel(
_DP
.
rank_in_group
,
_PP
.
rank_in_group
,
_TP
.
rank_in_group
)
def
ensure_kv_transfer_initialized
(
vllm_config
:
"VllmConfig"
)
->
None
:
"""
Initialize KV cache transfer parallel group.
"""
global
_KV_TRANSFER
if
vllm_config
.
kv_transfer_config
is
None
:
return
if
all
([
vllm_config
.
kv_transfer_config
.
is_kv_transfer_instance
,
_KV_TRANSFER
is
None
]):
_KV_TRANSFER
=
kv_transfer
.
KVTransferAgent
(
rank
=
get_world_group
().
rank
,
local_rank
=
get_world_group
().
local_rank
,
config
=
vllm_config
)
def
ensure_model_parallel_initialized
(
tensor_model_parallel_size
:
int
,
pipeline_model_parallel_size
:
int
,
...
...
vllm/distributed/utils.py
View file @
dcb5624a
...
...
@@ -7,6 +7,7 @@
import
dataclasses
import
datetime
import
pickle
import
socket
import
time
from
collections
import
deque
from
typing
import
Any
,
Deque
,
Dict
,
Optional
,
Sequence
,
Tuple
...
...
@@ -123,6 +124,10 @@ class StatelessProcessGroup:
rank
:
int
world_size
:
int
store
:
torch
.
_C
.
_distributed_c10d
.
Store
# stores a reference to the socket so that the file descriptor stays alive
socket
:
Optional
[
socket
.
socket
]
data_expiration_seconds
:
int
=
3600
# 1 hour
# dst rank -> counter
...
...
@@ -234,18 +239,33 @@ class StatelessProcessGroup:
can call `StatelessProcessGroup.create` to form a group, and then process A, B,
C, and D can call `StatelessProcessGroup.create` to form another group.
"""
# noqa
launch_server
=
rank
==
0
if
launch_server
:
# listen on the specified interface (instead of 0.0.0.0)
listen_socket
=
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
listen_socket
.
setsockopt
(
socket
.
SOL_SOCKET
,
socket
.
SO_REUSEADDR
,
1
)
listen_socket
.
bind
((
host
,
port
))
listen_socket
.
listen
()
listen_fd
=
listen_socket
.
fileno
()
else
:
listen_socket
=
None
listen_fd
=
None
store
=
TCPStore
(
host_name
=
host
,
port
=
port
,
world_size
=
world_size
,
is_master
=
(
rank
==
0
)
,
is_master
=
launch_server
,
timeout
=
datetime
.
timedelta
(
seconds
=
store_timeout
),
use_libuv
=
False
,
# for now: github.com/pytorch/pytorch/pull/150215
master_listen_fd
=
listen_fd
,
)
return
StatelessProcessGroup
(
rank
=
rank
,
world_size
=
world_size
,
store
=
store
,
socket
=
listen_socket
,
data_expiration_seconds
=
data_expiration_seconds
)
...
...
vllm/engine/arg_utils.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
# yapf: disable
import
argparse
import
dataclasses
import
json
import
re
import
threading
from
dataclasses
import
MISSING
,
dataclass
,
fields
from
typing
import
(
TYPE_CHECKING
,
Any
,
Dict
,
List
,
Literal
,
Mapping
,
Optional
,
T
uple
,
T
ype
,
Union
,
cast
,
get_args
,
get_origin
)
from
typing
import
(
Any
,
Callable
,
Dict
,
List
,
Literal
,
Optional
,
Type
,
Type
Var
,
Union
,
cast
,
get_args
,
get_origin
)
import
torch
from
typing_extensions
import
TypeIs
,
deprecated
import
vllm.envs
as
envs
from
vllm
import
version
from
vllm.config
import
(
CacheConfig
,
CompilationConfig
,
ConfigFormat
,
DecodingConfig
,
DeviceConfig
,
HfOverrides
,
from
vllm.config
import
(
BlockSize
,
CacheConfig
,
CacheDType
,
CompilationConfig
,
ConfigFormat
,
ConfigType
,
DecodingConfig
,
Device
,
DeviceConfig
,
DistributedExecutorBackend
,
GuidedDecodingBackendV1
,
HfOverrides
,
KVTransferConfig
,
LoadConfig
,
LoadFormat
,
LoRAConfig
,
ModelConfig
,
ModelImpl
,
ObservabilityConfig
,
ParallelConfig
,
PoolerConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
,
TaskOption
,
TokenizerPoolConfig
,
VllmConfig
,
get_attr_docs
)
ModelConfig
,
ModelImpl
,
MultiModalConfig
,
ObservabilityConfig
,
ParallelConfig
,
PoolerConfig
,
PrefixCachingHashAlgo
,
PromptAdapterConfig
,
SchedulerConfig
,
SchedulerPolicy
,
SpeculativeConfig
,
TaskOption
,
TokenizerPoolConfig
,
VllmConfig
,
get_attr_docs
,
get_field
)
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
...
...
@@ -28,33 +34,42 @@ from vllm.reasoning import ReasoningParserManager
from
vllm.test_utils
import
MODEL_WEIGHTS_S3_BUCKET
,
MODELS_ON_S3
from
vllm.transformers_utils.utils
import
check_gguf_file
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
FlexibleArgumentParser
,
StoreBoolean
,
is_in_ray_actor
from
vllm.utils
import
FlexibleArgumentParser
,
GiB_bytes
,
is_in_ray_actor
if
TYPE_CHECKING
:
from
vllm.transformers_utils.tokenizer_group
import
BaseTokenizerGroup
# yapf: enable
logger
=
init_logger
(
__name__
)
ALLOWED_DETAILED_TRACE_MODULES
=
[
"model"
,
"worker"
,
"all"
]
DEVICE_OPTIONS
=
[
"auto"
,
"cuda"
,
"neuron"
,
"cpu"
,
"tpu"
,
"xpu"
,
"hpu"
,
]
# object is used to allow for special typing forms
T
=
TypeVar
(
"T"
)
TypeHint
=
Union
[
type
[
Any
],
object
]
TypeHintT
=
Union
[
type
[
T
],
object
]
def
nullable_str
(
val
:
str
):
if
not
val
or
val
==
"None"
:
return
None
return
val
def
optional_type
(
return_type
:
Callable
[[
str
],
T
])
->
Callable
[[
str
],
Optional
[
T
]]:
def
_optional_type
(
val
:
str
)
->
Optional
[
T
]:
if
val
==
""
or
val
==
"None"
:
return
None
try
:
if
return_type
is
json
.
loads
and
not
re
.
match
(
"^{.*}$"
,
val
):
return
cast
(
T
,
nullable_kvs
(
val
))
return
return_type
(
val
)
except
ValueError
as
e
:
raise
argparse
.
ArgumentTypeError
(
f
"Value
{
val
}
cannot be converted to
{
return_type
}
."
)
from
e
return
_optional_type
def
nullable_kvs
(
val
:
str
)
->
Optional
[
Mapping
[
str
,
int
]]:
@
deprecated
(
"Passing a JSON argument as a string containing comma separated key=value "
"pairs is deprecated. This will be removed in v0.10.0. Please use a JSON "
"string instead."
)
def
nullable_kvs
(
val
:
str
)
->
dict
[
str
,
int
]:
"""Parses a string containing comma separate key [str] to value [int]
pairs into a dictionary.
...
...
@@ -64,10 +79,7 @@ def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
Returns:
Dictionary with parsed values.
"""
if
len
(
val
)
==
0
:
return
None
out_dict
:
Dict
[
str
,
int
]
=
{}
out_dict
:
dict
[
str
,
int
]
=
{}
for
item
in
val
.
split
(
","
):
kv_parts
=
[
part
.
lower
().
strip
()
for
part
in
item
.
split
(
"="
)]
if
len
(
kv_parts
)
!=
2
:
...
...
@@ -89,6 +101,105 @@ def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
return
out_dict
def
is_type
(
type_hint
:
TypeHint
,
type
:
TypeHintT
)
->
TypeIs
[
TypeHintT
]:
"""Check if the type hint is a specific type."""
return
type_hint
is
type
or
get_origin
(
type_hint
)
is
type
def
contains_type
(
type_hints
:
set
[
TypeHint
],
type
:
TypeHintT
)
->
bool
:
"""Check if the type hints contain a specific type."""
return
any
(
is_type
(
type_hint
,
type
)
for
type_hint
in
type_hints
)
def
get_type
(
type_hints
:
set
[
TypeHint
],
type
:
TypeHintT
)
->
TypeHintT
:
"""Get the specific type from the type hints."""
return
next
((
th
for
th
in
type_hints
if
is_type
(
th
,
type
)),
None
)
def
is_not_builtin
(
type_hint
:
TypeHint
)
->
bool
:
"""Check if the class is not a built-in type."""
return
type_hint
.
__module__
!=
"builtins"
def
get_kwargs
(
cls
:
ConfigType
)
->
dict
[
str
,
Any
]:
cls_docs
=
get_attr_docs
(
cls
)
kwargs
=
{}
for
field
in
fields
(
cls
):
# Get the default value of the field
default
=
field
.
default
if
field
.
default_factory
is
not
MISSING
:
default
=
field
.
default_factory
()
# Get the help text for the field
name
=
field
.
name
help
=
cls_docs
[
name
]
# Escape % for argparse
help
=
help
.
replace
(
"%"
,
"%%"
)
# Initialise the kwargs dictionary for the field
kwargs
[
name
]
=
{
"default"
:
default
,
"help"
:
help
}
# Get the set of possible types for the field
type_hints
:
set
[
TypeHint
]
=
set
()
if
get_origin
(
field
.
type
)
is
Union
:
type_hints
.
update
(
get_args
(
field
.
type
))
else
:
type_hints
.
add
(
field
.
type
)
# Set other kwargs based on the type hints
if
contains_type
(
type_hints
,
bool
):
# Creates --no-<name> and --<name> flags
kwargs
[
name
][
"action"
]
=
argparse
.
BooleanOptionalAction
elif
contains_type
(
type_hints
,
Literal
):
# Creates choices from Literal arguments
type_hint
=
get_type
(
type_hints
,
Literal
)
choices
=
sorted
(
get_args
(
type_hint
))
kwargs
[
name
][
"choices"
]
=
choices
choice_type
=
type
(
choices
[
0
])
assert
all
(
type
(
c
)
is
choice_type
for
c
in
choices
),
(
"All choices must be of the same type. "
f
"Got
{
choices
}
with types
{
[
type
(
c
)
for
c
in
choices
]
}
"
)
kwargs
[
name
][
"type"
]
=
choice_type
elif
contains_type
(
type_hints
,
tuple
):
type_hint
=
get_type
(
type_hints
,
tuple
)
types
=
get_args
(
type_hint
)
tuple_type
=
types
[
0
]
assert
all
(
t
is
tuple_type
for
t
in
types
if
t
is
not
Ellipsis
),
(
"All non-Ellipsis tuple elements must be of the same "
f
"type. Got
{
types
}
."
)
kwargs
[
name
][
"type"
]
=
tuple_type
kwargs
[
name
][
"nargs"
]
=
"+"
if
Ellipsis
in
types
else
len
(
types
)
elif
contains_type
(
type_hints
,
list
):
type_hint
=
get_type
(
type_hints
,
list
)
types
=
get_args
(
type_hint
)
assert
len
(
types
)
==
1
,
(
"List type must have exactly one type. Got "
f
"
{
type_hint
}
with types
{
types
}
"
)
kwargs
[
name
][
"type"
]
=
types
[
0
]
kwargs
[
name
][
"nargs"
]
=
"+"
elif
contains_type
(
type_hints
,
int
):
kwargs
[
name
][
"type"
]
=
int
elif
contains_type
(
type_hints
,
float
):
kwargs
[
name
][
"type"
]
=
float
elif
contains_type
(
type_hints
,
dict
):
# Dict arguments will always be optional
kwargs
[
name
][
"type"
]
=
optional_type
(
json
.
loads
)
elif
(
contains_type
(
type_hints
,
str
)
or
any
(
is_not_builtin
(
th
)
for
th
in
type_hints
)):
kwargs
[
name
][
"type"
]
=
str
else
:
raise
ValueError
(
f
"Unsupported type
{
type_hints
}
for argument
{
name
}
."
)
# If None is in type_hints, make the argument optional.
# But not if it's a bool, argparse will handle this better.
if
type
(
None
)
in
type_hints
and
not
contains_type
(
type_hints
,
bool
):
kwargs
[
name
][
"type"
]
=
optional_type
(
kwargs
[
name
][
"type"
])
if
kwargs
[
name
].
get
(
"choices"
):
kwargs
[
name
][
"choices"
].
append
(
"None"
)
return
kwargs
@
dataclass
class
EngineArgs
:
"""Arguments for vLLM engine."""
...
...
@@ -105,14 +216,15 @@ class EngineArgs:
load_format
:
str
=
LoadConfig
.
load_format
config_format
:
ConfigFormat
=
ConfigFormat
.
AUTO
dtype
:
str
=
'auto'
kv_cache_dtype
:
str
=
'auto'
kv_cache_dtype
:
CacheDType
=
CacheConfig
.
cache_dtype
seed
:
Optional
[
int
]
=
None
max_model_len
:
Optional
[
int
]
=
None
# Note: Specifying a custom executor backend by passing a class
# is intended for expert use only. The API may change without
# notice.
distributed_executor_backend
:
Optional
[
Union
[
str
,
Type
[
ExecutorBase
]]]
=
ParallelConfig
.
distributed_executor_backend
DistributedExecutorBackend
,
Type
[
ExecutorBase
]]]
=
ParallelConfig
.
distributed_executor_backend
# number of P/D disaggregation (or other disaggregation) workers
pipeline_parallel_size
:
int
=
ParallelConfig
.
pipeline_parallel_size
tensor_parallel_size
:
int
=
ParallelConfig
.
tensor_parallel_size
...
...
@@ -120,20 +232,23 @@ class EngineArgs:
enable_expert_parallel
:
bool
=
ParallelConfig
.
enable_expert_parallel
max_parallel_loading_workers
:
Optional
[
int
]
=
ParallelConfig
.
max_parallel_loading_workers
block_size
:
Optional
[
int
]
=
None
enable_prefix_caching
:
Optional
[
bool
]
=
None
prefix_caching_hash_algo
:
str
=
"builtin"
block_size
:
Optional
[
BlockSize
]
=
CacheConfig
.
block_size
enable_prefix_caching
:
Optional
[
bool
]
=
CacheConfig
.
enable_prefix_caching
prefix_caching_hash_algo
:
PrefixCachingHashAlgo
=
\
CacheConfig
.
prefix_caching_hash_algo
disable_sliding_window
:
bool
=
False
disable_cascade_attn
:
bool
=
False
use_v2_block_manager
:
bool
=
True
swap_space
:
float
=
4
# GiB
cpu_offload_gb
:
float
=
0
# GiB
gpu_memory_utilization
:
float
=
0.90
max_num_batched_tokens
:
Optional
[
int
]
=
None
max_num_partial_prefills
:
Optional
[
int
]
=
1
max_long_partial_prefills
:
Optional
[
int
]
=
1
long_prefill_token_threshold
:
Optional
[
int
]
=
0
max_num_seqs
:
Optional
[
int
]
=
None
swap_space
:
float
=
CacheConfig
.
swap_space
cpu_offload_gb
:
float
=
CacheConfig
.
cpu_offload_gb
gpu_memory_utilization
:
float
=
CacheConfig
.
gpu_memory_utilization
max_num_batched_tokens
:
Optional
[
int
]
=
SchedulerConfig
.
max_num_batched_tokens
max_num_partial_prefills
:
int
=
SchedulerConfig
.
max_num_partial_prefills
max_long_partial_prefills
:
int
=
SchedulerConfig
.
max_long_partial_prefills
long_prefill_token_threshold
:
int
=
\
SchedulerConfig
.
long_prefill_token_threshold
max_num_seqs
:
Optional
[
int
]
=
SchedulerConfig
.
max_num_seqs
max_logprobs
:
int
=
20
# Default value for OpenAI Chat Completions API
disable_log_stats
:
bool
=
False
revision
:
Optional
[
str
]
=
None
...
...
@@ -147,44 +262,52 @@ class EngineArgs:
enforce_eager
:
Optional
[
bool
]
=
None
max_seq_len_to_capture
:
int
=
8192
disable_custom_all_reduce
:
bool
=
ParallelConfig
.
disable_custom_all_reduce
tokenizer_pool_size
:
int
=
0
# Note: Specifying a tokenizer pool by passing a class
# is intended for expert use only. The API may change without
# notice.
tokenizer_pool_type
:
Union
[
str
,
Type
[
"BaseTokenizerGroup"
]]
=
"ray"
tokenizer_pool_extra_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
=
None
# The following three fields are deprecated and will be removed in a future
# release. Setting them will have no effect. Please remove them from your
# configurations.
tokenizer_pool_size
:
int
=
TokenizerPoolConfig
.
pool_size
tokenizer_pool_type
:
str
=
TokenizerPoolConfig
.
pool_type
tokenizer_pool_extra_config
:
dict
=
\
get_field
(
TokenizerPoolConfig
,
"extra_config"
)
limit_mm_per_prompt
:
dict
[
str
,
int
]
=
\
get_field
(
MultiModalConfig
,
"limit_per_prompt"
)
mm_processor_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
disable_mm_preprocessor_cache
:
bool
=
False
# LoRA fields
enable_lora
:
bool
=
False
enable_lora_bias
:
bool
=
False
max_loras
:
int
=
1
max_lora_rank
:
int
=
16
enable_prompt_adapter
:
bool
=
False
max_prompt_adapters
:
int
=
1
max_prompt_adapter_token
:
int
=
0
fully_sharded_loras
:
bool
=
False
lora_extra_vocab_size
:
int
=
256
long_lora_scaling_factors
:
Optional
[
Tuple
[
float
]]
=
None
lora_dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
'auto'
max_cpu_loras
:
Optional
[
int
]
=
None
enable_lora_bias
:
bool
=
LoRAConfig
.
bias_enabled
max_loras
:
int
=
LoRAConfig
.
max_loras
max_lora_rank
:
int
=
LoRAConfig
.
max_lora_rank
fully_sharded_loras
:
bool
=
LoRAConfig
.
fully_sharded_loras
max_cpu_loras
:
Optional
[
int
]
=
LoRAConfig
.
max_cpu_loras
merge_lora
:
bool
=
False
lora_target_modules
:
Optional
[
List
[
str
]]
=
None
device
:
str
=
'auto'
num_scheduler_steps
:
int
=
1
multi_step_stream_outputs
:
bool
=
True
lora_target_modules
:
Optional
[
List
[
str
]]
=
LoRAConfig
.
lora_target_modules
lora_dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
LoRAConfig
.
lora_dtype
lora_extra_vocab_size
:
int
=
LoRAConfig
.
lora_extra_vocab_size
long_lora_scaling_factors
:
Optional
[
tuple
[
float
,
...]]
=
\
LoRAConfig
.
long_lora_scaling_factors
# PromptAdapter fields
enable_prompt_adapter
:
bool
=
False
max_prompt_adapters
:
int
=
PromptAdapterConfig
.
max_prompt_adapters
max_prompt_adapter_token
:
int
=
\
PromptAdapterConfig
.
max_prompt_adapter_token
device
:
Device
=
DeviceConfig
.
device
num_scheduler_steps
:
int
=
SchedulerConfig
.
num_scheduler_steps
multi_step_stream_outputs
:
bool
=
SchedulerConfig
.
multi_step_stream_outputs
ray_workers_use_nsight
:
bool
=
ParallelConfig
.
ray_workers_use_nsight
num_gpu_blocks_override
:
Optional
[
int
]
=
None
num_lookahead_slots
:
int
=
0
model_loader_extra_config
:
Optional
[
dict
]
=
LoadConfig
.
model_loader_extra_config
num_gpu_blocks_override
:
Optional
[
int
]
=
CacheConfig
.
num_gpu_blocks_override
num_lookahead_slots
:
int
=
SchedulerConfig
.
num_lookahead_slots
model_loader_extra_config
:
dict
=
\
get_field
(
LoadConfig
,
"model_loader_extra_config"
)
ignore_patterns
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
LoadConfig
.
ignore_patterns
preemption_mode
:
Optional
[
str
]
=
Non
e
preemption_mode
:
Optional
[
str
]
=
SchedulerConfig
.
preemption_mod
e
scheduler_delay_factor
:
float
=
0.0
enable_chunked_prefill
:
Optional
[
bool
]
=
None
disable_chunked_mm_input
:
bool
=
False
scheduler_delay_factor
:
float
=
SchedulerConfig
.
delay_factor
enable_chunked_prefill
:
Optional
[
bool
]
=
SchedulerConfig
.
enable_chunked_prefill
disable_chunked_mm_input
:
bool
=
SchedulerConfig
.
disable_chunked_mm_input
guided_decoding_backend
:
str
=
DecodingConfig
.
guided_decoding_backend
logits_processor_pattern
:
Optional
[
str
]
=
None
...
...
@@ -197,8 +320,8 @@ class EngineArgs:
otlp_traces_endpoint
:
Optional
[
str
]
=
None
collect_detailed_traces
:
Optional
[
str
]
=
None
disable_async_output_proc
:
bool
=
False
scheduling_policy
:
Literal
[
"fcfs"
,
"priority"
]
=
"fcfs"
scheduler_cls
:
Union
[
str
,
Type
[
object
]]
=
"vllm.core.scheduler.S
cheduler
"
scheduling_policy
:
SchedulerPolicy
=
SchedulerConfig
.
policy
scheduler_cls
:
Union
[
str
,
Type
[
object
]]
=
SchedulerConfig
.
s
cheduler
_cls
override_neuron_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
override_pooler_config
:
Optional
[
PoolerConfig
]
=
None
...
...
@@ -213,11 +336,11 @@ class EngineArgs:
enable_sleep_mode
:
bool
=
False
model_impl
:
str
=
"auto"
calculate_kv_scales
:
Optional
[
bool
]
=
None
calculate_kv_scales
:
bool
=
CacheConfig
.
calculate_kv_scales
additional_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
enable_reasoning
:
Optional
[
bool
]
=
None
reasoning_parser
:
Optional
[
str
]
=
None
reasoning_parser
:
Optional
[
str
]
=
DecodingConfig
.
reasoning_backend
use_tqdm_on_load
:
bool
=
LoadConfig
.
use_tqdm_on_load
...
...
@@ -240,38 +363,6 @@ class EngineArgs:
def
add_cli_args
(
parser
:
FlexibleArgumentParser
)
->
FlexibleArgumentParser
:
"""Shared CLI arguments for vLLM engine."""
def
is_type_in_union
(
cls
:
type
[
Any
],
type
:
type
[
Any
])
->
bool
:
"""Check if the class is a type in a union type."""
return
get_origin
(
cls
)
is
Union
and
type
in
get_args
(
cls
)
def
is_optional
(
cls
:
type
[
Any
])
->
bool
:
"""Check if the class is an optional type."""
return
is_type_in_union
(
cls
,
type
(
None
))
def
get_kwargs
(
cls
:
type
[
Any
])
->
Dict
[
str
,
Any
]:
cls_docs
=
get_attr_docs
(
cls
)
kwargs
=
{}
for
field
in
fields
(
cls
):
name
=
field
.
name
# One of these will always be present
default
=
(
field
.
default_factory
if
field
.
default
is
MISSING
else
field
.
default
)
kwargs
[
name
]
=
{
"default"
:
default
,
"help"
:
cls_docs
[
name
]}
# When using action="store_true"
# add_argument doesn't accept type
if
field
.
type
is
bool
:
continue
# Handle optional fields
if
is_optional
(
field
.
type
):
kwargs
[
name
][
"type"
]
=
nullable_str
continue
# Handle str in union fields
if
is_type_in_union
(
field
.
type
,
str
):
kwargs
[
name
][
"type"
]
=
str
continue
kwargs
[
name
][
"type"
]
=
field
.
type
return
kwargs
# Model arguments
parser
.
add_argument
(
'--model'
,
...
...
@@ -289,13 +380,13 @@ class EngineArgs:
'which task to use.'
)
parser
.
add_argument
(
'--tokenizer'
,
type
=
nullable_
str
,
type
=
optional_type
(
str
)
,
default
=
EngineArgs
.
tokenizer
,
help
=
'Name or path of the huggingface tokenizer to use. '
'If unspecified, model name or path will be used.'
)
parser
.
add_argument
(
"--hf-config-path"
,
type
=
nullable_
str
,
type
=
optional_type
(
str
)
,
default
=
EngineArgs
.
hf_config_path
,
help
=
'Name or path of the huggingface config to use. '
'If unspecified, model name or path will be used.'
)
...
...
@@ -307,21 +398,21 @@ class EngineArgs:
'the input. The generated output will contain token ids.'
)
parser
.
add_argument
(
'--revision'
,
type
=
nullable_
str
,
type
=
optional_type
(
str
)
,
default
=
None
,
help
=
'The specific model version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.'
)
parser
.
add_argument
(
'--code-revision'
,
type
=
nullable_
str
,
type
=
optional_type
(
str
)
,
default
=
None
,
help
=
'The specific revision to use for the model code on '
'Hugging Face Hub. It can be a branch name, a tag name, or a '
'commit id. If unspecified, will use the default version.'
)
parser
.
add_argument
(
'--tokenizer-revision'
,
type
=
nullable_
str
,
type
=
optional_type
(
str
)
,
default
=
None
,
help
=
'Revision of the huggingface tokenizer to use. '
'It can be a branch name, a tag name, or a commit id. '
...
...
@@ -361,7 +452,6 @@ class EngineArgs:
load_group
.
add_argument
(
'--model-loader-extra-config'
,
**
load_kwargs
[
"model_loader_extra_config"
])
load_group
.
add_argument
(
'--use-tqdm-on-load'
,
action
=
argparse
.
BooleanOptionalAction
,
**
load_kwargs
[
"use_tqdm_on_load"
])
parser
.
add_argument
(
...
...
@@ -386,14 +476,6 @@ class EngineArgs:
'* "bfloat16" for a balance between precision and range.
\n
'
'* "float" is shorthand for FP32 precision.
\n
'
'* "float32" for FP32 precision.'
)
parser
.
add_argument
(
'--kv-cache-dtype'
,
type
=
str
,
choices
=
[
'auto'
,
'fp8'
,
'fp8_e5m2'
,
'fp8_e4m3'
],
default
=
EngineArgs
.
kv_cache_dtype
,
help
=
'Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)'
)
parser
.
add_argument
(
'--max-model-len'
,
type
=
human_readable_int
,
default
=
EngineArgs
.
max_model_len
,
...
...
@@ -403,21 +485,25 @@ class EngineArgs:
'Examples:
\n
'
'- 1k → 1000
\n
'
'- 1K → 1024
\n
'
)
parser
.
add_argument
(
# Guided decoding arguments
guided_decoding_kwargs
=
get_kwargs
(
DecodingConfig
)
guided_decoding_group
=
parser
.
add_argument_group
(
title
=
"DecodingConfig"
,
description
=
DecodingConfig
.
__doc__
,
)
guided_decoding_group
.
add_argument
(
'--guided-decoding-backend'
,
type
=
str
,
default
=
DecodingConfig
.
guided_decoding_backend
,
help
=
'Which engine will be used for guided decoding'
' (JSON schema / regex etc) by default. Currently support '
'https://github.com/mlc-ai/xgrammar and '
'https://github.com/guidance-ai/llguidance.'
'Valid backend values are "xgrammar", "guidance", and "auto". '
'With "auto", we will make opinionated choices based on request '
'contents and what the backend libraries currently support, so '
'the behavior is subject to change in each release.'
)
**
guided_decoding_kwargs
[
"guided_decoding_backend"
])
guided_decoding_group
.
add_argument
(
"--reasoning-parser"
,
# This choices is a special case because it's not static
choices
=
list
(
ReasoningParserManager
.
reasoning_parsers
),
**
guided_decoding_kwargs
[
"reasoning_backend"
])
parser
.
add_argument
(
'--logits-processor-pattern'
,
type
=
nullable_
str
,
type
=
optional_type
(
str
)
,
default
=
None
,
help
=
'Optional regex pattern specifying valid logits processor '
'qualified names that can be passed with the `logits_processors` '
...
...
@@ -443,7 +529,6 @@ class EngineArgs:
)
parallel_group
.
add_argument
(
'--distributed-executor-backend'
,
choices
=
[
'ray'
,
'mp'
,
'uni'
,
'external_launcher'
],
**
parallel_kwargs
[
"distributed_executor_backend"
])
parallel_group
.
add_argument
(
'--pipeline-parallel-size'
,
'-pp'
,
...
...
@@ -454,46 +539,40 @@ class EngineArgs:
**
parallel_kwargs
[
"data_parallel_size"
])
parallel_group
.
add_argument
(
'--enable-expert-parallel'
,
action
=
'store_true'
,
**
parallel_kwargs
[
"enable_expert_parallel"
])
parallel_group
.
add_argument
(
'--max-parallel-loading-workers'
,
**
parallel_kwargs
[
"max_parallel_loading_workers"
])
parallel_group
.
add_argument
(
'--ray-workers-use-nsight'
,
action
=
'store_true'
,
**
parallel_kwargs
[
"ray_workers_use_nsight"
])
parallel_group
.
add_argument
(
'--disable-custom-all-reduce'
,
action
=
'store_true'
,
**
parallel_kwargs
[
"disable_custom_all_reduce"
])
# KV cache arguments
parser
.
add_argument
(
'--block-size'
,
type
=
int
,
default
=
EngineArgs
.
block_size
,
choices
=
[
8
,
16
,
32
,
64
,
128
],
help
=
'Token block size for contiguous chunks of '
'tokens. This is ignored on neuron devices and '
'set to ``--max-model-len``. On CUDA devices, '
'only block sizes up to 32 are supported. '
'On HPU devices, block size defaults to 128.'
)
parser
.
add_argument
(
"--enable-prefix-caching"
,
action
=
argparse
.
BooleanOptionalAction
,
default
=
EngineArgs
.
enable_prefix_caching
,
help
=
"Enables automatic prefix caching. "
"Use ``--no-enable-prefix-caching`` to disable explicitly."
,
)
parser
.
add_argument
(
"--prefix-caching-hash-algo"
,
type
=
str
,
choices
=
[
"builtin"
,
"sha256"
],
default
=
EngineArgs
.
prefix_caching_hash_algo
,
help
=
"Set the hash algorithm for prefix caching. "
"Options are 'builtin' (Python's built-in hash) or 'sha256' "
"(collision resistant but with certain overheads)."
,
# KV cache arguments
cache_kwargs
=
get_kwargs
(
CacheConfig
)
cache_group
=
parser
.
add_argument_group
(
title
=
"CacheConfig"
,
description
=
CacheConfig
.
__doc__
,
)
cache_group
.
add_argument
(
'--block-size'
,
**
cache_kwargs
[
"block_size"
])
cache_group
.
add_argument
(
'--gpu-memory-utilization'
,
**
cache_kwargs
[
"gpu_memory_utilization"
])
cache_group
.
add_argument
(
'--swap-space'
,
**
cache_kwargs
[
"swap_space"
])
cache_group
.
add_argument
(
'--kv-cache-dtype'
,
**
cache_kwargs
[
"cache_dtype"
])
cache_group
.
add_argument
(
'--num-gpu-blocks-override'
,
**
cache_kwargs
[
"num_gpu_blocks_override"
])
cache_group
.
add_argument
(
"--enable-prefix-caching"
,
**
cache_kwargs
[
"enable_prefix_caching"
])
cache_group
.
add_argument
(
"--prefix-caching-hash-algo"
,
**
cache_kwargs
[
"prefix_caching_hash_algo"
])
cache_group
.
add_argument
(
'--cpu-offload-gb'
,
**
cache_kwargs
[
"cpu_offload_gb"
])
cache_group
.
add_argument
(
'--calculate-kv-scales'
,
**
cache_kwargs
[
"calculate_kv_scales"
])
parser
.
add_argument
(
'--disable-sliding-window'
,
action
=
'store_true'
,
help
=
'Disables sliding window, '
...
...
@@ -506,86 +585,11 @@ class EngineArgs:
'block manager v2) is now the default. '
'Setting this flag to True or False'
' has no effect on vLLM behavior.'
)
parser
.
add_argument
(
'--num-lookahead-slots'
,
type
=
int
,
default
=
EngineArgs
.
num_lookahead_slots
,
help
=
'Experimental scheduling config necessary for '
'speculative decoding. This will be replaced by '
'speculative config in the future; it is present '
'to enable correctness tests until then.'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
EngineArgs
.
seed
,
help
=
'Random seed for operations.'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
float
,
default
=
EngineArgs
.
swap_space
,
help
=
'CPU swap space size (GiB) per GPU.'
)
parser
.
add_argument
(
'--cpu-offload-gb'
,
type
=
float
,
default
=
0
,
help
=
'The space in GiB to offload to CPU, per GPU. '
'Default is 0, which means no offloading. Intuitively, '
'this argument can be seen as a virtual way to increase '
'the GPU memory size. For example, if you have one 24 GB '
'GPU and set this to 10, virtually you can think of it as '
'a 34 GB GPU. Then you can load a 13B model with BF16 weight, '
'which requires at least 26GB GPU memory. Note that this '
'requires fast CPU-GPU interconnect, as part of the model is '
'loaded from CPU memory to GPU memory on the fly in each '
'model forward pass.'
)
parser
.
add_argument
(
'--gpu-memory-utilization'
,
type
=
float
,
default
=
EngineArgs
.
gpu_memory_utilization
,
help
=
'The fraction of GPU memory to be used for the model '
'executor, which can range from 0 to 1. For example, a value of '
'0.5 would imply 50%% GPU memory utilization. If unspecified, '
'will use the default value of 0.9. This is a per-instance '
'limit, and only applies to the current vLLM instance.'
'It does not matter if you have another vLLM instance running '
'on the same GPU. For example, if you have two vLLM instances '
'running on the same GPU, you can set the GPU memory utilization '
'to 0.5 for each instance.'
)
parser
.
add_argument
(
'--num-gpu-blocks-override'
,
type
=
int
,
default
=
None
,
help
=
'If specified, ignore GPU profiling result and use this number'
' of GPU blocks. Used for testing preemption.'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
EngineArgs
.
max_num_batched_tokens
,
help
=
'Maximum number of batched tokens per '
'iteration.'
)
parser
.
add_argument
(
"--max-num-partial-prefills"
,
type
=
int
,
default
=
EngineArgs
.
max_num_partial_prefills
,
help
=
"For chunked prefill, the max number of concurrent
\
partial prefills."
)
parser
.
add_argument
(
"--max-long-partial-prefills"
,
type
=
int
,
default
=
EngineArgs
.
max_long_partial_prefills
,
help
=
"For chunked prefill, the maximum number of prompts longer "
"than --long-prefill-token-threshold that will be prefilled "
"concurrently. Setting this less than --max-num-partial-prefills "
"will allow shorter prompts to jump the queue in front of longer "
"prompts in some cases, improving latency."
)
parser
.
add_argument
(
"--long-prefill-token-threshold"
,
type
=
float
,
default
=
EngineArgs
.
long_prefill_token_threshold
,
help
=
"For chunked prefill, a request is considered long if the "
"prompt is longer than this number of tokens."
)
parser
.
add_argument
(
'--max-num-seqs'
,
type
=
int
,
default
=
EngineArgs
.
max_num_seqs
,
help
=
'Maximum number of sequences per iteration.'
)
parser
.
add_argument
(
'--max-logprobs'
,
type
=
int
,
...
...
@@ -598,7 +602,7 @@ class EngineArgs:
# Quantization settings.
parser
.
add_argument
(
'--quantization'
,
'-q'
,
type
=
nullable_
str
,
type
=
optional_type
(
str
)
,
choices
=
[
*
QUANTIZATION_METHODS
,
None
],
default
=
EngineArgs
.
quantization
,
help
=
'Method used to quantize the weights. If '
...
...
@@ -649,162 +653,113 @@ class EngineArgs:
'Additionally for encoder-decoder models, if the '
'sequence length of the encoder input is larger '
'than this, we fall back to the eager mode.'
)
parser
.
add_argument
(
'--tokenizer-pool-size'
,
type
=
int
,
default
=
EngineArgs
.
tokenizer_pool_size
,
help
=
'Size of tokenizer pool to use for '
'asynchronous tokenization. If 0, will '
'use synchronous tokenization.'
)
parser
.
add_argument
(
'--tokenizer-pool-type'
,
type
=
str
,
default
=
EngineArgs
.
tokenizer_pool_type
,
help
=
'Type of tokenizer pool to use for '
'asynchronous tokenization. Ignored '
'if tokenizer_pool_size is 0.'
)
parser
.
add_argument
(
'--tokenizer-pool-extra-config'
,
type
=
nullable_str
,
default
=
EngineArgs
.
tokenizer_pool_extra_config
,
help
=
'Extra config for tokenizer pool. '
'This should be a JSON string that will be '
'parsed into a dictionary. Ignored if '
'tokenizer_pool_size is 0.'
)
# Tokenizer arguments
tokenizer_kwargs
=
get_kwargs
(
TokenizerPoolConfig
)
tokenizer_group
=
parser
.
add_argument_group
(
title
=
"TokenizerPoolConfig"
,
description
=
TokenizerPoolConfig
.
__doc__
,
)
tokenizer_group
.
add_argument
(
'--tokenizer-pool-size'
,
**
tokenizer_kwargs
[
"pool_size"
])
tokenizer_group
.
add_argument
(
'--tokenizer-pool-type'
,
**
tokenizer_kwargs
[
"pool_type"
])
tokenizer_group
.
add_argument
(
'--tokenizer-pool-extra-config'
,
**
tokenizer_kwargs
[
"extra_config"
])
# Multimodal related configs
parser
.
add_argument
(
'--limit-mm-per-prompt'
,
type
=
nullable_kvs
,
default
=
EngineArgs
.
limit_mm_per_prompt
,
# The default value is given in
# MultiModalConfig.get_default_limit_per_prompt
help
=
(
'For each multimodal plugin, limit how many '
'input instances to allow for each prompt. '
'Expects a comma-separated list of items, '
'e.g.: `image=16,video=2` allows a maximum of 16 '
'images and 2 videos per prompt. Defaults to '
'1 (V0) or 999 (V1) for each modality.'
))
multimodal_kwargs
=
get_kwargs
(
MultiModalConfig
)
multimodal_group
=
parser
.
add_argument_group
(
title
=
"MultiModalConfig"
,
description
=
MultiModalConfig
.
__doc__
,
)
multimodal_group
.
add_argument
(
'--limit-mm-per-prompt'
,
**
multimodal_kwargs
[
"limit_per_prompt"
])
parser
.
add_argument
(
'--mm-processor-kwargs'
,
default
=
None
,
type
=
json
.
loads
,
help
=
(
'Overrides for the multimodal input mapping/processing, '
'e.g., image processor. For example: ``{"num_crops": 4}``.'
))
help
=
(
'Overrides for the multi-modal processor obtained from '
'``AutoProcessor.from_pretrained``. The available overrides '
'depend on the model that is being run.'
'For example, for Phi-3-Vision: ``{"num_crops": 4}``.'
))
parser
.
add_argument
(
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
help
=
'If
t
rue,
then
disable
s
caching of the multi-modal '
'
preprocessor/mapper. (not recommended)
'
)
help
=
'If
T
rue, disable caching of the
processed
multi-modal '
'
inputs.
'
)
# LoRA related configs
parser
.
add_argument
(
'--enable-lora'
,
action
=
'store_true'
,
help
=
'If True, enable handling of LoRA adapters.'
)
parser
.
add_argument
(
'--enable-lora-bias'
,
action
=
'store_true'
,
help
=
'If True, enable bias for LoRA adapters.'
)
parser
.
add_argument
(
'--max-loras'
,
type
=
int
,
default
=
EngineArgs
.
max_loras
,
help
=
'Max number of LoRAs in a single batch.'
)
parser
.
add_argument
(
'--max-lora-rank'
,
type
=
int
,
default
=
EngineArgs
.
max_lora_rank
,
help
=
'Max LoRA rank.'
)
parser
.
add_argument
(
'--merge-lora'
,
type
=
bool
,
default
=
False
,
lora_kwargs
=
get_kwargs
(
LoRAConfig
)
lora_group
=
parser
.
add_argument_group
(
title
=
"LoRAConfig"
,
description
=
LoRAConfig
.
__doc__
,
)
lora_group
.
add_argument
(
'--enable-lora'
,
action
=
argparse
.
BooleanOptionalAction
,
help
=
'If True, enable handling of LoRA adapters.'
)
lora_group
.
add_argument
(
'--enable-lora-bias'
,
**
lora_kwargs
[
"bias_enabled"
])
lora_group
.
add_argument
(
'--max-loras'
,
**
lora_kwargs
[
"max_loras"
])
lora_group
.
add_argument
(
'--max-lora-rank'
,
**
lora_kwargs
[
"max_lora_rank"
])
lora_group
.
add_argument
(
'--merge-lora'
,
action
=
argparse
.
BooleanOptionalAction
,
help
=
'If set to True, the weights of the base layer will be merged with the weights of Lora.'
)
parser
.
add_argument
(
'--lora-target-modules'
,
nargs
=
'*'
,
default
=
None
,
help
=
'List of lora module name, If not specified, modules will be chosen according to the model architecture.'
)
parser
.
add_argument
(
'--lora-extra-vocab-size'
,
type
=
int
,
default
=
EngineArgs
.
lora_extra_vocab_size
,
help
=
(
'Maximum size of extra vocabulary that can be '
'present in a LoRA adapter (added to the base '
'model vocabulary).'
))
parser
.
add_argument
(
lora_group
.
add_argument
(
'--lora-target-modules'
,
**
lora_kwargs
[
"lora_target_modules"
])
lora_group
.
add_argument
(
'--lora-extra-vocab-size'
,
**
lora_kwargs
[
"lora_extra_vocab_size"
])
lora_group
.
add_argument
(
'--lora-dtype'
,
type
=
str
,
default
=
EngineArgs
.
lora_dtype
,
choices
=
[
'auto'
,
'float16'
,
'bfloat16'
],
help
=
(
'Data type for LoRA. If auto, will default to '
'base model dtype.'
))
parser
.
add_argument
(
'--long-lora-scaling-factors'
,
type
=
nullable_str
,
default
=
EngineArgs
.
long_lora_scaling_factors
,
help
=
(
'Specify multiple scaling factors (which can '
'be different from base model scaling factor '
'- see eg. Long LoRA) to allow for multiple '
'LoRA adapters trained with those scaling '
'factors to be used at the same time. If not '
'specified, only adapters trained with the '
'base model scaling factor are allowed.'
))
parser
.
add_argument
(
'--max-cpu-loras'
,
type
=
int
,
default
=
EngineArgs
.
max_cpu_loras
,
help
=
(
'Maximum number of LoRAs to store in CPU memory. '
'Must be >= than max_loras.'
))
parser
.
add_argument
(
'--fully-sharded-loras'
,
action
=
'store_true'
,
help
=
(
'By default, only half of the LoRA computation is '
'sharded with tensor parallelism. '
'Enabling this will use the fully sharded layers. '
'At high sequence length, max rank or '
'tensor parallel size, this is likely faster.'
))
parser
.
add_argument
(
'--enable-prompt-adapter'
,
action
=
'store_true'
,
help
=
'If True, enable handling of PromptAdapters.'
)
parser
.
add_argument
(
'--max-prompt-adapters'
,
type
=
int
,
default
=
EngineArgs
.
max_prompt_adapters
,
help
=
'Max number of PromptAdapters in a batch.'
)
parser
.
add_argument
(
'--max-prompt-adapter-token'
,
type
=
int
,
default
=
EngineArgs
.
max_prompt_adapter_token
,
help
=
'Max number of PromptAdapters tokens'
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
EngineArgs
.
device
,
choices
=
DEVICE_OPTIONS
,
help
=
'Device type for vLLM execution.'
)
parser
.
add_argument
(
'--num-scheduler-steps'
,
type
=
int
,
default
=
1
,
help
=
(
'Maximum number of forward steps per '
'scheduler call.'
))
**
lora_kwargs
[
"lora_dtype"
],
)
lora_group
.
add_argument
(
'--long-lora-scaling-factors'
,
**
lora_kwargs
[
"long_lora_scaling_factors"
])
lora_group
.
add_argument
(
'--max-cpu-loras'
,
**
lora_kwargs
[
"max_cpu_loras"
])
lora_group
.
add_argument
(
'--fully-sharded-loras'
,
**
lora_kwargs
[
"fully_sharded_loras"
])
# PromptAdapter related configs
prompt_adapter_kwargs
=
get_kwargs
(
PromptAdapterConfig
)
prompt_adapter_group
=
parser
.
add_argument_group
(
title
=
"PromptAdapterConfig"
,
description
=
PromptAdapterConfig
.
__doc__
,
)
prompt_adapter_group
.
add_argument
(
'--enable-prompt-adapter'
,
action
=
argparse
.
BooleanOptionalAction
,
help
=
'If True, enable handling of PromptAdapters.'
)
prompt_adapter_group
.
add_argument
(
'--max-prompt-adapters'
,
**
prompt_adapter_kwargs
[
"max_prompt_adapters"
])
prompt_adapter_group
.
add_argument
(
'--max-prompt-adapter-token'
,
**
prompt_adapter_kwargs
[
"max_prompt_adapter_token"
])
# Device arguments
device_kwargs
=
get_kwargs
(
DeviceConfig
)
device_group
=
parser
.
add_argument_group
(
title
=
"DeviceConfig"
,
description
=
DeviceConfig
.
__doc__
,
)
device_group
.
add_argument
(
"--device"
,
**
device_kwargs
[
"device"
])
# Speculative arguments
speculative_group
=
parser
.
add_argument_group
(
title
=
"SpeculativeConfig"
,
description
=
SpeculativeConfig
.
__doc__
,
)
speculative_group
.
add_argument
(
'--speculative-config'
,
type
=
json
.
loads
,
default
=
None
,
help
=
'The configurations for speculative decoding.'
' Should be a JSON string.'
)
parser
.
add_argument
(
'--multi-step-stream-outputs'
,
action
=
StoreBoolean
,
default
=
EngineArgs
.
multi_step_stream_outputs
,
nargs
=
"?"
,
const
=
"True"
,
help
=
'If False, then multi-step will stream outputs at the end '
'of all steps'
)
parser
.
add_argument
(
'--scheduler-delay-factor'
,
type
=
float
,
default
=
EngineArgs
.
scheduler_delay_factor
,
help
=
'Apply a delay (of delay factor multiplied by previous '
'prompt latency) before scheduling next prompt.'
)
parser
.
add_argument
(
'--enable-chunked-prefill'
,
action
=
StoreBoolean
,
default
=
EngineArgs
.
enable_chunked_prefill
,
nargs
=
"?"
,
const
=
"True"
,
help
=
'If set, the prefill requests can be chunked based on the '
'max_num_batched_tokens.'
)
parser
.
add_argument
(
'--speculative-config'
,
type
=
json
.
loads
,
default
=
None
,
help
=
'The configurations for speculative decoding.'
' Should be a JSON string.'
)
parser
.
add_argument
(
'--num-speculative-heads'
,
type
=
int
,
...
...
@@ -819,13 +774,6 @@ class EngineArgs:
help
=
"The pattern(s) to ignore when loading the model."
"Default to `original/**/*` to avoid repeated loading of llama's "
"checkpoints."
)
parser
.
add_argument
(
'--preemption-mode'
,
type
=
str
,
default
=
None
,
help
=
'If
\'
recompute
\'
, the engine performs preemption by '
'recomputing; If
\'
swap
\'
, the engine performs preemption by '
'block swapping.'
)
parser
.
add_argument
(
"--served-model-name"
,
...
...
@@ -881,22 +829,47 @@ class EngineArgs:
help
=
"Disable async output processing. This may result in "
"lower performance."
)
parser
.
add_argument
(
'--scheduling-policy'
,
choices
=
[
'fcfs'
,
'priority'
],
default
=
"fcfs"
,
help
=
'The scheduling policy to use. "fcfs" (first come first served'
', i.e. requests are handled in order of arrival; default) '
'or "priority" (requests are handled based on given '
'priority (lower value means earlier handling) and time of '
'arrival deciding any ties).'
)
parser
.
add_argument
(
'--scheduler-cls'
,
default
=
EngineArgs
.
scheduler_cls
,
help
=
'The scheduler class to use. "vllm.core.scheduler.Scheduler" '
'is the default scheduler. Can be a class directly or the path to '
'a class of form "mod.custom_class".'
)
# Scheduler arguments
scheduler_kwargs
=
get_kwargs
(
SchedulerConfig
)
scheduler_group
=
parser
.
add_argument_group
(
title
=
"SchedulerConfig"
,
description
=
SchedulerConfig
.
__doc__
,
)
scheduler_group
.
add_argument
(
'--max-num-batched-tokens'
,
**
scheduler_kwargs
[
"max_num_batched_tokens"
])
scheduler_group
.
add_argument
(
'--max-num-seqs'
,
**
scheduler_kwargs
[
"max_num_seqs"
])
scheduler_group
.
add_argument
(
"--max-num-partial-prefills"
,
**
scheduler_kwargs
[
"max_num_partial_prefills"
])
scheduler_group
.
add_argument
(
"--max-long-partial-prefills"
,
**
scheduler_kwargs
[
"max_long_partial_prefills"
])
scheduler_group
.
add_argument
(
"--long-prefill-token-threshold"
,
**
scheduler_kwargs
[
"long_prefill_token_threshold"
])
scheduler_group
.
add_argument
(
'--num-lookahead-slots'
,
**
scheduler_kwargs
[
"num_lookahead_slots"
])
scheduler_group
.
add_argument
(
'--scheduler-delay-factor'
,
**
scheduler_kwargs
[
"delay_factor"
])
scheduler_group
.
add_argument
(
'--preemption-mode'
,
**
scheduler_kwargs
[
"preemption_mode"
])
scheduler_group
.
add_argument
(
'--num-scheduler-steps'
,
**
scheduler_kwargs
[
"num_scheduler_steps"
])
scheduler_group
.
add_argument
(
'--multi-step-stream-outputs'
,
**
scheduler_kwargs
[
"multi_step_stream_outputs"
])
scheduler_group
.
add_argument
(
'--scheduling-policy'
,
**
scheduler_kwargs
[
"policy"
])
scheduler_group
.
add_argument
(
'--enable-chunked-prefill'
,
**
scheduler_kwargs
[
"enable_chunked_prefill"
])
scheduler_group
.
add_argument
(
"--disable-chunked-mm-input"
,
**
scheduler_kwargs
[
"disable_chunked_mm_input"
])
parser
.
add_argument
(
'--scheduler-cls'
,
**
scheduler_kwargs
[
"scheduler_cls"
])
parser
.
add_argument
(
'--override-neuron-config'
,
...
...
@@ -923,10 +896,11 @@ class EngineArgs:
'testing only. level 3 is the recommended level '
'for production.
\n
'
'To specify the full compilation config, '
'use a JSON string.
\n
'
'use a JSON string, e.g. ``{"level": 3, '
'"cudagraph_capture_sizes": [1, 2, 4, 8]}``
\n
'
'Following the convention of traditional '
'compilers, using
-O
without space is also '
'supported. -O3 is equivalent to -O 3.'
)
'compilers, using
``-O``
without space is also '
'supported.
``
-O3
``
is equivalent to
``
-O 3
``
.'
)
parser
.
add_argument
(
'--kv-transfer-config'
,
type
=
KVTransferConfig
.
from_cli
,
...
...
@@ -948,7 +922,7 @@ class EngineArgs:
'class without changing the existing functions.'
)
parser
.
add_argument
(
"--generation-config"
,
type
=
nullable_
str
,
type
=
optional_type
(
str
)
,
default
=
"auto"
,
help
=
"The folder path to the generation config. "
"Defaults to 'auto', the generation config will be loaded from "
...
...
@@ -975,15 +949,6 @@ class EngineArgs:
help
=
"Enable sleep mode for the engine. "
"(only cuda platform is supported)"
)
parser
.
add_argument
(
'--calculate-kv-scales'
,
action
=
'store_true'
,
help
=
'This enables dynamic calculation of '
'k_scale and v_scale when kv-cache-dtype is fp8. '
'If calculate-kv-scales is false, the scales will '
'be loaded from the model checkpoint if available. '
'Otherwise, the scales will default to 1.0.'
)
parser
.
add_argument
(
"--additional-config"
,
type
=
json
.
loads
,
...
...
@@ -1001,16 +966,6 @@ class EngineArgs:
"If enabled, the model will be able to generate reasoning content."
)
parser
.
add_argument
(
"--reasoning-parser"
,
type
=
str
,
choices
=
list
(
ReasoningParserManager
.
reasoning_parsers
),
default
=
None
,
help
=
"Select the reasoning parser depending on the model that you're "
"using. This is used to parse the reasoning content into OpenAI "
"API format. Required for ``--enable-reasoning``."
)
parser
.
add_argument
(
"--disable-cascade-attn"
,
action
=
"store_true"
,
...
...
@@ -1021,20 +976,6 @@ class EngineArgs:
"Note that even if this is set to False, cascade attention will be "
"only used when the heuristic tells that it's beneficial."
)
parser
.
add_argument
(
"--disable-chunked-mm-input"
,
action
=
StoreBoolean
,
default
=
EngineArgs
.
disable_chunked_mm_input
,
nargs
=
"?"
,
const
=
"True"
,
help
=
"Disable multimodal input chunking attention for V1. "
"If set to true and chunked prefill is enabled, we do not want to"
" partially schedule a multimodal item. This ensures that if a "
"request has a mixed prompt (like text tokens TTTT followed by "
"image tokens IIIIIIIIII) where only some image tokens can be "
"scheduled (like TTTTIIIII, leaving IIIII), it will be scheduled "
"as TTTT in one step and IIIIIIIIII in the next."
)
return
parser
@
classmethod
...
...
@@ -1228,11 +1169,6 @@ class EngineArgs:
enable_expert_parallel
=
self
.
enable_expert_parallel
,
max_parallel_loading_workers
=
self
.
max_parallel_loading_workers
,
disable_custom_all_reduce
=
self
.
disable_custom_all_reduce
,
tokenizer_pool_config
=
TokenizerPoolConfig
.
create_config
(
self
.
tokenizer_pool_size
,
self
.
tokenizer_pool_type
,
self
.
tokenizer_pool_extra_config
,
),
ray_workers_use_nsight
=
self
.
ray_workers_use_nsight
,
placement_group
=
placement_group
,
distributed_executor_backend
=
self
.
distributed_executor_backend
,
...
...
@@ -1308,8 +1244,6 @@ class EngineArgs:
if
self
.
qlora_adapter_name_or_path
is
not
None
and
\
self
.
qlora_adapter_name_or_path
!=
""
:
if
self
.
model_loader_extra_config
is
None
:
self
.
model_loader_extra_config
=
{}
self
.
model_loader_extra_config
[
"qlora_adapter_name_or_path"
]
=
self
.
qlora_adapter_name_or_path
...
...
@@ -1390,7 +1324,7 @@ class EngineArgs:
recommend_to_remove
=
False
)
return
False
if
self
.
preemption_mode
!=
EngineArgs
.
preemption_mode
:
if
self
.
preemption_mode
!=
SchedulerConfig
.
preemption_mode
:
_raise_or_fallback
(
feature_name
=
"--preemption-mode"
,
recommend_to_remove
=
True
)
return
False
...
...
@@ -1401,34 +1335,28 @@ class EngineArgs:
recommend_to_remove
=
True
)
return
False
if
self
.
scheduling_policy
!=
EngineArgs
.
scheduling_
policy
:
if
self
.
scheduling_policy
!=
SchedulerConfig
.
policy
:
_raise_or_fallback
(
feature_name
=
"--scheduling-policy"
,
recommend_to_remove
=
False
)
return
False
if
self
.
num_scheduler_steps
!=
EngineArgs
.
num_scheduler_steps
:
if
self
.
num_scheduler_steps
!=
SchedulerConfig
.
num_scheduler_steps
:
_raise_or_fallback
(
feature_name
=
"--num-scheduler-steps"
,
recommend_to_remove
=
True
)
return
False
if
self
.
scheduler_delay_factor
!=
EngineArgs
.
s
cheduler
_
delay_factor
:
if
self
.
scheduler_delay_factor
!=
S
cheduler
Config
.
delay_factor
:
_raise_or_fallback
(
feature_name
=
"--scheduler-delay-factor"
,
recommend_to_remove
=
True
)
return
False
if
self
.
additional_config
!=
EngineArgs
.
additional_config
:
_raise_or_fallback
(
feature_name
=
"--additional-config"
,
recommend_to_remove
=
False
)
return
False
# Xgrammar and Guidance are supported.
SUPPORTED_GUIDED_DECODING
=
[
"xgrammar"
,
"xgrammar:disable-any-whitespace"
,
"guidance"
,
"guidance:disable-any-whitespace"
,
"auto"
]
if
self
.
guided_decoding_backend
not
in
SUPPORTED_GUIDED_DECODING
:
_raise_or_fallback
(
feature_name
=
"--guided-decoding-backend"
,
recommend_to_remove
=
False
)
# remove backend options when doing this check
if
self
.
guided_decoding_backend
.
split
(
':'
)[
0
]
\
not
in
get_args
(
GuidedDecodingBackendV1
):
_raise_or_fallback
(
feature_name
=
f
"--guided-decoding-backend=
{
self
.
guided_decoding_backend
}
"
,
recommend_to_remove
=
False
)
return
False
# Need at least Ampere for now (FA support required).
...
...
@@ -1452,7 +1380,7 @@ class EngineArgs:
)
or
envs
.
VLLM_ATTENTION_BACKEND
==
"FLASH_ATTN_VLLM_V1"
supported
=
False
if
fp8_attention
and
will_use_fa
:
from
vllm.
vllm_flash_attn
.fa_utils
import
(
from
vllm.
attention.utils
.fa_utils
import
(
flash_attn_supports_fp8
)
supported
=
flash_attn_supports_fp8
()
if
not
supported
:
...
...
@@ -1495,9 +1423,9 @@ class EngineArgs:
# No Concurrent Partial Prefills so far.
if
(
self
.
max_num_partial_prefills
!=
EngineArgs
.
max_num_partial_prefills
!=
SchedulerConfig
.
max_num_partial_prefills
or
self
.
max_long_partial_prefills
!=
EngineArgs
.
max_long_partial_prefills
):
!=
SchedulerConfig
.
max_long_partial_prefills
):
_raise_or_fallback
(
feature_name
=
"Concurrent Partial Prefill"
,
recommend_to_remove
=
False
)
return
False
...
...
@@ -1517,7 +1445,7 @@ class EngineArgs:
if
speculative_method
:
if
speculative_method
in
(
"ngram"
,
"[ngram]"
):
is_ngram_enabled
=
True
elif
speculative_method
==
"eagle"
:
elif
speculative_method
in
(
"eagle"
,
"eagle3"
)
:
is_eagle_enabled
=
True
else
:
speculative_model
=
self
.
speculative_config
.
get
(
"model"
)
...
...
@@ -1529,16 +1457,17 @@ class EngineArgs:
recommend_to_remove
=
False
)
return
False
# No Disaggregated Prefill so far.
if
self
.
kv_transfer_config
!=
EngineArgs
.
kv_transfer_config
:
_raise_or_fallback
(
feature_name
=
"--kv-transfer-config"
,
recommend_to_remove
=
False
)
return
False
# No FlashInfer or XFormers so far.
# No XFormers so far.
V1_BACKENDS
=
[
"FLASH_ATTN_VLLM_V1"
,
"FLASH_ATTN"
,
"PALLAS"
,
"PALLAS_VLLM_V1"
,
"TRITON_ATTN_VLLM_V1"
,
"TRITON_MLA"
,
"FLASHMLA"
"FLASH_ATTN_VLLM_V1"
,
"FLASH_ATTN"
,
"PALLAS"
,
"PALLAS_VLLM_V1"
,
"TRITON_ATTN_VLLM_V1"
,
"TRITON_MLA"
,
"FLASHMLA"
,
"FLASHINFER"
,
"FLASHINFER_VLLM_V1"
,
]
if
(
envs
.
is_set
(
"VLLM_ATTENTION_BACKEND"
)
and
envs
.
VLLM_ATTENTION_BACKEND
not
in
V1_BACKENDS
):
...
...
@@ -1640,9 +1569,7 @@ class EngineArgs:
self
.
enable_prefix_caching
=
False
# VLLM_V0 only supports builtin hash algo for prefix caching.
if
self
.
prefix_caching_hash_algo
is
None
:
self
.
prefix_caching_hash_algo
=
"builtin"
elif
self
.
prefix_caching_hash_algo
==
"sha256"
:
if
self
.
prefix_caching_hash_algo
==
"sha256"
:
raise
ValueError
(
"sha256 is not supported for prefix caching in V0 engine. "
"Please use 'builtin'."
)
...
...
@@ -1661,10 +1588,6 @@ class EngineArgs:
if
self
.
enable_prefix_caching
is
None
:
self
.
enable_prefix_caching
=
True
# if using prefix caching, we must set a hash algo
if
self
.
enable_prefix_caching
and
self
.
prefix_caching_hash_algo
is
None
:
self
.
prefix_caching_hash_algo
=
"builtin"
# V1 should use the new scheduler by default.
# Swap it only if this arg is set to the original V0 default
if
self
.
scheduler_cls
==
EngineArgs
.
scheduler_cls
:
...
...
@@ -1681,13 +1604,13 @@ class EngineArgs:
# values for non-H100/H200 GPUs.
try
:
from
vllm.platforms
import
current_platform
device_
na
me
=
current_platform
.
get_device_
name
().
lower
()
device_me
mory
=
current_platform
.
get_device_
total_memory
()
except
Exception
:
# This is only used to set default_max_num_batched_tokens
device_
name
=
"no-device"
device_
memory
=
0
if
"h100"
in
device_
na
me
or
"h200"
in
device_name
:
# For H100 and
H200, we
use larger default values.
if
device_me
m
or
y
>=
70
*
GiB_bytes
:
# For
GPUs like
H100 and
MI300x,
use larger default values.
default_max_num_batched_tokens
=
{
UsageContext
.
LLM_CLASS
:
16384
,
UsageContext
.
OPENAI_API_SERVER
:
8192
,
...
...
vllm/engine/async_llm_engine.py
View file @
dcb5624a
...
...
@@ -493,12 +493,11 @@ class _AsyncLLMEngine(LLMEngine):
tokenizer
=
await
self
.
get_tokenizer_async
(
lora_request
)
self
.
_validate_token_prompt
(
prompt
,
tokenizer
=
tokenizer
)
pre
processed_inputs
=
await
self
.
input_preprocessor
.
preprocess_async
(
processed_inputs
=
await
self
.
input_preprocessor
.
preprocess_async
(
prompt
,
lora_request
=
lora_request
,
prompt_adapter_request
=
prompt_adapter_request
,
)
processed_inputs
=
self
.
input_processor
(
preprocessed_inputs
)
if
isinstance
(
params
,
SamplingParams
)
and
\
params
.
guided_decoding
is
not
None
:
...
...
@@ -526,10 +525,15 @@ class _AsyncLLMEngine(LLMEngine):
)
async
def
check_health_async
(
self
)
->
None
:
if
self
.
tokenizer
:
self
.
tokenizer
.
check_health
()
self
.
model_executor
.
check_health
()
async
def
collective_rpc_async
(
self
,
method
:
str
,
timeout
:
Optional
[
float
]
=
None
,
args
:
tuple
=
(),
kwargs
:
Optional
[
dict
]
=
None
):
raise
NotImplementedError
async
def
build_guided_decoding_logits_processor_async
(
sampling_params
:
SamplingParams
,
tokenizer
:
AnyTokenizer
,
...
...
@@ -1167,6 +1171,10 @@ class AsyncLLMEngine(EngineClient):
exception
=
asyncio
.
CancelledError
,
verbose
=
self
.
log_requests
)
async
def
get_vllm_config
(
self
)
->
VllmConfig
:
"""Get the vllm configuration of the vLLM engine."""
return
self
.
engine
.
get_vllm_config
()
async
def
get_model_config
(
self
)
->
ModelConfig
:
"""Get the model configuration of the vLLM engine."""
return
self
.
engine
.
get_model_config
()
...
...
@@ -1234,6 +1242,17 @@ class AsyncLLMEngine(EngineClient):
async
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
None
:
self
.
engine
.
add_lora
(
lora_request
)
async
def
collective_rpc
(
self
,
method
:
str
,
timeout
:
Optional
[
float
]
=
None
,
args
:
tuple
=
(),
kwargs
:
Optional
[
dict
]
=
None
):
"""
Perform a collective RPC call to the given path.
"""
return
await
self
.
engine
.
collective_rpc_async
(
method
,
timeout
,
args
,
kwargs
)
# TODO(v1): Remove this class proxy when V1 goes default.
if
envs
.
is_set
(
"VLLM_USE_V1"
)
and
envs
.
VLLM_USE_V1
:
...
...
vllm/engine/llm_engine.py
View file @
dcb5624a
...
...
@@ -30,8 +30,7 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group
from
vllm.entrypoints.openai.logits_processors
import
(
get_logits_processors
as
get_openai_logits_processors
)
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.inputs
import
(
INPUT_REGISTRY
,
InputRegistry
,
ProcessorInputs
,
PromptType
,
SingletonInputs
)
from
vllm.inputs
import
ProcessorInputs
,
PromptType
,
SingletonInputs
from
vllm.inputs.parse
import
is_token_prompt
,
split_enc_dec_inputs
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.logger
import
init_logger
...
...
@@ -56,7 +55,7 @@ from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer_group
import
(
Base
TokenizerGroup
,
init_tokenizer_from_configs
)
TokenizerGroup
,
init_tokenizer_from_configs
)
from
vllm.usage.usage_lib
import
(
UsageContext
,
is_usage_stats_enabled
,
usage_message
)
from
vllm.utils
import
(
Counter
,
Device
,
deprecate_kwargs
,
...
...
@@ -67,7 +66,6 @@ from vllm.worker.model_runner_base import InputProcessingError
logger
=
init_logger
(
__name__
)
_LOCAL_LOGGING_INTERVAL_SEC
=
5
_G
=
TypeVar
(
"_G"
,
bound
=
BaseTokenizerGroup
,
default
=
BaseTokenizerGroup
)
_O
=
TypeVar
(
"_O"
,
RequestOutput
,
PoolingRequestOutput
)
_R
=
TypeVar
(
"_R"
,
default
=
Any
)
...
...
@@ -206,7 +204,7 @@ class LLMEngine:
return
outputs_
tokenizer
:
Optional
[
Base
TokenizerGroup
]
tokenizer
:
Optional
[
TokenizerGroup
]
def
__init__
(
self
,
...
...
@@ -215,7 +213,6 @@ class LLMEngine:
log_stats
:
bool
,
usage_context
:
UsageContext
=
UsageContext
.
ENGINE_CONTEXT
,
stat_loggers
:
Optional
[
Dict
[
str
,
StatLoggerBase
]]
=
None
,
input_registry
:
InputRegistry
=
INPUT_REGISTRY
,
mm_registry
:
MultiModalRegistry
=
MULTIMODAL_REGISTRY
,
use_cached_outputs
:
bool
=
False
,
)
->
None
:
...
...
@@ -276,11 +273,7 @@ class LLMEngine:
self
.
tokenizer
,
mm_registry
)
self
.
input_registry
=
input_registry
self
.
input_processor
=
input_registry
.
create_input_processor
(
self
.
model_config
)
self
.
model_executor
=
executor_class
(
vllm_config
=
vllm_config
,
)
self
.
model_executor
=
executor_class
(
vllm_config
=
vllm_config
)
if
self
.
model_config
.
runner_type
!=
"pooling"
:
self
.
_initialize_kv_caches
()
...
...
@@ -322,11 +315,6 @@ class LLMEngine:
self
.
parallel_config
.
disable_custom_all_reduce
,
})
if
self
.
tokenizer
:
# Ping the tokenizer to ensure liveness if it runs in a
# different process.
self
.
tokenizer
.
ping
()
self
.
cached_scheduler_outputs
=
[
SchedulerOutputState
()
for
_
in
range
(
self
.
parallel_config
.
pipeline_parallel_size
)
...
...
@@ -540,21 +528,12 @@ class LLMEngine:
if
model_executor
:
=
getattr
(
self
,
"model_executor"
,
None
):
model_executor
.
shutdown
()
def
get_tokenizer_group
(
self
,
group_type
:
Type
[
_G
]
=
BaseTokenizerGroup
,
)
->
_G
:
tokenizer_group
=
self
.
tokenizer
if
tokenizer_group
is
None
:
def
get_tokenizer_group
(
self
)
->
TokenizerGroup
:
if
self
.
tokenizer
is
None
:
raise
ValueError
(
"Unable to get tokenizer because "
"skip_tokenizer_init is True"
)
if
not
isinstance
(
tokenizer_group
,
group_type
):
raise
TypeError
(
"Invalid type of tokenizer group. "
f
"Expected type:
{
group_type
}
, but "
f
"found type:
{
type
(
tokenizer_group
)
}
"
)
return
tokenizer
_group
return
self
.
tokenizer
def
get_tokenizer
(
self
,
...
...
@@ -562,11 +541,10 @@ class LLMEngine:
)
->
AnyTokenizer
:
return
self
.
get_tokenizer_group
().
get_lora_tokenizer
(
lora_request
)
def
_init_tokenizer
(
self
)
->
Base
TokenizerGroup
:
def
_init_tokenizer
(
self
)
->
TokenizerGroup
:
return
init_tokenizer_from_configs
(
model_config
=
self
.
model_config
,
scheduler_config
=
self
.
scheduler_config
,
parallel_config
=
self
.
parallel_config
,
lora_config
=
self
.
lora_config
)
def
_verify_args
(
self
)
->
None
:
...
...
@@ -781,12 +759,11 @@ class LLMEngine:
prompt
,
tokenizer
=
self
.
get_tokenizer
(
lora_request
=
lora_request
))
pre
processed_inputs
=
self
.
input_preprocessor
.
preprocess
(
processed_inputs
=
self
.
input_preprocessor
.
preprocess
(
prompt
,
lora_request
=
lora_request
,
prompt_adapter_request
=
prompt_adapter_request
,
)
processed_inputs
=
self
.
input_processor
(
preprocessed_inputs
)
self
.
_add_processed_request
(
request_id
=
request_id
,
...
...
@@ -917,6 +894,10 @@ class LLMEngine:
scheduler
.
abort_seq_group
(
request_id
,
seq_id_to_seq_group
=
self
.
seq_id_to_seq_group
)
def
get_vllm_config
(
self
)
->
VllmConfig
:
"""Gets the vllm configuration."""
return
self
.
vllm_config
def
get_model_config
(
self
)
->
ModelConfig
:
"""Gets the model configuration."""
return
self
.
model_config
...
...
@@ -1965,8 +1946,6 @@ class LLMEngine:
return
self
.
model_executor
.
is_sleeping
def
check_health
(
self
)
->
None
:
if
self
.
tokenizer
:
self
.
tokenizer
.
check_health
()
self
.
model_executor
.
check_health
()
def
is_tracing_enabled
(
self
)
->
bool
:
...
...
@@ -2075,7 +2054,7 @@ class LLMEngine:
raise
ValueError
(
f
"The
{
prompt_type
}
prompt cannot be empty"
)
max_prompt_len
=
self
.
model_config
.
max_model_len
if
len
(
prompt_ids
)
>
=
max_prompt_len
:
if
len
(
prompt_ids
)
>
max_prompt_len
:
if
prompt_type
==
"encoder"
and
model_config
.
is_multimodal_model
:
mm_registry
=
self
.
input_preprocessor
.
mm_registry
mm_processor
=
mm_registry
.
create_processor
(
...
...
vllm/engine/metrics.py
View file @
dcb5624a
...
...
@@ -140,16 +140,13 @@ class Metrics:
name
=
"vllm:generation_tokens_total"
,
documentation
=
"Number of generation tokens processed."
,
labelnames
=
labelnames
)
buckets
=
[
1
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8096
]
if
not
vllm_config
.
model_config
.
enforce_eager
:
buckets
=
vllm_config
.
compilation_config
.
\
cudagraph_capture_sizes
.
copy
()
buckets
.
sort
()
self
.
histogram_iteration_tokens
=
self
.
_histogram_cls
(
name
=
"vllm:iteration_tokens_total"
,
documentation
=
"Histogram of number of tokens per engine_step."
,
labelnames
=
labelnames
,
buckets
=
buckets
)
buckets
=
[
1
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
])
self
.
histogram_time_to_first_token
=
self
.
_histogram_cls
(
name
=
"vllm:time_to_first_token_seconds"
,
documentation
=
"Histogram of time to first token in seconds."
,
...
...
vllm/engine/multiprocessing/client.py
View file @
dcb5624a
...
...
@@ -93,6 +93,7 @@ class MQLLMEngineClient(EngineClient):
self
.
_errored_with
:
Optional
[
BaseException
]
=
None
# Get the configs.
self
.
vllm_config
=
engine_config
self
.
model_config
=
engine_config
.
model_config
self
.
decoding_config
=
engine_config
.
decoding_config
...
...
@@ -100,7 +101,6 @@ class MQLLMEngineClient(EngineClient):
self
.
tokenizer
=
init_tokenizer_from_configs
(
model_config
=
self
.
model_config
,
scheduler_config
=
engine_config
.
scheduler_config
,
parallel_config
=
engine_config
.
parallel_config
,
lora_config
=
engine_config
.
lora_config
)
self
.
input_preprocessor
=
InputPreprocessor
(
self
.
model_config
,
self
.
tokenizer
)
...
...
@@ -377,6 +377,9 @@ class MQLLMEngineClient(EngineClient):
async
def
get_tokenizer
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
):
return
await
self
.
tokenizer
.
get_lora_tokenizer_async
(
lora_request
)
async
def
get_vllm_config
(
self
)
->
VllmConfig
:
return
self
.
vllm_config
async
def
get_decoding_config
(
self
)
->
DecodingConfig
:
return
self
.
decoding_config
...
...
vllm/engine/output_processor/multi_step.py
View file @
dcb5624a
...
...
@@ -178,7 +178,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
# generates a fixed number of tokens without evaluating stopping
# conditions within the block. This can cause an eos token to be
# unintentionally ignored.
if
not
sampling_params
.
ignore_eos
:
if
not
sampling_params
.
ignore_eos
and
self
.
detokenizer
:
eos_token_id
=
self
.
get_tokenizer_for_seq
(
seq
).
eos_token_id
# Avoiding .index calls as exception throwing in the happy path
# is expensive.
...
...
vllm/engine/protocol.py
View file @
dcb5624a
...
...
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
from
typing
import
AsyncGenerator
,
List
,
Mapping
,
Optional
from
vllm.beam_search
import
BeamSearchSequence
,
create_sort_beams_key_function
from
vllm.config
import
DecodingConfig
,
ModelConfig
from
vllm.config
import
DecodingConfig
,
ModelConfig
,
VllmConfig
from
vllm.core.scheduler
import
SchedulerOutputs
from
vllm.inputs.data
import
PromptType
,
TokensPrompt
from
vllm.inputs.parse
import
is_explicit_encoder_decoder_prompt
...
...
@@ -220,6 +220,11 @@ class EngineClient(ABC):
"""
...
@
abstractmethod
async
def
get_vllm_config
(
self
)
->
VllmConfig
:
"""Get the vllm configuration of the vLLM engine."""
...
@
abstractmethod
async
def
get_model_config
(
self
)
->
ModelConfig
:
"""Get the model configuration of the vLLM engine."""
...
...
vllm/entrypoints/chat_utils.py
View file @
dcb5624a
...
...
@@ -27,10 +27,11 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
ChatCompletionToolMessageParam
)
from
openai.types.chat.chat_completion_content_part_input_audio_param
import
(
InputAudio
)
from
pydantic
import
TypeAdapter
# yapf: enable
# pydantic needs the TypedDict from typing_extensions
from
transformers
import
(
PreTrainedTokenizer
,
PreTrainedTokenizerFast
,
ProcessorMixin
)
# pydantic needs the TypedDict from typing_extensions
from
typing_extensions
import
Required
,
TypeAlias
,
TypedDict
from
vllm.config
import
ModelConfig
...
...
@@ -482,11 +483,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
if
modality
in
(
"image"
,
"image_embeds"
):
if
model_type
==
"chatglm"
:
return
"<|begin_of_image|><|endoftext|><|end_of_image|>"
if
model_type
==
"phi3_v"
:
# Workaround since this token is not defined in the tokenizer
if
model_type
in
(
"phi3_v"
,
"phi4mm"
):
return
f
"<|image_
{
current_count
}
|>"
if
model_type
==
"phi4mm"
:
return
"<|endoftext10|>"
# 200010 (see vocab.json in hf model)
if
model_type
in
(
"minicpmo"
,
"minicpmv"
):
return
"(<image>./</image>)"
if
model_type
in
(
"blip-2"
,
"florence2"
,
"fuyu"
,
"paligemma"
,
...
...
@@ -506,20 +504,24 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return
"<|image|>"
if
model_type
in
(
"qwen2_vl"
,
"qwen2_5_vl"
):
return
"<|vision_start|><|image_pad|><|vision_end|>"
if
model_type
==
"qwen2_5_omni"
:
return
"<|vision_start|><|IMAGE|><|vision_end|>"
if
model_type
==
"molmo"
:
return
""
if
model_type
==
"aria"
:
return
"<|fim_prefix|><|img|><|fim_suffix|>"
if
model_type
==
"gemma3"
:
return
"<start_of_image>"
if
model_type
==
"kimi_vl"
:
return
"<|media_start|>image<|media_content|><|media_pad|><|media_end|>"
# noqa: E501
raise
TypeError
(
f
"Unknown
{
modality
}
model type:
{
model_type
}
"
)
elif
modality
==
"audio"
:
if
model_type
==
"ultravox"
:
if
model_type
in
(
"ultravox"
,
"granite_speech"
)
:
return
"<|audio|>"
if
model_type
==
"phi4mm"
:
return
"<|
endoftext11|>"
# 200011 (see vocab.json in hf model)
if
model_type
==
"qwen2_audio"
:
return
f
"<|
audio_
{
current_count
}
|>"
if
model_type
in
(
"qwen2_audio"
,
"qwen2_5_omni"
)
:
return
(
f
"Audio
{
current_count
}
: "
f
"<|audio_bos|><|AUDIO|><|audio_eos|>"
)
if
model_type
==
"minicpmo"
:
...
...
@@ -528,6 +530,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
elif
modality
==
"video"
:
if
model_type
in
(
"qwen2_vl"
,
"qwen2_5_vl"
):
return
"<|vision_start|><|video_pad|><|vision_end|>"
if
model_type
==
"qwen2_5_omni"
:
return
"<|vision_start|><|VIDEO|><|vision_end|>"
if
model_type
in
(
"minicpmo"
,
"minicpmv"
):
return
"(<video>./</video>)"
if
model_type
.
startswith
(
"llava"
):
...
...
@@ -876,12 +880,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
# No need to validate using Pydantic again
_TextParser
=
partial
(
cast
,
ChatCompletionContentPartTextParam
)
_ImageParser
=
partial
(
cast
,
ChatCompletionContentPartImageParam
)
_ImageEmbedsParser
=
partial
(
cast
,
ChatCompletionContentPartImageEmbedsParam
)
_AudioParser
=
partial
(
cast
,
ChatCompletionContentPartAudioParam
)
_InputAudioParser
=
partial
(
cast
,
ChatCompletionContentPartInputAudioParam
)
_RefusalParser
=
partial
(
cast
,
ChatCompletionContentPartRefusalParam
)
_VideoParser
=
partial
(
cast
,
ChatCompletionContentPartVideoParam
)
# Need to validate url objects
_ImageParser
=
TypeAdapter
(
ChatCompletionContentPartImageParam
).
validate_python
_AudioParser
=
TypeAdapter
(
ChatCompletionContentPartAudioParam
).
validate_python
_VideoParser
=
TypeAdapter
(
ChatCompletionContentPartVideoParam
).
validate_python
_ContentPart
:
TypeAlias
=
Union
[
str
,
dict
[
str
,
str
],
InputAudio
]
...
...
@@ -1092,7 +1097,11 @@ def _parse_chat_message_content(
if
role
==
'assistant'
:
parsed_msg
=
_AssistantParser
(
message
)
if
"tool_calls"
in
parsed_msg
:
# The 'tool_calls' is not None check ensures compatibility.
# It's needed only if downstream code doesn't strictly
# follow the OpenAI spec.
if
(
"tool_calls"
in
parsed_msg
and
parsed_msg
[
"tool_calls"
]
is
not
None
):
result_msg
[
"tool_calls"
]
=
list
(
parsed_msg
[
"tool_calls"
])
elif
role
==
"tool"
:
parsed_msg
=
_ToolParser
(
message
)
...
...
@@ -1189,14 +1198,25 @@ def apply_hf_chat_template(
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
)
return
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
# type: ignore[arg-type]
tools
=
tools
,
# type: ignore[arg-type]
chat_template
=
hf_chat_template
,
tokenize
=
tokenize
,
**
kwargs
,
)
try
:
return
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
# type: ignore[arg-type]
tools
=
tools
,
# type: ignore[arg-type]
chat_template
=
hf_chat_template
,
tokenize
=
tokenize
,
**
kwargs
,
)
# External library exceptions can sometimes occur despite the framework's
# internal exception management capabilities.
except
Exception
as
e
:
# Log and report any library-related exceptions for further
# investigation.
logger
.
exception
(
"An error occurred in `transformers` while applying chat template"
)
raise
ValueError
from
e
def
apply_mistral_chat_template
(
tokenizer
:
MistralTokenizer
,
...
...
@@ -1205,6 +1225,8 @@ def apply_mistral_chat_template(
tools
:
Optional
[
list
[
dict
[
str
,
Any
]]],
**
kwargs
:
Any
,
)
->
list
[
int
]:
from
mistral_common.exceptions
import
MistralCommonException
# The return value of resolve_mistral_chat_template is always None,
# and we won't use it.
resolve_mistral_chat_template
(
...
...
@@ -1222,5 +1244,16 @@ def apply_mistral_chat_template(
# if input does not comply with the expected format.
# We convert those assertion errors to ValueErrors so they can be
# are properly caught in the preprocessing_input step
except
AssertionError
as
e
:
except
(
AssertionError
,
MistralCommonException
)
as
e
:
raise
ValueError
from
e
# External library exceptions can sometimes occur despite the framework's
# internal exception management capabilities.
except
Exception
as
e
:
# Log and report any library-related exceptions for further
# investigation.
logger
.
exception
(
"An error occurred in `mistral_common` while applying chat "
"template"
)
raise
ValueError
from
e
vllm/entrypoints/cli/benchmark/latency.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
argparse
from
vllm.benchmarks.latency
import
add_cli_args
,
main
from
vllm.entrypoints.cli.benchmark.base
import
BenchmarkSubcommandBase
from
vllm.entrypoints.cli.types
import
CLISubcommand
class
BenchmarkLatencySubcommand
(
BenchmarkSubcommandBase
):
""" The `latency` subcommand for vllm bench. """
def
__init__
(
self
):
self
.
name
=
"latency"
super
().
__init__
()
@
property
def
help
(
self
)
->
str
:
return
"Benchmark the latency of a single batch of requests."
def
add_cli_args
(
self
,
parser
:
argparse
.
ArgumentParser
)
->
None
:
add_cli_args
(
parser
)
@
staticmethod
def
cmd
(
args
:
argparse
.
Namespace
)
->
None
:
main
(
args
)
def
cmd_init
()
->
list
[
CLISubcommand
]:
return
[
BenchmarkLatencySubcommand
()]
vllm/entrypoints/cli/benchmark/main.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
vllm.entrypoints.cli.benchmark.latency
import
vllm.entrypoints.cli.benchmark.serve
import
vllm.entrypoints.cli.benchmark.throughput
from
vllm.entrypoints.cli.types
import
CLISubcommand
from
vllm.utils
import
FlexibleArgumentParser
# TODO: Add the rest of the benchmark subcommands here,
# e.g., throughput, latency, etc.
BENCHMARK_CMD_MODULES
=
[
vllm
.
entrypoints
.
cli
.
benchmark
.
latency
,
vllm
.
entrypoints
.
cli
.
benchmark
.
serve
,
vllm
.
entrypoints
.
cli
.
benchmark
.
throughput
,
]
...
...
vllm/entrypoints/cli/benchmark/throughput.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
argparse
from
vllm.benchmarks.throughput
import
add_cli_args
,
main
from
vllm.entrypoints.cli.benchmark.base
import
BenchmarkSubcommandBase
from
vllm.entrypoints.cli.types
import
CLISubcommand
class
BenchmarkThroughputSubcommand
(
BenchmarkSubcommandBase
):
""" The `throughput` subcommand for vllm bench. """
def
__init__
(
self
):
self
.
name
=
"throughput"
super
().
__init__
()
@
property
def
help
(
self
)
->
str
:
return
"Benchmark offline inference throughput."
def
add_cli_args
(
self
,
parser
:
argparse
.
ArgumentParser
)
->
None
:
add_cli_args
(
parser
)
@
staticmethod
def
cmd
(
args
:
argparse
.
Namespace
)
->
None
:
main
(
args
)
def
cmd_init
()
->
list
[
CLISubcommand
]:
return
[
BenchmarkThroughputSubcommand
()]
vllm/entrypoints/cli/collect_env.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
argparse
from
vllm.collect_env
import
main
as
collect_env_main
from
vllm.entrypoints.cli.types
import
CLISubcommand
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.utils
import
FlexibleArgumentParser
class
CollectEnvSubcommand
(
CLISubcommand
):
"""The `serve` subcommand for the vLLM CLI. """
def
__init__
(
self
):
self
.
name
=
"collect-env"
super
().
__init__
()
@
staticmethod
def
cmd
(
args
:
argparse
.
Namespace
)
->
None
:
"""Collect information about the environment."""
collect_env_main
()
def
subparser_init
(
self
,
subparsers
:
argparse
.
_SubParsersAction
)
->
FlexibleArgumentParser
:
serve_parser
=
subparsers
.
add_parser
(
"collect-env"
,
help
=
"Start collecting environment information."
,
description
=
"Start collecting environment information."
,
usage
=
"vllm collect-env"
)
return
make_arg_parser
(
serve_parser
)
def
cmd_init
()
->
list
[
CLISubcommand
]:
return
[
CollectEnvSubcommand
()]
Prev
1
…
17
18
19
20
21
22
23
24
25
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment