Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
af7f4372
Commit
af7f4372
authored
Sep 03, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1
parents
5e19cdef
09c77926
Changes
448
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
719 additions
and
193 deletions
+719
-193
vllm/config.py
vllm/config.py
+176
-45
vllm/core/block/block_table.py
vllm/core/block/block_table.py
+6
-0
vllm/core/block/common.py
vllm/core/block/common.py
+53
-0
vllm/core/block/cpu_gpu_block_allocator.py
vllm/core/block/cpu_gpu_block_allocator.py
+5
-0
vllm/core/block/interfaces.py
vllm/core/block/interfaces.py
+10
-0
vllm/core/block/naive_block.py
vllm/core/block/naive_block.py
+5
-3
vllm/core/block/prefix_caching_block.py
vllm/core/block/prefix_caching_block.py
+15
-6
vllm/core/block/utils.py
vllm/core/block/utils.py
+2
-10
vllm/core/block_manager_v1.py
vllm/core/block_manager_v1.py
+42
-16
vllm/core/block_manager_v2.py
vllm/core/block_manager_v2.py
+3
-0
vllm/core/embedding_model_block_manager.py
vllm/core/embedding_model_block_manager.py
+4
-0
vllm/core/evictor_v2.py
vllm/core/evictor_v2.py
+11
-8
vllm/core/interfaces.py
vllm/core/interfaces.py
+6
-0
vllm/core/scheduler.py
vllm/core/scheduler.py
+210
-67
vllm/distributed/communication_op.py
vllm/distributed/communication_op.py
+1
-1
vllm/distributed/device_communicators/cuda_wrapper.py
vllm/distributed/device_communicators/cuda_wrapper.py
+7
-2
vllm/distributed/device_communicators/custom_all_reduce.py
vllm/distributed/device_communicators/custom_all_reduce.py
+8
-3
vllm/distributed/device_communicators/tpu_communicator.py
vllm/distributed/device_communicators/tpu_communicator.py
+10
-3
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+1
-1
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+144
-28
No files found.
Too many changes to show.
To preserve performance only
448 of 448+
files are displayed.
Plain diff
Email patch
vllm/config.py
View file @
af7f4372
import
enum
import
json
from
dataclasses
import
dataclass
,
field
,
fields
from
typing
import
TYPE_CHECKING
,
ClassVar
,
List
,
Optional
,
Tuple
,
Type
,
Union
from
typing
import
(
TYPE_CHECKING
,
ClassVar
,
List
,
Mapping
,
Optional
,
Tuple
,
Type
,
Union
)
import
torch
from
transformers
import
PretrainedConfig
...
...
@@ -10,10 +11,14 @@ import vllm.envs as envs
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.tracing
import
is_otel_installed
from
vllm.transformers_utils.config
import
get_config
,
get_hf_text_config
from
vllm.utils
import
(
cuda_device_count_stateless
,
get_cpu_memory
,
is_cpu
,
is_hip
,
is_neuron
,
is_openvino
,
is_tpu
,
is_xpu
,
from
vllm.platforms
import
current_platform
from
vllm.tracing
import
is_otel_available
,
otel_import_error_traceback
from
vllm.transformers_utils.config
import
(
get_config
,
get_hf_image_processor_config
,
get_hf_text_config
)
from
vllm.utils
import
(
STR_NOT_IMPL_ENC_DEC_CUDAGRAPH
,
GiB_bytes
,
cuda_device_count_stateless
,
get_cpu_memory
,
is_cpu
,
is_hip
,
is_neuron
,
is_openvino
,
is_xpu
,
print_warning_once
)
if
TYPE_CHECKING
:
...
...
@@ -26,7 +31,6 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
_GB
=
1
<<
30
_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS
=
32768
_PP_SUPPORTED_MODELS
=
[
...
...
@@ -34,6 +38,7 @@ _PP_SUPPORTED_MODELS = [
"AquilaForCausalLM"
,
"DeepseekV2ForCausalLM"
,
"InternLMForCausalLM"
,
"JAISLMHeadModel"
,
"LlamaForCausalLM"
,
"LLaMAForCausalLM"
,
"MistralForCausalLM"
,
...
...
@@ -87,6 +92,9 @@ class ModelConfig:
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
If None, the user did not specify, so default to False -
except for encoder/decoder models, which currently require
eager mode.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
...
...
@@ -103,6 +111,8 @@ class ModelConfig:
matches the model name exposed via the APIs. If multiple model
names provided, the first name will be used. If not specified,
the model name will be the same as `model`.
limit_mm_per_prompt: Maximum number of data instances per modality
per prompt. Only applicable for multimodal models.
"""
def
__init__
(
...
...
@@ -119,16 +129,17 @@ class ModelConfig:
rope_theta
:
Optional
[
float
]
=
None
,
tokenizer_revision
:
Optional
[
str
]
=
None
,
max_model_len
:
Optional
[
int
]
=
None
,
spec_target_max_model_len
:
Optional
[
int
]
=
None
,
quantization
:
Optional
[
str
]
=
None
,
quantization_param_path
:
Optional
[
str
]
=
None
,
enforce_eager
:
bool
=
Fals
e
,
enforce_eager
:
Optional
[
bool
]
=
Non
e
,
max_context_len_to_capture
:
Optional
[
int
]
=
None
,
max_seq_len_to_capture
:
Optional
[
int
]
=
None
,
max_logprobs
:
int
=
20
,
disable_sliding_window
:
bool
=
False
,
skip_tokenizer_init
:
bool
=
False
,
served_model_name
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
,
multimodal_config
:
Optional
[
"MultiModalConfig"
]
=
None
,
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]
]
=
None
,
)
->
None
:
self
.
model
=
model
self
.
tokenizer
=
tokenizer
...
...
@@ -158,8 +169,38 @@ class ModelConfig:
self
.
hf_config
=
get_config
(
self
.
model
,
trust_remote_code
,
revision
,
code_revision
,
rope_scaling
,
rope_theta
)
self
.
hf_text_config
=
get_hf_text_config
(
self
.
hf_config
)
self
.
hf_image_processor_config
=
get_hf_image_processor_config
(
self
.
model
,
revision
)
self
.
dtype
=
_get_and_verify_dtype
(
self
.
hf_text_config
,
dtype
)
# Choose a default enforce_eager value if the user did not specify
# a value (enforce_eager is None)
if
getattr
(
self
.
hf_config
,
'is_encoder_decoder'
,
False
):
if
self
.
enforce_eager
is
None
:
# *Only for encoder/decoder models* and
# *only if enforce_eager is unset*, override
# to enforce_eager=True
#
# Add a logger message since it is *somewhat* non-intuitive that
# enforce_eager is True when the user has not specified its
# value.
logger
.
info
(
"Forcing enforce_eager == True because "
"enforce_eager setting was unspecified and "
"CUDAGraph is not supported with encoder/ "
"decoder models."
)
self
.
enforce_eager
=
True
if
not
self
.
enforce_eager
:
# Eager mode explicitly disabled by user for an encoder/
# decoder model; however CUDAGRAPH + encoder/decoder is
# not currently supported
raise
ValueError
(
STR_NOT_IMPL_ENC_DEC_CUDAGRAPH
)
elif
self
.
enforce_eager
is
None
:
# *Only for decoder-only models*, enforce_eager
# defaults to False if unset. This is intuitive
# so no logging message needed.
self
.
enforce_eager
=
False
if
(
not
self
.
disable_sliding_window
and
self
.
hf_text_config
.
model_type
==
"gemma2"
and
self
.
hf_text_config
.
sliding_window
is
not
None
):
...
...
@@ -174,17 +215,33 @@ class ModelConfig:
hf_config
=
self
.
hf_text_config
,
max_model_len
=
max_model_len
,
disable_sliding_window
=
self
.
disable_sliding_window
,
sliding_window_len
=
self
.
get_hf_config_sliding_window
())
sliding_window_len
=
self
.
get_hf_config_sliding_window
(),
spec_target_max_model_len
=
spec_target_max_model_len
)
self
.
served_model_name
=
get_served_model_name
(
model
,
served_model_name
)
self
.
multimodal_config
=
multimodal_config
self
.
multimodal_config
=
self
.
_init_
multimodal_config
(
limit_mm_per_prompt
)
if
not
self
.
skip_tokenizer_init
:
self
.
_verify_tokenizer_mode
()
self
.
_verify_embedding_mode
()
self
.
_verify_quantization
()
self
.
_verify_cuda_graph
()
def
_init_multimodal_config
(
self
,
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
)
->
Optional
[
"MultiModalConfig"
]:
architectures
=
getattr
(
self
.
hf_config
,
"architectures"
,
[])
if
any
(
ModelRegistry
.
is_multimodal_model
(
arch
)
for
arch
in
architectures
):
return
MultiModalConfig
(
limit_per_prompt
=
limit_mm_per_prompt
or
{})
else
:
if
limit_mm_per_prompt
:
raise
ValueError
(
"limit_mm_per_prompt is only supported for multimodal "
"models."
)
return
None
def
_verify_tokenizer_mode
(
self
)
->
None
:
tokenizer_mode
=
self
.
tokenizer_mode
.
lower
()
if
tokenizer_mode
not
in
[
"auto"
,
"slow"
]:
...
...
@@ -207,11 +264,13 @@ class ModelConfig:
def
_verify_quantization
(
self
)
->
None
:
supported_quantization
=
[
*
QUANTIZATION_METHODS
]
rocm_supported_quantization
=
[
"gptq"
,
"squeezellm"
,
"awq"
]
rocm_supported_quantization
=
[
"gptq"
,
"squeezellm"
,
"awq"
]
# "fp8"
optimized_quantization_methods
=
[
"fp8"
,
"marlin"
,
"gptq_marlin_24"
,
"gptq_marlin"
,
"awq_marlin"
,
"fbgemm_fp8"
,
"compressed_tensors"
,
"compressed-tensors"
"fbgemm_fp8"
,
"compressed_tensors"
,
"compressed-tensors"
,
"experts_int8"
]
tpu_supported_quantization
=
[
"tpu_int8"
]
if
self
.
quantization
is
not
None
:
self
.
quantization
=
self
.
quantization
.
lower
()
...
...
@@ -250,6 +309,11 @@ class ModelConfig:
raise
ValueError
(
f
"
{
self
.
quantization
}
quantization is currently not "
f
"supported in ROCm."
)
if
current_platform
.
is_tpu
(
)
and
self
.
quantization
not
in
tpu_supported_quantization
:
raise
ValueError
(
f
"
{
self
.
quantization
}
quantization is currently not "
f
"supported in TPU Backend."
)
if
self
.
quantization
not
in
optimized_quantization_methods
:
logger
.
warning
(
"%s quantization is not fully "
...
...
@@ -290,8 +354,9 @@ class ModelConfig:
"BitAndBytes quantization with TP or PP is not supported yet."
)
if
self
.
quantization
==
"bitsandbytes"
and
self
.
enforce_eager
is
False
:
raise
ValueError
(
"BitAndBytes with enforce_eager = False is not supported yet."
)
logger
.
warning
(
"CUDA graph is not supported on BitAndBytes yet, "
"fallback to the eager mode."
)
self
.
enforce_eager
=
True
def
get_hf_config_sliding_window
(
self
)
->
Optional
[
int
]:
"""Get the sliding window size, or None if disabled."""
...
...
@@ -425,6 +490,28 @@ class ModelConfig:
if
t
!=
"attention"
])
def
get_multimodal_config
(
self
)
->
"MultiModalConfig"
:
"""
Get the multimodal configuration of the model.
Raises:
ValueError: If the model is not multimodal.
"""
if
self
.
multimodal_config
is
None
:
raise
ValueError
(
"The model is not multimodal."
)
return
self
.
multimodal_config
@
property
def
is_encoder_decoder_model
(
self
)
->
bool
:
"""Extract the HF encoder/decoder model flag."""
return
getattr
(
self
.
hf_config
,
"is_encoder_decoder"
,
False
)
@
property
def
is_embedding_model
(
self
)
->
bool
:
"""Extract the embedding model flag."""
return
self
.
embedding_mode
class
CacheConfig
:
"""Configuration for the KV cache.
...
...
@@ -443,7 +530,7 @@ class CacheConfig:
self
,
block_size
:
int
,
gpu_memory_utilization
:
float
,
swap_space
:
in
t
,
swap_space
:
floa
t
,
cache_dtype
:
str
,
num_gpu_blocks_override
:
Optional
[
int
]
=
None
,
sliding_window
:
Optional
[
int
]
=
None
,
...
...
@@ -452,7 +539,7 @@ class CacheConfig:
)
->
None
:
self
.
block_size
=
block_size
self
.
gpu_memory_utilization
=
gpu_memory_utilization
self
.
swap_space_bytes
=
swap_space
*
_GB
self
.
swap_space_bytes
=
swap_space
*
GiB_bytes
self
.
num_gpu_blocks_override
=
num_gpu_blocks_override
self
.
cache_dtype
=
cache_dtype
self
.
sliding_window
=
sliding_window
...
...
@@ -497,10 +584,6 @@ class CacheConfig:
raise
NotImplementedError
(
"Prefix caching is not supported with sliding window. "
"Run with --disable-sliding-window to use prefix caching."
)
if
self
.
cache_dtype
==
"fp8"
:
raise
NotImplementedError
(
"Prefix caching is not supported for fp8 cache_dtype. "
"Run with --kv-cache-dtype auto to use prefix caching."
)
def
verify_with_parallel_config
(
self
,
...
...
@@ -512,9 +595,9 @@ class CacheConfig:
num_gpus_per_node
=
parallel_config
.
tensor_parallel_size
cpu_memory_usage
=
self
.
swap_space_bytes
*
num_gpus_per_node
msg
=
(
f
"
{
cpu_memory_usage
/
_GB
:.
2
f
}
GiB out of "
f
"
the
{
total_cpu_memory
/
_GB
:.
2
f
}
GiB total CPU memory
is
"
"allocated for the swap space."
)
msg
=
(
f
"
{
cpu_memory_usage
/
GiB_bytes
:.
2
f
}
GiB out of
the
"
f
"
{
total_cpu_memory
/
GiB_bytes
:.
2
f
}
GiB total CPU memory "
"
is
allocated for the swap space."
)
if
cpu_memory_usage
>
0.7
*
total_cpu_memory
:
raise
ValueError
(
"Too large swap space. "
+
msg
)
elif
cpu_memory_usage
>
0.4
*
total_cpu_memory
:
...
...
@@ -582,6 +665,7 @@ class LoadFormat(str, enum.Enum):
DUMMY
=
"dummy"
TENSORIZER
=
"tensorizer"
SHARDED_STATE
=
"sharded_state"
GGUF
=
"gguf"
BITSANDBYTES
=
"bitsandbytes"
...
...
@@ -692,8 +776,8 @@ class ParallelConfig:
self
.
tokenizer_pool_config
=
tokenizer_pool_config
self
.
ray_workers_use_nsight
=
ray_workers_use_nsight
self
.
placement_group
=
placement_group
self
.
world_size
=
pipeline_parallel_size
*
self
.
tensor_parallel_size
if
worker_use_ray
:
if
self
.
distributed_executor_backend
is
None
:
self
.
distributed_executor_backend
=
"ray"
...
...
@@ -789,6 +873,11 @@ class SchedulerConfig:
swapping. However, when the sequence group has multiple sequences
(e.g., beam search), recomputation is not currently supported. In
such a case, we use swapping instead.
send_delta_data: Private API. If used, scheduler sends delta data to
workers instead of an entire data. It should be enabled only
when SPMD worker architecture is enabled. I.e.,
VLLM_USE_RAY_SPMD_WORKER=1
"""
def
__init__
(
self
,
...
...
@@ -800,7 +889,9 @@ class SchedulerConfig:
delay_factor
:
float
=
0.0
,
enable_chunked_prefill
:
bool
=
False
,
embedding_mode
:
Optional
[
bool
]
=
False
,
preemption_mode
:
Optional
[
str
]
=
None
)
->
None
:
preemption_mode
:
Optional
[
str
]
=
None
,
num_scheduler_steps
:
int
=
1
,
send_delta_data
:
bool
=
False
)
->
None
:
if
max_num_batched_tokens
is
not
None
:
self
.
max_num_batched_tokens
=
max_num_batched_tokens
else
:
...
...
@@ -829,6 +920,8 @@ class SchedulerConfig:
self
.
chunked_prefill_enabled
=
enable_chunked_prefill
self
.
embedding_mode
=
embedding_mode
self
.
preemption_mode
=
preemption_mode
self
.
num_scheduler_steps
=
num_scheduler_steps
self
.
send_delta_data
=
send_delta_data
self
.
_verify_args
()
def
_verify_args
(
self
)
->
None
:
...
...
@@ -854,6 +947,16 @@ class SchedulerConfig:
f
"(
{
self
.
num_lookahead_slots
}
) must be greater than or "
"equal to 0."
)
if
self
.
num_scheduler_steps
<
1
:
raise
ValueError
(
"num_scheduler_steps "
f
"(
{
self
.
num_scheduler_steps
}
) must be greater than or "
"equal to 1."
)
@
property
def
is_multi_step
(
self
)
->
bool
:
return
self
.
num_scheduler_steps
>
1
class
DeviceConfig
:
device
:
Optional
[
torch
.
device
]
...
...
@@ -865,7 +968,7 @@ class DeviceConfig:
self
.
device_type
=
"neuron"
elif
is_openvino
():
self
.
device_type
=
"openvino"
elif
is_tpu
():
elif
current_platform
.
is_tpu
():
self
.
device_type
=
"tpu"
elif
is_cpu
():
self
.
device_type
=
"cpu"
...
...
@@ -902,6 +1005,7 @@ class SpeculativeConfig:
target_parallel_config
:
ParallelConfig
,
target_dtype
:
str
,
speculative_model
:
Optional
[
str
],
speculative_model_quantization
:
Optional
[
str
],
speculative_draft_tensor_parallel_size
:
Optional
[
int
],
num_speculative_tokens
:
Optional
[
int
],
speculative_max_model_len
:
Optional
[
int
],
...
...
@@ -930,6 +1034,9 @@ class SpeculativeConfig:
target_dtype (str): The data type used for the target model.
speculative_model (Optional[str]): The name of the speculative
model, if provided.
speculative_model_quantization (Optional[str]): Quantization method
that was used to quantize the speculative model weights. If
None, we assume the model weights are not quantized.
speculative_draft_tensor_parallel_size (Optional[int]): The degree
of the tensor parallelism for the draft model.
num_speculative_tokens (Optional[int]): The number of speculative
...
...
@@ -997,11 +1104,11 @@ class SpeculativeConfig:
"Speculative decoding requires usage of the V2 "
"block manager. Enable it with --use-v2-block-manager."
)
# TODO: The user should be able to specify revision/
quantization/max
#
model len
for the draft model. It is not currently supported.
# TODO: The user should be able to specify revision/
max model len
# for the draft model. It is not currently supported.
draft_revision
=
None
draft_code_revision
=
None
draft_quantization
=
None
draft_quantization
=
speculative_model_quantization
if
speculative_model
==
"[ngram]"
:
if
ngram_prompt_lookup_min
is
None
:
...
...
@@ -1033,6 +1140,7 @@ class SpeculativeConfig:
code_revision
=
draft_code_revision
,
tokenizer_revision
=
target_model_config
.
tokenizer_revision
,
max_model_len
=
None
,
spec_target_max_model_len
=
target_model_config
.
max_model_len
,
quantization
=
draft_quantization
,
enforce_eager
=
target_model_config
.
enforce_eager
,
max_seq_len_to_capture
=
target_model_config
.
...
...
@@ -1158,7 +1266,7 @@ class SpeculativeConfig:
elif
speculative_draft_tensor_parallel_size
!=
1
:
# TODO(wooyeon): allow tp values larger than 1
raise
ValueError
(
f
"
{
speculative_draft_tensor_parallel_size
=
}
cannot be"
f
"
{
speculative_draft_tensor_parallel_size
=
}
cannot be
"
f
"other value than 1"
)
draft_parallel_config
=
ParallelConfig
(
...
...
@@ -1310,8 +1418,9 @@ class LoRAConfig:
long_lora_scaling_factors
:
Optional
[
Tuple
[
float
]]
=
None
def
__post_init__
(
self
):
# TODO: Increase the range of rank
possible_max_ranks
=
(
8
,
16
,
32
,
64
)
# Setting the maximum rank to 256 should be able to satisfy the vast
# majority of applications.
possible_max_ranks
=
(
8
,
16
,
32
,
64
,
128
,
256
)
possible_lora_extra_vocab_size
=
(
0
,
256
,
512
)
if
self
.
max_lora_rank
not
in
possible_max_ranks
:
raise
ValueError
(
...
...
@@ -1343,11 +1452,6 @@ class LoRAConfig:
model_config
.
quantization
)
def
verify_with_scheduler_config
(
self
,
scheduler_config
:
SchedulerConfig
):
if
scheduler_config
.
max_num_batched_tokens
>
65528
:
raise
ValueError
(
"Due to limitations of the custom LoRA CUDA kernel, "
"max_num_batched_tokens must be <= 65528 when "
"LoRA is enabled."
)
if
scheduler_config
.
chunked_prefill_enabled
:
raise
ValueError
(
"LoRA is not supported with chunked prefill yet."
)
...
...
@@ -1387,10 +1491,15 @@ class PromptAdapterConfig:
@
dataclass
class
MultiModalConfig
:
"""Configs the input data format and how models should run for
multimodal models."""
"""Controls the behavior of multimodal models."""
limit_per_prompt
:
Mapping
[
str
,
int
]
=
field
(
default_factory
=
dict
)
"""
The maximum number of multi-modal input instances allowed per prompt
for each :class:`~vllm.multimodal.MultiModalPlugin`.
"""
# TODO: Add configs to init vision tower or not.
pass
_STR_DTYPE_TO_TORCH_DTYPE
=
{
...
...
@@ -1461,6 +1570,7 @@ def _get_and_verify_max_len(
max_model_len
:
Optional
[
int
],
disable_sliding_window
:
bool
,
sliding_window_len
:
Optional
[
int
],
spec_target_max_model_len
:
Optional
[
int
]
=
None
,
)
->
int
:
"""Get and verify the model's maximum length."""
derived_max_model_len
=
float
(
"inf"
)
...
...
@@ -1503,6 +1613,11 @@ def _get_and_verify_max_len(
# If max_model_len is specified, we use it.
return
max_model_len
if
spec_target_max_model_len
is
not
None
:
# If this is a speculative draft model, we use the max model len
# from the target model.
return
spec_target_max_model_len
default_max_len
=
2048
logger
.
warning
(
"The model's config.json does not contain any of the following "
...
...
@@ -1610,10 +1725,27 @@ class ObservabilityConfig:
"""Configuration for observability."""
otlp_traces_endpoint
:
Optional
[
str
]
=
None
# Collecting detailed timing information for each request can be expensive.
# If set, collects the model forward time for the request.
collect_model_forward_time
:
bool
=
False
# If set, collects the model execute time for the request.
collect_model_execute_time
:
bool
=
False
def
__post_init__
(
self
):
if
not
is_otel_installed
()
and
self
.
otlp_traces_endpoint
is
not
None
:
raise
ValueError
(
"OpenTelemetry packages must be installed before "
"configuring 'otlp_traces_endpoint'"
)
if
not
is_otel_available
()
and
self
.
otlp_traces_endpoint
is
not
None
:
raise
ValueError
(
"OpenTelemetry is not available. Unable to configure "
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
f
"installed. Original error:
\n
{
otel_import_error_traceback
}
"
)
if
((
self
.
collect_model_forward_time
or
self
.
collect_model_execute_time
)
and
self
.
otlp_traces_endpoint
is
None
):
raise
ValueError
(
"collect_model_forward_time or collect_model_execute_time "
"requires --otlp-traces-endpoint to be set."
)
@
dataclass
(
frozen
=
True
)
...
...
@@ -1629,7 +1761,6 @@ class EngineConfig:
device_config
:
DeviceConfig
load_config
:
LoadConfig
lora_config
:
Optional
[
LoRAConfig
]
multimodal_config
:
Optional
[
MultiModalConfig
]
speculative_config
:
Optional
[
SpeculativeConfig
]
decoding_config
:
Optional
[
DecodingConfig
]
observability_config
:
Optional
[
ObservabilityConfig
]
...
...
vllm/core/block/block_table.py
View file @
af7f4372
...
...
@@ -356,7 +356,13 @@ class BlockTable:
appended to blocks. The first such "token block" may have less token ids
than the block size, since the last allocated block may be partially
full.
If no token ids are provided, then no chunks are returned.
"""
if
not
token_ids
:
return
[]
first_chunk_size
=
self
.
_block_size
-
(
self
.
_num_full_slots
%
self
.
_block_size
)
token_blocks
=
[
token_ids
[:
first_chunk_size
]]
...
...
vllm/core/block/common.py
View file @
af7f4372
from
collections
import
deque
from
dataclasses
import
dataclass
from
typing
import
Deque
,
Dict
,
Iterable
,
List
,
Optional
,
Protocol
,
Tuple
from
vllm.core.block.interfaces
import
Block
,
BlockAllocator
...
...
@@ -282,6 +283,58 @@ class BlockList:
return
self
.
_block_ids
@
dataclass
class
CacheMetricData
:
"""A utility dataclass to maintain cache metric.
To avoid overflow, we maintain the hit rate in block granularity, so that
we can maintain a single hit rate for n_completed_block x block_size,
and calculate the real time hit rate by the following:
BS = The number of queries per block.
nB = The number of completed blocks.
HR = hit rate of (nB x BS) queries.
Q = current number of queries (< BS).
H = current number of hits (< BS).
hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
"""
num_completed_blocks
:
int
=
0
completed_block_cache_hit_rate
:
float
=
0.0
num_incompleted_block_queries
:
int
=
0
num_incompleted_block_hit
:
int
=
0
block_size
:
int
=
1000
def
query
(
self
,
hit
:
bool
):
self
.
num_incompleted_block_queries
+=
1
self
.
num_incompleted_block_hit
+=
1
if
hit
else
0
# When a block is completed, update the cache hit rate
# and reset the incomplete numbers.
if
self
.
num_incompleted_block_queries
==
self
.
block_size
:
hit_rate
=
(
self
.
num_incompleted_block_hit
/
self
.
num_incompleted_block_queries
)
self
.
completed_block_cache_hit_rate
=
(
self
.
completed_block_cache_hit_rate
*
self
.
num_completed_blocks
+
hit_rate
)
/
(
self
.
num_completed_blocks
+
1
)
self
.
num_incompleted_block_queries
=
0
self
.
num_incompleted_block_hit
=
0
self
.
num_completed_blocks
+=
1
def
get_hit_rate
(
self
):
incomplete_ratio
=
self
.
num_incompleted_block_queries
/
self
.
block_size
total_blocks
=
self
.
num_completed_blocks
+
incomplete_ratio
if
total_blocks
==
0
:
return
0.0
completed_block_hit
,
incompleted_block_hit
=
0.0
,
0.0
if
self
.
num_completed_blocks
>
0
:
completed_block_hit
=
(
self
.
completed_block_cache_hit_rate
*
self
.
num_completed_blocks
)
if
self
.
num_incompleted_block_queries
>
0
:
incompleted_hit_rate
=
(
self
.
num_incompleted_block_hit
/
self
.
num_incompleted_block_queries
)
incompleted_block_hit
=
(
incompleted_hit_rate
*
incomplete_ratio
)
return
(
completed_block_hit
+
incompleted_block_hit
)
/
total_blocks
def
get_all_blocks_recursively
(
last_block
:
Block
)
->
List
[
Block
]:
"""Retrieves all the blocks in a sequence starting from the last block.
...
...
vllm/core/block/cpu_gpu_block_allocator.py
View file @
af7f4372
...
...
@@ -323,6 +323,11 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
def
all_block_ids
(
self
)
->
FrozenSet
[
int
]:
return
frozenset
(
self
.
_block_ids_to_allocator
.
keys
())
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
"""Prefix cache hit rate. -1 means not supported or disabled."""
assert
device
in
self
.
_allocators
return
self
.
_allocators
[
device
].
get_prefix_cache_hit_rate
()
def
get_and_reset_swaps
(
self
)
->
List
[
Tuple
[
int
,
int
]]:
"""Returns and clears the mapping of source to destination block IDs.
Will be called after every swapping operations for now, and after every
...
...
vllm/core/block/interfaces.py
View file @
af7f4372
...
...
@@ -186,6 +186,11 @@ class BlockAllocator(ABC):
num_lookahead_slots
:
int
=
0
)
->
int
:
pass
@
abstractmethod
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
class
NoFreeBlocksError
(
ValueError
):
pass
...
...
@@ -278,3 +283,8 @@ class DeviceAwareBlockAllocator(ABC):
There is at most one null block per allocator.
"""
pass
@
abstractmethod
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
vllm/core/block/naive_block.py
View file @
af7f4372
...
...
@@ -307,9 +307,8 @@ class NaiveBlockAllocator(BlockAllocator):
# TODO(cade): make sure the logic is correct and clean it up.
for
block
in
blocks
:
if
not
block
.
is_full
and
num_lookahead_slots
!=
0
:
if
block
.
num_empty_slots
>=
num_lookahead_slots
:
new_block_count
+=
1
else
:
new_block_count
+=
1
if
num_lookahead_slots
>
block
.
num_empty_slots
:
new_block_count
+=
cdiv
(
num_lookahead_slots
-
block
.
num_empty_slots
,
self
.
_block_size
)
...
...
@@ -342,6 +341,9 @@ class NaiveBlockAllocator(BlockAllocator):
block
.
block_id
=
block_id
# Assign block_id
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
return
-
1
class
NaiveBlock
(
Block
):
"""An implementation of the Block class that does not support prefix
...
...
vllm/core/block/prefix_caching_block.py
View file @
af7f4372
"""Token blocks."""
from
os.path
import
commonprefix
from
typing
import
Dict
,
FrozenSet
,
Iterable
,
List
,
Optional
,
Tuple
from
vllm.core.block.common
import
(
CopyOnWriteTracker
,
from
vllm.core.block.common
import
(
CacheMetricData
,
CopyOnWriteTracker
,
get_all_blocks_recursively
)
from
vllm.core.block.interfaces
import
Block
,
BlockAllocator
,
BlockId
,
Device
from
vllm.core.block.naive_block
import
(
BlockPool
,
NaiveBlock
,
...
...
@@ -107,6 +106,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
self
.
_cow_tracker
=
CopyOnWriteTracker
(
refcounter
=
self
.
_refcounter
.
as_readonly
())
self
.
metric_data
=
CacheMetricData
()
# Implements Block.Factory.
def
_create_block
(
self
,
...
...
@@ -155,9 +156,11 @@ class PrefixCachingBlockAllocator(BlockAllocator):
cached_block_id
=
self
.
_cached_blocks
.
get
(
block
.
content_hash
,
None
)
if
cached_block_id
is
not
None
:
self
.
metric_data
.
query
(
hit
=
True
)
block
.
block_id
=
cached_block_id
self
.
_incr_refcount_cached_block
(
block
)
return
block
self
.
metric_data
.
query
(
hit
=
False
)
self
.
_block_pool
.
free_block
(
block
)
# No cached block => Allocate a new block
...
...
@@ -404,6 +407,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
def
all_block_ids
(
self
)
->
FrozenSet
[
int
]:
return
self
.
_hashless_allocator
.
all_block_ids
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
return
self
.
metric_data
.
get_hit_rate
()
def
is_block_cached
(
self
,
block
:
Block
)
->
bool
:
assert
block
.
content_hash
is
not
None
if
block
.
content_hash
in
self
.
_cached_blocks
:
...
...
@@ -579,14 +585,17 @@ class PrefixCachingBlockAllocator(BlockAllocator):
num_touched_blocks
=
0
for
block
in
blocks
:
if
not
block
.
is_full
:
if
block
.
num_empty_slots
>=
num_lookahead_slots
:
num_touched_blocks
+=
1
else
:
num_touched_blocks
+=
1
if
num_lookahead_slots
>
block
.
num_empty_slots
:
num_touched_blocks
+=
cdiv
(
num_lookahead_slots
-
block
.
num_empty_slots
,
self
.
_block_size
)
else
:
if
not
self
.
is_block_cached
(
block
):
# If the block has a match in the cache and the cached block
# is not referenced, then we still count it as a touched block
if
not
self
.
is_block_cached
(
block
)
or
\
(
block
.
content_hash
is
not
None
and
\
self
.
_cached_blocks
[
block
.
content_hash
]
in
self
.
evictor
):
num_touched_blocks
+=
1
return
num_touched_blocks
...
...
vllm/core/block/utils.py
View file @
af7f4372
"""Block manager utils."""
from
vllm.sequence
import
SequenceGroup
# Exception strings for non-implemented block manager enc/dec scenarios
STR_NOT_IMPL_ENC_DEC_SWA
=
\
"Sliding window attention for encoder/decoder models "
+
\
"is not currently supported."
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
=
\
"Prefix caching for encoder/decoder models "
+
\
"is not currently supported."
from
vllm.utils
import
(
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
,
STR_NOT_IMPL_ENC_DEC_SWA
)
def
_get_block_mgr_sliding_window_attr
(
block_mgr
):
...
...
vllm/core/block_manager_v1.py
View file @
af7f4372
...
...
@@ -8,6 +8,7 @@ from typing import Sequence as GenericSequence
from
typing
import
Set
,
Tuple
from
vllm.block
import
BlockTable
,
PhysicalTokenBlock
from
vllm.core.block.common
import
CacheMetricData
from
vllm.core.block.utils
import
check_no_caching_or_swa_for_blockmgr_encdec
from
vllm.core.evictor_v1
import
EvictionPolicy
,
Evictor
,
make_evictor
from
vllm.core.interfaces
import
AllocStatus
,
BlockSpaceManager
...
...
@@ -60,6 +61,11 @@ class BlockAllocatorBase(ABC):
def
update_hash
(
self
,
block_hash
:
int
,
block
:
PhysicalTokenBlock
):
pass
@
abstractmethod
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
class
CachedBlockAllocator
(
BlockAllocatorBase
):
"""Manages free physical token blocks for a device.
...
...
@@ -85,6 +91,8 @@ class CachedBlockAllocator(BlockAllocatorBase):
self
.
default_hash_ctr
=
count
()
self
.
cache_metric_data
=
CacheMetricData
()
def
allocate_block
(
self
,
block_hash
:
int
,
num_hashed_tokens
:
int
)
->
PhysicalTokenBlock
:
if
self
.
current_num_blocks
==
self
.
num_blocks
:
...
...
@@ -105,15 +113,17 @@ class CachedBlockAllocator(BlockAllocatorBase):
num_hashed_tokens
:
int
=
0
)
->
PhysicalTokenBlock
:
if
block_hash
is
None
:
block_hash
=
next
(
self
.
default_hash_ctr
)
if
block_hash
in
self
.
evictor
:
assert
block_hash
not
in
self
.
cached_blocks
block
=
self
.
evictor
.
remove
(
block_hash
)
assert
block
.
ref_count
==
0
self
.
cached_blocks
[
block_hash
]
=
block
block
.
ref_count
+=
1
assert
block
.
block_hash
==
block_hash
return
block
if
block_hash
not
in
self
.
cached_blocks
:
if
block_hash
in
self
.
cached_blocks
:
self
.
cache_metric_data
.
query
(
hit
=
True
)
else
:
self
.
cache_metric_data
.
query
(
hit
=
False
)
self
.
cached_blocks
[
block_hash
]
=
self
.
allocate_block
(
block_hash
,
num_hashed_tokens
)
block
=
self
.
cached_blocks
[
block_hash
]
...
...
@@ -150,6 +160,9 @@ class CachedBlockAllocator(BlockAllocatorBase):
del
self
.
cached_blocks
[
old_hash
]
self
.
cached_blocks
[
block_hash
]
=
block
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
return
self
.
cache_metric_data
.
get_hit_rate
()
class
UncachedBlockAllocator
(
BlockAllocatorBase
):
"""Manages free physical token blocks for a device.
...
...
@@ -170,7 +183,7 @@ class UncachedBlockAllocator(BlockAllocatorBase):
self
.
num_blocks
=
num_blocks
# Initialize the free blocks.
self
.
free_blocks
:
BlockTable
=
[]
self
.
free_blocks
:
List
[
PhysicalTokenBlock
]
=
[]
for
i
in
range
(
num_blocks
):
block
=
PhysicalTokenBlock
(
device
=
device
,
block_number
=
i
,
...
...
@@ -209,6 +222,9 @@ class UncachedBlockAllocator(BlockAllocatorBase):
raise
NotImplementedError
(
"Invalid codepath for uncached block allocator."
)
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
return
-
1
class
BlockSpaceManagerV1
(
BlockSpaceManager
):
"""Manages the mapping between logical and physical token blocks."""
...
...
@@ -256,6 +272,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
Device
.
CPU
,
block_size
,
num_cpu_blocks
)
# Mapping: seq_id -> BlockTable.
self
.
block_tables
:
Dict
[
int
,
BlockTable
]
=
{}
# Mapping: req_id -> BlockTable
# Note that each SequenceGroup has a unique
# request ID
...
...
@@ -299,7 +316,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
# Allocate new physical token blocks that will store the prompt tokens.
num_prompt_blocks
=
seq
.
n_blocks
block_table
:
BlockTable
=
[]
block_table
:
BlockTable
=
BlockTable
()
for
logical_idx
in
range
(
num_prompt_blocks
):
if
(
self
.
block_sliding_window
is
not
None
and
logical_idx
>=
self
.
block_sliding_window
):
...
...
@@ -326,15 +343,19 @@ class BlockSpaceManagerV1(BlockSpaceManager):
#
# NOTE: Here we assume that all sequences in the group have the same
# decoder prompt.
seq
=
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
WAITING
)[
0
]
wait_seqs
=
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
WAITING
)
seq
=
wait_seqs
[
0
]
block_table
:
BlockTable
=
\
self
.
_allocate_sequence
(
seq
,
seq_group
.
num_seqs
(),
is_encoder_decoder
)
# Assign the self-attention block tables for each sequence.
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
WAITING
):
self
.
block_tables
[
seq
.
seq_id
]
=
block_table
.
copy
()
if
len
(
wait_seqs
)
==
1
:
self
.
block_tables
[
seq
.
seq_id
]
=
block_table
else
:
for
seq
in
wait_seqs
:
self
.
block_tables
[
seq
.
seq_id
]
=
block_table
.
copy
()
# Allocate encoder sequence
if
is_encoder_decoder
:
...
...
@@ -476,6 +497,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
return
src_block_table
=
self
.
block_tables
[
parent_seq
.
seq_id
]
self
.
block_tables
[
child_seq
.
seq_id
]
=
src_block_table
.
copy
()
# When using a sliding window, blocks will be eventually reused.
# In this case the block tables will contain repeated blocks.
# When forking, we must make sure that each block's `ref_count`
...
...
@@ -527,7 +549,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
dest_allocator
:
BlockAllocatorBase
,
mapping
:
Dict
[
PhysicalTokenBlock
,
PhysicalTokenBlock
])
->
BlockTable
:
new_block_table
=
[]
new_block_table
:
BlockTable
=
BlockTable
()
for
from_block
in
block_table
:
if
from_block
in
mapping
:
...
...
@@ -553,8 +575,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
SWAPPED
):
self
.
block_tables
[
seq
.
seq_id
]
=
\
self
.
_swap_block_table
(
self
.
block_tables
[
seq
.
seq_id
],
self
.
cpu_allocator
,
self
.
gpu_allocator
,
self
.
cpu_allocator
,
self
.
gpu_allocator
,
mapping
)
if
seq_group
.
is_encoder_decoder
():
...
...
@@ -580,8 +601,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
self
.
block_tables
[
seq
.
seq_id
]
=
\
self
.
_swap_block_table
(
self
.
block_tables
[
seq
.
seq_id
],
self
.
gpu_allocator
,
self
.
cpu_allocator
,
self
.
gpu_allocator
,
self
.
cpu_allocator
,
mapping
)
if
seq_group
.
is_encoder_decoder
():
...
...
@@ -636,8 +656,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
self
.
cross_block_tables
.
clear
()
def
get_block_table
(
self
,
seq
:
Sequence
)
->
List
[
int
]:
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
return
[
block
.
block_number
for
block
in
block_table
]
return
self
.
block_tables
[
seq
.
seq_id
].
ids
()
def
get_cross_block_table
(
self
,
seq_group
:
SequenceGroup
)
->
List
[
int
]:
block_table
=
self
.
cross_block_tables
[
seq_group
.
request_id
]
...
...
@@ -702,3 +721,10 @@ class BlockSpaceManagerV1(BlockSpaceManager):
if
self
.
enable_caching
:
for
seq
in
seq_group
.
get_seqs
():
self
.
compute_full_blocks_in_seq
(
seq
)
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
if
device
==
Device
.
GPU
:
return
self
.
gpu_allocator
.
get_prefix_cache_hit_rate
()
if
device
==
Device
.
CPU
:
return
self
.
cpu_allocator
.
get_prefix_cache_hit_rate
()
raise
ValueError
(
f
"Invalid device:
{
device
}
"
)
vllm/core/block_manager_v2.py
View file @
af7f4372
...
...
@@ -441,6 +441,9 @@ class BlockSpaceManagerV2(BlockSpaceManager):
def
get_num_free_cpu_blocks
(
self
)
->
int
:
return
self
.
block_allocator
.
get_num_free_blocks
(
Device
.
CPU
)
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
return
self
.
block_allocator
.
get_prefix_cache_hit_rate
(
device
)
def
_can_swap
(
self
,
seq_group
:
SequenceGroup
,
device
:
Device
,
...
...
vllm/core/embedding_model_block_manager.py
View file @
af7f4372
...
...
@@ -2,6 +2,7 @@ from typing import List, Tuple
from
vllm.core.interfaces
import
AllocStatus
,
BlockSpaceManager
from
vllm.sequence
import
Sequence
,
SequenceGroup
from
vllm.utils
import
Device
class
EmbeddingModelBlockSpaceManager
(
BlockSpaceManager
):
...
...
@@ -81,3 +82,6 @@ class EmbeddingModelBlockSpaceManager(BlockSpaceManager):
def
mark_blocks_as_computed
(
self
,
seq_group
:
SequenceGroup
):
pass
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
return
-
1
vllm/core/evictor_v2.py
View file @
af7f4372
...
...
@@ -85,18 +85,21 @@ class LRUEvictor(Evictor):
if
len
(
self
.
free_table
)
==
0
:
raise
ValueError
(
"No usable cache memory left"
)
evicted_block
=
next
(
iter
(
self
.
free_table
.
values
()))
evicted_block_id
=
next
(
iter
(
self
.
free_table
.
keys
()))
evicted_block
,
evicted_block_id
=
None
,
None
# The blocks with the lowest timestamps should be placed consecutively
# at the start of OrderedDict. Loop through all these blocks to
# find the one with maximum number of hashed tokens.
for
_id
,
block
in
self
.
free_table
.
items
():
if
evicted_block
.
last_accessed
>
block
.
last_accessed
or
(
evicted_block
.
last_accessed
==
block
.
last_accessed
and
evicted_block
.
num_hashed_tokens
<
block
.
num_hashed_tokens
):
evicted_block
=
block
evicted_block_id
=
_id
if
evicted_block
is
None
:
evicted_block
,
evicted_block_id
=
block
,
_id
continue
if
evicted_block
.
last_accessed
<
block
.
last_accessed
:
break
if
evicted_block
.
num_hashed_tokens
<
block
.
num_hashed_tokens
:
evicted_block
,
evicted_block_id
=
block
,
_id
assert
evicted_block
is
not
None
assert
evicted_block_id
is
not
None
self
.
free_table
.
pop
(
evicted_block_id
)
return
evicted_block_id
,
evicted_block
.
content_hash
...
...
vllm/core/interfaces.py
View file @
af7f4372
...
...
@@ -5,6 +5,7 @@ from typing import Sequence as GenericSequence
from
typing
import
Tuple
from
vllm.sequence
import
Sequence
,
SequenceGroup
from
vllm.utils
import
Device
class
AllocStatus
(
enum
.
Enum
):
...
...
@@ -116,3 +117,8 @@ class BlockSpaceManager(ABC):
@
abstractmethod
def
mark_blocks_as_computed
(
self
,
seq_group
:
SequenceGroup
):
pass
@
abstractmethod
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
vllm/core/scheduler.py
View file @
af7f4372
...
...
@@ -12,7 +12,9 @@ from vllm.logger import init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sequence
import
(
Sequence
,
SequenceData
,
SequenceGroup
,
SequenceGroupMetadata
,
SequenceStatus
)
SequenceGroupMetadata
,
SequenceGroupMetadataDelta
,
SequenceStatus
)
from
vllm.utils
import
Device
,
PyObjectCache
logger
=
init_logger
(
__name__
)
...
...
@@ -176,10 +178,10 @@ class SchedulerRunningOutputs:
enough memory, it can be preempted (for recompute) or swapped out.
"""
# Selected sequences that are running and in a decoding phase.
decode_seq_groups
:
List
[
SequenceGroup
]
decode_seq_groups
:
List
[
Scheduled
SequenceGroup
]
# Selected sequences that are running and in a prefill phase.
# I.e., it means the prefill has been chunked.
prefill_seq_groups
:
List
[
SequenceGroup
]
prefill_seq_groups
:
List
[
Scheduled
SequenceGroup
]
# The preempted sequences.
preempted
:
List
[
SequenceGroup
]
# Sequences that are swapped out.
...
...
@@ -191,6 +193,10 @@ class SchedulerRunningOutputs:
# The number of slots for lookahead decoding.
num_lookahead_slots
:
int
# Optimization for fast-access to seq_group lists
decode_seq_groups_list
:
List
[
SequenceGroup
]
prefill_seq_groups_list
:
List
[
SequenceGroup
]
@
classmethod
def
create_empty
(
cls
)
->
"SchedulerRunningOutputs"
:
return
SchedulerRunningOutputs
(
...
...
@@ -201,6 +207,8 @@ class SchedulerRunningOutputs:
blocks_to_swap_out
=
[],
blocks_to_copy
=
[],
num_lookahead_slots
=
0
,
decode_seq_groups_list
=
[],
prefill_seq_groups_list
=
[],
)
...
...
@@ -259,6 +267,30 @@ class SchedulerPrefillOutputs:
)
def
seq_group_metadata_builder
():
return
SequenceGroupMetadata
(
request_id
=
""
,
is_prompt
=
False
,
seq_data
=
{},
sampling_params
=
None
,
block_tables
=
{})
def
scheduler_running_outputs_builder
():
return
SchedulerRunningOutputs
(
decode_seq_groups
=
[],
prefill_seq_groups
=
[],
preempted
=
[],
swapped_out
=
[],
blocks_to_swap_out
=
[],
blocks_to_copy
=
[],
num_lookahead_slots
=
0
,
prefill_seq_groups_list
=
[],
decode_seq_groups_list
=
[])
def
scheduled_seq_group_builder
():
return
ScheduledSequenceGroup
(
seq_group
=
None
,
token_chunk_size
=
0
)
class
Scheduler
:
def
__init__
(
...
...
@@ -331,6 +363,12 @@ class Scheduler:
else
0
)
self
.
num_cumulative_preemption
:
int
=
0
# Used to cache python objects
self
.
_scheduler_running_outputs_cache
:
PyObjectCache
=
PyObjectCache
(
scheduler_running_outputs_builder
)
self
.
_scheduled_seq_group_cache
:
PyObjectCache
=
PyObjectCache
(
scheduled_seq_group_builder
)
@
property
def
lora_enabled
(
self
)
->
bool
:
return
bool
(
self
.
lora_config
)
...
...
@@ -392,10 +430,26 @@ class Scheduler:
seq
.
status
=
SequenceStatus
.
FINISHED_ABORTED
self
.
free_seq
(
seq
)
self
.
_free_seq_group_cross_attn_blocks
(
aborted_group
)
def
_free_seq_group_cross_attn_blocks
(
self
,
seq_group
:
SequenceGroup
,
)
->
None
:
"""
Free a sequence group from a cross-attention block table.
Has no effect on decoder-only models.
"""
if
seq_group
.
is_encoder_decoder
():
self
.
block_manager
.
free_cross
(
seq_group
)
def
has_unfinished_seqs
(
self
)
->
bool
:
return
len
(
self
.
waiting
)
!=
0
or
len
(
self
.
running
)
!=
0
or
len
(
self
.
swapped
)
!=
0
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
return
self
.
block_manager
.
get_prefix_cache_hit_rate
(
device
)
def
get_num_unfinished_seq_groups
(
self
)
->
int
:
return
len
(
self
.
waiting
)
+
len
(
self
.
running
)
+
len
(
self
.
swapped
)
...
...
@@ -428,14 +482,30 @@ class Scheduler:
Returns:
SchedulerRunningOutputs.
"""
ret
:
SchedulerRunningOutputs
=
\
self
.
_scheduler_running_outputs_cache
.
get_object
()
ret
.
blocks_to_swap_out
.
clear
()
ret
.
blocks_to_copy
.
clear
()
ret
.
decode_seq_groups
.
clear
()
ret
.
prefill_seq_groups
.
clear
()
ret
.
preempted
.
clear
()
ret
.
swapped_out
.
clear
()
ret
.
num_lookahead_slots
=
self
.
_get_num_lookahead_slots
(
is_prefill
=
False
)
ret
.
decode_seq_groups_list
.
clear
()
ret
.
prefill_seq_groups_list
.
clear
()
# Blocks that need to be swapped or copied before model execution.
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_copy
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
ret
.
blocks_to_swap_out
blocks_to_copy
:
List
[
Tuple
[
int
,
int
]]
=
ret
.
blocks_to_copy
decode_seq_groups
:
List
[
ScheduledSequenceGroup
]
=
[]
prefill_seq_groups
:
List
[
ScheduledSequenceGroup
]
=
[]
preempted
:
List
[
SequenceGroup
]
=
[]
swapped_out
:
List
[
SequenceGroup
]
=
[]
decode_seq_groups
:
List
[
ScheduledSequenceGroup
]
=
ret
.
decode_seq_groups
prefill_seq_groups
:
List
[
ScheduledSequenceGroup
]
=
ret
.
prefill_seq_groups
preempted
:
List
[
SequenceGroup
]
=
ret
.
preempted
swapped_out
:
List
[
SequenceGroup
]
=
ret
.
swapped_out
# NOTE(woosuk): Preemption happens only when there is no available slot
# to keep all the sequence groups in the RUNNING state.
...
...
@@ -484,15 +554,19 @@ class Scheduler:
else
:
self
.
_append_slots
(
seq_group
,
blocks_to_copy
)
is_prefill
=
seq_group
.
is_prefill
()
scheduled_seq_group
:
ScheduledSequenceGroup
=
\
self
.
_scheduled_seq_group_cache
.
get_object
()
scheduled_seq_group
.
seq_group
=
seq_group
if
is_prefill
:
prefill_seq_groups
.
append
(
ScheduledSequenceGroup
(
seq_group
=
seq_group
,
token_chunk_size
=
num_running_tokens
))
scheduled_seq_group
.
token_chunk_size
=
num_running_tokens
prefill_seq_groups
.
append
(
scheduled_seq_group
)
ret
.
prefill_seq_groups_list
.
append
(
seq_group
)
else
:
decode_seq_groups
.
append
(
ScheduledSequenceGroup
(
seq_group
=
seq_group
,
token_chunk_size
=
1
))
scheduled_seq_group
.
token_chunk_size
=
1
decode_seq_groups
.
append
(
scheduled_seq_group
)
ret
.
decode_seq_groups_list
.
append
(
seq_group
)
budget
.
add_num_batched_tokens
(
seq_group
.
request_id
,
num_running_tokens
)
# OPTIMIZATION: Note that get_max_num_running_seqs is
...
...
@@ -505,15 +579,10 @@ class Scheduler:
if
curr_loras
is
not
None
and
seq_group
.
lora_int_id
>
0
:
curr_loras
.
add
(
seq_group
.
lora_int_id
)
return
SchedulerRunningOutputs
(
decode_seq_groups
=
decode_seq_groups
,
prefill_seq_groups
=
prefill_seq_groups
,
preempted
=
preempted
,
swapped_out
=
swapped_out
,
blocks_to_swap_out
=
blocks_to_swap_out
,
blocks_to_copy
=
blocks_to_copy
,
num_lookahead_slots
=
self
.
_get_num_lookahead_slots
(
is_prefill
=
False
))
self
.
_scheduler_running_outputs_cache
.
reset
()
self
.
_scheduled_seq_group_cache
.
reset
()
return
ret
def
_schedule_swapped
(
self
,
...
...
@@ -665,7 +734,7 @@ class Scheduler:
all tokens.
Returns:
Scheduler
SwappedIn
Outputs.
Scheduler
Prefill
Outputs.
"""
ignored_seq_groups
:
List
[
SequenceGroup
]
=
[]
seq_groups
:
List
[
SequenceGroup
]
=
[]
...
...
@@ -738,6 +807,9 @@ class Scheduler:
curr_loras
.
add
(
lora_int_id
)
waiting_queue
.
popleft
()
self
.
_allocate_and_set_running
(
seq_group
)
seq_group
.
init_multi_step
(
num_scheduler_steps
=
self
.
_get_num_lookahead_slots
(
is_prefill
=
True
)
+
1
)
seq_groups
.
append
(
ScheduledSequenceGroup
(
seq_group
=
seq_group
,
token_chunk_size
=
num_new_tokens
))
...
...
@@ -807,11 +879,15 @@ class Scheduler:
# Update waiting requests.
self
.
waiting
.
extendleft
(
running_scheduled
.
preempted
)
# Update new running requests.
self
.
running
.
extend
([
s
.
seq_group
for
s
in
prefills
.
seq_groups
])
self
.
running
.
extend
(
[
s
.
seq_group
for
s
in
running_scheduled
.
decode_seq_groups
])
self
.
running
.
extend
(
[
s
.
seq_group
for
s
in
swapped_in
.
decode_seq_groups
])
if
len
(
prefills
.
seq_groups
)
>
0
:
self
.
running
.
extend
([
s
.
seq_group
for
s
in
prefills
.
seq_groups
])
self
.
running
.
extend
(
running_scheduled
.
decode_seq_groups_list
)
if
len
(
swapped_in
.
decode_seq_groups
)
>
0
:
self
.
running
.
extend
(
[
s
.
seq_group
for
s
in
swapped_in
.
decode_seq_groups
])
# Update swapped requests.
self
.
swapped
.
extend
(
running_scheduled
.
swapped_out
)
preempted
=
(
len
(
running_scheduled
.
preempted
)
+
...
...
@@ -821,24 +897,36 @@ class Scheduler:
# doesn't allow chunked prefills.
assert
len
(
running_scheduled
.
prefill_seq_groups
)
==
0
assert
len
(
swapped_in
.
prefill_seq_groups
)
==
0
# Merge lists
num_prefill_groups
=
len
(
prefills
.
seq_groups
)
if
num_prefill_groups
>
0
:
scheduled_seq_groups
=
prefills
.
seq_groups
scheduled_seq_groups
.
extend
(
running_scheduled
.
decode_seq_groups
)
else
:
scheduled_seq_groups
=
running_scheduled
.
decode_seq_groups
scheduled_seq_groups
.
extend
(
swapped_in
.
decode_seq_groups
)
blocks_to_copy
=
running_scheduled
.
blocks_to_copy
blocks_to_copy
.
extend
(
swapped_in
.
blocks_to_copy
)
ignored_seq_groups
=
prefills
.
ignored_seq_groups
ignored_seq_groups
.
extend
(
swapped_in
.
infeasible_seq_groups
)
return
SchedulerOutputs
(
scheduled_seq_groups
=
(
prefills
.
seq_groups
+
running_scheduled
.
decode_seq_groups
+
swapped_in
.
decode_seq_groups
),
num_prefill_groups
=
len
(
prefills
.
seq_groups
),
scheduled_seq_groups
=
scheduled_seq_groups
,
num_prefill_groups
=
num_prefill_groups
,
num_batched_tokens
=
budget
.
num_batched_tokens
,
blocks_to_swap_in
=
swapped_in
.
blocks_to_swap_in
,
blocks_to_swap_out
=
running_scheduled
.
blocks_to_swap_out
,
blocks_to_copy
=
running_scheduled
.
blocks_to_copy
+
swapped_in
.
blocks_to_copy
,
ignored_seq_groups
=
prefills
.
ignored_seq_groups
+
swapped_in
.
infeasible_seq_groups
,
blocks_to_copy
=
blocks_to_copy
,
ignored_seq_groups
=
ignored_seq_groups
,
num_lookahead_slots
=
running_scheduled
.
num_lookahead_slots
,
running_queue_size
=
len
(
self
.
running
),
preempted
=
preempted
,
)
def
_schedule_chunked_prefill
(
self
):
def
_schedule_chunked_prefill
(
self
)
->
SchedulerOutputs
:
"""Schedule queued requests.
Chunked prefill allows to chunk prefill requests, batch them together
...
...
@@ -849,7 +937,7 @@ class Scheduler:
The policy can sustain the high GPU utilization because it can put
prefill and decodes requests to the same batch, while it improves
inter token latency because decodes requests don't need to blocked
inter token latency because decodes requests don't need to
be
blocked
by prefill requests.
"""
budget
=
SchedulingBudget
(
...
...
@@ -947,9 +1035,13 @@ class Scheduler:
# Schedule sequence groups.
# This function call changes the internal states of the scheduler
# such as self.running, self.swapped, and self.waiting.
scheduler_start_time
=
time
.
perf_counter
()
scheduler_outputs
=
self
.
_schedule
()
now
=
time
.
time
()
if
not
self
.
cache_config
.
enable_prefix_caching
:
common_computed_block_nums
=
[]
# Create input data structures.
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
]
=
[]
for
i
,
scheduled_seq_group
in
enumerate
(
...
...
@@ -963,52 +1055,88 @@ class Scheduler:
# seq_id -> physical block numbers
block_tables
:
Dict
[
int
,
List
[
int
]]
=
{}
if
seq_group
.
is_encoder_decoder
():
# Encoder associated with SequenceGroup
encoder_seq_data
=
seq_group
.
get_encoder_seq
().
data
# Block table for cross-attention
# Also managed at SequenceGroup level
cross_block_table
=
self
.
block_manager
.
get_cross_block_table
(
seq_group
)
else
:
encoder_seq_data
=
None
cross_block_table
=
None
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
seq_id
=
seq
.
seq_id
seq_data
[
seq_id
]
=
seq
.
data
block_tables
[
seq_id
]
=
self
.
block_manager
.
get_block_table
(
seq
)
self
.
block_manager
.
access_all_blocks_in_seq
(
seq
,
now
)
common_computed_block_nums
=
(
self
.
block_manager
.
get_common_computed_block_ids
(
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)))
if
self
.
cache_config
.
enable_prefix_caching
:
common_computed_block_nums
=
(
self
.
block_manager
.
get_common_computed_block_ids
(
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)))
do_sample
=
True
if
seq_group
.
is_prefill
():
is_prompt
=
seq_group
.
is_prefill
()
# We should send the metadata to workers when the first prefill
# is sent. Subsequent requests could be chunked prefill or decode.
is_first_prefill
=
False
if
is_prompt
:
seqs
=
seq_group
.
get_seqs
()
# Prefill has only 1 sequence.
assert
len
(
seqs
)
==
1
num_computed_tokens
=
seqs
[
0
].
data
.
get_num_computed_tokens
()
is_first_prefill
=
num_computed_tokens
==
0
# In the next iteration, all prompt tokens are not computed.
# It means the prefill is chunked, and we don't need sampling.
# NOTE: We use get_len instead of get_prompt_len because when
# a sequence is preempted, prefill includes previous generated
# output tokens.
if
(
token_chunk_size
+
seqs
[
0
].
data
.
get_
num_computed_tokens
()
<
if
(
token_chunk_size
+
num_computed_tokens
<
seqs
[
0
].
data
.
get_len
()):
do_sample
=
False
# It assumes the scheduled_seq_groups is ordered by
# prefill < decoding.
is_prompt
=
seq_group
.
is_prefill
()
seq_group_metadata
=
SequenceGroupMetadata
(
request_id
=
seq_group
.
request_id
,
is_prompt
=
is_prompt
,
seq_data
=
seq_data
,
sampling_params
=
seq_group
.
sampling_params
,
block_tables
=
block_tables
,
do_sample
=
do_sample
,
pooling_params
=
seq_group
.
pooling_params
,
token_chunk_size
=
token_chunk_size
,
lora_request
=
seq_group
.
lora_request
,
computed_block_nums
=
common_computed_block_nums
,
# `multi_modal_data` will only be present for the 1st comm
# between engine and worker.
# the subsequent comms can still use delta, but
# `multi_modal_data` will be None.
multi_modal_data
=
seq_group
.
multi_modal_data
if
scheduler_outputs
.
num_prefill_groups
>
0
else
None
,
prompt_adapter_request
=
seq_group
.
prompt_adapter_request
,
)
if
is_first_prefill
or
not
self
.
scheduler_config
.
send_delta_data
:
seq_group_metadata
=
SequenceGroupMetadata
(
request_id
=
seq_group
.
request_id
,
is_prompt
=
is_prompt
,
seq_data
=
seq_data
,
sampling_params
=
seq_group
.
sampling_params
,
block_tables
=
block_tables
,
do_sample
=
do_sample
,
pooling_params
=
seq_group
.
pooling_params
,
token_chunk_size
=
token_chunk_size
,
lora_request
=
seq_group
.
lora_request
,
computed_block_nums
=
common_computed_block_nums
,
encoder_seq_data
=
encoder_seq_data
,
cross_block_table
=
cross_block_table
,
state
=
seq_group
.
state
,
# `multi_modal_data` will only be present for the 1st comm
# between engine and worker.
# the subsequent comms can still use delta, but
# `multi_modal_data` will be None.
multi_modal_data
=
seq_group
.
multi_modal_data
if
scheduler_outputs
.
num_prefill_groups
>
0
else
None
,
prompt_adapter_request
=
seq_group
.
prompt_adapter_request
,
)
else
:
# When SPMD mode is enabled, we only send delta data except for
# the first request to reduce serialization cost.
seq_data_delta
=
{}
for
id
,
data
in
seq_data
.
items
():
seq_data_delta
[
id
]
=
data
.
get_delta_and_reset
()
seq_group_metadata
=
SequenceGroupMetadataDelta
(
seq_data_delta
,
seq_group
.
request_id
,
block_tables
,
is_prompt
,
do_sample
=
do_sample
,
token_chunk_size
=
token_chunk_size
,
computed_block_nums
=
common_computed_block_nums
,
)
seq_group_metadata_list
.
append
(
seq_group_metadata
)
# Now that the batch has been created, we can assume all blocks in the
...
...
@@ -1019,6 +1147,17 @@ class Scheduler:
self
.
block_manager
.
mark_blocks_as_computed
(
scheduled_seq_group
.
seq_group
)
scheduler_time
=
time
.
perf_counter
()
-
scheduler_start_time
# Add this to scheduler time to all the sequences that are currently
# running. This will help estimate if the scheduler is a significant
# component in the e2e latency.
for
seq_group
in
self
.
running
:
if
seq_group
is
not
None
and
seq_group
.
metrics
is
not
None
:
if
seq_group
.
metrics
.
scheduler_time
is
not
None
:
seq_group
.
metrics
.
scheduler_time
+=
scheduler_time
else
:
seq_group
.
metrics
.
scheduler_time
=
scheduler_time
return
seq_group_metadata_list
,
scheduler_outputs
def
fork_seq
(
self
,
parent_seq
:
Sequence
,
child_seq
:
Sequence
)
->
None
:
...
...
@@ -1032,6 +1171,8 @@ class Scheduler:
remaining
:
Deque
[
SequenceGroup
]
=
deque
()
for
seq_group
in
self
.
running
:
if
seq_group
.
is_finished
():
# Free cross-attention block table, if it exists
self
.
_free_seq_group_cross_attn_blocks
(
seq_group
)
# Add the finished requests to the finished requests list.
# This list will be used to update the Mamba cache in the
# next step.
...
...
@@ -1062,10 +1203,12 @@ class Scheduler:
slots.
"""
num_lookahead_slots
=
self
.
_get_num_lookahead_slots
(
is_prefill
=
False
)
seq_group
.
init_multi_step
(
num_scheduler_steps
=
num_lookahead_slots
+
1
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
cows
=
self
.
block_manager
.
append_slots
(
seq
,
num_lookahead_slots
)
blocks_to_copy
.
extend
(
cows
)
if
len
(
cows
)
>
0
:
blocks_to_copy
.
extend
(
cows
)
def
_preempt
(
self
,
...
...
vllm/distributed/communication_op.py
View file @
af7f4372
...
...
@@ -19,7 +19,7 @@ def tensor_model_parallel_all_gather(input_: torch.Tensor,
def
tensor_model_parallel_gather
(
input_
:
torch
.
Tensor
,
dst
:
int
=
0
,
dim
:
int
=
-
1
)
->
torch
.
Tensor
:
dim
:
int
=
-
1
)
->
Optional
[
torch
.
Tensor
]
:
"""Gather the input tensor across model parallel group."""
return
get_tp_group
().
gather
(
input_
,
dst
,
dim
)
...
...
vllm/distributed/device_communicators/cuda_wrapper.py
View file @
af7f4372
...
...
@@ -49,8 +49,13 @@ def find_loaded_library(lib_name) -> Optional[str]:
if
not
found
:
# the library is not loaded in the current process
return
None
# if lib_name is libcudart, we need to match a line with:
# address /path/to/libcudart-hash.so.11.0
start
=
line
.
index
(
"/"
)
path
=
line
[
start
:].
strip
()
filename
=
path
.
split
(
"/"
)[
-
1
]
assert
filename
.
rpartition
(
".so"
)[
0
].
startswith
(
lib_name
),
\
f
"Unexpected filename:
{
filename
}
for library
{
lib_name
}
"
return
path
...
...
@@ -98,9 +103,9 @@ class CudaRTLibrary:
def
__init__
(
self
,
so_file
:
Optional
[
str
]
=
None
):
if
so_file
is
None
:
so_file
=
find_loaded_library
(
"libcudart
.so
"
)
so_file
=
find_loaded_library
(
"libcudart"
)
assert
so_file
is
not
None
,
\
"libcudart
.so
is not loaded in the current process"
"libcudart is not loaded in the current process"
if
so_file
not
in
CudaRTLibrary
.
path_to_library_cache
:
lib
=
ctypes
.
CDLL
(
so_file
)
CudaRTLibrary
.
path_to_library_cache
[
so_file
]
=
lib
...
...
vllm/distributed/device_communicators/custom_all_reduce.py
View file @
af7f4372
...
...
@@ -11,15 +11,17 @@ from vllm.distributed.device_communicators.custom_all_reduce_utils import (
gpu_p2p_access_check
)
from
vllm.distributed.parallel_state
import
in_the_same_node_as
from
vllm.logger
import
init_logger
from
vllm.utils
import
cuda_device_count_stateless
,
is_full_nvlink
from
vllm.platforms
import
current_platform
from
vllm.utils
import
cuda_device_count_stateless
from
vllm.utils
import
is_hip
try
:
if
(
not
is_hip
()):
assert
ops
.
is_custom_op_supported
(
"_C_custom_ar::
meta_size
"
)
ops
.
meta_size
(
)
custom_ar
=
True
else
:
custom_ar
=
False
except
Exception
:
# For AMD GPUs and CPUs
custom_ar
=
False
...
...
@@ -117,7 +119,10 @@ class CustomAllreduce:
# test nvlink first, this will filter out most of the cases
# where custom allreduce is not supported
# this checks hardware and driver support for NVLink
full_nvlink
=
is_full_nvlink
(
physical_device_ids
)
assert
current_platform
.
is_cuda
()
from
vllm.platforms.cuda
import
CudaPlatform
cuda_platform
:
CudaPlatform
=
current_platform
full_nvlink
=
cuda_platform
.
is_full_nvlink
(
physical_device_ids
)
if
world_size
>
2
and
not
full_nvlink
:
logger
.
warning
(
"Custom allreduce is disabled because it's not supported on"
...
...
vllm/distributed/device_communicators/tpu_communicator.py
View file @
af7f4372
...
...
@@ -5,6 +5,7 @@ from torch.distributed import ProcessGroup
from
vllm.platforms
import
current_platform
if
current_platform
.
is_tpu
():
import
ray
import
torch_xla.core.xla_model
as
xm
import
torch_xla.runtime
as
xr
from
torch_xla._internal
import
pjrt
...
...
@@ -18,9 +19,15 @@ class TpuCommunicator:
return
self
.
disabled
=
False
local_rank
=
dist
.
get_rank
(
group
)
world_size
=
dist
.
get_world_size
(
group
)
pjrt
.
initialize_multiprocess
(
local_rank
,
world_size
)
# NOTE(woosuk): When using TP > 1 on TPUs, every TPU on the same node
# must be used together. Therefore, the local rank and world size can
# be simply calculated as follows.
global_rank
=
dist
.
get_rank
(
group
)
global_world_size
=
dist
.
get_world_size
(
group
)
num_nodes
=
len
(
ray
.
nodes
())
local_world_size
=
global_world_size
//
num_nodes
local_rank
=
global_rank
%
local_world_size
pjrt
.
initialize_multiprocess
(
local_rank
,
local_world_size
)
xr
.
_init_world_size_ordinal
()
def
all_reduce
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
...
...
vllm/distributed/parallel_state.py
View file @
af7f4372
...
...
@@ -329,7 +329,7 @@ class GroupCoordinator:
def
gather
(
self
,
input_
:
torch
.
Tensor
,
dst
:
int
=
0
,
dim
:
int
=
-
1
)
->
torch
.
Tensor
:
dim
:
int
=
-
1
)
->
Optional
[
torch
.
Tensor
]
:
"""
NOTE: We assume that the input tensor is on the same device across
all the ranks.
...
...
vllm/engine/arg_utils.py
View file @
af7f4372
...
...
@@ -2,11 +2,15 @@ import argparse
import
dataclasses
import
json
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
List
,
Optional
,
Tuple
,
Type
,
Union
from
typing
import
(
TYPE_CHECKING
,
Dict
,
List
,
Mapping
,
Optional
,
Tuple
,
Type
,
Union
)
import
torch
import
vllm.envs
as
envs
from
vllm.config
import
(
CacheConfig
,
DecodingConfig
,
DeviceConfig
,
EngineConfig
,
LoadConfig
,
Lo
RAConfig
,
Model
Config
,
Multi
Mod
a
lConfig
,
ObservabilityConfig
,
ParallelConfig
,
EngineConfig
,
LoadConfig
,
Lo
adFormat
,
LoRA
Config
,
Mod
e
lConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
,
TokenizerPoolConfig
)
from
vllm.executor.executor_base
import
ExecutorBase
...
...
@@ -15,11 +19,12 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from
vllm.utils
import
FlexibleArgumentParser
if
TYPE_CHECKING
:
from
vllm.transformers_utils.tokenizer_group.base_tokenizer_group
import
(
BaseTokenizerGroup
)
from
vllm.transformers_utils.tokenizer_group
import
BaseTokenizerGroup
logger
=
init_logger
(
__name__
)
ALLOWED_DETAILED_TRACE_MODULES
=
[
"model"
,
"worker"
,
"all"
]
def
nullable_str
(
val
:
str
):
if
not
val
or
val
==
"None"
:
...
...
@@ -27,11 +32,32 @@ def nullable_str(val: str):
return
val
def
nullable_kvs
(
val
:
str
)
->
Optional
[
Mapping
[
str
,
int
]]:
if
len
(
val
)
==
0
:
return
None
out_dict
:
Dict
[
str
,
int
]
=
{}
for
item
in
val
.
split
(
","
):
try
:
key
,
value
=
item
.
split
(
"="
)
except
TypeError
as
exc
:
msg
=
"Each item should be in the form KEY=VALUE"
raise
ValueError
(
msg
)
from
exc
try
:
out_dict
[
key
]
=
int
(
value
)
except
ValueError
as
exc
:
msg
=
f
"Failed to parse value of item
{
key
}
=
{
value
}
"
raise
ValueError
(
msg
)
from
exc
return
out_dict
@
dataclass
class
EngineArgs
:
"""Arguments for vLLM engine."""
model
:
str
served_model_name
:
Optional
[
Union
[
List
[
str
]]]
=
None
model
:
str
=
'facebook/opt-125m'
served_model_name
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
tokenizer
:
Optional
[
str
]
=
None
skip_tokenizer_init
:
bool
=
False
tokenizer_mode
:
str
=
'auto'
...
...
@@ -56,8 +82,8 @@ class EngineArgs:
enable_prefix_caching
:
bool
=
False
disable_sliding_window
:
bool
=
False
use_v2_block_manager
:
bool
=
False
swap_space
:
in
t
=
4
# GiB
cpu_offload_gb
:
in
t
=
0
# GiB
swap_space
:
floa
t
=
4
# GiB
cpu_offload_gb
:
floa
t
=
0
# GiB
gpu_memory_utilization
:
float
=
0.90
max_num_batched_tokens
:
Optional
[
int
]
=
None
max_num_seqs
:
int
=
256
...
...
@@ -69,7 +95,7 @@ class EngineArgs:
rope_theta
:
Optional
[
float
]
=
None
tokenizer_revision
:
Optional
[
str
]
=
None
quantization
:
Optional
[
str
]
=
None
enforce_eager
:
bool
=
Fals
e
enforce_eager
:
Optional
[
bool
]
=
Non
e
max_context_len_to_capture
:
Optional
[
int
]
=
None
max_seq_len_to_capture
:
int
=
8192
disable_custom_all_reduce
:
bool
=
False
...
...
@@ -79,6 +105,7 @@ class EngineArgs:
# notice.
tokenizer_pool_type
:
Union
[
str
,
Type
[
"BaseTokenizerGroup"
]]
=
"ray"
tokenizer_pool_extra_config
:
Optional
[
dict
]
=
None
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
=
None
enable_lora
:
bool
=
False
max_loras
:
int
=
1
max_lora_rank
:
int
=
16
...
...
@@ -88,9 +115,10 @@ class EngineArgs:
fully_sharded_loras
:
bool
=
False
lora_extra_vocab_size
:
int
=
256
long_lora_scaling_factors
:
Optional
[
Tuple
[
float
]]
=
None
lora_dtype
:
str
=
'auto'
lora_dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
'auto'
max_cpu_loras
:
Optional
[
int
]
=
None
device
:
str
=
'auto'
num_scheduler_steps
:
int
=
1
ray_workers_use_nsight
:
bool
=
False
num_gpu_blocks_override
:
Optional
[
int
]
=
None
num_lookahead_slots
:
int
=
0
...
...
@@ -104,6 +132,7 @@ class EngineArgs:
guided_decoding_backend
:
str
=
'outlines'
# Speculative decoding configuration.
speculative_model
:
Optional
[
str
]
=
None
speculative_model_quantization
:
Optional
[
str
]
=
None
speculative_draft_tensor_parallel_size
:
Optional
[
int
]
=
None
num_speculative_tokens
:
Optional
[
int
]
=
None
speculative_max_model_len
:
Optional
[
int
]
=
None
...
...
@@ -117,6 +146,7 @@ class EngineArgs:
disable_logprobs_during_spec_decoding
:
Optional
[
bool
]
=
None
otlp_traces_endpoint
:
Optional
[
str
]
=
None
collect_detailed_traces
:
Optional
[
str
]
=
None
def
__post_init__
(
self
):
if
self
.
tokenizer
is
None
:
...
...
@@ -130,7 +160,7 @@ class EngineArgs:
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'facebook/opt-125m'
,
default
=
EngineArgs
.
model
,
help
=
'Name or path of the huggingface model to use.'
)
parser
.
add_argument
(
'--tokenizer'
,
...
...
@@ -184,10 +214,7 @@ class EngineArgs:
'--load-format'
,
type
=
str
,
default
=
EngineArgs
.
load_format
,
choices
=
[
'auto'
,
'pt'
,
'safetensors'
,
'npcache'
,
'dummy'
,
'tensorizer'
,
'bitsandbytes'
],
choices
=
[
f
.
value
for
f
in
LoadFormat
],
help
=
'The format of the model weights to load.
\n\n
'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
...
...
@@ -290,7 +317,7 @@ class EngineArgs:
parser
.
add_argument
(
'--block-size'
,
type
=
int
,
default
=
EngineArgs
.
block_size
,
choices
=
[
8
,
16
,
32
],
choices
=
[
8
,
16
,
32
,
128
,
256
,
512
,
1024
,
2048
],
help
=
'Token block size for contiguous chunks of '
'tokens.'
)
...
...
@@ -318,7 +345,7 @@ class EngineArgs:
default
=
EngineArgs
.
seed
,
help
=
'Random seed for operations.'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
in
t
,
type
=
floa
t
,
default
=
EngineArgs
.
swap_space
,
help
=
'CPU swap space size (GiB) per GPU.'
)
parser
.
add_argument
(
...
...
@@ -432,6 +459,21 @@ class EngineArgs:
'This should be a JSON string that will be '
'parsed into a dictionary. Ignored if '
'tokenizer_pool_size is 0.'
)
# Multimodal related configs
parser
.
add_argument
(
'--limit-mm-per-prompt'
,
type
=
nullable_kvs
,
default
=
EngineArgs
.
limit_mm_per_prompt
,
# The default value is given in
# MultiModalRegistry.init_mm_limits_per_prompt
help
=
(
'For each multimodal plugin, limit how many '
'input instances to allow for each prompt. '
'Expects a comma-separated list of items, '
'e.g.: `image=16,video=2` allows a maximum of 16 '
'images and 2 videos per prompt. Defaults to 1 for '
'each modality.'
))
# LoRA related configs
parser
.
add_argument
(
'--enable-lora'
,
action
=
'store_true'
,
...
...
@@ -503,6 +545,11 @@ class EngineArgs:
"tpu"
,
"xpu"
],
help
=
'Device type for vLLM execution.'
)
parser
.
add_argument
(
'--num-scheduler-steps'
,
type
=
int
,
default
=
1
,
help
=
(
'Maximum number of forward steps per '
'scheduler call.'
))
parser
.
add_argument
(
'--scheduler-delay-factor'
,
...
...
@@ -525,6 +572,18 @@ class EngineArgs:
default
=
EngineArgs
.
speculative_model
,
help
=
'The name of the draft model to be used in speculative decoding.'
)
# Quantization settings for speculative model.
parser
.
add_argument
(
'--speculative-model-quantization'
,
type
=
nullable_str
,
choices
=
[
*
QUANTIZATION_METHODS
,
None
],
default
=
EngineArgs
.
speculative_model_quantization
,
help
=
'Method used to quantize the weights of speculative model.'
'If None, we first check the `quantization_config` '
'attribute in the model config file. If that is '
'None, we assume the model weights are not '
'quantized and use `dtype` to determine the data '
'type of the weights.'
)
parser
.
add_argument
(
'--num-speculative-tokens'
,
type
=
int
,
...
...
@@ -602,8 +661,10 @@ class EngineArgs:
parser
.
add_argument
(
'--disable-logprobs-during-spec-decoding'
,
type
=
bool
,
action
=
StoreBoolean
,
default
=
EngineArgs
.
disable_logprobs_during_spec_decoding
,
nargs
=
"?"
,
const
=
"True"
,
help
=
'If set to True, token log probabilities are not returned '
'during speculative decoding. If set to False, log probabilities '
'are returned according to the settings in SamplingParams. If '
...
...
@@ -660,6 +721,16 @@ class EngineArgs:
type
=
str
,
default
=
None
,
help
=
'Target URL to which OpenTelemetry traces will be sent.'
)
parser
.
add_argument
(
'--collect-detailed-traces'
,
type
=
str
,
default
=
None
,
help
=
"Valid choices are "
+
","
.
join
(
ALLOWED_DETAILED_TRACE_MODULES
)
+
". It makes sense to set this only if --otlp-traces-endpoint is"
" set. If set, it will collect detailed traces for the specified "
"modules. This involves use of possibly costly and or blocking "
"operations and hence might have a performance impact."
)
return
parser
...
...
@@ -672,6 +743,9 @@ class EngineArgs:
return
engine_args
def
create_engine_config
(
self
,
)
->
EngineConfig
:
# gguf file needs a specific model loader and doesn't use hf_repo
if
self
.
model
.
endswith
(
".gguf"
):
self
.
quantization
=
self
.
load_format
=
"gguf"
# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
...
...
@@ -693,8 +767,6 @@ class EngineArgs:
"CPU offload space must be non-negative"
f
", but got
{
self
.
cpu_offload_gb
}
"
)
multimodal_config
=
MultiModalConfig
()
device_config
=
DeviceConfig
(
device
=
self
.
device
)
model_config
=
ModelConfig
(
model
=
self
.
model
,
...
...
@@ -718,7 +790,8 @@ class EngineArgs:
disable_sliding_window
=
self
.
disable_sliding_window
,
skip_tokenizer_init
=
self
.
skip_tokenizer_init
,
served_model_name
=
self
.
served_model_name
,
multimodal_config
=
multimodal_config
)
limit_mm_per_prompt
=
self
.
limit_mm_per_prompt
,
)
cache_config
=
CacheConfig
(
block_size
=
self
.
block_size
,
gpu_memory_utilization
=
self
.
gpu_memory_utilization
,
...
...
@@ -779,11 +852,19 @@ class EngineArgs:
"in low performance due to small KV cache space. Consider "
"setting --max-model-len to a smaller value."
,
max_model_len
)
if
self
.
num_scheduler_steps
>
1
and
not
self
.
use_v2_block_manager
:
self
.
use_v2_block_manager
=
True
logger
.
warning
(
"Enabled BlockSpaceManagerV2 because it is "
"required for multi-step (--num-scheduler-steps > 1)"
)
speculative_config
=
SpeculativeConfig
.
maybe_create_spec_config
(
target_model_config
=
model_config
,
target_parallel_config
=
parallel_config
,
target_dtype
=
self
.
dtype
,
speculative_model
=
self
.
speculative_model
,
speculative_model_quantization
=
\
self
.
speculative_model_quantization
,
speculative_draft_tensor_parallel_size
=
\
self
.
speculative_draft_tensor_parallel_size
,
num_speculative_tokens
=
self
.
num_speculative_tokens
,
...
...
@@ -804,18 +885,35 @@ class EngineArgs:
disable_logprobs
=
self
.
disable_logprobs_during_spec_decoding
,
)
if
self
.
num_scheduler_steps
>
1
:
if
speculative_config
is
not
None
:
raise
ValueError
(
"Speculative decoding is not supported with "
"multi-step (--num-scheduler-steps > 1)"
)
if
self
.
enable_chunked_prefill
:
raise
ValueError
(
"Chunked prefill is not supported with "
"multi-step (--num-scheduler-steps > 1)"
)
# make sure num_lookahead_slots is set the higher value depending on
# if we are using speculative decoding or multi-step
num_lookahead_slots
=
max
(
self
.
num_lookahead_slots
,
self
.
num_scheduler_steps
-
1
)
num_lookahead_slots
=
num_lookahead_slots
\
if
speculative_config
is
None
\
else
speculative_config
.
num_lookahead_slots
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
=
self
.
max_num_batched_tokens
,
max_num_seqs
=
self
.
max_num_seqs
,
max_model_len
=
model_config
.
max_model_len
,
use_v2_block_manager
=
self
.
use_v2_block_manager
,
num_lookahead_slots
=
(
self
.
num_lookahead_slots
if
speculative_config
is
None
else
speculative_config
.
num_lookahead_slots
),
num_lookahead_slots
=
num_lookahead_slots
,
delay_factor
=
self
.
scheduler_delay_factor
,
enable_chunked_prefill
=
self
.
enable_chunked_prefill
,
embedding_mode
=
model_config
.
embedding_mode
,
preemption_mode
=
self
.
preemption_mode
,
num_scheduler_steps
=
self
.
num_scheduler_steps
,
send_delta_data
=
(
envs
.
VLLM_USE_RAY_SPMD_WORKER
and
parallel_config
.
use_ray
),
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
self
.
max_lora_rank
,
...
...
@@ -849,8 +947,21 @@ class EngineArgs:
decoding_config
=
DecodingConfig
(
guided_decoding_backend
=
self
.
guided_decoding_backend
)
detailed_trace_modules
=
[]
if
self
.
collect_detailed_traces
is
not
None
:
detailed_trace_modules
=
self
.
collect_detailed_traces
.
split
(
","
)
for
m
in
detailed_trace_modules
:
if
m
not
in
ALLOWED_DETAILED_TRACE_MODULES
:
raise
ValueError
(
f
"Invalid module
{
m
}
in collect_detailed_traces. "
f
"Valid modules are
{
ALLOWED_DETAILED_TRACE_MODULES
}
"
)
observability_config
=
ObservabilityConfig
(
otlp_traces_endpoint
=
self
.
otlp_traces_endpoint
)
otlp_traces_endpoint
=
self
.
otlp_traces_endpoint
,
collect_model_forward_time
=
"model"
in
detailed_trace_modules
or
"all"
in
detailed_trace_modules
,
collect_model_execute_time
=
"worker"
in
detailed_trace_modules
or
"all"
in
detailed_trace_modules
,
)
if
(
model_config
.
get_sliding_window
()
is
not
None
and
scheduler_config
.
chunked_prefill_enabled
...
...
@@ -866,7 +977,6 @@ class EngineArgs:
scheduler_config
=
scheduler_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
multimodal_config
=
multimodal_config
,
speculative_config
=
speculative_config
,
load_config
=
load_config
,
decoding_config
=
decoding_config
,
...
...
@@ -889,7 +999,13 @@ class AsyncEngineArgs(EngineArgs):
parser
.
add_argument
(
'--engine-use-ray'
,
action
=
'store_true'
,
help
=
'Use Ray to start the LLM engine in a '
'separate process as the server process.'
)
'separate process as the server process.'
'(DEPRECATED. This argument is deprecated '
'and will be removed in a future update. '
'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
'use it. See '
'https://github.com/vllm-project/vllm/issues/7045.'
')'
)
parser
.
add_argument
(
'--disable-log-requests'
,
action
=
'store_true'
,
help
=
'Disable logging requests.'
)
...
...
Prev
1
…
11
12
13
14
15
16
17
18
19
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment