Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
539aa992
Commit
539aa992
authored
Sep 27, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.2' into v0.6.2-dev
parents
93872128
7193774b
Changes
383
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1266 additions
and
397 deletions
+1266
-397
vllm/transformers_utils/configs/mllama.py
vllm/transformers_utils/configs/mllama.py
+28
-0
vllm/transformers_utils/configs/solar.py
vllm/transformers_utils/configs/solar.py
+245
-0
vllm/transformers_utils/detokenizer.py
vllm/transformers_utils/detokenizer.py
+7
-9
vllm/transformers_utils/image_processor.py
vllm/transformers_utils/image_processor.py
+0
-64
vllm/transformers_utils/processor.py
vllm/transformers_utils/processor.py
+61
-4
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/tokenizer.py
+0
-1
vllm/transformers_utils/tokenizers/mistral.py
vllm/transformers_utils/tokenizers/mistral.py
+31
-5
vllm/triton_utils/libentry.py
vllm/triton_utils/libentry.py
+2
-2
vllm/triton_utils/sample.py
vllm/triton_utils/sample.py
+0
-13
vllm/usage/usage_lib.py
vllm/usage/usage_lib.py
+2
-1
vllm/utils.py
vllm/utils.py
+114
-36
vllm/version.py
vllm/version.py
+5
-7
vllm/vllm_flash_attn/.gitkeep
vllm/vllm_flash_attn/.gitkeep
+0
-0
vllm/worker/cpu_model_runner.py
vllm/worker/cpu_model_runner.py
+276
-116
vllm/worker/enc_dec_model_runner.py
vllm/worker/enc_dec_model_runner.py
+72
-23
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+80
-24
vllm/worker/model_runner_base.py
vllm/worker/model_runner_base.py
+29
-3
vllm/worker/multi_step_model_runner.py
vllm/worker/multi_step_model_runner.py
+53
-21
vllm/worker/multi_step_tpu_worker.py
vllm/worker/multi_step_tpu_worker.py
+105
-0
vllm/worker/tpu_model_runner.py
vllm/worker/tpu_model_runner.py
+156
-68
No files found.
vllm/transformers_utils/configs/mllama.py
0 → 100644
View file @
539aa992
from
transformers.models.mllama
import
configuration_mllama
as
mllama_hf_config
class
MllamaTextConfig
(
mllama_hf_config
.
MllamaTextConfig
):
'''
Use this class to override is_encoder_decoder:
- transformers regards mllama as is_encoder_decoder=False
- vllm needs is_encoder_decoder=True to enable cross-attention
'''
def
__init__
(
self
,
**
kwargs
,
):
super
().
__init__
(
**
kwargs
)
self
.
is_encoder_decoder
=
True
class
MllamaConfig
(
mllama_hf_config
.
MllamaConfig
):
def
__init__
(
self
,
text_config
=
None
,
**
kwargs
,
):
if
isinstance
(
text_config
,
dict
):
text_config
=
MllamaTextConfig
(
**
text_config
)
super
().
__init__
(
text_config
=
text_config
,
**
kwargs
)
vllm/transformers_utils/configs/solar.py
0 → 100644
View file @
539aa992
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Solar model configuration"""
from
transformers
import
PretrainedConfig
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
class
SolarConfig
(
PretrainedConfig
):
r
"""
This is the configuration class to store
the configuration of a [`SolarModel`].
It is used to instantiate an LLaMA model
according to the specified arguments,
defining the model architecture.
Instantiating a configuration with the
defaults will yield a similar
configuration to that of the LLaMA-7B.
Configuration objects inherit from [`PretrainedConfig`]
and can be used to control the model outputs.
Read the documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the LLaMA model.
Defines the number of different tokens
that can be represented by the `inputs_ids`
passed when calling [`SolarModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer
in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that
should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`,
the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model
will use Multi Query Attention (MQA)
otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint,
each group key and value head should be constructed
by meanpooling all the original heads within that group.
For more details checkout [this paper]
(https://arxiv.org/pdf/2305.13245.pdf).
If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string)
in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
Solar 1 supports up to 2048 tokens,
Solar 2 up to 4096, CodeSolar up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of
the truncated_normal_initializer for initializing
all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return
the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank
used during pretraining.
Please refer to [this
document](https://huggingface.co/docs/
transformers/main/
perf_train_gpu_many#tensor-parallelism)
to understand more about it. This value is
necessary to ensure exact reproducibility
of the pretraining results.
Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for
the RoPE embeddings.
Currently supports two scaling
strategies: linear and dynamic.
Their scaling factor must be a float greater than 1.
The expected format is
`{"type": strategy name, "factor": scaling factor}`.
When using this flag, don't update
`max_position_embeddings` to the expected new maximum.
See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking
API changes in future versions.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value
and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj
layers in the MLP layers.
sliding_window (`int`, *optional*, defaults to 2047):
Sliding window attention window size. If not specified,
will default to `2047`.
```python
>>> from transformers import SolarModel, SolarConfig
>>> # Initializing a Solar-pro style configuration
>>> configuration = SolarConfig()
>>> # Initializing a model from the Solar-pro style configuration
>>> model = SolarModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type
=
"solar"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
32000
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
None
,
hidden_act
=
"silu"
,
max_position_embeddings
=
2048
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
None
,
bos_token_id
=
1
,
eos_token_id
=
2
,
pretraining_tp
=
1
,
tie_word_embeddings
=
False
,
rope_theta
=
10000.0
,
rope_scaling
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
mlp_bias
=
False
,
sliding_window
=
2047
,
bskcn_1
=
None
,
bskcn_2
=
None
,
bskcn_3
=
None
,
bskcn_4
=
None
,
bskcn_tv
=
None
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
pretraining_tp
=
pretraining_tp
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
_rope_scaling_validation
()
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
self
.
mlp_bias
=
mlp_bias
self
.
sliding_window
=
sliding_window
self
.
bskcn_1
=
bskcn_1
if
bskcn_1
is
not
None
else
[
12
,
20
,
32
,
44
]
self
.
bskcn_2
=
bskcn_2
if
bskcn_2
is
not
None
else
[
20
,
32
]
self
.
bskcn_3
=
bskcn_3
if
bskcn_3
is
not
None
else
[
16
,
24
,
36
,
48
]
self
.
bskcn_4
=
bskcn_4
if
bskcn_4
is
not
None
else
[
28
,
40
]
self
.
bskcn_tv
=
bskcn_tv
if
bskcn_tv
is
not
None
else
[
0.9
,
0.8
]
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
def
_rope_scaling_validation
(
self
):
"""
Validate the `rope_scaling` configuration.
"""
if
self
.
rope_scaling
is
None
:
return
if
(
not
isinstance
(
self
.
rope_scaling
,
dict
)
or
len
(
self
.
rope_scaling
)
!=
2
):
raise
ValueError
(
"`rope_scaling` must be a dictionary with two fields,"
" `type` and `factor`, "
f
"got
{
self
.
rope_scaling
}
"
)
rope_scaling_type
=
self
.
rope_scaling
.
get
(
"type"
,
None
)
rope_scaling_factor
=
self
.
rope_scaling
.
get
(
"factor"
,
None
)
if
rope_scaling_type
is
None
or
rope_scaling_type
not
in
[
"linear"
,
"dynamic"
,
]:
raise
ValueError
(
f
"`rope_scaling`'s type field must be one of "
f
"['linear', 'dynamic'], got
{
rope_scaling_type
}
"
)
if
(
rope_scaling_factor
is
None
or
not
isinstance
(
rope_scaling_factor
,
float
)
or
rope_scaling_factor
<=
1.0
):
raise
ValueError
(
f
"`rope_scaling`'s factor field must be a float > 1,"
f
" got
{
rope_scaling_factor
}
"
)
vllm/transformers_utils/detokenizer.py
View file @
539aa992
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
vllm.sequence
import
Logprob
,
SamplingParams
,
Sequence
,
SequenceGroup
from
vllm.sequence
import
(
VLLM_INVALID_TOKEN_ID
,
Logprob
,
SamplingParams
,
Sequence
,
SequenceGroup
)
from
.tokenizer
import
AnyTokenizer
from
.tokenizer_group
import
BaseTokenizerGroup
# Used eg. for marking rejected tokens in spec decoding.
INVALID_TOKEN_ID
=
-
1
class
Detokenizer
:
"""Provides methods to decode the output of a model into text."""
...
...
@@ -61,7 +59,7 @@ class Detokenizer:
continue
for
token_id
,
sample_logprob
in
prompt_logprobs_for_token
.
items
():
if
(
sample_logprob
.
decoded_token
is
None
and
token_id
!=
INVALID_TOKEN_ID
):
and
token_id
!=
VLLM_
INVALID_TOKEN_ID
):
prompt_token_ids_with_token
=
(
prompt_token_ids
[:
token_position
]
+
[
token_id
])
(
new_tokens
,
new_text
,
new_prefix_offset
,
...
...
@@ -143,7 +141,7 @@ class Detokenizer:
continue
if
(
sample_logprob
.
decoded_token
is
None
and
token_id
!=
INVALID_TOKEN_ID
):
and
token_id
!=
VLLM_
INVALID_TOKEN_ID
):
all_input_ids_with_logprob
=
previous_tokens
+
[
token_id
]
(
_
,
new_text
,
_
,
_
)
=
detokenize_incrementally
(
tokenizer
=
tokenizer
,
...
...
@@ -282,14 +280,14 @@ def detokenize_incrementally(
assert
prev_tokens
is
not
None
# If the new token id is out of bounds, return an empty string.
if
new_token_id
>=
len
(
tokenizer
):
new_tokens
=
[
""
]
else
:
if
0
<=
new_token_id
<
len
(
tokenizer
):
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens
=
tokenizer
.
convert_ids_to_tokens
(
[
new_token_id
],
skip_special_tokens
=
skip_special_tokens
)
if
isinstance
(
new_tokens
,
str
):
new_tokens
=
[
new_tokens
]
else
:
new_tokens
=
[
""
]
output_tokens
=
prev_tokens
+
new_tokens
# If this is the first iteration, return all tokens.
...
...
vllm/transformers_utils/image_processor.py
deleted
100644 → 0
View file @
93872128
from
typing
import
cast
def
get_video_processor
(
processor_name
:
str
,
trust_remote_code
:
bool
=
False
,
):
"""
Gets a processor for the given model name via HuggingFace.
"""
from
transformers
import
AutoProcessor
try
:
processor
=
AutoProcessor
.
from_pretrained
(
processor_name
)
video_processor
=
processor
.
video_processor
except
ValueError
as
e
:
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
video_processor
def
get_image_processor
(
processor_name
:
str
,
*
args
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
):
"""Gets an image processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoImageProcessor
from
transformers.image_processing_utils
import
BaseImageProcessor
try
:
processor
=
AutoImageProcessor
.
from_pretrained
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
)
except
ValueError
as
e
:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the image processor. If the image processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
cast
(
BaseImageProcessor
,
processor
)
vllm/transformers_utils/processor.py
View file @
539aa992
from
typing
import
cast
from
typing
import
Any
,
cast
def
get_processor
(
processor_name
:
str
,
*
args
,
*
args
:
Any
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
**
kwargs
:
Any
,
):
"""
Gets
a processor for the given model name via HuggingFace."""
"""
Load
a processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoProcessor
...
...
@@ -35,3 +35,60 @@ def get_processor(
raise
e
return
cast
(
ProcessorMixin
,
processor
)
def
get_image_processor
(
processor_name
:
str
,
*
args
:
Any
,
trust_remote_code
:
bool
=
False
,
**
kwargs
:
Any
,
):
"""Load an image processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoImageProcessor
from
transformers.image_processing_utils
import
BaseImageProcessor
try
:
processor
=
AutoImageProcessor
.
from_pretrained
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
)
except
ValueError
as
e
:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the image processor. If the image processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
cast
(
BaseImageProcessor
,
processor
)
def
get_video_processor
(
processor_name
:
str
,
*
args
:
Any
,
trust_remote_code
:
bool
=
False
,
**
kwargs
:
Any
,
):
"""Load a video processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers.image_processing_utils
import
BaseImageProcessor
processor
=
get_processor
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
,
)
return
cast
(
BaseImageProcessor
,
processor
.
video_processor
)
vllm/transformers_utils/tokenizer.py
View file @
539aa992
...
...
@@ -111,7 +111,6 @@ def get_tokenizer(
'encoding and decoding.'
,
FutureWarning
,
stacklevel
=
2
)
if
tokenizer_mode
==
"mistral"
:
tokenizer
=
MistralTokenizer
.
from_pretrained
(
str
(
tokenizer_name
),
revision
=
revision
)
...
...
vllm/transformers_utils/tokenizers/mistral.py
View file @
539aa992
...
...
@@ -165,10 +165,9 @@ class MistralTokenizer:
messages
:
List
[
"ChatCompletionMessageParam"
],
tools
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
**
kwargs
)
->
List
[
int
]:
assert
tools
is
None
,
"`tools` are not yet supported."
request
=
ChatCompletionRequest
(
messages
=
message
s
)
# type: ignore[type-var]
request
=
ChatCompletionRequest
(
messages
=
messages
,
tools
=
tool
s
)
# type: ignore[type-var]
encoded
=
self
.
mistral
.
encode_chat_completion
(
request
)
# encode-decode to get clean prompt
...
...
@@ -176,9 +175,29 @@ class MistralTokenizer:
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
])
->
str
:
if
isinstance
(
self
.
tokenizer
,
Tekkenizer
):
return
""
.
join
(
tokens
)
tokens
=
[
t
for
t
in
tokens
if
t
not
in
self
.
tokenizer
.
_all_special_tokens
]
if
any
(
isinstance
(
t
,
bytes
)
for
t
in
tokens
):
# we need to encode and decode all tokens again
shift
=
self
.
tokenizer
.
num_special_tokens
byte_tokens
=
[
t
.
encode
(
"utf-8"
)
if
not
isinstance
(
t
,
bytes
)
else
t
for
t
in
tokens
]
ids
=
[
self
.
tokenizer
.
_tekken_token2id_nospecial
[
t
]
+
shift
for
t
in
byte_tokens
]
decoded
=
self
.
tokenizer
.
decode
(
ids
)
else
:
decoded
=
""
.
join
(
tokens
)
else
:
return
self
.
tokenizer
.
decode
(
tokens
)
# type: ignore[arg-type]
decoded
=
self
.
tokenizer
.
decode
(
tokens
)
# type: ignore[arg-type]
return
decoded
def
decode
(
self
,
ids
:
Union
[
List
[
int
],
int
])
->
str
:
if
isinstance
(
ids
,
int
):
...
...
@@ -200,4 +219,11 @@ class MistralTokenizer:
self
.
tokenizer
)
tokens
=
[
self
.
tokenizer
.
id_to_piece
(
id
)
for
id
in
ids
]
if
any
(
t
.
strip
()
==
"�"
for
t
in
tokens
):
# if any stripped decoded token is undefined
# because it's invalid unicode then pass bytes
# See: https://github.com/vllm-project/vllm/pull/8640
tokens
=
[
self
.
tokenizer
.
id_to_byte_piece
(
id
)
for
id
in
ids
]
return
tokens
vllm/triton_utils/libentry.py
View file @
539aa992
...
...
@@ -35,8 +35,8 @@ class LibEntry(triton.KernelInterface):
dns_key
=
[
arg
.
dtype
if
hasattr
(
arg
,
"data_ptr"
)
else
type
(
arg
)
if
not
isinstance
(
arg
,
int
)
else
"i32"
if
-
(
2
**
31
)
<=
arg
and
arg
<=
2
**
31
-
1
else
"u64"
if
2
**
63
<=
arg
and
arg
<=
2
**
64
-
1
else
"i64"
else
"i32"
if
arg
>=
-
(
2
**
31
)
and
arg
<=
2
**
31
-
1
else
"u64"
if
arg
>=
2
**
63
and
arg
<=
2
**
64
-
1
else
"i64"
for
arg
in
dns_args
]
# const args passed by position
...
...
vllm/triton_utils/sample.py
deleted
100644 → 0
View file @
93872128
import
math
# This is a hardcoded limit in Triton (max block size).
MAX_TRITON_N_COLS
=
131072
def
get_num_triton_sampler_splits
(
n_cols
:
int
)
->
int
:
"""Get the number of splits to use for Triton sampling.
Triton has a limit on the number of columns it can handle, so we need to
split the tensor and call the kernel multiple times if it's too large.
"""
return
math
.
ceil
(
n_cols
/
MAX_TRITON_N_COLS
)
vllm/usage/usage_lib.py
View file @
539aa992
...
...
@@ -17,6 +17,7 @@ import torch
import
vllm.envs
as
envs
from
vllm.connections
import
global_http_connection
from
vllm.platforms
import
current_platform
from
vllm.version
import
__version__
as
VLLM_VERSION
_config_home
=
envs
.
VLLM_CONFIG_ROOT
...
...
@@ -151,7 +152,7 @@ class UsageMessage:
usage_context
:
UsageContext
,
extra_kvs
:
Dict
[
str
,
Any
])
->
None
:
# Platform information
if
torch
.
cuda
.
is_availabl
e
():
if
current_platform
.
is_cuda_alik
e
():
device_property
=
torch
.
cuda
.
get_device_properties
(
0
)
self
.
gpu_count
=
torch
.
cuda
.
device_count
()
self
.
gpu_type
=
device_property
.
name
...
...
vllm/utils.py
View file @
539aa992
...
...
@@ -4,7 +4,10 @@ import contextlib
import
datetime
import
enum
import
gc
import
inspect
import
ipaddress
import
os
import
random
import
socket
import
subprocess
import
sys
...
...
@@ -12,6 +15,7 @@ import tempfile
import
threading
import
uuid
import
warnings
import
weakref
from
asyncio
import
FIRST_COMPLETED
,
ensure_future
from
functools
import
lru_cache
,
partial
,
wraps
from
platform
import
uname
...
...
@@ -31,6 +35,7 @@ from typing_extensions import ParamSpec, TypeIs, assert_never
import
vllm.envs
as
envs
from
vllm.logger
import
enable_trace_function_call
,
init_logger
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
...
...
@@ -70,10 +75,6 @@ STR_NOT_IMPL_ENC_DEC_SPEC_DEC = ("Speculative decoding is not "
"currently supported with encoder/"
"decoder models."
)
STR_NOT_IMPL_ENC_DEC_CUDAGRAPH
=
(
"CUDAGraph is not "
"currently supported with encoder/"
"decoder models."
)
STR_NOT_IMPL_ENC_DEC_BACKEND
=
(
"XFormers is the only backend "
"currently supported with encoder/"
"decoder models."
)
...
...
@@ -97,7 +98,6 @@ STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
"STR_NOT_IMPL_ENC_DEC_PP"
:
STR_NOT_IMPL_ENC_DEC_PP
,
"STR_NOT_IMPL_ENC_DEC_MM"
:
STR_NOT_IMPL_ENC_DEC_MM
,
"STR_NOT_IMPL_ENC_DEC_SPEC_DEC"
:
STR_NOT_IMPL_ENC_DEC_SPEC_DEC
,
"STR_NOT_IMPL_ENC_DEC_CUDA_GRAPH"
:
STR_NOT_IMPL_ENC_DEC_CUDAGRAPH
,
"STR_NOT_IMPL_ENC_DEC_BACKEND"
:
STR_NOT_IMPL_ENC_DEC_BACKEND
,
"STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER"
:
STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER
,
"STR_NOT_IMPL_ENC_DEC_CPU"
:
STR_NOT_IMPL_ENC_DEC_CPU
...
...
@@ -269,7 +269,7 @@ class LRUCache(Generic[T]):
class
PyObjectCache
:
"""Used to cache python objects to avoid object allocations
"""Used to cache python objects to avoid object allocations
across scheduler iterations.
"""
...
...
@@ -288,7 +288,7 @@ class PyObjectCache:
self
.
_obj_cache
.
append
(
self
.
_obj_builder
())
def
get_object
(
self
):
"""Returns a pre-allocated cached object. If there is not enough
"""Returns a pre-allocated cached object. If there is not enough
objects, then the cache size will double.
"""
if
self
.
_index
>=
len
(
self
.
_obj_cache
):
...
...
@@ -377,6 +377,22 @@ def get_cpu_memory() -> int:
return
psutil
.
virtual_memory
().
total
def
seed_everything
(
seed
:
int
)
->
None
:
"""
Set the seed of each random module.
Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
"""
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
manual_seed_all
(
seed
)
if
is_xpu
():
torch
.
xpu
.
manual_seed_all
(
seed
)
def
random_uuid
()
->
str
:
return
str
(
uuid
.
uuid4
().
hex
)
...
...
@@ -518,6 +534,14 @@ def get_ip() -> str:
return
"0.0.0.0"
def
is_valid_ipv6_address
(
address
:
str
)
->
bool
:
try
:
ipaddress
.
IPv6Address
(
address
)
return
True
except
ValueError
:
return
False
def
get_distributed_init_method
(
ip
:
str
,
port
:
int
)
->
str
:
# Brackets are not permitted in ipv4 addresses,
# see https://github.com/python/cpython/issues/103848
...
...
@@ -638,9 +662,7 @@ def create_kv_caches_with_random_flash(
seed
:
int
=
0
,
device
:
Optional
[
str
]
=
"cuda"
,
)
->
Tuple
[
List
[
torch
.
Tensor
],
List
[
torch
.
Tensor
]]:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch_dtype
=
get_kv_cache_torch_dtype
(
cache_dtype
,
model_dtype
)
key_value_cache_shape
=
(
num_blocks
,
2
,
block_size
,
num_heads
,
head_size
)
...
...
@@ -682,9 +704,7 @@ def create_kv_caches_with_random(
f
"Does not support key cache of type fp8 with head_size
{
head_size
}
"
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch_dtype
=
get_kv_cache_torch_dtype
(
cache_dtype
,
model_dtype
)
...
...
@@ -747,14 +767,14 @@ def is_pin_memory_available() -> bool:
return
True
class
Cuda
MemoryProfiler
:
class
Device
MemoryProfiler
:
def
__init__
(
self
,
device
:
Optional
[
torch
.
types
.
Device
]
=
None
):
self
.
device
=
device
def
current_memory_usage
(
self
)
->
float
:
# Return the memory usage in bytes.
if
torch
.
cuda
.
is_availabl
e
():
if
current_platform
.
is_cuda_alik
e
():
torch
.
cuda
.
reset_peak_memory_stats
(
self
.
device
)
mem
=
torch
.
cuda
.
max_memory_allocated
(
self
.
device
)
elif
is_xpu
():
...
...
@@ -836,15 +856,6 @@ def async_tensor_h2d(
return
t
.
to
(
device
=
target_device
,
non_blocking
=
True
)
def
maybe_expand_dim
(
tensor
:
torch
.
Tensor
,
target_dims
:
int
,
size
:
int
=
1
)
->
torch
.
Tensor
:
"""Expand the tensor to the target_dims."""
if
tensor
.
ndim
<
target_dims
:
tensor
=
tensor
.
view
(
-
1
,
*
([
size
]
*
(
target_dims
-
tensor
.
ndim
)))
return
tensor
def
get_dtype_size
(
dtype
:
torch
.
dtype
)
->
int
:
"""Get the size of the data type in bytes."""
return
torch
.
tensor
([],
dtype
=
dtype
).
element_size
()
...
...
@@ -1069,7 +1080,7 @@ def _cuda_device_count_stateless(
def
cuda_device_count_stateless
()
->
int
:
"""Get number of CUDA devices, caching based on the value of
CUDA_VISIBLE_DEVICES at the time of call.
This should be used instead of torch.cuda.device_count()
unless CUDA_VISIBLE_DEVICES has already been set to the desired
value."""
...
...
@@ -1079,6 +1090,20 @@ def cuda_device_count_stateless() -> int:
return
_cuda_device_count_stateless
(
envs
.
CUDA_VISIBLE_DEVICES
)
def
weak_bind
(
bound_method
:
Callable
[...,
Any
],
)
->
Callable
[...,
None
]:
"""Make an instance method that weakly references
its associated instance and no-ops once that
instance is collected."""
ref
=
weakref
.
ref
(
bound_method
.
__self__
)
# type: ignore[attr-defined]
unbound
=
bound_method
.
__func__
# type: ignore[attr-defined]
def
weak_bound
(
*
args
,
**
kwargs
)
->
None
:
if
inst
:
=
ref
():
unbound
(
inst
,
*
args
,
**
kwargs
)
return
weak_bound
#From: https://stackoverflow.com/a/4104188/2749989
def
run_once
(
f
:
Callable
[
P
,
None
])
->
Callable
[
P
,
None
]:
...
...
@@ -1121,10 +1146,10 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
def
_pull_args_from_config
(
args
:
List
[
str
])
->
List
[
str
]:
"""Method to pull arguments specified in the config file
into the command-line args variable.
The arguments in config file will be inserted between
The arguments in config file will be inserted between
the argument list.
example:
```yaml
port: 12323
...
...
@@ -1135,21 +1160,21 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
--config config.yaml -tp 2
$: args = [
"serve,chat,complete",
"facebook/opt-12B",
'--config', 'config.yaml',
"facebook/opt-12B",
'--config', 'config.yaml',
'-tp', '2'
]
$: args = [
"serve,chat,complete",
"facebook/opt-12B",
'--port', '12323',
'--tensor-parallel-size', '4',
"facebook/opt-12B",
'--port', '12323',
'--tensor-parallel-size', '4',
'-tp', '2'
]
```
Please note how the config args are inserted after the sub command.
this way the order of priorities is maintained when these are args
this way the order of priorities is maintained when these are args
parsed by super().
"""
assert
args
.
count
(
...
...
@@ -1175,7 +1200,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
@
staticmethod
def
_load_config_file
(
file_path
:
str
)
->
List
[
str
]:
"""Loads a yaml file and returns the key value pairs as a
"""Loads a yaml file and returns the key value pairs as a
flattened list with argparse like pattern
```yaml
port: 12323
...
...
@@ -1186,7 +1211,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
'--port': '12323',
'--tensor-parallel-size': '4'
]
"""
extension
:
str
=
file_path
.
split
(
'.'
)[
-
1
]
...
...
@@ -1222,6 +1247,53 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
return
await
task
(
*
args
,
**
kwargs
)
def
get_allowed_kwarg_only_overrides
(
callable
:
Callable
[...,
object
],
overrides
:
Optional
[
Dict
[
str
,
Any
]],
)
->
Dict
[
str
,
Any
]:
"""
Given a callable which has one or more keyword only params and a dict
mapping param names to values, drop values that can be not be kwarg
expanded to overwrite one or more keyword-only args. This is used in a
few places to handle custom processor overrides for multimodal models,
e.g., for profiling when processor options provided by the user
may affect the number of mm tokens per instance.
Args:
callable: Callable which takes 0 or more keyword only arguments.
overrides: Potential overrides to be used when invoking the callable.
Returns:
Dictionary containing the kwargs to be leveraged which may be used
to overwrite one or more keyword only arguments when invoking the
callable.
"""
if
not
overrides
:
return
{}
allowed_override_names
=
[
name
for
name
,
param
in
inspect
.
signature
(
callable
).
parameters
.
items
()
if
param
.
kind
==
inspect
.
Parameter
.
KEYWORD_ONLY
]
# Drop any mm_processor_kwargs provided by the user that are
# not kwarg names accepted by the provided input processor.
filtered_overrides
=
{
kwarg_name
:
val
for
kwarg_name
,
val
in
overrides
.
items
()
if
kwarg_name
in
allowed_override_names
}
# If anything is dropped, log a warning
dropped_keys
=
overrides
.
keys
()
-
filtered_overrides
.
keys
()
if
dropped_keys
:
logger
.
warning
(
"The following intended overrides are not keyword-only args "
"and and will be dropped: %s"
,
dropped_keys
)
return
filtered_overrides
# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
# In particular, the FakeScalarType is not supported for earlier versions of
# PyTorch which breaks dynamo for any ops registered using ScalarType.
...
...
@@ -1230,6 +1302,12 @@ def supports_dynamo() -> bool:
return
base_torch_version
>=
Version
(
"2.4.0"
)
# Some backends use pytorch version < 2.4.0 which doesn't
# support `torch.library.custom_op`.
def
supports_custom_op
()
->
bool
:
return
hasattr
(
torch
.
library
,
"custom_op"
)
class
AtomicCounter
:
"""An atomic, thread-safe counter"""
...
...
vllm/version.py
View file @
539aa992
import
warnings
try
:
import
vllm.commit_id
__commit__
=
vllm
.
commit_id
.
__commit__
from
._version
import
__version__
,
__version_tuple__
except
Exception
as
e
:
import
warnings
warnings
.
warn
(
f
"Failed to read commit hash:
\n
{
e
}
"
,
RuntimeWarning
,
stacklevel
=
2
)
__commit__
=
"COMMIT_HASH_PLACEHOLDER"
__version__
=
"0.6.1.post2"
__version__
=
"dev"
__version_tuple__
=
(
0
,
0
,
__version__
)
vllm/vllm_flash_attn/.gitkeep
0 → 100644
View file @
539aa992
vllm/worker/cpu_model_runner.py
View file @
539aa992
import
dataclasses
import
weakref
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
Union
...
...
@@ -10,14 +12,16 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
SchedulerConfig
)
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.rotary_embedding
import
MRotaryEmbedding
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.model_loader
import
get_model
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
MultiModalInputs
)
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
from
vllm.sequence
import
(
IntermediateTensors
,
SequenceData
,
SequenceGroupMetadata
)
from
vllm.utils
import
STR_NOT_IMPL_ENC_DEC_ERR_STRS
,
make_tensor_with_pad
from
vllm.worker.model_runner_base
import
(
ModelRunnerBase
,
ModelRunnerInputBase
,
ModelRunnerBase
,
ModelRunnerInputBase
,
ModelRunnerInputBuilderBase
,
_add_attn_metadata_broadcastable_dict
,
_add_sampling_metadata_broadcastable_dict
,
_init_attn_metadata_from_tensor_dict
,
...
...
@@ -32,16 +36,17 @@ _PAD_SLOT_ID = -1
@
dataclass
(
frozen
=
True
)
class
CPU
ModelInput
(
ModelRunnerInputBase
):
class
ModelInput
ForCPU
(
ModelRunnerInputBase
):
"""
Used by the CPUModelRunner.
Base class contains metadata needed for the base model forward pass on CPU
"""
input_tokens
:
Optional
[
torch
.
Tensor
]
=
None
input_positions
:
Optional
[
torch
.
Tensor
]
=
None
attn_metadata
:
Optional
[
"AttentionMetadata"
]
=
None
sampling_metadata
:
Optional
[
"SamplingMetadata"
]
=
None
multi_modal_kwargs
:
Optional
[
BatchedTensorInputs
]
=
None
virtual_engine
:
Optional
[
int
]
=
None
seq_lens
:
Optional
[
List
[
int
]]
=
None
query_lens
:
Optional
[
List
[
int
]]
=
None
def
as_broadcastable_tensor_dict
(
self
)
->
Dict
[
str
,
Union
[
int
,
torch
.
Tensor
]]:
...
...
@@ -51,16 +56,44 @@ class CPUModelInput(ModelRunnerInputBase):
"multi_modal_kwargs"
:
self
.
multi_modal_kwargs
,
}
_add_attn_metadata_broadcastable_dict
(
tensor_dict
,
self
.
attn_metadata
)
return
tensor_dict
@
classmethod
def
from_broadcasted_tensor_dict
(
cls
:
Type
[
"ModelInputForCPU"
],
tensor_dict
:
Dict
[
str
,
Any
],
attn_backend
:
Optional
[
"AttentionBackend"
]
=
None
)
->
"ModelInputForCPU"
:
if
attn_backend
is
not
None
:
tensor_dict
=
_init_attn_metadata_from_tensor_dict
(
attn_backend
,
tensor_dict
)
return
cls
(
**
tensor_dict
)
@
dataclass
(
frozen
=
True
)
class
ModelInputForCPUWithSamplingMetadata
(
ModelInputForCPU
):
"""
Used by the ModelRunner.
"""
sampling_metadata
:
Optional
[
"SamplingMetadata"
]
=
None
def
as_broadcastable_tensor_dict
(
self
)
->
Dict
[
str
,
Any
]:
tensor_dict
=
{
"input_tokens"
:
self
.
input_tokens
,
"input_positions"
:
self
.
input_positions
,
}
_add_attn_metadata_broadcastable_dict
(
tensor_dict
,
self
.
attn_metadata
)
_add_sampling_metadata_broadcastable_dict
(
tensor_dict
,
self
.
sampling_metadata
)
return
tensor_dict
@
classmethod
def
from_broadcasted_tensor_dict
(
cls
:
Type
[
"CPUModelInput"
]
,
tensor_dict
:
Dict
[
str
,
Any
],
attn_backend
:
Optional
[
"AttentionBackend"
]
=
None
)
->
"
CPU
ModelInput"
:
cls
,
tensor_dict
:
Dict
[
str
,
Any
],
attn_backend
:
Optional
[
"AttentionBackend"
]
=
None
,
)
->
"ModelInput
ForCPUWithSamplingMetadata
"
:
tensor_dict
=
_init_sampling_metadata_from_tensor_dict
(
tensor_dict
)
if
attn_backend
is
not
None
:
tensor_dict
=
_init_attn_metadata_from_tensor_dict
(
...
...
@@ -68,71 +101,83 @@ class CPUModelInput(ModelRunnerInputBase):
return
cls
(
**
tensor_dict
)
class
CPU
Model
Runn
er
(
ModelRunnerBase
[
CPU
ModelInput
]):
class
Model
InputForCPUBuild
er
(
ModelRunner
InputBuilder
Base
[
ModelInput
ForCPU
]):
def
__init__
(
self
,
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
cache_config
:
CacheConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
]
=
None
,
is_driver_worker
:
bool
=
False
,
*
args
,
**
kwargs
,
):
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
# Currently, CPU worker doesn't support chunked prefill.
assert
self
.
scheduler_config
.
chunked_prefill_enabled
is
False
self
.
device_config
=
device_config
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
load_config
=
load_config
self
.
is_driver_worker
=
is_driver_worker
def
__init__
(
self
,
runner
:
"CPUModelRunner"
,
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
)
->
None
:
super
().
__init__
()
self
.
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
]
=
[]
self
.
runner
=
runner
self
.
model_input_cls
=
self
.
runner
.
_model_input_cls
self
.
attn_backend
=
self
.
runner
.
attn_backend
self
.
sliding_window
=
self
.
runner
.
sliding_window
self
.
block_size
=
self
.
runner
.
block_size
self
.
device
=
self
.
runner
.
device
self
.
multi_modal_input_mapper
=
self
.
runner
.
multi_modal_input_mapper
self
.
device
=
self
.
device_config
.
device
def
add_seq_group
(
self
,
seq_group_metadata
:
SequenceGroupMetadata
):
self
.
seq_group_metadata_list
.
append
(
seq_group_metadata
)
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
sliding_window
=
model_config
.
get_sliding_window
()
self
.
block_size
=
cache_config
.
block_size
self
.
attn_backend
=
get_attn_backend
(
self
.
model_config
.
get_num_attention_heads
(
self
.
parallel_config
),
self
.
model_config
.
get_head_size
(),
self
.
model_config
.
get_num_kv_heads
(
self
.
parallel_config
),
self
.
model_config
.
get_sliding_window
(),
self
.
model_config
.
dtype
,
self
.
kv_cache_dtype
,
self
.
block_size
,
)
# Multi-modal data support
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
multi_modal_input_mapper
=
self
.
mm_registry
\
.
create_input_mapper
(
self
.
model_config
)
self
.
mm_registry
.
init_mm_limits_per_prompt
(
self
.
model_config
)
# Lazy initialization.
self
.
model
:
nn
.
Module
# Set after init_Model
def
build
(
self
)
->
ModelInputForCPU
:
multi_modal_kwargs
=
None
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt
=
self
.
seq_group_metadata_list
[
0
].
is_prompt
# Prepare input tensors.
if
is_prompt
:
(
input_tokens
,
input_positions
,
attn_metadata
,
seq_lens
,
multi_modal_kwargs
)
=
self
.
_prepare_prompt
(
self
.
seq_group_metadata_list
)
else
:
(
input_tokens
,
input_positions
,
attn_metadata
)
=
self
.
_prepare_decode
(
self
.
seq_group_metadata_list
)
seq_lens
=
[]
if
self
.
model_config
.
is_encoder_decoder_model
:
raise
NotImplementedError
(
STR_NOT_IMPL_ENC_DEC_ERR_STRS
[
'STR_NOT_IMPL_ENC_DEC_CPU'
])
return
self
.
model_input_cls
(
input_tokens
=
input_tokens
,
input_positions
=
input_positions
,
attn_metadata
=
attn_metadata
,
multi_modal_kwargs
=
multi_modal_kwargs
,
# query_lens is not needed if chunked prefill is not
# supported. Since CPU worker doesn't support chunked prefill
# just use seq_lens instead.
seq_lens
=
seq_lens
,
query_lens
=
seq_lens
,
)
def
load_model
(
self
)
->
None
:
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
load_config
=
self
.
load_config
,
device_config
=
self
.
device_config
,
lora_config
=
self
.
lora_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
,
cache_config
=
self
.
cache_config
)
def
_compute_multi_modal_input
(
self
,
seq_data
:
SequenceData
,
mm_data
,
computed_len
:
int
):
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
)
# special processing for mrope position deltas.
mrope_positions
=
None
if
self
.
runner
.
model_is_mrope
:
image_grid_thw
=
mm_kwargs
.
get
(
"image_grid_thw"
,
None
)
video_grid_thw
=
mm_kwargs
.
get
(
"video_grid_thw"
,
None
)
assert
image_grid_thw
is
not
None
or
video_grid_thw
is
not
None
,
(
"mrope embedding type requires multi-modal input mapper "
"returns 'image_grid_thw' or 'video_grid_thw'."
)
hf_config
=
self
.
runner
.
model_config
.
hf_config
token_ids
=
seq_data
.
get_token_ids
()
mrope_positions
,
mrope_position_delta
=
\
MRotaryEmbedding
.
get_input_positions
(
token_ids
,
image_grid_thw
=
image_grid_thw
,
video_grid_thw
=
video_grid_thw
,
image_token_id
=
hf_config
.
image_token_id
,
video_token_id
=
hf_config
.
video_token_id
,
vision_start_token_id
=
hf_config
.
vision_start_token_id
,
vision_end_token_id
=
hf_config
.
vision_end_token_id
,
spatial_merge_size
=
hf_config
.
vision_config
.
spatial_merge_size
,
context_len
=
computed_len
,
)
seq_data
.
mrope_position_delta
=
mrope_position_delta
return
mm_kwargs
,
mrope_positions
def
_prepare_prompt
(
self
,
...
...
@@ -142,6 +187,8 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
assert
len
(
seq_group_metadata_list
)
>
0
input_tokens
:
List
[
int
]
=
[]
input_positions
:
List
[
int
]
=
[]
input_mrope_positions
:
List
[
List
[
int
]]
=
[[]
for
_
in
range
(
3
)]
slot_mapping
:
List
[
int
]
=
[]
seq_lens
:
List
[
int
]
=
[]
multi_modal_inputs_list
:
List
[
MultiModalInputs
]
=
[]
...
...
@@ -160,15 +207,20 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
seq_lens
.
append
(
seq_len
)
# Prompt token num
input_tokens
.
extend
(
prompt_tokens
)
# Token ids
mrope_positions
=
None
if
(
mm_data
:
=
seq_group_metadata
.
multi_modal_data
):
mm_kwargs
,
mrope_positions
=
self
.
_compute_multi_modal_input
(
seq_data
,
mm_data
,
computed_len
)
multi_modal_inputs_list
.
append
(
mm_kwargs
)
# Token position ids
# NOTE(woosuk): Here we assume that the first token in the prompt
# is always the first token in the sequence.
input_positions
.
extend
(
list
(
range
(
computed_len
,
seq_len
)))
mm_data
=
seq_group_metadata
.
multi_modal_data
if
mm_data
:
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
)
multi_modal_inputs_list
.
append
(
mm_kwargs
)
if
mrope_positions
:
for
idx
in
range
(
3
):
input_mrope_positions
[
idx
].
extend
(
mrope_positions
[
idx
])
else
:
input_positions
.
extend
(
list
(
range
(
computed_len
,
seq_len
)))
# Compute the slot mapping.
block_table
=
seq_group_metadata
.
block_tables
[
seq_id
]
...
...
@@ -192,12 +244,18 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
slot
=
block_number
*
self
.
block_size
+
block_offset
slot_mapping
.
append
(
slot
)
if
any
(
input_mrope_positions
):
input_positions
=
None
# type: ignore
else
:
input_mrope_positions
=
None
# type: ignore
num_prompt_tokens
=
len
(
input_tokens
)
input_tokens
=
torch
.
tensor
(
input_tokens
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
# type: ignore
input_positions
=
torch
.
tensor
(
input_positions
,
input_positions
=
torch
.
tensor
(
input_positions
or
input_mrope_positions
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
# type: ignore
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
...
...
@@ -228,6 +286,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
assert
len
(
seq_group_metadata_list
)
>
0
input_tokens
:
List
[
int
]
=
[]
input_positions
:
List
[
int
]
=
[]
input_mrope_positions
:
List
[
List
[
int
]]
=
[[]
for
_
in
range
(
3
)]
slot_mapping
:
List
[
int
]
=
[]
seq_lens
:
List
[
int
]
=
[]
block_tables
:
List
[
List
[
int
]]
=
[]
...
...
@@ -245,7 +304,17 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
seq_len
=
seq_data
.
get_len
()
position
=
seq_len
-
1
input_positions
.
append
(
position
)
if
seq_data
.
mrope_position_delta
is
not
None
:
context_len
=
seq_data
.
get_num_computed_tokens
()
next_pos
=
MRotaryEmbedding
.
get_next_input_positions
(
seq_data
.
mrope_position_delta
,
context_len
,
seq_len
,
)
for
idx
in
range
(
3
):
input_mrope_positions
[
idx
].
extend
(
next_pos
[
idx
])
else
:
input_positions
.
append
(
position
)
seq_len
=
seq_len
if
self
.
sliding_window
is
None
else
min
(
seq_len
,
self
.
sliding_window
)
...
...
@@ -263,12 +332,18 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
block_table
=
block_table
[
-
sliding_window_blocks
:]
block_tables
.
append
(
block_table
)
if
any
(
input_mrope_positions
):
input_positions
=
None
# type: ignore
else
:
input_mrope_positions
=
None
# type: ignore
max_decode_seq_len
=
max
(
seq_lens
)
input_tokens
=
torch
.
tensor
(
input_tokens
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_positions
=
torch
.
tensor
(
input_positions
,
input_positions
=
torch
.
tensor
(
input_positions
or
input_mrope_positions
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
...
...
@@ -302,56 +377,139 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
attn_metadata
,
)
class
CPUModelRunner
(
ModelRunnerBase
[
ModelInputForCPU
]):
_model_input_cls
:
Type
[
ModelInputForCPUWithSamplingMetadata
]
=
(
ModelInputForCPUWithSamplingMetadata
)
_builder_cls
:
Type
[
ModelInputForCPUBuilder
]
=
ModelInputForCPUBuilder
def
__init__
(
self
,
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
cache_config
:
CacheConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
]
=
None
,
is_driver_worker
:
bool
=
False
,
*
args
,
**
kwargs
,
):
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
# Currently, CPU worker doesn't support chunked prefill.
assert
self
.
scheduler_config
.
chunked_prefill_enabled
is
False
self
.
device_config
=
device_config
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
load_config
=
load_config
self
.
is_driver_worker
=
is_driver_worker
self
.
device
=
self
.
device_config
.
device
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
sliding_window
=
model_config
.
get_sliding_window
()
self
.
block_size
=
cache_config
.
block_size
self
.
attn_backend
=
get_attn_backend
(
self
.
model_config
.
get_num_attention_heads
(
self
.
parallel_config
),
self
.
model_config
.
get_head_size
(),
self
.
model_config
.
get_num_kv_heads
(
self
.
parallel_config
),
self
.
model_config
.
get_sliding_window
(),
self
.
model_config
.
dtype
,
self
.
kv_cache_dtype
,
self
.
block_size
,
)
# Multi-modal data support
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
multi_modal_input_mapper
=
self
.
mm_registry
\
.
create_input_mapper
(
self
.
model_config
)
self
.
mm_registry
.
init_mm_limits_per_prompt
(
self
.
model_config
)
# Lazy initialization.
self
.
model
:
nn
.
Module
# Set after init_Model
if
self
.
model_config
.
is_encoder_decoder_model
:
raise
NotImplementedError
(
STR_NOT_IMPL_ENC_DEC_ERR_STRS
[
'STR_NOT_IMPL_ENC_DEC_CPU'
])
@
property
def
model_is_mrope
(
self
)
->
bool
:
"""Detect if the model has "mrope" rope_scaling type.
mrope requires keep "rope_deltas" between prompt and decoding phases."""
rope_scaling
=
getattr
(
self
.
model_config
.
hf_config
,
"rope_scaling"
,
{})
if
rope_scaling
is
None
:
return
False
return
rope_scaling
.
get
(
"type"
,
None
)
==
"mrope"
def
load_model
(
self
)
->
None
:
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
load_config
=
self
.
load_config
,
device_config
=
self
.
device_config
,
lora_config
=
self
.
lora_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
,
cache_config
=
self
.
cache_config
)
def
make_model_input_from_broadcasted_tensor_dict
(
self
,
tensor_dict
:
Dict
[
str
,
Any
],
)
->
CPU
ModelInput
:
return
CPU
ModelInput
.
from_broadcasted_tensor_dict
(
)
->
ModelInput
ForCPU
:
return
ModelInput
ForCPU
.
from_broadcasted_tensor_dict
(
tensor_dict
,
attn_backend
=
self
.
attn_backend
,
)
def
_prepare_model_input_tensors
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
)
->
ModelInputForCPUWithSamplingMetadata
:
"""Helper method to prepare the model input based on a given sequence
group. Prepares metadata needed for the base model forward pass but not
metadata for possible additional steps, e.g., sampling.
"""
builder
=
self
.
_builder_cls
(
weakref
.
proxy
(
self
),
finished_requests_ids
)
for
seq_group_metadata
in
seq_group_metadata_list
:
builder
.
add_seq_group
(
seq_group_metadata
)
return
builder
.
build
()
# type: ignore
def
prepare_model_input
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
virtual_engine
:
int
=
0
,
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
)
->
CPUModelInput
:
multi_modal_kwargs
=
None
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt
=
seq_group_metadata_list
[
0
].
is_prompt
# Prepare input tensors.
if
is_prompt
:
(
input_tokens
,
input_positions
,
attn_metadata
,
seq_lens
,
multi_modal_kwargs
)
=
self
.
_prepare_prompt
(
seq_group_metadata_list
)
else
:
(
input_tokens
,
input_positions
,
attn_metadata
)
=
self
.
_prepare_decode
(
seq_group_metadata_list
)
seq_lens
=
[]
sampling_metadata
=
SamplingMetadata
.
prepare
(
seq_group_metadata_list
,
seq_lens
,
# query_lens is not needed if chunked prefill is not
# supported. Since CPU worker doesn't support chunked prefill
# just use seq_lens instead.
seq_lens
,
self
.
device
,
pin_memory
=
False
,
generators
=
self
.
get_generators
(
finished_requests_ids
))
return
CPUModelInput
(
input_tokens
=
input_tokens
,
input_positions
=
input_positions
,
attn_metadata
=
attn_metadata
,
sampling_metadata
=
sampling_metadata
,
multi_modal_kwargs
=
multi_modal_kwargs
,
)
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
virtual_engine
:
int
=
0
,
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
)
->
ModelInputForCPUWithSamplingMetadata
:
"""Prepare the model input based on a given sequence group, including
metadata for the sampling step.
"""
model_input
=
self
.
_prepare_model_input_tensors
(
seq_group_metadata_list
,
finished_requests_ids
)
# Sampling metadata is only required for the final pp group
generators
=
self
.
get_generators
(
finished_requests_ids
)
sampling_metadata
=
SamplingMetadata
.
prepare
(
seq_group_metadata_list
,
model_input
.
seq_lens
,
model_input
.
query_lens
,
self
.
device
,
pin_memory
=
False
,
generators
=
generators
)
return
dataclasses
.
replace
(
model_input
,
sampling_metadata
=
sampling_metadata
,
virtual_engine
=
virtual_engine
)
@
torch
.
no_grad
()
def
execute_model
(
self
,
model_input
:
CPU
ModelInput
,
model_input
:
ModelInput
ForCPUWithSamplingMetadata
,
kv_caches
:
List
[
torch
.
Tensor
],
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
num_steps
:
int
=
1
,
...
...
@@ -372,6 +530,8 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
model_input
.
attn_metadata
,
**
MultiModalInputs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
"intermediate_tensors"
:
intermediate_tensors
,
}
hidden_states
=
model_executable
(
**
execute_model_kwargs
)
...
...
vllm/worker/enc_dec_model_runner.py
View file @
539aa992
import
dataclasses
import
itertools
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
cast
import
torch
...
...
@@ -17,14 +18,16 @@ from vllm.inputs import INPUT_REGISTRY, InputRegistry
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
MultiModalInputs
,
MultiModalRegistry
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
(
IntermediateTensors
,
PoolerOutput
,
SequenceGroupMetadata
)
from
vllm.utils
import
STR_NOT_IMPL_ENC_DEC_BACKEND
,
make_tensor_with_pad
from
vllm.worker.model_runner
import
(
GPUModelRunnerBase
,
ModelInputForGPUBuilder
,
ModelInputForGPUWithSamplingMetadata
)
ModelInputForGPUWithSamplingMetadata
,
_get_graph_batch_size
)
from
vllm.worker.model_runner_base
import
(
_add_attn_metadata_broadcastable_dict
,
_add_sampling_metadata_broadcastable_dict
)
...
...
@@ -50,6 +53,7 @@ class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
"virtual_engine"
:
self
.
virtual_engine
,
"request_ids_to_seq_ids"
:
self
.
request_ids_to_seq_ids
,
"finished_requests_ids"
:
self
.
finished_requests_ids
,
"multi_modal_kwargs"
:
self
.
multi_modal_kwargs
,
}
_add_attn_metadata_broadcastable_dict
(
tensor_dict
,
self
.
attn_metadata
)
_add_sampling_metadata_broadcastable_dict
(
tensor_dict
,
...
...
@@ -178,12 +182,22 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
raise
ValueError
(
"num_steps > 1 is not supported in "
"EncoderDecoderModelRunner"
)
model_executable
=
self
.
model
if
(
model_input
.
attn_metadata
is
not
None
and
model_input
.
attn_metadata
.
prefill_metadata
is
None
and
model_input
.
attn_metadata
.
decode_metadata
.
use_cuda_graph
):
assert
model_input
.
input_tokens
is
not
None
graph_batch_size
=
model_input
.
input_tokens
.
shape
[
0
]
model_executable
=
self
.
graph_runners
[
model_input
.
virtual_engine
][
graph_batch_size
]
else
:
model_executable
=
self
.
model
seqlen_agnostic_kwargs
=
{
"finished_requests_ids"
:
model_input
.
finished_requests_ids
,
"request_ids_to_seq_ids"
:
model_input
.
request_ids_to_seq_ids
,
}
if
self
.
has_seqlen_agnostic
else
{}
multi_modal_kwargs
=
model_input
.
multi_modal_kwargs
or
{}
hidden_or_intermediate_states
=
model_executable
(
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
...
...
@@ -192,6 +206,8 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
kv_caches
=
kv_caches
,
attn_metadata
=
model_input
.
attn_metadata
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalInputs
.
as_kwargs
(
multi_modal_kwargs
,
device
=
self
.
device
),
**
seqlen_agnostic_kwargs
)
logits
=
self
.
model
.
compute_logits
(
hidden_or_intermediate_states
,
...
...
@@ -200,6 +216,9 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
if
not
self
.
is_driver_worker
:
return
[]
if
model_input
.
async_callback
is
not
None
:
model_input
.
async_callback
()
# Sample the next token.
output
:
SamplerOutput
=
self
.
model
.
sample
(
logits
=
logits
,
...
...
@@ -231,14 +250,12 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
"""
model_input
=
self
.
_prepare_model_input_tensors
(
seq_group_metadata_list
,
finished_requests_ids
)
(
attn_metadata
,
encoder_input_tokens_tensor
,
encoder_input_positions_tensor
,
)
=
(
self
.
_prepare_encoder_model_input_tensors
(
seq_group_metadata_list
,
model_input
))
# Inject attn_metadata encoder/cross-attention fields &
# encoder input tokens/positions into model_input.
# Frozen dataclass fields cannot be modified, so use
...
...
@@ -277,8 +294,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
max_mm_tokens
=
self
.
mm_registry
.
get_max_multimodal_tokens
(
self
.
model_config
)
if
max_mm_tokens
>
0
:
raise
NotImplementedError
(
"Multi-modal encoder-decoder models are not supported yet"
)
logger
.
info
(
"Starting profile run for multi-modal models."
)
batch_size
=
0
for
group_id
in
range
(
max_num_seqs
):
...
...
@@ -286,24 +302,39 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
(
group_id
<
max_num_batched_tokens
%
max_num_seqs
))
batch_size
+=
seq_len
seq_data
,
_
=
self
.
input_registry
\
.
dummy_data_for_profiling
(
self
.
model_config
,
decoder_seq_data
,
decoder_dummy_multi_modal_data
\
=
self
.
input_registry
.
dummy_data_for_profiling
(
self
.
model_config
,
seq_len
,
self
.
mm_registry
)
self
.
mm_registry
,
is_encoder_data
=
False
)
encoder_seq_data
,
encoder_dummy_multi_modal_data
\
=
self
.
input_registry
.
dummy_data_for_profiling
(
self
.
model_config
,
seq_len
,
self
.
mm_registry
,
is_encoder_data
=
True
)
# Having more tokens is over-conservative but otherwise fine
assert
len
(
seq_data
.
prompt_token_ids
)
>=
seq_len
,
(
assert
len
(
decoder_
seq_data
.
prompt_token_ids
)
>=
seq_len
,
(
f
"Expected at least
{
seq_len
}
dummy tokens for profiling, "
f
"but got:
{
len
(
seq_data
.
prompt_token_ids
)
}
"
)
f
"but got:
{
len
(
decoder_seq_data
.
prompt_token_ids
)
}
"
)
assert
decoder_dummy_multi_modal_data
is
None
or
\
encoder_dummy_multi_modal_data
is
None
,
(
"Multi-modal data can't be provided in both encoder and decoder"
)
seq
=
SequenceGroupMetadata
(
request_id
=
str
(
group_id
),
is_prompt
=
True
,
seq_data
=
{
group_id
:
seq_data
},
seq_data
=
{
group_id
:
decoder_
seq_data
},
sampling_params
=
sampling_params
,
block_tables
=
None
,
encoder_seq_data
=
seq_data
,
encoder_seq_data
=
encoder_
seq_data
,
cross_block_table
=
None
,
multi_modal_data
=
decoder_dummy_multi_modal_data
or
encoder_dummy_multi_modal_data
,
)
seqs
.
append
(
seq
)
...
...
@@ -424,24 +455,42 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
encoder_input_tokens_tensor
=
self
.
_empty_long_tensor
()
encoder_input_positions_tensor
=
self
.
_empty_long_tensor
()
cross_slot_mapping_tensor
=
self
.
_empty_long_tensor
()
# Extract cross-attention block tables &
# seq len from each sequence group metadata.
# Cross-attention block tables are empty
# during vLLM memory profiling.
cross_block_tables
=
[]
for
seq_group_metadata
in
seq_group_metadata_list
:
encoder_seq_lens
.
append
(
seq_group_metadata
.
encoder_seq_data
.
get_len
())
cross_block_table
=
seq_group_metadata
.
cross_block_table
cross_block_tables
.
append
([]
if
(
cross_block_table
is
None
)
else
cross_block_table
)
for
_
in
range
(
len
(
seq_group_metadata
.
seq_data
)):
encoder_seq_lens
.
append
(
seq_group_metadata
.
encoder_seq_data
.
get_len
())
cross_block_table
=
seq_group_metadata
.
cross_block_table
cross_block_tables
.
append
([]
if
(
cross_block_table
is
None
)
else
cross_block_table
)
if
(
model_input
.
attn_metadata
is
not
None
and
model_input
.
attn_metadata
.
use_cuda_graph
):
# We will be using CUDA graph replay for this decode.
max_len_of_block_table
=
self
.
get_max_block_per_batch
()
batch_size
=
len
(
encoder_seq_lens
)
graph_batch_size
=
_get_graph_batch_size
(
batch_size
)
assert
graph_batch_size
>=
batch_size
cuda_graph_pad_size
=
graph_batch_size
-
batch_size
# extend the cross_block_tables and encoder_seq_lens to match
# the graph_batch_size.
cross_block_tables
.
extend
([[]
for
_
in
range
(
cuda_graph_pad_size
)
])
encoder_seq_lens
.
extend
(
itertools
.
repeat
(
1
,
cuda_graph_pad_size
))
else
:
max_len_of_block_table
=
max
(
len
(
block_table
)
for
block_table
in
cross_block_tables
)
# Convert cross-attention block tables to encoder input tensor
cross_block_tables
=
make_tensor_with_pad
(
cross_block_tables
,
max_len
=
max
(
len
(
block_table
)
for
block_table
in
cross_block_tables
),
max_len
=
max_len_of_block_table
,
pad
=
0
,
dtype
=
torch
.
int32
,
device
=
self
.
device
,
...
...
vllm/worker/model_runner.py
View file @
539aa992
...
...
@@ -45,7 +45,7 @@ from vllm.prompt_adapter.worker_manager import (
LRUCacheWorkerPromptAdapterManager
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
from
vllm.utils
import
(
Cuda
MemoryProfiler
,
PyObjectCache
,
async_tensor_h2d
,
from
vllm.utils
import
(
Device
MemoryProfiler
,
PyObjectCache
,
async_tensor_h2d
,
flatten_2d_lists
,
is_hip
,
is_pin_memory_available
,
supports_dynamo
)
from
vllm.worker.model_runner_base
import
(
...
...
@@ -243,6 +243,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
prefix_cache_hit
:
bool
=
False
,
reinit
:
bool
=
False
,
reinit_use_defaults
:
bool
=
False
,
encoder_seq_len
:
int
=
0
,
):
if
reinit
:
assert
len
(
self
.
seq_ids
)
==
len
(
seq_ids
)
# type: ignore
...
...
@@ -256,6 +257,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self
.
block_tables
=
block_tables
self
.
computed_block_nums
=
computed_block_nums
self
.
n_seqs
=
n_seqs
self
.
encoder_seq_len
=
encoder_seq_len
if
reinit
:
if
len
(
self
.
seq_ids
)
==
1
and
reinit_use_defaults
:
...
...
@@ -702,6 +704,11 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
assert
n_seqs
==
1
self
.
decode_only
=
False
encoder_seq_len
=
0
if
self
.
runner
.
model_config
.
is_encoder_decoder_model
:
encoder_seq_len
=
seq_group_metadata
.
encoder_seq_data
.
get_len
()
inter_data
=
self
.
init_cached_inter_data
(
request_id
=
seq_group_metadata
.
request_id
,
seq_ids
=
seq_ids
,
...
...
@@ -709,7 +716,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
block_tables
=
seq_group_metadata
.
block_tables
,
computed_block_nums
=
seq_group_metadata
.
computed_block_nums
,
reinit
=
True
,
reinit_use_defaults
=
True
)
reinit_use_defaults
=
True
,
encoder_seq_len
=
encoder_seq_len
)
self
.
inter_data_list
.
append
(
inter_data
)
...
...
@@ -719,11 +727,15 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
for
per_seq_group_fn
in
self
.
per_seq_group_compute_fns
:
per_seq_group_fn
(
inter_data
,
seq_group_metadata
)
def
_use_captured_graph
(
self
,
batch_size
:
int
,
max_decode_seq_len
:
int
)
->
bool
:
def
_use_captured_graph
(
self
,
batch_size
:
int
,
max_decode_seq_len
:
int
,
max_encoder_seq_len
:
int
=
0
)
->
bool
:
return
(
self
.
decode_only
and
not
self
.
runner
.
model_config
.
enforce_eager
and
batch_size
<=
self
.
runner
.
max_batchsize_to_capture
and
max_decode_seq_len
<=
self
.
runner
.
max_seq_len_to_capture
)
and
batch_size
<=
_BATCH_SIZES_TO_CAPTURE
[
-
1
]
and
max_decode_seq_len
<=
self
.
runner
.
max_seq_len_to_capture
and
max_encoder_seq_len
<=
self
.
runner
.
max_seq_len_to_capture
and
batch_size
<=
self
.
runner
.
max_batchsize_to_capture
)
def
build
(
self
)
->
ModelInputForGPU
:
"""Finalize the builder intermediate data and
...
...
@@ -763,15 +775,18 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
input_positions
.
extend
(
cur_input_positions
)
seq_lens
=
[]
query_lens
=
[]
max_decode_seq_len
=
0
max_encoder_seq_len
=
0
for
inter_data
in
self
.
inter_data_list
:
seq_lens
.
extend
(
inter_data
.
seq_lens
)
query_lens
.
extend
(
inter_data
.
query_lens
)
if
not
inter_data
.
is_prompt
:
max_decode_seq_len
=
max
(
max_decode_seq_len
,
max
(
inter_data
.
seq_lens
))
query_lens
=
[]
for
inter_data
in
self
.
inter_data_list
:
query_lens
.
extend
(
inter_data
.
query
_len
s
)
if
self
.
runner
.
model_config
.
is_encoder_decoder_model
:
max_encoder_seq_len
=
max
(
max_encoder_seq_len
,
inter_data
.
encoder_seq
_len
)
# Mapping from request IDs to sequence IDs. Used for Jamba models
# that manages the cache by itself.
...
...
@@ -781,8 +796,10 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
}
batch_size
=
len
(
input_tokens
)
use_captured_graph
=
self
.
_use_captured_graph
(
batch_size
,
max_decode_seq_len
)
use_captured_graph
=
self
.
_use_captured_graph
(
batch_size
,
max_decode_seq_len
,
max_encoder_seq_len
=
max_encoder_seq_len
)
# If cuda graph can be used, pad tensors accordingly.
# See `capture_model` API for more details.
...
...
@@ -995,7 +1012,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
def
load_model
(
self
)
->
None
:
logger
.
info
(
"Starting to load model %s..."
,
self
.
model_config
.
model
)
with
Cuda
MemoryProfiler
()
as
m
:
with
Device
MemoryProfiler
()
as
m
:
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
device_config
=
self
.
device_config
,
load_config
=
self
.
load_config
,
...
...
@@ -1064,8 +1081,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"This may lead to less accurate results!"
)
if
envs
.
VLLM_TEST_DYNAMO_GRAPH_CAPTURE
and
supports_dynamo
():
from
vllm.compilation.backends
import
vllm_backend
from
vllm.plugins
import
get_torch_compile_backend
backend
=
get_torch_compile_backend
()
or
"eager"
backend
=
get_torch_compile_backend
()
or
vllm_backend
self
.
model
=
torch
.
compile
(
self
.
model
,
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
...
...
@@ -1363,7 +1381,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
for
batch_size
in
reversed
(
batch_size_capture_list
):
attn_metadata
=
(
self
.
attn_state
.
graph_capture_get_metadata_for_batch
(
batch_size
))
batch_size
,
is_encoder_decoder_model
=
self
.
model_config
.
is_encoder_decoder_model
))
if
self
.
lora_config
:
lora_mapping
=
LoRAMapping
(
...
...
@@ -1379,10 +1399,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
)
self
.
set_active_prompt_adapters
(
set
(),
prompt_adapter_mapping
)
graph_runner
=
CUDAGraphRunner
(
self
.
model
,
self
.
attn_backend
.
get_name
(),
self
.
attn_state
.
graph_clone
(
batch_size
))
self
.
attn_state
.
graph_clone
(
batch_size
),
self
.
model_config
.
is_encoder_decoder_model
)
capture_inputs
=
{
"input_ids"
:
...
...
@@ -1419,6 +1439,12 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self
.
model
.
get_seqlen_agnostic_capture_inputs
(
batch_size
)
})
if
self
.
model_config
.
is_encoder_decoder_model
:
# add the additional inputs to capture for
# encoder-decoder models.
self
.
_update_inputs_to_capture_for_enc_dec_model
(
capture_inputs
)
graph_runner
.
capture
(
**
capture_inputs
)
self
.
graph_memory_pool
=
graph_runner
.
graph
.
pool
()
self
.
graph_runners
[
virtual_engine
][
batch_size
]
=
(
...
...
@@ -1429,6 +1455,24 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
# This usually takes < 10 seconds.
logger
.
info
(
"Graph capturing finished in %.0f secs."
,
elapsed_time
)
def
_update_inputs_to_capture_for_enc_dec_model
(
self
,
capture_inputs
:
Dict
[
str
,
Any
]):
"""
Updates the set of input tensors needed for CUDA graph capture in an
encoder-decoder model.
This method modifies the provided `capture_inputs` dictionary by
adding tensors specific to encoder-decoder specific models that
need to be captured for CUDA Graph replay.
"""
# During the decode phase encoder_input_ids and encoder_positions are
# unset. Do the same thing for graph capture.
capture_inputs
[
"encoder_input_ids"
]
=
torch
.
tensor
(
[],
dtype
=
torch
.
long
).
cuda
()
capture_inputs
[
"encoder_positions"
]
=
torch
.
tensor
(
[],
dtype
=
torch
.
long
).
cuda
()
@
property
def
vocab_size
(
self
)
->
int
:
return
self
.
model_config
.
get_vocab_size
()
...
...
@@ -1628,7 +1672,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
class
CUDAGraphRunner
:
def
__init__
(
self
,
model
:
nn
.
Module
,
backend_name
:
str
,
attn_state
:
AttentionState
):
attn_state
:
AttentionState
,
is_encoder_decoder_model
:
bool
):
self
.
model
=
model
self
.
backend_name
=
backend_name
self
.
attn_state
=
attn_state
...
...
@@ -1637,6 +1681,7 @@ class CUDAGraphRunner:
self
.
output_buffers
:
Dict
[
str
,
torch
.
Tensor
]
=
{}
self
.
_graph
:
Optional
[
torch
.
cuda
.
CUDAGraph
]
=
None
self
.
_is_encoder_decoder_model
=
is_encoder_decoder_model
@
property
def
graph
(
self
):
...
...
@@ -1670,8 +1715,9 @@ class CUDAGraphRunner:
intermediate_tensors
=
intermediate_inputs
,
**
kwargs
,
)
# Wait for the warm up operations to finish before proceeding with
# Graph Capture.
torch
.
cuda
.
synchronize
()
# Capture the graph.
self
.
_graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
self
.
_graph
,
pool
=
memory_pool
,
stream
=
stream
):
...
...
@@ -1703,10 +1749,14 @@ class CUDAGraphRunner:
# Save the input and output buffers.
self
.
input_buffers
=
{
"input_ids"
:
input_ids
,
"positions"
:
positions
,
"kv_caches"
:
kv_caches
,
**
self
.
attn_state
.
get_graph_input_buffers
(
attn_metadata
),
"input_ids"
:
input_ids
,
"positions"
:
positions
,
"kv_caches"
:
kv_caches
,
**
self
.
attn_state
.
get_graph_input_buffers
(
attn_metadata
,
self
.
_is_encoder_decoder_model
),
**
kwargs
,
}
if
intermediate_inputs
is
not
None
:
...
...
@@ -1736,8 +1786,8 @@ class CUDAGraphRunner:
self
.
input_buffers
[
"positions"
].
copy_
(
positions
,
non_blocking
=
True
)
self
.
input_buffers
[
"slot_mapping"
].
copy_
(
attn_metadata
.
slot_mapping
,
non_blocking
=
True
)
self
.
attn_state
.
prepare_graph_input_buffers
(
self
.
input_buffers
,
attn_metadata
)
self
.
attn_state
.
prepare_graph_input_buffers
(
self
.
input_buffers
,
attn_metadata
,
self
.
_is_encoder_decoder_model
)
if
"seqlen_agnostic_capture_inputs"
in
self
.
input_buffers
:
self
.
model
.
copy_inputs_before_cuda_graphs
(
self
.
input_buffers
,
**
kwargs
)
...
...
@@ -1751,6 +1801,12 @@ class CUDAGraphRunner:
if
key
!=
"model_execute_time"
and
key
!=
"model_forward_time"
:
self
.
input_buffers
[
key
].
copy_
(
intermediate_tensors
[
key
],
non_blocking
=
True
)
if
self
.
_is_encoder_decoder_model
:
self
.
input_buffers
[
"encoder_input_ids"
].
copy_
(
kwargs
[
'encoder_input_ids'
],
non_blocking
=
True
)
self
.
input_buffers
[
"encoder_positions"
].
copy_
(
kwargs
[
'encoder_positions'
],
non_blocking
=
True
)
# Run the graph.
self
.
graph
.
replay
()
# Return the output tensor.
...
...
vllm/worker/model_runner_base.py
View file @
539aa992
...
...
@@ -3,11 +3,13 @@ import pickle
from
abc
import
ABC
,
abstractmethod
from
datetime
import
datetime
from
functools
import
wraps
from
typing
import
(
TYPE_CHECKING
,
Any
,
Dict
,
Generic
,
List
,
Optional
,
Type
,
TypeVar
)
from
typing
import
(
TYPE_CHECKING
,
Any
,
Dict
,
Generic
,
Iterable
,
List
,
Optional
,
Type
,
TypeVar
)
import
torch
from
torch
import
is_tensor
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
...
...
@@ -17,6 +19,8 @@ if TYPE_CHECKING:
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.model_executor
import
SamplingMetadata
logger
=
init_logger
(
__name__
)
T
=
TypeVar
(
'T'
,
bound
=
"BroadcastableModelInput"
)
...
...
@@ -113,6 +117,8 @@ def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
except
Exception
as
err
:
timestamp
=
datetime
.
now
().
strftime
(
"%Y%m%d-%H%M%S"
)
filename
=
f
"/tmp/err_
{
func
.
__name__
}
_input_
{
timestamp
}
.pkl"
logger
.
info
(
"Writing input of failed execution to %s..."
,
filename
)
with
open
(
filename
,
"wb"
)
as
filep
:
dumped_inputs
=
{
k
:
v
...
...
@@ -122,7 +128,27 @@ def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
for
i
,
arg
in
enumerate
(
args
):
if
i
not
in
(
exclude_args
or
[]):
dumped_inputs
[
f
"arg_
{
i
}
"
]
=
arg
pickle
.
dump
(
dumped_inputs
,
filep
)
# Only persist dtype and shape for kvcache tensors
# (can be way to big otherwise)
if
(
kv_caches
:
=
dumped_inputs
.
get
(
"kv_caches"
))
\
and
isinstance
(
kv_caches
,
Iterable
):
dumped_inputs
[
"kv_caches"
]
=
[(
t
.
dtype
,
t
.
shape
)
for
t
in
kv_caches
if
is_tensor
(
t
)]
try
:
pickle
.
dump
(
dumped_inputs
,
filep
)
except
Exception
as
pickle_err
:
logger
.
warning
(
"Failed to pickle inputs of failed execution: %s"
,
str
(
pickle_err
))
raise
type
(
err
)(
f
"Error in model execution: "
f
"
{
str
(
err
)
}
"
)
from
err
logger
.
info
(
"Completed writing input of failed execution to %s."
,
filename
)
raise
type
(
err
)(
f
"Error in model execution (input dumped to
{
filename
}
): "
f
"
{
str
(
err
)
}
"
)
from
err
...
...
vllm/worker/multi_step_model_runner.py
View file @
539aa992
...
...
@@ -29,7 +29,7 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
MULTI_STEP_ATTENTION_BACKENDS
=
[
"flash-attn"
,
"flashinfer"
]
MULTI_STEP_ATTENTION_BACKENDS
=
[
"flash-attn"
,
"rocm-flash-attn"
,
"flashinfer"
]
def
seq_output_builder
():
...
...
@@ -614,34 +614,66 @@ def _pythonize_sampler_output(
frozen_model_input
=
model_input
.
frozen_model_input
assert
frozen_model_input
.
sampling_metadata
is
not
None
sampling_metadata
=
frozen_model_input
.
sampling_metadata
# samples generation should have been skipped
assert
not
output
.
outputs
pinned_buffer
=
pinned_sampled_token_buffer
[:
model_input
.
num_queries
]
# CPU GPU sync
pinned_buffer
=
pinned_buffer
.
copy_
(
sampled_token_ids
,
non_blocking
=
False
)
# We guarantee output tensors are ready, so it is safe to
# pythonize the sampler output & obtain CPU-side logprobs.
#
# However we should check whether logprobs pythonization may
# be skipped entirely, i.e. because no logprobs were requested
# or pythonization was not deferred. To that end,
#
# * `prompt_logprobs_are_requested_for_prefill` signals that
# there are *any* prefill-phase requests which specify that
# prompt logprobs should be returned.
#
# * `any_logprobs_are_requested` signals that there are any
# requests which (1) specify that sample logprobs should be
# returned, or (2) are in the prefill phase AND specify that
# prompt logprobs should be returned.
#
# Later on, these flags cause adjustments to the pythonization
# process to accommodate logprobs.
seq_groups
=
sampling_metadata
.
seq_groups
prompt_logprobs_are_requested_for_prefill
=
any
([
sg
.
sampling_params
.
prompt_logprobs
is
not
None
and
sg
.
is_prompt
for
sg
in
seq_groups
])
any_logprobs_are_requested
=
(
prompt_logprobs_are_requested_for_prefill
or
any
([
sg
.
sampling_params
.
logprobs
is
not
None
for
sg
in
seq_groups
]))
if
prompt_logprobs_are_requested_for_prefill
:
# CPU GPU sync, after gathering *only* sampled tokens (since
# requesting prompt logprobs leads `sampled_token_ids` to
# include prompt token ids in addition to sampled token ids.)
sample_idx_tensor
=
torch
.
tensor
(
[
sdx
for
sg
in
seq_groups
for
sdx
in
sg
.
sample_indices
])
pinned_buffer
=
pinned_buffer
.
copy_
(
sampled_token_ids
[
sample_idx_tensor
,
:],
non_blocking
=
False
)
else
:
# CPU GPU sync
pinned_buffer
=
pinned_buffer
.
copy_
(
sampled_token_ids
,
non_blocking
=
False
)
# this will not block as the tensors are already on CPU
samples_list
=
pinned_buffer
.
tolist
()
sampling_metadata
=
frozen_model_input
.
sampling_metadata
skip_sampler_cpu_output
=
(
frozen_model_input
.
sampling_metadata
.
skip_sampler_cpu_output
)
# We are guaranteed output tensors are ready, so it is safe to
# pythonize the sampler output & obtain CPU-side logprobs.
#
# However this computation may be skipped entirely
# if no pythonization was deferred.
seq_groups
=
sampling_metadata
.
seq_groups
logprobs_are_requested
=
any
([
sg
.
sampling_params
.
logprobs
is
not
None
or
sg
.
sampling_params
.
prompt_logprobs
is
not
None
for
sg
in
seq_groups
])
# *Don't* skip logprobs pythonization *if*:
# * Any requests require logprobs to be returned in this
# iteration AND
# * These requests are being scheduled in a fashion which
# defers pythonization (i.e. multi-step scheduling.)
do_pythonize_logprobs
=
(
skip_sampler_cpu_output
and
logprobs_are_requested
)
and
any_
logprobs_are_requested
)
(
prompt_logprobs
,
sample_logprobs
,
...
...
@@ -666,7 +698,7 @@ def _pythonize_sampler_output(
prompt_logprobs
[
sgdx
],
sample_logprobs
[
sgdx
],
)
elif
logprobs_are_requested
:
elif
any_
logprobs_are_requested
:
(
group_prompt_logprobs
,
group_sample_logprobs
,
...
...
@@ -696,7 +728,7 @@ def _pythonize_sampler_output(
seq_output
.
parent_seq_id
=
seq_ids
[
parent_id
]
seq_output
.
output_token
=
next_token_id
if
logprobs_are_requested
:
if
any_
logprobs_are_requested
:
seq_output
.
logprobs
=
group_sample_logprobs
[
tdx
]
else
:
logprobs
=
next
(
iter
(
seq_output
.
logprobs
.
values
()))
...
...
@@ -714,7 +746,7 @@ def _pythonize_sampler_output(
seq_outputs
.
append
(
SequenceOutput
(
seq_ids
[
parent_id
],
next_token_id
,
(
group_sample_logprobs
[
tdx
]
if
logprobs_are_requested
else
{
if
any_
logprobs_are_requested
else
{
next_token_id
:
Logprob
(
logprob
=
float
(
'inf'
),
rank
=
None
,
...
...
@@ -722,12 +754,12 @@ def _pythonize_sampler_output(
})))
if
cache
is
not
None
:
completion_seq_group_output
.
prompt_logprobs
=
\
group_prompt_logprobs
if
logprobs_are_requested
else
None
group_prompt_logprobs
if
any_
logprobs_are_requested
else
None
output
.
outputs
.
append
(
completion_seq_group_output
)
else
:
output
.
outputs
.
append
(
CompletionSequenceGroupOutput
(
seq_outputs
,
(
group_prompt_logprobs
if
logprobs_are_requested
else
None
)))
if
any_
logprobs_are_requested
else
None
)))
assert
len
(
output
.
outputs
)
>
0
vllm/worker/multi_step_tpu_worker.py
0 → 100644
View file @
539aa992
import
dataclasses
from
typing
import
Dict
,
Optional
,
Tuple
import
torch
from
vllm.distributed
import
broadcast_tensor_dict
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.worker.tpu_model_runner
import
ModelInputForTPU
from
vllm.worker.tpu_worker
import
TPUWorker
from
vllm.worker.worker_base
import
WorkerInput
class
MultiStepTPUWorker
(
TPUWorker
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
cached_model_input
:
Optional
[
ModelInputForTPU
]
=
None
def
_get_driver_input_and_broadcast
(
self
,
execute_model_req
:
ExecuteModelRequest
)
->
Tuple
[
ModelInputForTPU
,
WorkerInput
,
Dict
[
str
,
torch
.
Tensor
]]:
assert
self
.
is_driver_worker
assert
execute_model_req
.
virtual_engine
==
0
is_first_multi_step
=
execute_model_req
.
is_first_multi_step
is_last_step
=
execute_model_req
.
is_last_step
if
is_first_multi_step
:
worker_input
:
WorkerInput
=
self
.
prepare_worker_input
(
execute_model_req
=
execute_model_req
)
worker_input
=
dataclasses
.
replace
(
worker_input
,
num_steps
=
execute_model_req
.
num_lookahead_slots
+
1
)
model_input
:
ModelInputForTPU
=
(
self
.
model_runner
.
prepare_model_input
(
execute_model_req
.
seq_group_metadata_list
,
execute_model_req
.
virtual_engine
,
execute_model_req
.
finished_requests_ids
))
if
execute_model_req
.
async_callback
:
model_input
=
dataclasses
.
replace
(
model_input
,
async_callback
=
execute_model_req
.
async_callback
)
else
:
assert
self
.
cached_model_input
is
not
None
model_input
=
self
.
cached_model_input
worker_input
=
WorkerInput
()
model_input
=
dataclasses
.
replace
(
model_input
,
is_first_multi_step
=
is_first_multi_step
,
is_last_step
=
is_last_step
)
if
self
.
do_metadata_broadcast
:
if
is_first_multi_step
:
broadcast_data
=
worker_input
.
as_broadcastable_tensor_dict
()
broadcast_data
.
update
(
model_input
.
as_broadcastable_tensor_dict
())
broadcast_tensor_dict
(
broadcast_data
,
src
=
0
)
else
:
broadcast_data
=
{
"is_first_multi_step"
:
is_first_multi_step
,
"is_last_step"
:
is_last_step
,
}
broadcast_tensor_dict
(
broadcast_data
,
src
=
0
)
# Retuning empty dict here to keep this compatible with
# `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
return
model_input
,
worker_input
,
{}
def
prepare_input
(
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
,
)
->
Optional
[
Tuple
[
ModelInputForTPU
,
WorkerInput
,
Dict
[
str
,
torch
.
Tensor
]]]:
if
self
.
is_driver_worker
:
if
execute_model_req
is
None
:
if
self
.
do_metadata_broadcast
:
broadcast_tensor_dict
({},
src
=
0
)
return
None
model_input
,
worker_input
,
_
=
self
.
_get_driver_input_and_broadcast
(
execute_model_req
)
if
model_input
.
is_first_multi_step
:
self
.
cached_model_input
=
model_input
return
model_input
,
worker_input
,
{}
else
:
broadcast_data
=
broadcast_tensor_dict
(
src
=
0
)
if
not
broadcast_data
:
return
None
if
len
(
broadcast_data
)
==
2
:
assert
self
.
cached_model_input
is
not
None
self
.
cached_model_input
=
dataclasses
.
replace
(
self
.
cached_model_input
,
is_first_multi_step
=
broadcast_data
[
"is_first_multi_step"
],
is_last_step
=
broadcast_data
[
"is_last_step"
])
empty_worker_input
=
WorkerInput
()
return
self
.
cached_model_input
,
empty_worker_input
,
{}
worker_input
=
WorkerInput
.
from_broadcasted_tensor_dict
(
broadcast_data
)
model_input
=
(
self
.
model_runner
.
make_model_input_from_broadcasted_tensor_dict
(
broadcast_data
))
self
.
cached_model_input
=
model_input
return
model_input
,
worker_input
,
{}
vllm/worker/tpu_model_runner.py
View file @
539aa992
...
...
@@ -51,6 +51,8 @@ class ModelInputForTPU(ModelRunnerInputBase):
num_samples
:
int
best_of
:
List
[
int
]
seq_groups
:
List
[
List
[
int
]]
is_first_multi_step
:
bool
=
True
is_last_step
:
bool
=
True
virtual_engine
:
int
=
0
async_callback
:
Optional
[
Callable
]
=
None
...
...
@@ -65,6 +67,8 @@ class ModelInputForTPU(ModelRunnerInputBase):
"num_samples"
:
self
.
num_samples
,
"best_of"
:
self
.
best_of
,
"seq_groups"
:
self
.
seq_groups
,
"is_first_multi_step"
:
self
.
is_first_multi_step
,
"is_last_step"
:
self
.
is_last_step
,
"virtual_engine"
:
self
.
virtual_engine
,
}
_add_attn_metadata_broadcastable_dict
(
tensor_dict
,
self
.
attn_metadata
)
...
...
@@ -118,6 +122,7 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
self
.
block_size
,
False
,
)
self
.
cached_step_outputs
:
List
[
torch
.
Tensor
]
=
[]
def
load_model
(
self
)
->
None
:
self
.
device
=
self
.
device_config
.
device
...
...
@@ -518,97 +523,159 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
num_steps
:
int
=
1
,
)
->
List
[
SamplerOutput
]:
assert
intermediate_tensors
is
None
if
num_steps
>
1
:
raise
ValueError
(
"TPUModelRunner does not support multi-step execution."
)
def
_execute_model
(
*
args
):
"""Move input args from CPU to device and execute the model."""
new_args
=
[]
for
arg
in
args
:
if
isinstance
(
arg
,
torch
.
Tensor
):
arg
=
arg
.
to
(
self
.
device
)
elif
isinstance
(
arg
,
AttentionMetadata
):
arg
.
slot_mapping
=
arg
.
slot_mapping
.
to
(
self
.
device
)
if
getattr
(
arg
,
"block_tables"
,
None
)
is
not
None
:
arg
.
block_tables
=
arg
.
block_tables
.
to
(
self
.
device
)
if
getattr
(
arg
,
"context_lens"
,
None
)
is
not
None
:
arg
.
context_lens
=
arg
.
context_lens
.
to
(
self
.
device
)
new_args
.
append
(
arg
)
return
self
.
model
(
*
new_args
,
is_prompt
=
is_prompt
)
num_prefills
=
model_input
.
attn_metadata
.
num_prefills
is_prompt
=
num_prefills
>
0
if
not
model_input
.
is_first_multi_step
:
if
not
model_input
.
is_last_step
:
return
[]
use_async_out_proc
=
model_input
.
async_callback
is
not
None
sampler_outputs
=
[]
num_outputs
=
len
(
self
.
cached_step_outputs
)
for
i
in
range
(
num_outputs
):
next_token_ids
=
self
.
cached_step_outputs
.
pop
(
0
)
next_token_ids
=
next_token_ids
.
cpu
().
tolist
()
sampler_output
=
_make_decode_output
(
next_token_ids
,
model_input
.
seq_groups
)
sampler_outputs
.
append
(
sampler_output
)
if
i
<
num_outputs
-
1
and
use_async_out_proc
:
assert
model_input
.
async_callback
is
not
None
ctx
=
model_input
.
async_callback
.
keywords
[
# type: ignore
"ctx"
]
ctx
.
append_output
(
outputs
=
[
sampler_output
],
seq_group_metadata_list
=
ctx
.
seq_group_metadata_list
,
scheduler_outputs
=
ctx
.
scheduler_outputs
,
is_async
=
False
,
is_last_step
=
False
)
model_input
.
async_callback
()
if
use_async_out_proc
:
return
[
sampler_outputs
[
-
1
]]
else
:
return
sampler_outputs
is_prompt
=
model_input
.
attn_metadata
.
num_prefills
>
0
if
is_prompt
:
assert
num_steps
==
1
# NOTE(woosuk): Since the FlashAttention kernel does not support
# ragged inputs, we split the prompts into different batches and
# process them separately. This is a temporary hack that should be
# optimized by using SplashAttention.
next_token_ids
=
[]
orig_slot_mapping
=
model_input
.
attn_metadata
.
slot_mapping
batch_size
=
model_input
.
input_lens
.
shape
[
0
]
start_idx
=
0
next_token_ids
=
[]
for
i
in
range
(
batch_size
):
# Get the actual prefill_len.
prefill_len
=
model_input
.
input_lens
[
i
:
i
+
1
].
item
()
prefill_len
=
_get_padded_prefill_len
(
prefill_len
)
end_idx
=
start_idx
+
prefill_len
model_input
.
attn_metadata
.
slot_mapping
=
orig_slot_mapping
[
None
,
start_idx
:
end_idx
]
model_input
.
attn_metadata
.
num_prefills
=
1
output_token_ids
=
_execute_model
(
model_input
.
token_ids
[
None
,
start_idx
:
end_idx
],
model_input
.
position_ids
[
None
,
start_idx
:
end_idx
],
model_input
.
attn_metadata
,
model_input
.
input_lens
[
i
:
i
+
1
],
model_input
.
t
[
i
:
i
+
1
],
model_input
.
p
[
i
:
i
+
1
],
model_input
.
num_samples
,
kv_caches
)
if
i
==
0
and
model_input
.
async_callback
is
not
None
:
model_input
.
async_callback
()
# Retrieve the outputs to CPU.
next_token_ids
+=
output_token_ids
.
cpu
().
tolist
()
token_ids
=
model_input
.
token_ids
[
None
,
start_idx
:
end_idx
].
to
(
self
.
device
)
position_ids
=
model_input
.
position_ids
[
None
,
start_idx
:
end_idx
].
to
(
self
.
device
)
attn_metadata
=
model_input
.
attn_metadata
attn_metadata
.
num_prefills
=
1
attn_metadata
.
slot_mapping
=
orig_slot_mapping
[
None
,
start_idx
:
end_idx
].
to
(
self
.
device
)
input_lens
=
model_input
.
input_lens
[
i
:
i
+
1
].
to
(
self
.
device
)
t
=
model_input
.
t
[
i
:
i
+
1
].
to
(
self
.
device
)
p
=
model_input
.
p
[
i
:
i
+
1
].
to
(
self
.
device
)
output_token_ids
=
self
.
model
(
token_ids
,
position_ids
,
attn_metadata
,
input_lens
,
t
,
p
,
model_input
.
num_samples
,
kv_caches
,
is_prompt
=
True
)
next_token_ids
.
append
(
output_token_ids
[
0
])
start_idx
=
end_idx
else
:
# Execute the model.
output_token_ids
=
_execute_model
(
model_input
.
token_ids
,
model_input
.
position_ids
,
model_input
.
attn_metadata
,
model_input
.
input_lens
,
model_input
.
t
,
model_input
.
p
,
model_input
.
num_samples
,
kv_caches
)
if
model_input
.
async_callback
is
not
None
:
model_input
.
async_callback
()
# Retrieve the outputs to CPU.
next_token_ids
=
output_token_ids
.
cpu
().
tolist
()
# NOTE(woosuk): Minimal code to construct the sampler outputs.
# The TPU backend does not reuse the sampler, since the TPU backend
# does not support the advanced sampling parameters such as logprobs.
zero_logprob
=
Logprob
(
0.0
)
batch_idx
=
0
sampler_outputs
=
[]
for
seq_group
in
model_input
.
seq_groups
:
s
eq_ids
=
seq_group
seq_outputs
=
[]
if
is_prompt
:
next_token_ids
=
[
output_token_ids
.
cpu
().
tolist
()
for
output_token_ids
in
next_token_ids
]
# NOTE(woosuk): Minimal code to construct the sampler outputs.
# The TPU backend does not reuse the sampler, since the TPU backend
# does not support advanced sampling parameters such as logprobs.
zero_logprob
=
Logprob
(
0.0
)
s
ampler_outputs
=
[]
for
i
,
seq_group
in
enumerate
(
model_input
.
seq_groups
):
seq_ids
=
seq_group
assert
len
(
seq_ids
)
==
1
seq_id
=
seq_ids
[
0
]
for
i
in
range
(
model_input
.
best_of
[
batch_idx
]):
next_token_id
=
next_token_ids
[
batch_idx
][
i
]
seq_outputs
=
[]
for
j
in
range
(
model_input
.
best_of
[
i
]):
next_token_id
=
next_token_ids
[
i
][
j
]
seq_outputs
.
append
(
SequenceOutput
(
seq_id
,
next_token_id
,
{
next_token_id
:
zero_logprob
}))
batch_idx
+=
1
else
:
for
seq_id
in
seq_ids
:
next_token_id
=
next_token_ids
[
batch_idx
]
seq_outputs
.
append
(
SequenceOutput
(
seq_id
,
next_token_id
,
{
next_token_id
:
zero_logprob
}))
batch_idx
+=
1
sampler_outputs
.
append
(
CompletionSequenceGroupOutput
(
seq_outputs
,
None
))
return
[
SamplerOutput
(
sampler_outputs
)]
sampler_outputs
.
append
(
CompletionSequenceGroupOutput
(
seq_outputs
,
None
))
return
[
SamplerOutput
(
sampler_outputs
)]
else
:
token_ids
=
model_input
.
token_ids
.
to
(
self
.
device
)
position_ids
=
model_input
.
position_ids
.
to
(
self
.
device
)
attn_metadata
=
model_input
.
attn_metadata
attn_metadata
.
slot_mapping
=
attn_metadata
.
slot_mapping
.
to
(
self
.
device
)
attn_metadata
.
block_tables
=
attn_metadata
.
block_tables
.
to
(
self
.
device
)
attn_metadata
.
context_lens
=
attn_metadata
.
context_lens
.
to
(
self
.
device
)
t
=
model_input
.
t
.
to
(
self
.
device
)
p
=
model_input
.
p
.
to
(
self
.
device
)
input_lens
=
model_input
.
input_lens
.
to
(
self
.
device
)
for
i
in
range
(
num_steps
):
slot_mapping
=
attn_metadata
.
slot_mapping
output_token_ids
=
self
.
model
(
token_ids
,
position_ids
,
attn_metadata
,
input_lens
,
t
,
p
,
model_input
.
num_samples
,
kv_caches
,
is_prompt
=
False
)
self
.
cached_step_outputs
.
append
(
output_token_ids
)
if
i
<
num_steps
-
1
:
# Prepare the inputs for the next step.
token_ids
=
output_token_ids
.
unsqueeze
(
dim
=
1
).
int
()
position_ids
=
position_ids
+
1
attn_metadata
.
context_lens
=
attn_metadata
.
context_lens
+
1
block_tables
=
attn_metadata
.
block_tables
block_number
=
block_tables
.
gather
(
1
,
position_ids
.
long
()
//
self
.
block_size
)
block_offset
=
position_ids
%
self
.
block_size
is_padding
=
slot_mapping
==
_PAD_SLOT_ID
slot_mapping
=
block_number
*
self
.
block_size
+
block_offset
slot_mapping
=
slot_mapping
.
long
()
slot_mapping
=
torch
.
where
(
is_padding
,
_PAD_SLOT_ID
,
slot_mapping
)
attn_metadata
.
slot_mapping
=
slot_mapping
if
model_input
.
async_callback
is
not
None
:
model_input
.
async_callback
()
if
num_steps
>
1
:
return
[]
# Retrieve the outputs to CPU.
next_token_ids
=
self
.
cached_step_outputs
.
pop
(
0
)
next_token_ids
=
next_token_ids
.
cpu
().
tolist
()
sampler_output
=
_make_decode_output
(
next_token_ids
,
model_input
.
seq_groups
)
return
[
sampler_output
]
class
ModelWrapper
(
TorchCompileWrapperWithCustomDispatcher
):
...
...
@@ -756,3 +823,24 @@ def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor:
cutoff_logit
=
torch
.
gather
(
logits_sorted
,
-
1
,
cutoff_index
)
logits
=
logits
.
masked_fill_
(
logits
<
cutoff_logit
,
-
float
(
"inf"
))
return
logits
def
_make_decode_output
(
next_token_ids
:
List
[
int
],
seq_groups
:
List
[
List
[
int
]],
)
->
SamplerOutput
:
zero_logprob
=
Logprob
(
0.0
)
sampler_outputs
=
[]
batch_idx
=
0
for
seq_group
in
seq_groups
:
seq_ids
=
seq_group
seq_outputs
=
[]
for
seq_id
in
seq_ids
:
next_token_id
=
next_token_ids
[
batch_idx
]
seq_outputs
.
append
(
SequenceOutput
(
seq_id
,
next_token_id
,
{
next_token_id
:
zero_logprob
}))
batch_idx
+=
1
sampler_outputs
.
append
(
CompletionSequenceGroupOutput
(
seq_outputs
,
None
))
return
SamplerOutput
(
sampler_outputs
)
Prev
1
…
15
16
17
18
19
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment