Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
539aa992
Commit
539aa992
authored
Sep 27, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.2' into v0.6.2-dev
parents
93872128
7193774b
Changes
383
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1266 additions
and
397 deletions
+1266
-397
vllm/transformers_utils/configs/mllama.py
vllm/transformers_utils/configs/mllama.py
+28
-0
vllm/transformers_utils/configs/solar.py
vllm/transformers_utils/configs/solar.py
+245
-0
vllm/transformers_utils/detokenizer.py
vllm/transformers_utils/detokenizer.py
+7
-9
vllm/transformers_utils/image_processor.py
vllm/transformers_utils/image_processor.py
+0
-64
vllm/transformers_utils/processor.py
vllm/transformers_utils/processor.py
+61
-4
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/tokenizer.py
+0
-1
vllm/transformers_utils/tokenizers/mistral.py
vllm/transformers_utils/tokenizers/mistral.py
+31
-5
vllm/triton_utils/libentry.py
vllm/triton_utils/libentry.py
+2
-2
vllm/triton_utils/sample.py
vllm/triton_utils/sample.py
+0
-13
vllm/usage/usage_lib.py
vllm/usage/usage_lib.py
+2
-1
vllm/utils.py
vllm/utils.py
+114
-36
vllm/version.py
vllm/version.py
+5
-7
vllm/vllm_flash_attn/.gitkeep
vllm/vllm_flash_attn/.gitkeep
+0
-0
vllm/worker/cpu_model_runner.py
vllm/worker/cpu_model_runner.py
+276
-116
vllm/worker/enc_dec_model_runner.py
vllm/worker/enc_dec_model_runner.py
+72
-23
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+80
-24
vllm/worker/model_runner_base.py
vllm/worker/model_runner_base.py
+29
-3
vllm/worker/multi_step_model_runner.py
vllm/worker/multi_step_model_runner.py
+53
-21
vllm/worker/multi_step_tpu_worker.py
vllm/worker/multi_step_tpu_worker.py
+105
-0
vllm/worker/tpu_model_runner.py
vllm/worker/tpu_model_runner.py
+156
-68
No files found.
vllm/transformers_utils/configs/mllama.py
0 → 100644
View file @
539aa992
from
transformers.models.mllama
import
configuration_mllama
as
mllama_hf_config
class
MllamaTextConfig
(
mllama_hf_config
.
MllamaTextConfig
):
'''
Use this class to override is_encoder_decoder:
- transformers regards mllama as is_encoder_decoder=False
- vllm needs is_encoder_decoder=True to enable cross-attention
'''
def
__init__
(
self
,
**
kwargs
,
):
super
().
__init__
(
**
kwargs
)
self
.
is_encoder_decoder
=
True
class
MllamaConfig
(
mllama_hf_config
.
MllamaConfig
):
def
__init__
(
self
,
text_config
=
None
,
**
kwargs
,
):
if
isinstance
(
text_config
,
dict
):
text_config
=
MllamaTextConfig
(
**
text_config
)
super
().
__init__
(
text_config
=
text_config
,
**
kwargs
)
vllm/transformers_utils/configs/solar.py
0 → 100644
View file @
539aa992
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Solar model configuration"""
from
transformers
import
PretrainedConfig
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
class
SolarConfig
(
PretrainedConfig
):
r
"""
This is the configuration class to store
the configuration of a [`SolarModel`].
It is used to instantiate an LLaMA model
according to the specified arguments,
defining the model architecture.
Instantiating a configuration with the
defaults will yield a similar
configuration to that of the LLaMA-7B.
Configuration objects inherit from [`PretrainedConfig`]
and can be used to control the model outputs.
Read the documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the LLaMA model.
Defines the number of different tokens
that can be represented by the `inputs_ids`
passed when calling [`SolarModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer
in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that
should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`,
the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model
will use Multi Query Attention (MQA)
otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint,
each group key and value head should be constructed
by meanpooling all the original heads within that group.
For more details checkout [this paper]
(https://arxiv.org/pdf/2305.13245.pdf).
If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string)
in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
Solar 1 supports up to 2048 tokens,
Solar 2 up to 4096, CodeSolar up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of
the truncated_normal_initializer for initializing
all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return
the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank
used during pretraining.
Please refer to [this
document](https://huggingface.co/docs/
transformers/main/
perf_train_gpu_many#tensor-parallelism)
to understand more about it. This value is
necessary to ensure exact reproducibility
of the pretraining results.
Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for
the RoPE embeddings.
Currently supports two scaling
strategies: linear and dynamic.
Their scaling factor must be a float greater than 1.
The expected format is
`{"type": strategy name, "factor": scaling factor}`.
When using this flag, don't update
`max_position_embeddings` to the expected new maximum.
See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking
API changes in future versions.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value
and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj
layers in the MLP layers.
sliding_window (`int`, *optional*, defaults to 2047):
Sliding window attention window size. If not specified,
will default to `2047`.
```python
>>> from transformers import SolarModel, SolarConfig
>>> # Initializing a Solar-pro style configuration
>>> configuration = SolarConfig()
>>> # Initializing a model from the Solar-pro style configuration
>>> model = SolarModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type
=
"solar"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
32000
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
None
,
hidden_act
=
"silu"
,
max_position_embeddings
=
2048
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
None
,
bos_token_id
=
1
,
eos_token_id
=
2
,
pretraining_tp
=
1
,
tie_word_embeddings
=
False
,
rope_theta
=
10000.0
,
rope_scaling
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
mlp_bias
=
False
,
sliding_window
=
2047
,
bskcn_1
=
None
,
bskcn_2
=
None
,
bskcn_3
=
None
,
bskcn_4
=
None
,
bskcn_tv
=
None
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
pretraining_tp
=
pretraining_tp
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
_rope_scaling_validation
()
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
self
.
mlp_bias
=
mlp_bias
self
.
sliding_window
=
sliding_window
self
.
bskcn_1
=
bskcn_1
if
bskcn_1
is
not
None
else
[
12
,
20
,
32
,
44
]
self
.
bskcn_2
=
bskcn_2
if
bskcn_2
is
not
None
else
[
20
,
32
]
self
.
bskcn_3
=
bskcn_3
if
bskcn_3
is
not
None
else
[
16
,
24
,
36
,
48
]
self
.
bskcn_4
=
bskcn_4
if
bskcn_4
is
not
None
else
[
28
,
40
]
self
.
bskcn_tv
=
bskcn_tv
if
bskcn_tv
is
not
None
else
[
0.9
,
0.8
]
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
def
_rope_scaling_validation
(
self
):
"""
Validate the `rope_scaling` configuration.
"""
if
self
.
rope_scaling
is
None
:
return
if
(
not
isinstance
(
self
.
rope_scaling
,
dict
)
or
len
(
self
.
rope_scaling
)
!=
2
):
raise
ValueError
(
"`rope_scaling` must be a dictionary with two fields,"
" `type` and `factor`, "
f
"got
{
self
.
rope_scaling
}
"
)
rope_scaling_type
=
self
.
rope_scaling
.
get
(
"type"
,
None
)
rope_scaling_factor
=
self
.
rope_scaling
.
get
(
"factor"
,
None
)
if
rope_scaling_type
is
None
or
rope_scaling_type
not
in
[
"linear"
,
"dynamic"
,
]:
raise
ValueError
(
f
"`rope_scaling`'s type field must be one of "
f
"['linear', 'dynamic'], got
{
rope_scaling_type
}
"
)
if
(
rope_scaling_factor
is
None
or
not
isinstance
(
rope_scaling_factor
,
float
)
or
rope_scaling_factor
<=
1.0
):
raise
ValueError
(
f
"`rope_scaling`'s factor field must be a float > 1,"
f
" got
{
rope_scaling_factor
}
"
)
vllm/transformers_utils/detokenizer.py
View file @
539aa992
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
vllm.sequence
import
Logprob
,
SamplingParams
,
Sequence
,
SequenceGroup
from
vllm.sequence
import
(
VLLM_INVALID_TOKEN_ID
,
Logprob
,
SamplingParams
,
Sequence
,
SequenceGroup
)
from
.tokenizer
import
AnyTokenizer
from
.tokenizer_group
import
BaseTokenizerGroup
# Used eg. for marking rejected tokens in spec decoding.
INVALID_TOKEN_ID
=
-
1
class
Detokenizer
:
"""Provides methods to decode the output of a model into text."""
...
...
@@ -61,7 +59,7 @@ class Detokenizer:
continue
for
token_id
,
sample_logprob
in
prompt_logprobs_for_token
.
items
():
if
(
sample_logprob
.
decoded_token
is
None
and
token_id
!=
INVALID_TOKEN_ID
):
and
token_id
!=
VLLM_
INVALID_TOKEN_ID
):
prompt_token_ids_with_token
=
(
prompt_token_ids
[:
token_position
]
+
[
token_id
])
(
new_tokens
,
new_text
,
new_prefix_offset
,
...
...
@@ -143,7 +141,7 @@ class Detokenizer:
continue
if
(
sample_logprob
.
decoded_token
is
None
and
token_id
!=
INVALID_TOKEN_ID
):
and
token_id
!=
VLLM_
INVALID_TOKEN_ID
):
all_input_ids_with_logprob
=
previous_tokens
+
[
token_id
]
(
_
,
new_text
,
_
,
_
)
=
detokenize_incrementally
(
tokenizer
=
tokenizer
,
...
...
@@ -282,14 +280,14 @@ def detokenize_incrementally(
assert
prev_tokens
is
not
None
# If the new token id is out of bounds, return an empty string.
if
new_token_id
>=
len
(
tokenizer
):
new_tokens
=
[
""
]
else
:
if
0
<=
new_token_id
<
len
(
tokenizer
):
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens
=
tokenizer
.
convert_ids_to_tokens
(
[
new_token_id
],
skip_special_tokens
=
skip_special_tokens
)
if
isinstance
(
new_tokens
,
str
):
new_tokens
=
[
new_tokens
]
else
:
new_tokens
=
[
""
]
output_tokens
=
prev_tokens
+
new_tokens
# If this is the first iteration, return all tokens.
...
...
vllm/transformers_utils/image_processor.py
deleted
100644 → 0
View file @
93872128
from
typing
import
cast
def
get_video_processor
(
processor_name
:
str
,
trust_remote_code
:
bool
=
False
,
):
"""
Gets a processor for the given model name via HuggingFace.
"""
from
transformers
import
AutoProcessor
try
:
processor
=
AutoProcessor
.
from_pretrained
(
processor_name
)
video_processor
=
processor
.
video_processor
except
ValueError
as
e
:
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
video_processor
def
get_image_processor
(
processor_name
:
str
,
*
args
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
):
"""Gets an image processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoImageProcessor
from
transformers.image_processing_utils
import
BaseImageProcessor
try
:
processor
=
AutoImageProcessor
.
from_pretrained
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
)
except
ValueError
as
e
:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the image processor. If the image processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
cast
(
BaseImageProcessor
,
processor
)
vllm/transformers_utils/processor.py
View file @
539aa992
from
typing
import
cast
from
typing
import
Any
,
cast
def
get_processor
(
processor_name
:
str
,
*
args
,
*
args
:
Any
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
**
kwargs
:
Any
,
):
"""
Gets
a processor for the given model name via HuggingFace."""
"""
Load
a processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoProcessor
...
...
@@ -35,3 +35,60 @@ def get_processor(
raise
e
return
cast
(
ProcessorMixin
,
processor
)
def
get_image_processor
(
processor_name
:
str
,
*
args
:
Any
,
trust_remote_code
:
bool
=
False
,
**
kwargs
:
Any
,
):
"""Load an image processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoImageProcessor
from
transformers.image_processing_utils
import
BaseImageProcessor
try
:
processor
=
AutoImageProcessor
.
from_pretrained
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
)
except
ValueError
as
e
:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the image processor. If the image processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
cast
(
BaseImageProcessor
,
processor
)
def
get_video_processor
(
processor_name
:
str
,
*
args
:
Any
,
trust_remote_code
:
bool
=
False
,
**
kwargs
:
Any
,
):
"""Load a video processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers.image_processing_utils
import
BaseImageProcessor
processor
=
get_processor
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
,
)
return
cast
(
BaseImageProcessor
,
processor
.
video_processor
)
vllm/transformers_utils/tokenizer.py
View file @
539aa992
...
...
@@ -111,7 +111,6 @@ def get_tokenizer(
'encoding and decoding.'
,
FutureWarning
,
stacklevel
=
2
)
if
tokenizer_mode
==
"mistral"
:
tokenizer
=
MistralTokenizer
.
from_pretrained
(
str
(
tokenizer_name
),
revision
=
revision
)
...
...
vllm/transformers_utils/tokenizers/mistral.py
View file @
539aa992
...
...
@@ -165,10 +165,9 @@ class MistralTokenizer:
messages
:
List
[
"ChatCompletionMessageParam"
],
tools
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
**
kwargs
)
->
List
[
int
]:
assert
tools
is
None
,
"`tools` are not yet supported."
request
=
ChatCompletionRequest
(
messages
=
message
s
)
# type: ignore[type-var]
request
=
ChatCompletionRequest
(
messages
=
messages
,
tools
=
tool
s
)
# type: ignore[type-var]
encoded
=
self
.
mistral
.
encode_chat_completion
(
request
)
# encode-decode to get clean prompt
...
...
@@ -176,9 +175,29 @@ class MistralTokenizer:
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
])
->
str
:
if
isinstance
(
self
.
tokenizer
,
Tekkenizer
):
return
""
.
join
(
tokens
)
tokens
=
[
t
for
t
in
tokens
if
t
not
in
self
.
tokenizer
.
_all_special_tokens
]
if
any
(
isinstance
(
t
,
bytes
)
for
t
in
tokens
):
# we need to encode and decode all tokens again
shift
=
self
.
tokenizer
.
num_special_tokens
byte_tokens
=
[
t
.
encode
(
"utf-8"
)
if
not
isinstance
(
t
,
bytes
)
else
t
for
t
in
tokens
]
ids
=
[
self
.
tokenizer
.
_tekken_token2id_nospecial
[
t
]
+
shift
for
t
in
byte_tokens
]
decoded
=
self
.
tokenizer
.
decode
(
ids
)
else
:
return
self
.
tokenizer
.
decode
(
tokens
)
# type: ignore[arg-type]
decoded
=
""
.
join
(
tokens
)
else
:
decoded
=
self
.
tokenizer
.
decode
(
tokens
)
# type: ignore[arg-type]
return
decoded
def
decode
(
self
,
ids
:
Union
[
List
[
int
],
int
])
->
str
:
if
isinstance
(
ids
,
int
):
...
...
@@ -200,4 +219,11 @@ class MistralTokenizer:
self
.
tokenizer
)
tokens
=
[
self
.
tokenizer
.
id_to_piece
(
id
)
for
id
in
ids
]
if
any
(
t
.
strip
()
==
"�"
for
t
in
tokens
):
# if any stripped decoded token is undefined
# because it's invalid unicode then pass bytes
# See: https://github.com/vllm-project/vllm/pull/8640
tokens
=
[
self
.
tokenizer
.
id_to_byte_piece
(
id
)
for
id
in
ids
]
return
tokens
vllm/triton_utils/libentry.py
View file @
539aa992
...
...
@@ -35,8 +35,8 @@ class LibEntry(triton.KernelInterface):
dns_key
=
[
arg
.
dtype
if
hasattr
(
arg
,
"data_ptr"
)
else
type
(
arg
)
if
not
isinstance
(
arg
,
int
)
else
"i32"
if
-
(
2
**
31
)
<=
arg
and
arg
<=
2
**
31
-
1
else
"u64"
if
2
**
63
<=
arg
and
arg
<=
2
**
64
-
1
else
"i64"
else
"i32"
if
arg
>=
-
(
2
**
31
)
and
arg
<=
2
**
31
-
1
else
"u64"
if
arg
>=
2
**
63
and
arg
<=
2
**
64
-
1
else
"i64"
for
arg
in
dns_args
]
# const args passed by position
...
...
vllm/triton_utils/sample.py
deleted
100644 → 0
View file @
93872128
import
math
# This is a hardcoded limit in Triton (max block size).
MAX_TRITON_N_COLS
=
131072
def
get_num_triton_sampler_splits
(
n_cols
:
int
)
->
int
:
"""Get the number of splits to use for Triton sampling.
Triton has a limit on the number of columns it can handle, so we need to
split the tensor and call the kernel multiple times if it's too large.
"""
return
math
.
ceil
(
n_cols
/
MAX_TRITON_N_COLS
)
vllm/usage/usage_lib.py
View file @
539aa992
...
...
@@ -17,6 +17,7 @@ import torch
import
vllm.envs
as
envs
from
vllm.connections
import
global_http_connection
from
vllm.platforms
import
current_platform
from
vllm.version
import
__version__
as
VLLM_VERSION
_config_home
=
envs
.
VLLM_CONFIG_ROOT
...
...
@@ -151,7 +152,7 @@ class UsageMessage:
usage_context
:
UsageContext
,
extra_kvs
:
Dict
[
str
,
Any
])
->
None
:
# Platform information
if
torch
.
cuda
.
is_availabl
e
():
if
current_platform
.
is_cuda_alik
e
():
device_property
=
torch
.
cuda
.
get_device_properties
(
0
)
self
.
gpu_count
=
torch
.
cuda
.
device_count
()
self
.
gpu_type
=
device_property
.
name
...
...
vllm/utils.py
View file @
539aa992
...
...
@@ -4,7 +4,10 @@ import contextlib
import
datetime
import
enum
import
gc
import
inspect
import
ipaddress
import
os
import
random
import
socket
import
subprocess
import
sys
...
...
@@ -12,6 +15,7 @@ import tempfile
import
threading
import
uuid
import
warnings
import
weakref
from
asyncio
import
FIRST_COMPLETED
,
ensure_future
from
functools
import
lru_cache
,
partial
,
wraps
from
platform
import
uname
...
...
@@ -31,6 +35,7 @@ from typing_extensions import ParamSpec, TypeIs, assert_never
import
vllm.envs
as
envs
from
vllm.logger
import
enable_trace_function_call
,
init_logger
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
...
...
@@ -70,10 +75,6 @@ STR_NOT_IMPL_ENC_DEC_SPEC_DEC = ("Speculative decoding is not "
"currently supported with encoder/"
"decoder models."
)
STR_NOT_IMPL_ENC_DEC_CUDAGRAPH
=
(
"CUDAGraph is not "
"currently supported with encoder/"
"decoder models."
)
STR_NOT_IMPL_ENC_DEC_BACKEND
=
(
"XFormers is the only backend "
"currently supported with encoder/"
"decoder models."
)
...
...
@@ -97,7 +98,6 @@ STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
"STR_NOT_IMPL_ENC_DEC_PP"
:
STR_NOT_IMPL_ENC_DEC_PP
,
"STR_NOT_IMPL_ENC_DEC_MM"
:
STR_NOT_IMPL_ENC_DEC_MM
,
"STR_NOT_IMPL_ENC_DEC_SPEC_DEC"
:
STR_NOT_IMPL_ENC_DEC_SPEC_DEC
,
"STR_NOT_IMPL_ENC_DEC_CUDA_GRAPH"
:
STR_NOT_IMPL_ENC_DEC_CUDAGRAPH
,
"STR_NOT_IMPL_ENC_DEC_BACKEND"
:
STR_NOT_IMPL_ENC_DEC_BACKEND
,
"STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER"
:
STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER
,
"STR_NOT_IMPL_ENC_DEC_CPU"
:
STR_NOT_IMPL_ENC_DEC_CPU
...
...
@@ -377,6 +377,22 @@ def get_cpu_memory() -> int:
return
psutil
.
virtual_memory
().
total
def
seed_everything
(
seed
:
int
)
->
None
:
"""
Set the seed of each random module.
Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
"""
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
manual_seed_all
(
seed
)
if
is_xpu
():
torch
.
xpu
.
manual_seed_all
(
seed
)
def
random_uuid
()
->
str
:
return
str
(
uuid
.
uuid4
().
hex
)
...
...
@@ -518,6 +534,14 @@ def get_ip() -> str:
return
"0.0.0.0"
def
is_valid_ipv6_address
(
address
:
str
)
->
bool
:
try
:
ipaddress
.
IPv6Address
(
address
)
return
True
except
ValueError
:
return
False
def
get_distributed_init_method
(
ip
:
str
,
port
:
int
)
->
str
:
# Brackets are not permitted in ipv4 addresses,
# see https://github.com/python/cpython/issues/103848
...
...
@@ -638,9 +662,7 @@ def create_kv_caches_with_random_flash(
seed
:
int
=
0
,
device
:
Optional
[
str
]
=
"cuda"
,
)
->
Tuple
[
List
[
torch
.
Tensor
],
List
[
torch
.
Tensor
]]:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch_dtype
=
get_kv_cache_torch_dtype
(
cache_dtype
,
model_dtype
)
key_value_cache_shape
=
(
num_blocks
,
2
,
block_size
,
num_heads
,
head_size
)
...
...
@@ -682,9 +704,7 @@ def create_kv_caches_with_random(
f
"Does not support key cache of type fp8 with head_size
{
head_size
}
"
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch_dtype
=
get_kv_cache_torch_dtype
(
cache_dtype
,
model_dtype
)
...
...
@@ -747,14 +767,14 @@ def is_pin_memory_available() -> bool:
return
True
class
Cuda
MemoryProfiler
:
class
Device
MemoryProfiler
:
def
__init__
(
self
,
device
:
Optional
[
torch
.
types
.
Device
]
=
None
):
self
.
device
=
device
def
current_memory_usage
(
self
)
->
float
:
# Return the memory usage in bytes.
if
torch
.
cuda
.
is_availabl
e
():
if
current_platform
.
is_cuda_alik
e
():
torch
.
cuda
.
reset_peak_memory_stats
(
self
.
device
)
mem
=
torch
.
cuda
.
max_memory_allocated
(
self
.
device
)
elif
is_xpu
():
...
...
@@ -836,15 +856,6 @@ def async_tensor_h2d(
return
t
.
to
(
device
=
target_device
,
non_blocking
=
True
)
def
maybe_expand_dim
(
tensor
:
torch
.
Tensor
,
target_dims
:
int
,
size
:
int
=
1
)
->
torch
.
Tensor
:
"""Expand the tensor to the target_dims."""
if
tensor
.
ndim
<
target_dims
:
tensor
=
tensor
.
view
(
-
1
,
*
([
size
]
*
(
target_dims
-
tensor
.
ndim
)))
return
tensor
def
get_dtype_size
(
dtype
:
torch
.
dtype
)
->
int
:
"""Get the size of the data type in bytes."""
return
torch
.
tensor
([],
dtype
=
dtype
).
element_size
()
...
...
@@ -1079,6 +1090,20 @@ def cuda_device_count_stateless() -> int:
return
_cuda_device_count_stateless
(
envs
.
CUDA_VISIBLE_DEVICES
)
def
weak_bind
(
bound_method
:
Callable
[...,
Any
],
)
->
Callable
[...,
None
]:
"""Make an instance method that weakly references
its associated instance and no-ops once that
instance is collected."""
ref
=
weakref
.
ref
(
bound_method
.
__self__
)
# type: ignore[attr-defined]
unbound
=
bound_method
.
__func__
# type: ignore[attr-defined]
def
weak_bound
(
*
args
,
**
kwargs
)
->
None
:
if
inst
:
=
ref
():
unbound
(
inst
,
*
args
,
**
kwargs
)
return
weak_bound
#From: https://stackoverflow.com/a/4104188/2749989
def
run_once
(
f
:
Callable
[
P
,
None
])
->
Callable
[
P
,
None
]:
...
...
@@ -1222,6 +1247,53 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
return
await
task
(
*
args
,
**
kwargs
)
def
get_allowed_kwarg_only_overrides
(
callable
:
Callable
[...,
object
],
overrides
:
Optional
[
Dict
[
str
,
Any
]],
)
->
Dict
[
str
,
Any
]:
"""
Given a callable which has one or more keyword only params and a dict
mapping param names to values, drop values that can be not be kwarg
expanded to overwrite one or more keyword-only args. This is used in a
few places to handle custom processor overrides for multimodal models,
e.g., for profiling when processor options provided by the user
may affect the number of mm tokens per instance.
Args:
callable: Callable which takes 0 or more keyword only arguments.
overrides: Potential overrides to be used when invoking the callable.
Returns:
Dictionary containing the kwargs to be leveraged which may be used
to overwrite one or more keyword only arguments when invoking the
callable.
"""
if
not
overrides
:
return
{}
allowed_override_names
=
[
name
for
name
,
param
in
inspect
.
signature
(
callable
).
parameters
.
items
()
if
param
.
kind
==
inspect
.
Parameter
.
KEYWORD_ONLY
]
# Drop any mm_processor_kwargs provided by the user that are
# not kwarg names accepted by the provided input processor.
filtered_overrides
=
{
kwarg_name
:
val
for
kwarg_name
,
val
in
overrides
.
items
()
if
kwarg_name
in
allowed_override_names
}
# If anything is dropped, log a warning
dropped_keys
=
overrides
.
keys
()
-
filtered_overrides
.
keys
()
if
dropped_keys
:
logger
.
warning
(
"The following intended overrides are not keyword-only args "
"and and will be dropped: %s"
,
dropped_keys
)
return
filtered_overrides
# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
# In particular, the FakeScalarType is not supported for earlier versions of
# PyTorch which breaks dynamo for any ops registered using ScalarType.
...
...
@@ -1230,6 +1302,12 @@ def supports_dynamo() -> bool:
return
base_torch_version
>=
Version
(
"2.4.0"
)
# Some backends use pytorch version < 2.4.0 which doesn't
# support `torch.library.custom_op`.
def
supports_custom_op
()
->
bool
:
return
hasattr
(
torch
.
library
,
"custom_op"
)
class
AtomicCounter
:
"""An atomic, thread-safe counter"""
...
...
vllm/version.py
View file @
539aa992
import
warnings
try
:
import
vllm.commit_id
__commit__
=
vllm
.
commit_id
.
__commit__
from
._version
import
__version__
,
__version_tuple__
except
Exception
as
e
:
import
warnings
warnings
.
warn
(
f
"Failed to read commit hash:
\n
{
e
}
"
,
RuntimeWarning
,
stacklevel
=
2
)
__commit__
=
"COMMIT_HASH_PLACEHOLDER"
__version__
=
"0.6.1.post2"
__version__
=
"dev"
__version_tuple__
=
(
0
,
0
,
__version__
)
vllm/vllm_flash_attn/.gitkeep
0 → 100644
View file @
539aa992
vllm/worker/cpu_model_runner.py
View file @
539aa992
import
dataclasses
import
weakref
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
Union
...
...
@@ -10,14 +12,16 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
SchedulerConfig
)
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.rotary_embedding
import
MRotaryEmbedding
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.model_loader
import
get_model
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
MultiModalInputs
)
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
from
vllm.sequence
import
(
IntermediateTensors
,
SequenceData
,
SequenceGroupMetadata
)
from
vllm.utils
import
STR_NOT_IMPL_ENC_DEC_ERR_STRS
,
make_tensor_with_pad
from
vllm.worker.model_runner_base
import
(
ModelRunnerBase
,
ModelRunnerInputBase
,
ModelRunnerBase
,
ModelRunnerInputBase
,
ModelRunnerInputBuilderBase
,
_add_attn_metadata_broadcastable_dict
,
_add_sampling_metadata_broadcastable_dict
,
_init_attn_metadata_from_tensor_dict
,
...
...
@@ -32,16 +36,17 @@ _PAD_SLOT_ID = -1
@
dataclass
(
frozen
=
True
)
class
CPU
ModelInput
(
ModelRunnerInputBase
):
class
ModelInput
ForCPU
(
ModelRunnerInputBase
):
"""
Used by the CPUModelRunner.
Base class contains metadata needed for the base model forward pass on CPU
"""
input_tokens
:
Optional
[
torch
.
Tensor
]
=
None
input_positions
:
Optional
[
torch
.
Tensor
]
=
None
attn_metadata
:
Optional
[
"AttentionMetadata"
]
=
None
sampling_metadata
:
Optional
[
"SamplingMetadata"
]
=
None
multi_modal_kwargs
:
Optional
[
BatchedTensorInputs
]
=
None
virtual_engine
:
Optional
[
int
]
=
None
seq_lens
:
Optional
[
List
[
int
]]
=
None
query_lens
:
Optional
[
List
[
int
]]
=
None
def
as_broadcastable_tensor_dict
(
self
)
->
Dict
[
str
,
Union
[
int
,
torch
.
Tensor
]]:
...
...
@@ -51,16 +56,44 @@ class CPUModelInput(ModelRunnerInputBase):
"multi_modal_kwargs"
:
self
.
multi_modal_kwargs
,
}
_add_attn_metadata_broadcastable_dict
(
tensor_dict
,
self
.
attn_metadata
)
return
tensor_dict
@
classmethod
def
from_broadcasted_tensor_dict
(
cls
:
Type
[
"ModelInputForCPU"
],
tensor_dict
:
Dict
[
str
,
Any
],
attn_backend
:
Optional
[
"AttentionBackend"
]
=
None
)
->
"ModelInputForCPU"
:
if
attn_backend
is
not
None
:
tensor_dict
=
_init_attn_metadata_from_tensor_dict
(
attn_backend
,
tensor_dict
)
return
cls
(
**
tensor_dict
)
@
dataclass
(
frozen
=
True
)
class
ModelInputForCPUWithSamplingMetadata
(
ModelInputForCPU
):
"""
Used by the ModelRunner.
"""
sampling_metadata
:
Optional
[
"SamplingMetadata"
]
=
None
def
as_broadcastable_tensor_dict
(
self
)
->
Dict
[
str
,
Any
]:
tensor_dict
=
{
"input_tokens"
:
self
.
input_tokens
,
"input_positions"
:
self
.
input_positions
,
}
_add_attn_metadata_broadcastable_dict
(
tensor_dict
,
self
.
attn_metadata
)
_add_sampling_metadata_broadcastable_dict
(
tensor_dict
,
self
.
sampling_metadata
)
return
tensor_dict
@
classmethod
def
from_broadcasted_tensor_dict
(
cls
:
Type
[
"CPUModelInput"
]
,
cls
,
tensor_dict
:
Dict
[
str
,
Any
],
attn_backend
:
Optional
[
"AttentionBackend"
]
=
None
)
->
"
CPU
ModelInput"
:
attn_backend
:
Optional
[
"AttentionBackend"
]
=
None
,
)
->
"ModelInput
ForCPUWithSamplingMetadata
"
:
tensor_dict
=
_init_sampling_metadata_from_tensor_dict
(
tensor_dict
)
if
attn_backend
is
not
None
:
tensor_dict
=
_init_attn_metadata_from_tensor_dict
(
...
...
@@ -68,71 +101,83 @@ class CPUModelInput(ModelRunnerInputBase):
return
cls
(
**
tensor_dict
)
class
CPU
Model
Runn
er
(
ModelRunnerBase
[
CPU
ModelInput
]):
class
Model
InputForCPUBuild
er
(
ModelRunner
InputBuilder
Base
[
ModelInput
ForCPU
]):
def
__init__
(
self
,
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
cache_config
:
CacheConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
]
=
None
,
is_driver_worker
:
bool
=
False
,
*
args
,
**
kwargs
,
):
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
# Currently, CPU worker doesn't support chunked prefill.
assert
self
.
scheduler_config
.
chunked_prefill_enabled
is
False
self
.
device_config
=
device_config
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
load_config
=
load_config
self
.
is_driver_worker
=
is_driver_worker
self
.
device
=
self
.
device_config
.
device
def
__init__
(
self
,
runner
:
"CPUModelRunner"
,
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
)
->
None
:
super
().
__init__
()
self
.
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
]
=
[]
self
.
runner
=
runner
self
.
model_input_cls
=
self
.
runner
.
_model_input_cls
self
.
attn_backend
=
self
.
runner
.
attn_backend
self
.
sliding_window
=
self
.
runner
.
sliding_window
self
.
block_size
=
self
.
runner
.
block_size
self
.
device
=
self
.
runner
.
device
self
.
multi_modal_input_mapper
=
self
.
runner
.
multi_modal_input_mapper
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
sliding_window
=
model_config
.
get_sliding_window
()
self
.
block_size
=
cache_config
.
block_size
self
.
attn_backend
=
get_attn_backend
(
self
.
model_config
.
get_num_attention_heads
(
self
.
parallel_config
),
self
.
model_config
.
get_head_size
(),
self
.
model_config
.
get_num_kv_heads
(
self
.
parallel_config
),
self
.
model_config
.
get_sliding_window
(),
self
.
model_config
.
dtype
,
self
.
kv_cache_dtype
,
self
.
block_size
,
)
def
add_seq_group
(
self
,
seq_group_metadata
:
SequenceGroupMetadata
):
self
.
seq_group_metadata_list
.
append
(
seq_group_metadata
)
# Multi-modal data support
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
multi_modal_input_mapper
=
self
.
mm_registry
\
.
create_input_mapper
(
self
.
model_config
)
self
.
mm_registry
.
init_mm_limits_per_prompt
(
self
.
model_config
)
def
build
(
self
)
->
ModelInputForCPU
:
multi_modal_kwargs
=
None
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt
=
self
.
seq_group_metadata_list
[
0
].
is_prompt
# Prepare input tensors.
if
is_prompt
:
(
input_tokens
,
input_positions
,
attn_metadata
,
seq_lens
,
multi_modal_kwargs
)
=
self
.
_prepare_prompt
(
self
.
seq_group_metadata_list
)
else
:
(
input_tokens
,
input_positions
,
attn_metadata
)
=
self
.
_prepare_decode
(
self
.
seq_group_metadata_list
)
seq_lens
=
[]
# Lazy initialization.
self
.
model
:
nn
.
Module
# Set after init_Model
return
self
.
model_input_cls
(
input_tokens
=
input_tokens
,
input_positions
=
input_positions
,
attn_metadata
=
attn_metadata
,
multi_modal_kwargs
=
multi_modal_kwargs
,
# query_lens is not needed if chunked prefill is not
# supported. Since CPU worker doesn't support chunked prefill
# just use seq_lens instead.
seq_lens
=
seq_lens
,
query_lens
=
seq_lens
,
)
if
self
.
mod
e
l_
config
.
is_encoder_decoder_model
:
raise
NotImplementedError
(
STR_NOT_IMPL_ENC_DEC_ERR_STRS
[
'STR_NOT_IMPL_ENC_DEC_CPU'
]
)
def
_compute_multi_
mod
a
l_
input
(
self
,
seq_data
:
SequenceData
,
mm_data
,
computed_len
:
int
):
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
)
def
load_model
(
self
)
->
None
:
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
load_config
=
self
.
load_config
,
device_config
=
self
.
device_config
,
lora_config
=
self
.
lora_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
,
cache_config
=
self
.
cache_config
)
# special processing for mrope position deltas.
mrope_positions
=
None
if
self
.
runner
.
model_is_mrope
:
image_grid_thw
=
mm_kwargs
.
get
(
"image_grid_thw"
,
None
)
video_grid_thw
=
mm_kwargs
.
get
(
"video_grid_thw"
,
None
)
assert
image_grid_thw
is
not
None
or
video_grid_thw
is
not
None
,
(
"mrope embedding type requires multi-modal input mapper "
"returns 'image_grid_thw' or 'video_grid_thw'."
)
hf_config
=
self
.
runner
.
model_config
.
hf_config
token_ids
=
seq_data
.
get_token_ids
()
mrope_positions
,
mrope_position_delta
=
\
MRotaryEmbedding
.
get_input_positions
(
token_ids
,
image_grid_thw
=
image_grid_thw
,
video_grid_thw
=
video_grid_thw
,
image_token_id
=
hf_config
.
image_token_id
,
video_token_id
=
hf_config
.
video_token_id
,
vision_start_token_id
=
hf_config
.
vision_start_token_id
,
vision_end_token_id
=
hf_config
.
vision_end_token_id
,
spatial_merge_size
=
hf_config
.
vision_config
.
spatial_merge_size
,
context_len
=
computed_len
,
)
seq_data
.
mrope_position_delta
=
mrope_position_delta
return
mm_kwargs
,
mrope_positions
def
_prepare_prompt
(
self
,
...
...
@@ -142,6 +187,8 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
assert
len
(
seq_group_metadata_list
)
>
0
input_tokens
:
List
[
int
]
=
[]
input_positions
:
List
[
int
]
=
[]
input_mrope_positions
:
List
[
List
[
int
]]
=
[[]
for
_
in
range
(
3
)]
slot_mapping
:
List
[
int
]
=
[]
seq_lens
:
List
[
int
]
=
[]
multi_modal_inputs_list
:
List
[
MultiModalInputs
]
=
[]
...
...
@@ -160,16 +207,21 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
seq_lens
.
append
(
seq_len
)
# Prompt token num
input_tokens
.
extend
(
prompt_tokens
)
# Token ids
mrope_positions
=
None
if
(
mm_data
:
=
seq_group_metadata
.
multi_modal_data
):
mm_kwargs
,
mrope_positions
=
self
.
_compute_multi_modal_input
(
seq_data
,
mm_data
,
computed_len
)
multi_modal_inputs_list
.
append
(
mm_kwargs
)
# Token position ids
# NOTE(woosuk): Here we assume that the first token in the prompt
# is always the first token in the sequence.
if
mrope_positions
:
for
idx
in
range
(
3
):
input_mrope_positions
[
idx
].
extend
(
mrope_positions
[
idx
])
else
:
input_positions
.
extend
(
list
(
range
(
computed_len
,
seq_len
)))
mm_data
=
seq_group_metadata
.
multi_modal_data
if
mm_data
:
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
)
multi_modal_inputs_list
.
append
(
mm_kwargs
)
# Compute the slot mapping.
block_table
=
seq_group_metadata
.
block_tables
[
seq_id
]
# Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
...
...
@@ -192,12 +244,18 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
slot
=
block_number
*
self
.
block_size
+
block_offset
slot_mapping
.
append
(
slot
)
if
any
(
input_mrope_positions
):
input_positions
=
None
# type: ignore
else
:
input_mrope_positions
=
None
# type: ignore
num_prompt_tokens
=
len
(
input_tokens
)
input_tokens
=
torch
.
tensor
(
input_tokens
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
# type: ignore
input_positions
=
torch
.
tensor
(
input_positions
,
input_positions
=
torch
.
tensor
(
input_positions
or
input_mrope_positions
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
# type: ignore
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
...
...
@@ -228,6 +286,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
assert
len
(
seq_group_metadata_list
)
>
0
input_tokens
:
List
[
int
]
=
[]
input_positions
:
List
[
int
]
=
[]
input_mrope_positions
:
List
[
List
[
int
]]
=
[[]
for
_
in
range
(
3
)]
slot_mapping
:
List
[
int
]
=
[]
seq_lens
:
List
[
int
]
=
[]
block_tables
:
List
[
List
[
int
]]
=
[]
...
...
@@ -245,6 +304,16 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
seq_len
=
seq_data
.
get_len
()
position
=
seq_len
-
1
if
seq_data
.
mrope_position_delta
is
not
None
:
context_len
=
seq_data
.
get_num_computed_tokens
()
next_pos
=
MRotaryEmbedding
.
get_next_input_positions
(
seq_data
.
mrope_position_delta
,
context_len
,
seq_len
,
)
for
idx
in
range
(
3
):
input_mrope_positions
[
idx
].
extend
(
next_pos
[
idx
])
else
:
input_positions
.
append
(
position
)
seq_len
=
seq_len
if
self
.
sliding_window
is
None
else
min
(
...
...
@@ -263,12 +332,18 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
block_table
=
block_table
[
-
sliding_window_blocks
:]
block_tables
.
append
(
block_table
)
if
any
(
input_mrope_positions
):
input_positions
=
None
# type: ignore
else
:
input_mrope_positions
=
None
# type: ignore
max_decode_seq_len
=
max
(
seq_lens
)
input_tokens
=
torch
.
tensor
(
input_tokens
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_positions
=
torch
.
tensor
(
input_positions
,
input_positions
=
torch
.
tensor
(
input_positions
or
input_mrope_positions
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
...
...
@@ -302,56 +377,139 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
attn_metadata
,
)
class
CPUModelRunner
(
ModelRunnerBase
[
ModelInputForCPU
]):
_model_input_cls
:
Type
[
ModelInputForCPUWithSamplingMetadata
]
=
(
ModelInputForCPUWithSamplingMetadata
)
_builder_cls
:
Type
[
ModelInputForCPUBuilder
]
=
ModelInputForCPUBuilder
def
__init__
(
self
,
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
cache_config
:
CacheConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
]
=
None
,
is_driver_worker
:
bool
=
False
,
*
args
,
**
kwargs
,
):
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
# Currently, CPU worker doesn't support chunked prefill.
assert
self
.
scheduler_config
.
chunked_prefill_enabled
is
False
self
.
device_config
=
device_config
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
load_config
=
load_config
self
.
is_driver_worker
=
is_driver_worker
self
.
device
=
self
.
device_config
.
device
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
sliding_window
=
model_config
.
get_sliding_window
()
self
.
block_size
=
cache_config
.
block_size
self
.
attn_backend
=
get_attn_backend
(
self
.
model_config
.
get_num_attention_heads
(
self
.
parallel_config
),
self
.
model_config
.
get_head_size
(),
self
.
model_config
.
get_num_kv_heads
(
self
.
parallel_config
),
self
.
model_config
.
get_sliding_window
(),
self
.
model_config
.
dtype
,
self
.
kv_cache_dtype
,
self
.
block_size
,
)
# Multi-modal data support
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
multi_modal_input_mapper
=
self
.
mm_registry
\
.
create_input_mapper
(
self
.
model_config
)
self
.
mm_registry
.
init_mm_limits_per_prompt
(
self
.
model_config
)
# Lazy initialization.
self
.
model
:
nn
.
Module
# Set after init_Model
if
self
.
model_config
.
is_encoder_decoder_model
:
raise
NotImplementedError
(
STR_NOT_IMPL_ENC_DEC_ERR_STRS
[
'STR_NOT_IMPL_ENC_DEC_CPU'
])
@
property
def
model_is_mrope
(
self
)
->
bool
:
"""Detect if the model has "mrope" rope_scaling type.
mrope requires keep "rope_deltas" between prompt and decoding phases."""
rope_scaling
=
getattr
(
self
.
model_config
.
hf_config
,
"rope_scaling"
,
{})
if
rope_scaling
is
None
:
return
False
return
rope_scaling
.
get
(
"type"
,
None
)
==
"mrope"
def
load_model
(
self
)
->
None
:
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
load_config
=
self
.
load_config
,
device_config
=
self
.
device_config
,
lora_config
=
self
.
lora_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
,
cache_config
=
self
.
cache_config
)
def
make_model_input_from_broadcasted_tensor_dict
(
self
,
tensor_dict
:
Dict
[
str
,
Any
],
)
->
CPU
ModelInput
:
return
CPU
ModelInput
.
from_broadcasted_tensor_dict
(
)
->
ModelInput
ForCPU
:
return
ModelInput
ForCPU
.
from_broadcasted_tensor_dict
(
tensor_dict
,
attn_backend
=
self
.
attn_backend
,
)
def
_prepare_model_input_tensors
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
)
->
ModelInputForCPUWithSamplingMetadata
:
"""Helper method to prepare the model input based on a given sequence
group. Prepares metadata needed for the base model forward pass but not
metadata for possible additional steps, e.g., sampling.
"""
builder
=
self
.
_builder_cls
(
weakref
.
proxy
(
self
),
finished_requests_ids
)
for
seq_group_metadata
in
seq_group_metadata_list
:
builder
.
add_seq_group
(
seq_group_metadata
)
return
builder
.
build
()
# type: ignore
def
prepare_model_input
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
virtual_engine
:
int
=
0
,
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
)
->
CPUModelInput
:
multi_modal_kwargs
=
None
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt
=
seq_group_metadata_list
[
0
].
is_prompt
# Prepare input tensors.
if
is_prompt
:
(
input_tokens
,
input_positions
,
attn_metadata
,
seq_lens
,
multi_modal_kwargs
)
=
self
.
_prepare_prompt
(
seq_group_metadata_list
)
else
:
(
input_tokens
,
input_positions
,
attn_metadata
)
=
self
.
_prepare_decode
(
seq_group_metadata_list
)
seq_lens
=
[]
sampling_metadata
=
SamplingMetadata
.
prepare
(
seq_group_metadata_list
,
seq_lens
,
# query_lens is not needed if chunked prefill is not
# supported. Since CPU worker doesn't support chunked prefill
# just use seq_lens instead.
seq_lens
,
)
->
ModelInputForCPUWithSamplingMetadata
:
"""Prepare the model input based on a given sequence group, including
metadata for the sampling step.
"""
model_input
=
self
.
_prepare_model_input_tensors
(
seq_group_metadata_list
,
finished_requests_ids
)
# Sampling metadata is only required for the final pp group
generators
=
self
.
get_generators
(
finished_requests_ids
)
sampling_metadata
=
SamplingMetadata
.
prepare
(
seq_group_metadata_list
,
model_input
.
seq_lens
,
model_input
.
query_lens
,
self
.
device
,
pin_memory
=
False
,
generators
=
self
.
get_generators
(
finished_requests_ids
))
return
CPUModelInput
(
input_tokens
=
input_tokens
,
input_positions
=
input_positions
,
attn_metadata
=
attn_metadata
,
generators
=
generators
)
return
dataclasses
.
replace
(
model_input
,
sampling_metadata
=
sampling_metadata
,
multi_modal_kwargs
=
multi_modal_kwargs
,
)
virtual_engine
=
virtual_engine
)
@
torch
.
no_grad
()
def
execute_model
(
self
,
model_input
:
CPU
ModelInput
,
model_input
:
ModelInput
ForCPUWithSamplingMetadata
,
kv_caches
:
List
[
torch
.
Tensor
],
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
num_steps
:
int
=
1
,
...
...
@@ -372,6 +530,8 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
model_input
.
attn_metadata
,
**
MultiModalInputs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
"intermediate_tensors"
:
intermediate_tensors
,
}
hidden_states
=
model_executable
(
**
execute_model_kwargs
)
...
...
vllm/worker/enc_dec_model_runner.py
View file @
539aa992
import
dataclasses
import
itertools
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
cast
import
torch
...
...
@@ -17,14 +18,16 @@ from vllm.inputs import INPUT_REGISTRY, InputRegistry
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
MultiModalInputs
,
MultiModalRegistry
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
(
IntermediateTensors
,
PoolerOutput
,
SequenceGroupMetadata
)
from
vllm.utils
import
STR_NOT_IMPL_ENC_DEC_BACKEND
,
make_tensor_with_pad
from
vllm.worker.model_runner
import
(
GPUModelRunnerBase
,
ModelInputForGPUBuilder
,
ModelInputForGPUWithSamplingMetadata
)
ModelInputForGPUWithSamplingMetadata
,
_get_graph_batch_size
)
from
vllm.worker.model_runner_base
import
(
_add_attn_metadata_broadcastable_dict
,
_add_sampling_metadata_broadcastable_dict
)
...
...
@@ -50,6 +53,7 @@ class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
"virtual_engine"
:
self
.
virtual_engine
,
"request_ids_to_seq_ids"
:
self
.
request_ids_to_seq_ids
,
"finished_requests_ids"
:
self
.
finished_requests_ids
,
"multi_modal_kwargs"
:
self
.
multi_modal_kwargs
,
}
_add_attn_metadata_broadcastable_dict
(
tensor_dict
,
self
.
attn_metadata
)
_add_sampling_metadata_broadcastable_dict
(
tensor_dict
,
...
...
@@ -178,12 +182,22 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
raise
ValueError
(
"num_steps > 1 is not supported in "
"EncoderDecoderModelRunner"
)
if
(
model_input
.
attn_metadata
is
not
None
and
model_input
.
attn_metadata
.
prefill_metadata
is
None
and
model_input
.
attn_metadata
.
decode_metadata
.
use_cuda_graph
):
assert
model_input
.
input_tokens
is
not
None
graph_batch_size
=
model_input
.
input_tokens
.
shape
[
0
]
model_executable
=
self
.
graph_runners
[
model_input
.
virtual_engine
][
graph_batch_size
]
else
:
model_executable
=
self
.
model
seqlen_agnostic_kwargs
=
{
"finished_requests_ids"
:
model_input
.
finished_requests_ids
,
"request_ids_to_seq_ids"
:
model_input
.
request_ids_to_seq_ids
,
}
if
self
.
has_seqlen_agnostic
else
{}
multi_modal_kwargs
=
model_input
.
multi_modal_kwargs
or
{}
hidden_or_intermediate_states
=
model_executable
(
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
...
...
@@ -192,6 +206,8 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
kv_caches
=
kv_caches
,
attn_metadata
=
model_input
.
attn_metadata
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalInputs
.
as_kwargs
(
multi_modal_kwargs
,
device
=
self
.
device
),
**
seqlen_agnostic_kwargs
)
logits
=
self
.
model
.
compute_logits
(
hidden_or_intermediate_states
,
...
...
@@ -200,6 +216,9 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
if
not
self
.
is_driver_worker
:
return
[]
if
model_input
.
async_callback
is
not
None
:
model_input
.
async_callback
()
# Sample the next token.
output
:
SamplerOutput
=
self
.
model
.
sample
(
logits
=
logits
,
...
...
@@ -231,14 +250,12 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
"""
model_input
=
self
.
_prepare_model_input_tensors
(
seq_group_metadata_list
,
finished_requests_ids
)
(
attn_metadata
,
encoder_input_tokens_tensor
,
encoder_input_positions_tensor
,
)
=
(
self
.
_prepare_encoder_model_input_tensors
(
seq_group_metadata_list
,
model_input
))
# Inject attn_metadata encoder/cross-attention fields &
# encoder input tokens/positions into model_input.
# Frozen dataclass fields cannot be modified, so use
...
...
@@ -277,8 +294,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
max_mm_tokens
=
self
.
mm_registry
.
get_max_multimodal_tokens
(
self
.
model_config
)
if
max_mm_tokens
>
0
:
raise
NotImplementedError
(
"Multi-modal encoder-decoder models are not supported yet"
)
logger
.
info
(
"Starting profile run for multi-modal models."
)
batch_size
=
0
for
group_id
in
range
(
max_num_seqs
):
...
...
@@ -286,24 +302,39 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
(
group_id
<
max_num_batched_tokens
%
max_num_seqs
))
batch_size
+=
seq_len
seq_data
,
_
=
self
.
input_registry
\
.
dummy_data_for_profiling
(
self
.
model_config
,
decoder_seq_data
,
decoder_dummy_multi_modal_data
\
=
self
.
input_registry
.
dummy_data_for_profiling
(
self
.
model_config
,
seq_len
,
self
.
mm_registry
,
is_encoder_data
=
False
)
encoder_seq_data
,
encoder_dummy_multi_modal_data
\
=
self
.
input_registry
.
dummy_data_for_profiling
(
self
.
model_config
,
seq_len
,
self
.
mm_registry
)
self
.
mm_registry
,
is_encoder_data
=
True
)
# Having more tokens is over-conservative but otherwise fine
assert
len
(
seq_data
.
prompt_token_ids
)
>=
seq_len
,
(
assert
len
(
decoder_
seq_data
.
prompt_token_ids
)
>=
seq_len
,
(
f
"Expected at least
{
seq_len
}
dummy tokens for profiling, "
f
"but got:
{
len
(
seq_data
.
prompt_token_ids
)
}
"
)
f
"but got:
{
len
(
decoder_seq_data
.
prompt_token_ids
)
}
"
)
assert
decoder_dummy_multi_modal_data
is
None
or
\
encoder_dummy_multi_modal_data
is
None
,
(
"Multi-modal data can't be provided in both encoder and decoder"
)
seq
=
SequenceGroupMetadata
(
request_id
=
str
(
group_id
),
is_prompt
=
True
,
seq_data
=
{
group_id
:
seq_data
},
seq_data
=
{
group_id
:
decoder_
seq_data
},
sampling_params
=
sampling_params
,
block_tables
=
None
,
encoder_seq_data
=
seq_data
,
encoder_seq_data
=
encoder_
seq_data
,
cross_block_table
=
None
,
multi_modal_data
=
decoder_dummy_multi_modal_data
or
encoder_dummy_multi_modal_data
,
)
seqs
.
append
(
seq
)
...
...
@@ -424,24 +455,42 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
encoder_input_tokens_tensor
=
self
.
_empty_long_tensor
()
encoder_input_positions_tensor
=
self
.
_empty_long_tensor
()
cross_slot_mapping_tensor
=
self
.
_empty_long_tensor
()
# Extract cross-attention block tables &
# seq len from each sequence group metadata.
# Cross-attention block tables are empty
# during vLLM memory profiling.
cross_block_tables
=
[]
for
seq_group_metadata
in
seq_group_metadata_list
:
for
_
in
range
(
len
(
seq_group_metadata
.
seq_data
)):
encoder_seq_lens
.
append
(
seq_group_metadata
.
encoder_seq_data
.
get_len
())
cross_block_table
=
seq_group_metadata
.
cross_block_table
cross_block_tables
.
append
([]
if
(
cross_block_table
is
None
)
else
cross_block_table
)
# Convert cross-attention block tables to encoder input tensor
if
(
model_input
.
attn_metadata
is
not
None
and
model_input
.
attn_metadata
.
use_cuda_graph
):
# We will be using CUDA graph replay for this decode.
max_len_of_block_table
=
self
.
get_max_block_per_batch
()
batch_size
=
len
(
encoder_seq_lens
)
graph_batch_size
=
_get_graph_batch_size
(
batch_size
)
assert
graph_batch_size
>=
batch_size
cuda_graph_pad_size
=
graph_batch_size
-
batch_size
# extend the cross_block_tables and encoder_seq_lens to match
# the graph_batch_size.
cross_block_tables
.
extend
([[]
for
_
in
range
(
cuda_graph_pad_size
)
])
encoder_seq_lens
.
extend
(
itertools
.
repeat
(
1
,
cuda_graph_pad_size
))
else
:
max_len_of_block_table
=
max
(
len
(
block_table
)
for
block_table
in
cross_block_tables
)
cross_block_tables
=
make_tensor_with_pad
(
cross_block_tables
,
max_len
=
max
(
len
(
block_table
)
for
block_table
in
cross_block_tables
),
max_len
=
max_len_of_block_table
,
pad
=
0
,
dtype
=
torch
.
int32
,
device
=
self
.
device
,
...
...
vllm/worker/model_runner.py
View file @
539aa992
...
...
@@ -45,7 +45,7 @@ from vllm.prompt_adapter.worker_manager import (
LRUCacheWorkerPromptAdapterManager
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
from
vllm.utils
import
(
Cuda
MemoryProfiler
,
PyObjectCache
,
async_tensor_h2d
,
from
vllm.utils
import
(
Device
MemoryProfiler
,
PyObjectCache
,
async_tensor_h2d
,
flatten_2d_lists
,
is_hip
,
is_pin_memory_available
,
supports_dynamo
)
from
vllm.worker.model_runner_base
import
(
...
...
@@ -243,6 +243,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
prefix_cache_hit
:
bool
=
False
,
reinit
:
bool
=
False
,
reinit_use_defaults
:
bool
=
False
,
encoder_seq_len
:
int
=
0
,
):
if
reinit
:
assert
len
(
self
.
seq_ids
)
==
len
(
seq_ids
)
# type: ignore
...
...
@@ -256,6 +257,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self
.
block_tables
=
block_tables
self
.
computed_block_nums
=
computed_block_nums
self
.
n_seqs
=
n_seqs
self
.
encoder_seq_len
=
encoder_seq_len
if
reinit
:
if
len
(
self
.
seq_ids
)
==
1
and
reinit_use_defaults
:
...
...
@@ -702,6 +704,11 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
assert
n_seqs
==
1
self
.
decode_only
=
False
encoder_seq_len
=
0
if
self
.
runner
.
model_config
.
is_encoder_decoder_model
:
encoder_seq_len
=
seq_group_metadata
.
encoder_seq_data
.
get_len
()
inter_data
=
self
.
init_cached_inter_data
(
request_id
=
seq_group_metadata
.
request_id
,
seq_ids
=
seq_ids
,
...
...
@@ -709,7 +716,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
block_tables
=
seq_group_metadata
.
block_tables
,
computed_block_nums
=
seq_group_metadata
.
computed_block_nums
,
reinit
=
True
,
reinit_use_defaults
=
True
)
reinit_use_defaults
=
True
,
encoder_seq_len
=
encoder_seq_len
)
self
.
inter_data_list
.
append
(
inter_data
)
...
...
@@ -719,11 +727,15 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
for
per_seq_group_fn
in
self
.
per_seq_group_compute_fns
:
per_seq_group_fn
(
inter_data
,
seq_group_metadata
)
def
_use_captured_graph
(
self
,
batch_size
:
int
,
max_decode_seq_len
:
int
)
->
bool
:
def
_use_captured_graph
(
self
,
batch_size
:
int
,
max_decode_seq_len
:
int
,
max_encoder_seq_len
:
int
=
0
)
->
bool
:
return
(
self
.
decode_only
and
not
self
.
runner
.
model_config
.
enforce_eager
and
batch_size
<=
self
.
runner
.
max_batchsize_to_capture
and
max_decode_seq_len
<=
self
.
runner
.
max_seq_len_to_capture
)
and
batch_size
<=
_BATCH_SIZES_TO_CAPTURE
[
-
1
]
and
max_decode_seq_len
<=
self
.
runner
.
max_seq_len_to_capture
and
max_encoder_seq_len
<=
self
.
runner
.
max_seq_len_to_capture
and
batch_size
<=
self
.
runner
.
max_batchsize_to_capture
)
def
build
(
self
)
->
ModelInputForGPU
:
"""Finalize the builder intermediate data and
...
...
@@ -763,15 +775,18 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
input_positions
.
extend
(
cur_input_positions
)
seq_lens
=
[]
query_lens
=
[]
max_decode_seq_len
=
0
max_encoder_seq_len
=
0
for
inter_data
in
self
.
inter_data_list
:
seq_lens
.
extend
(
inter_data
.
seq_lens
)
query_lens
.
extend
(
inter_data
.
query_lens
)
if
not
inter_data
.
is_prompt
:
max_decode_seq_len
=
max
(
max_decode_seq_len
,
max
(
inter_data
.
seq_lens
))
query_lens
=
[]
for
inter_data
in
self
.
inter_data_list
:
query_lens
.
extend
(
inter_data
.
query
_len
s
)
if
self
.
runner
.
model_config
.
is_encoder_decoder_model
:
max_encoder_seq_len
=
max
(
max_encoder_seq_len
,
inter_data
.
encoder_seq
_len
)
# Mapping from request IDs to sequence IDs. Used for Jamba models
# that manages the cache by itself.
...
...
@@ -781,8 +796,10 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
}
batch_size
=
len
(
input_tokens
)
use_captured_graph
=
self
.
_use_captured_graph
(
batch_size
,
max_decode_seq_len
)
use_captured_graph
=
self
.
_use_captured_graph
(
batch_size
,
max_decode_seq_len
,
max_encoder_seq_len
=
max_encoder_seq_len
)
# If cuda graph can be used, pad tensors accordingly.
# See `capture_model` API for more details.
...
...
@@ -995,7 +1012,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
def
load_model
(
self
)
->
None
:
logger
.
info
(
"Starting to load model %s..."
,
self
.
model_config
.
model
)
with
Cuda
MemoryProfiler
()
as
m
:
with
Device
MemoryProfiler
()
as
m
:
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
device_config
=
self
.
device_config
,
load_config
=
self
.
load_config
,
...
...
@@ -1064,8 +1081,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"This may lead to less accurate results!"
)
if
envs
.
VLLM_TEST_DYNAMO_GRAPH_CAPTURE
and
supports_dynamo
():
from
vllm.compilation.backends
import
vllm_backend
from
vllm.plugins
import
get_torch_compile_backend
backend
=
get_torch_compile_backend
()
or
"eager"
backend
=
get_torch_compile_backend
()
or
vllm_backend
self
.
model
=
torch
.
compile
(
self
.
model
,
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
...
...
@@ -1363,7 +1381,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
for
batch_size
in
reversed
(
batch_size_capture_list
):
attn_metadata
=
(
self
.
attn_state
.
graph_capture_get_metadata_for_batch
(
batch_size
))
batch_size
,
is_encoder_decoder_model
=
self
.
model_config
.
is_encoder_decoder_model
))
if
self
.
lora_config
:
lora_mapping
=
LoRAMapping
(
...
...
@@ -1379,10 +1399,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
)
self
.
set_active_prompt_adapters
(
set
(),
prompt_adapter_mapping
)
graph_runner
=
CUDAGraphRunner
(
self
.
model
,
self
.
attn_backend
.
get_name
(),
self
.
attn_state
.
graph_clone
(
batch_size
))
self
.
attn_state
.
graph_clone
(
batch_size
),
self
.
model_config
.
is_encoder_decoder_model
)
capture_inputs
=
{
"input_ids"
:
...
...
@@ -1419,6 +1439,12 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self
.
model
.
get_seqlen_agnostic_capture_inputs
(
batch_size
)
})
if
self
.
model_config
.
is_encoder_decoder_model
:
# add the additional inputs to capture for
# encoder-decoder models.
self
.
_update_inputs_to_capture_for_enc_dec_model
(
capture_inputs
)
graph_runner
.
capture
(
**
capture_inputs
)
self
.
graph_memory_pool
=
graph_runner
.
graph
.
pool
()
self
.
graph_runners
[
virtual_engine
][
batch_size
]
=
(
...
...
@@ -1429,6 +1455,24 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
# This usually takes < 10 seconds.
logger
.
info
(
"Graph capturing finished in %.0f secs."
,
elapsed_time
)
def
_update_inputs_to_capture_for_enc_dec_model
(
self
,
capture_inputs
:
Dict
[
str
,
Any
]):
"""
Updates the set of input tensors needed for CUDA graph capture in an
encoder-decoder model.
This method modifies the provided `capture_inputs` dictionary by
adding tensors specific to encoder-decoder specific models that
need to be captured for CUDA Graph replay.
"""
# During the decode phase encoder_input_ids and encoder_positions are
# unset. Do the same thing for graph capture.
capture_inputs
[
"encoder_input_ids"
]
=
torch
.
tensor
(
[],
dtype
=
torch
.
long
).
cuda
()
capture_inputs
[
"encoder_positions"
]
=
torch
.
tensor
(
[],
dtype
=
torch
.
long
).
cuda
()
@
property
def
vocab_size
(
self
)
->
int
:
return
self
.
model_config
.
get_vocab_size
()
...
...
@@ -1628,7 +1672,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
class
CUDAGraphRunner
:
def
__init__
(
self
,
model
:
nn
.
Module
,
backend_name
:
str
,
attn_state
:
AttentionState
):
attn_state
:
AttentionState
,
is_encoder_decoder_model
:
bool
):
self
.
model
=
model
self
.
backend_name
=
backend_name
self
.
attn_state
=
attn_state
...
...
@@ -1637,6 +1681,7 @@ class CUDAGraphRunner:
self
.
output_buffers
:
Dict
[
str
,
torch
.
Tensor
]
=
{}
self
.
_graph
:
Optional
[
torch
.
cuda
.
CUDAGraph
]
=
None
self
.
_is_encoder_decoder_model
=
is_encoder_decoder_model
@
property
def
graph
(
self
):
...
...
@@ -1670,8 +1715,9 @@ class CUDAGraphRunner:
intermediate_tensors
=
intermediate_inputs
,
**
kwargs
,
)
# Wait for the warm up operations to finish before proceeding with
# Graph Capture.
torch
.
cuda
.
synchronize
()
# Capture the graph.
self
.
_graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
self
.
_graph
,
pool
=
memory_pool
,
stream
=
stream
):
...
...
@@ -1703,10 +1749,14 @@ class CUDAGraphRunner:
# Save the input and output buffers.
self
.
input_buffers
=
{
"input_ids"
:
input_ids
,
"positions"
:
positions
,
"kv_caches"
:
kv_caches
,
**
self
.
attn_state
.
get_graph_input_buffers
(
attn_metadata
),
"input_ids"
:
input_ids
,
"positions"
:
positions
,
"kv_caches"
:
kv_caches
,
**
self
.
attn_state
.
get_graph_input_buffers
(
attn_metadata
,
self
.
_is_encoder_decoder_model
),
**
kwargs
,
}
if
intermediate_inputs
is
not
None
:
...
...
@@ -1736,8 +1786,8 @@ class CUDAGraphRunner:
self
.
input_buffers
[
"positions"
].
copy_
(
positions
,
non_blocking
=
True
)
self
.
input_buffers
[
"slot_mapping"
].
copy_
(
attn_metadata
.
slot_mapping
,
non_blocking
=
True
)
self
.
attn_state
.
prepare_graph_input_buffers
(
self
.
input_buffers
,
attn_metadata
)
self
.
attn_state
.
prepare_graph_input_buffers
(
self
.
input_buffers
,
attn_metadata
,
self
.
_is_encoder_decoder_model
)
if
"seqlen_agnostic_capture_inputs"
in
self
.
input_buffers
:
self
.
model
.
copy_inputs_before_cuda_graphs
(
self
.
input_buffers
,
**
kwargs
)
...
...
@@ -1751,6 +1801,12 @@ class CUDAGraphRunner:
if
key
!=
"model_execute_time"
and
key
!=
"model_forward_time"
:
self
.
input_buffers
[
key
].
copy_
(
intermediate_tensors
[
key
],
non_blocking
=
True
)
if
self
.
_is_encoder_decoder_model
:
self
.
input_buffers
[
"encoder_input_ids"
].
copy_
(
kwargs
[
'encoder_input_ids'
],
non_blocking
=
True
)
self
.
input_buffers
[
"encoder_positions"
].
copy_
(
kwargs
[
'encoder_positions'
],
non_blocking
=
True
)
# Run the graph.
self
.
graph
.
replay
()
# Return the output tensor.
...
...
vllm/worker/model_runner_base.py
View file @
539aa992
...
...
@@ -3,11 +3,13 @@ import pickle
from
abc
import
ABC
,
abstractmethod
from
datetime
import
datetime
from
functools
import
wraps
from
typing
import
(
TYPE_CHECKING
,
Any
,
Dict
,
Generic
,
List
,
Optional
,
Type
,
TypeVar
)
from
typing
import
(
TYPE_CHECKING
,
Any
,
Dict
,
Generic
,
Iterable
,
List
,
Optional
,
Type
,
TypeVar
)
import
torch
from
torch
import
is_tensor
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
...
...
@@ -17,6 +19,8 @@ if TYPE_CHECKING:
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.model_executor
import
SamplingMetadata
logger
=
init_logger
(
__name__
)
T
=
TypeVar
(
'T'
,
bound
=
"BroadcastableModelInput"
)
...
...
@@ -113,6 +117,8 @@ def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
except
Exception
as
err
:
timestamp
=
datetime
.
now
().
strftime
(
"%Y%m%d-%H%M%S"
)
filename
=
f
"/tmp/err_
{
func
.
__name__
}
_input_
{
timestamp
}
.pkl"
logger
.
info
(
"Writing input of failed execution to %s..."
,
filename
)
with
open
(
filename
,
"wb"
)
as
filep
:
dumped_inputs
=
{
k
:
v
...
...
@@ -122,7 +128,27 @@ def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
for
i
,
arg
in
enumerate
(
args
):
if
i
not
in
(
exclude_args
or
[]):
dumped_inputs
[
f
"arg_
{
i
}
"
]
=
arg
# Only persist dtype and shape for kvcache tensors
# (can be way to big otherwise)
if
(
kv_caches
:
=
dumped_inputs
.
get
(
"kv_caches"
))
\
and
isinstance
(
kv_caches
,
Iterable
):
dumped_inputs
[
"kv_caches"
]
=
[(
t
.
dtype
,
t
.
shape
)
for
t
in
kv_caches
if
is_tensor
(
t
)]
try
:
pickle
.
dump
(
dumped_inputs
,
filep
)
except
Exception
as
pickle_err
:
logger
.
warning
(
"Failed to pickle inputs of failed execution: %s"
,
str
(
pickle_err
))
raise
type
(
err
)(
f
"Error in model execution: "
f
"
{
str
(
err
)
}
"
)
from
err
logger
.
info
(
"Completed writing input of failed execution to %s."
,
filename
)
raise
type
(
err
)(
f
"Error in model execution (input dumped to
{
filename
}
): "
f
"
{
str
(
err
)
}
"
)
from
err
...
...
vllm/worker/multi_step_model_runner.py
View file @
539aa992
...
...
@@ -29,7 +29,7 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
MULTI_STEP_ATTENTION_BACKENDS
=
[
"flash-attn"
,
"flashinfer"
]
MULTI_STEP_ATTENTION_BACKENDS
=
[
"flash-attn"
,
"rocm-flash-attn"
,
"flashinfer"
]
def
seq_output_builder
():
...
...
@@ -614,34 +614,66 @@ def _pythonize_sampler_output(
frozen_model_input
=
model_input
.
frozen_model_input
assert
frozen_model_input
.
sampling_metadata
is
not
None
sampling_metadata
=
frozen_model_input
.
sampling_metadata
# samples generation should have been skipped
assert
not
output
.
outputs
pinned_buffer
=
pinned_sampled_token_buffer
[:
model_input
.
num_queries
]
# We guarantee output tensors are ready, so it is safe to
# pythonize the sampler output & obtain CPU-side logprobs.
#
# However we should check whether logprobs pythonization may
# be skipped entirely, i.e. because no logprobs were requested
# or pythonization was not deferred. To that end,
#
# * `prompt_logprobs_are_requested_for_prefill` signals that
# there are *any* prefill-phase requests which specify that
# prompt logprobs should be returned.
#
# * `any_logprobs_are_requested` signals that there are any
# requests which (1) specify that sample logprobs should be
# returned, or (2) are in the prefill phase AND specify that
# prompt logprobs should be returned.
#
# Later on, these flags cause adjustments to the pythonization
# process to accommodate logprobs.
seq_groups
=
sampling_metadata
.
seq_groups
prompt_logprobs_are_requested_for_prefill
=
any
([
sg
.
sampling_params
.
prompt_logprobs
is
not
None
and
sg
.
is_prompt
for
sg
in
seq_groups
])
any_logprobs_are_requested
=
(
prompt_logprobs_are_requested_for_prefill
or
any
([
sg
.
sampling_params
.
logprobs
is
not
None
for
sg
in
seq_groups
]))
if
prompt_logprobs_are_requested_for_prefill
:
# CPU GPU sync, after gathering *only* sampled tokens (since
# requesting prompt logprobs leads `sampled_token_ids` to
# include prompt token ids in addition to sampled token ids.)
sample_idx_tensor
=
torch
.
tensor
(
[
sdx
for
sg
in
seq_groups
for
sdx
in
sg
.
sample_indices
])
pinned_buffer
=
pinned_buffer
.
copy_
(
sampled_token_ids
[
sample_idx_tensor
,
:],
non_blocking
=
False
)
else
:
# CPU GPU sync
pinned_buffer
=
pinned_buffer
.
copy_
(
sampled_token_ids
,
non_blocking
=
False
)
pinned_buffer
=
pinned_buffer
.
copy_
(
sampled_token_ids
,
non_blocking
=
False
)
# this will not block as the tensors are already on CPU
samples_list
=
pinned_buffer
.
tolist
()
sampling_metadata
=
frozen_model_input
.
sampling_metadata
skip_sampler_cpu_output
=
(
frozen_model_input
.
sampling_metadata
.
skip_sampler_cpu_output
)
# We are guaranteed output tensors are ready, so it is safe to
# pythonize the sampler output & obtain CPU-side logprobs.
#
# However this computation may be skipped entirely
# if no pythonization was deferred.
seq_groups
=
sampling_metadata
.
seq_groups
logprobs_are_requested
=
any
([
sg
.
sampling_params
.
logprobs
is
not
None
or
sg
.
sampling_params
.
prompt_logprobs
is
not
None
for
sg
in
seq_groups
])
# *Don't* skip logprobs pythonization *if*:
# * Any requests require logprobs to be returned in this
# iteration AND
# * These requests are being scheduled in a fashion which
# defers pythonization (i.e. multi-step scheduling.)
do_pythonize_logprobs
=
(
skip_sampler_cpu_output
and
logprobs_are_requested
)
and
any_
logprobs_are_requested
)
(
prompt_logprobs
,
sample_logprobs
,
...
...
@@ -666,7 +698,7 @@ def _pythonize_sampler_output(
prompt_logprobs
[
sgdx
],
sample_logprobs
[
sgdx
],
)
elif
logprobs_are_requested
:
elif
any_
logprobs_are_requested
:
(
group_prompt_logprobs
,
group_sample_logprobs
,
...
...
@@ -696,7 +728,7 @@ def _pythonize_sampler_output(
seq_output
.
parent_seq_id
=
seq_ids
[
parent_id
]
seq_output
.
output_token
=
next_token_id
if
logprobs_are_requested
:
if
any_
logprobs_are_requested
:
seq_output
.
logprobs
=
group_sample_logprobs
[
tdx
]
else
:
logprobs
=
next
(
iter
(
seq_output
.
logprobs
.
values
()))
...
...
@@ -714,7 +746,7 @@ def _pythonize_sampler_output(
seq_outputs
.
append
(
SequenceOutput
(
seq_ids
[
parent_id
],
next_token_id
,
(
group_sample_logprobs
[
tdx
]
if
logprobs_are_requested
else
{
if
any_
logprobs_are_requested
else
{
next_token_id
:
Logprob
(
logprob
=
float
(
'inf'
),
rank
=
None
,
...
...
@@ -722,12 +754,12 @@ def _pythonize_sampler_output(
})))
if
cache
is
not
None
:
completion_seq_group_output
.
prompt_logprobs
=
\
group_prompt_logprobs
if
logprobs_are_requested
else
None
group_prompt_logprobs
if
any_
logprobs_are_requested
else
None
output
.
outputs
.
append
(
completion_seq_group_output
)
else
:
output
.
outputs
.
append
(
CompletionSequenceGroupOutput
(
seq_outputs
,
(
group_prompt_logprobs
if
logprobs_are_requested
else
None
)))
if
any_
logprobs_are_requested
else
None
)))
assert
len
(
output
.
outputs
)
>
0
vllm/worker/multi_step_tpu_worker.py
0 → 100644
View file @
539aa992
import
dataclasses
from
typing
import
Dict
,
Optional
,
Tuple
import
torch
from
vllm.distributed
import
broadcast_tensor_dict
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.worker.tpu_model_runner
import
ModelInputForTPU
from
vllm.worker.tpu_worker
import
TPUWorker
from
vllm.worker.worker_base
import
WorkerInput
class
MultiStepTPUWorker
(
TPUWorker
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
cached_model_input
:
Optional
[
ModelInputForTPU
]
=
None
def
_get_driver_input_and_broadcast
(
self
,
execute_model_req
:
ExecuteModelRequest
)
->
Tuple
[
ModelInputForTPU
,
WorkerInput
,
Dict
[
str
,
torch
.
Tensor
]]:
assert
self
.
is_driver_worker
assert
execute_model_req
.
virtual_engine
==
0
is_first_multi_step
=
execute_model_req
.
is_first_multi_step
is_last_step
=
execute_model_req
.
is_last_step
if
is_first_multi_step
:
worker_input
:
WorkerInput
=
self
.
prepare_worker_input
(
execute_model_req
=
execute_model_req
)
worker_input
=
dataclasses
.
replace
(
worker_input
,
num_steps
=
execute_model_req
.
num_lookahead_slots
+
1
)
model_input
:
ModelInputForTPU
=
(
self
.
model_runner
.
prepare_model_input
(
execute_model_req
.
seq_group_metadata_list
,
execute_model_req
.
virtual_engine
,
execute_model_req
.
finished_requests_ids
))
if
execute_model_req
.
async_callback
:
model_input
=
dataclasses
.
replace
(
model_input
,
async_callback
=
execute_model_req
.
async_callback
)
else
:
assert
self
.
cached_model_input
is
not
None
model_input
=
self
.
cached_model_input
worker_input
=
WorkerInput
()
model_input
=
dataclasses
.
replace
(
model_input
,
is_first_multi_step
=
is_first_multi_step
,
is_last_step
=
is_last_step
)
if
self
.
do_metadata_broadcast
:
if
is_first_multi_step
:
broadcast_data
=
worker_input
.
as_broadcastable_tensor_dict
()
broadcast_data
.
update
(
model_input
.
as_broadcastable_tensor_dict
())
broadcast_tensor_dict
(
broadcast_data
,
src
=
0
)
else
:
broadcast_data
=
{
"is_first_multi_step"
:
is_first_multi_step
,
"is_last_step"
:
is_last_step
,
}
broadcast_tensor_dict
(
broadcast_data
,
src
=
0
)
# Retuning empty dict here to keep this compatible with
# `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
return
model_input
,
worker_input
,
{}
def
prepare_input
(
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
,
)
->
Optional
[
Tuple
[
ModelInputForTPU
,
WorkerInput
,
Dict
[
str
,
torch
.
Tensor
]]]:
if
self
.
is_driver_worker
:
if
execute_model_req
is
None
:
if
self
.
do_metadata_broadcast
:
broadcast_tensor_dict
({},
src
=
0
)
return
None
model_input
,
worker_input
,
_
=
self
.
_get_driver_input_and_broadcast
(
execute_model_req
)
if
model_input
.
is_first_multi_step
:
self
.
cached_model_input
=
model_input
return
model_input
,
worker_input
,
{}
else
:
broadcast_data
=
broadcast_tensor_dict
(
src
=
0
)
if
not
broadcast_data
:
return
None
if
len
(
broadcast_data
)
==
2
:
assert
self
.
cached_model_input
is
not
None
self
.
cached_model_input
=
dataclasses
.
replace
(
self
.
cached_model_input
,
is_first_multi_step
=
broadcast_data
[
"is_first_multi_step"
],
is_last_step
=
broadcast_data
[
"is_last_step"
])
empty_worker_input
=
WorkerInput
()
return
self
.
cached_model_input
,
empty_worker_input
,
{}
worker_input
=
WorkerInput
.
from_broadcasted_tensor_dict
(
broadcast_data
)
model_input
=
(
self
.
model_runner
.
make_model_input_from_broadcasted_tensor_dict
(
broadcast_data
))
self
.
cached_model_input
=
model_input
return
model_input
,
worker_input
,
{}
vllm/worker/tpu_model_runner.py
View file @
539aa992
...
...
@@ -51,6 +51,8 @@ class ModelInputForTPU(ModelRunnerInputBase):
num_samples
:
int
best_of
:
List
[
int
]
seq_groups
:
List
[
List
[
int
]]
is_first_multi_step
:
bool
=
True
is_last_step
:
bool
=
True
virtual_engine
:
int
=
0
async_callback
:
Optional
[
Callable
]
=
None
...
...
@@ -65,6 +67,8 @@ class ModelInputForTPU(ModelRunnerInputBase):
"num_samples"
:
self
.
num_samples
,
"best_of"
:
self
.
best_of
,
"seq_groups"
:
self
.
seq_groups
,
"is_first_multi_step"
:
self
.
is_first_multi_step
,
"is_last_step"
:
self
.
is_last_step
,
"virtual_engine"
:
self
.
virtual_engine
,
}
_add_attn_metadata_broadcastable_dict
(
tensor_dict
,
self
.
attn_metadata
)
...
...
@@ -118,6 +122,7 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
self
.
block_size
,
False
,
)
self
.
cached_step_outputs
:
List
[
torch
.
Tensor
]
=
[]
def
load_model
(
self
)
->
None
:
self
.
device
=
self
.
device_config
.
device
...
...
@@ -518,97 +523,159 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
num_steps
:
int
=
1
,
)
->
List
[
SamplerOutput
]:
assert
intermediate_tensors
is
None
if
num_steps
>
1
:
raise
ValueError
(
"TPUModelRunner does not support multi-step execution."
)
def
_execute_model
(
*
args
):
"""Move input args from CPU to device and execute the model."""
new_args
=
[]
for
arg
in
args
:
if
isinstance
(
arg
,
torch
.
Tensor
):
arg
=
arg
.
to
(
self
.
device
)
elif
isinstance
(
arg
,
AttentionMetadata
):
arg
.
slot_mapping
=
arg
.
slot_mapping
.
to
(
self
.
device
)
if
getattr
(
arg
,
"block_tables"
,
None
)
is
not
None
:
arg
.
block_tables
=
arg
.
block_tables
.
to
(
self
.
device
)
if
getattr
(
arg
,
"context_lens"
,
None
)
is
not
None
:
arg
.
context_lens
=
arg
.
context_lens
.
to
(
self
.
device
)
new_args
.
append
(
arg
)
return
self
.
model
(
*
new_args
,
is_prompt
=
is_prompt
)
num_prefills
=
model_input
.
attn_metadata
.
num_prefills
is_prompt
=
num_prefills
>
0
if
not
model_input
.
is_first_multi_step
:
if
not
model_input
.
is_last_step
:
return
[]
use_async_out_proc
=
model_input
.
async_callback
is
not
None
sampler_outputs
=
[]
num_outputs
=
len
(
self
.
cached_step_outputs
)
for
i
in
range
(
num_outputs
):
next_token_ids
=
self
.
cached_step_outputs
.
pop
(
0
)
next_token_ids
=
next_token_ids
.
cpu
().
tolist
()
sampler_output
=
_make_decode_output
(
next_token_ids
,
model_input
.
seq_groups
)
sampler_outputs
.
append
(
sampler_output
)
if
i
<
num_outputs
-
1
and
use_async_out_proc
:
assert
model_input
.
async_callback
is
not
None
ctx
=
model_input
.
async_callback
.
keywords
[
# type: ignore
"ctx"
]
ctx
.
append_output
(
outputs
=
[
sampler_output
],
seq_group_metadata_list
=
ctx
.
seq_group_metadata_list
,
scheduler_outputs
=
ctx
.
scheduler_outputs
,
is_async
=
False
,
is_last_step
=
False
)
model_input
.
async_callback
()
if
use_async_out_proc
:
return
[
sampler_outputs
[
-
1
]]
else
:
return
sampler_outputs
is_prompt
=
model_input
.
attn_metadata
.
num_prefills
>
0
if
is_prompt
:
assert
num_steps
==
1
# NOTE(woosuk): Since the FlashAttention kernel does not support
# ragged inputs, we split the prompts into different batches and
# process them separately. This is a temporary hack that should be
# optimized by using SplashAttention.
next_token_ids
=
[]
orig_slot_mapping
=
model_input
.
attn_metadata
.
slot_mapping
batch_size
=
model_input
.
input_lens
.
shape
[
0
]
start_idx
=
0
next_token_ids
=
[]
for
i
in
range
(
batch_size
):
# Get the actual prefill_len.
prefill_len
=
model_input
.
input_lens
[
i
:
i
+
1
].
item
()
prefill_len
=
_get_padded_prefill_len
(
prefill_len
)
end_idx
=
start_idx
+
prefill_len
model_input
.
attn_metadata
.
slot_mapping
=
orig_slot_mapping
[
None
,
start_idx
:
end_idx
]
model_input
.
attn_metadata
.
num_prefills
=
1
output_token_ids
=
_execute_model
(
model_input
.
token_ids
[
None
,
start_idx
:
end_idx
],
model_input
.
position_ids
[
None
,
start_idx
:
end_idx
],
model_input
.
attn_metadata
,
model_input
.
input_lens
[
i
:
i
+
1
],
model_input
.
t
[
i
:
i
+
1
],
model_input
.
p
[
i
:
i
+
1
],
model_input
.
num_samples
,
kv_caches
)
if
i
==
0
and
model_input
.
async_callback
is
not
None
:
model_input
.
async_callback
()
# Retrieve the outputs to CPU.
next_token_ids
+=
output_token_ids
.
cpu
().
tolist
()
token_ids
=
model_input
.
token_ids
[
None
,
start_idx
:
end_idx
].
to
(
self
.
device
)
position_ids
=
model_input
.
position_ids
[
None
,
start_idx
:
end_idx
].
to
(
self
.
device
)
attn_metadata
=
model_input
.
attn_metadata
attn_metadata
.
num_prefills
=
1
attn_metadata
.
slot_mapping
=
orig_slot_mapping
[
None
,
start_idx
:
end_idx
].
to
(
self
.
device
)
input_lens
=
model_input
.
input_lens
[
i
:
i
+
1
].
to
(
self
.
device
)
t
=
model_input
.
t
[
i
:
i
+
1
].
to
(
self
.
device
)
p
=
model_input
.
p
[
i
:
i
+
1
].
to
(
self
.
device
)
output_token_ids
=
self
.
model
(
token_ids
,
position_ids
,
attn_metadata
,
input_lens
,
t
,
p
,
model_input
.
num_samples
,
kv_caches
,
is_prompt
=
True
)
next_token_ids
.
append
(
output_token_ids
[
0
])
start_idx
=
end_idx
else
:
# Execute the model.
output_token_ids
=
_execute_model
(
model_input
.
token_ids
,
model_input
.
position_ids
,
model_input
.
attn_metadata
,
model_input
.
input_lens
,
model_input
.
t
,
model_input
.
p
,
model_input
.
num_samples
,
kv_caches
)
if
model_input
.
async_callback
is
not
None
:
model_input
.
async_callback
()
# Retrieve the outputs to CPU.
next_token_ids
=
output_token_ids
.
cpu
().
tolist
()
next_token_ids
=
[
output_token_ids
.
cpu
().
tolist
()
for
output_token_ids
in
next_token_ids
]
# NOTE(woosuk): Minimal code to construct the sampler outputs.
# The TPU backend does not reuse the sampler, since the TPU backend
# does not support
the
advanced sampling parameters such as logprobs.
# does not support advanced sampling parameters such as logprobs.
zero_logprob
=
Logprob
(
0.0
)
batch_idx
=
0
sampler_outputs
=
[]
for
seq_group
in
model_input
.
seq_groups
:
for
i
,
seq_group
in
enumerate
(
model_input
.
seq_groups
)
:
seq_ids
=
seq_group
seq_outputs
=
[]
if
is_prompt
:
assert
len
(
seq_ids
)
==
1
seq_id
=
seq_ids
[
0
]
for
i
in
range
(
model_input
.
best_of
[
batch_idx
]):
next_token_id
=
next_token_ids
[
batch_idx
][
i
]
seq_outputs
.
append
(
SequenceOutput
(
seq_id
,
next_token_id
,
{
next_token_id
:
zero_logprob
}))
batch_idx
+=
1
else
:
for
seq_id
in
seq_ids
:
next_token_id
=
next_token_ids
[
batch_idx
]
seq_outputs
=
[]
for
j
in
range
(
model_input
.
best_of
[
i
]):
next_token_id
=
next_token_ids
[
i
][
j
]
seq_outputs
.
append
(
SequenceOutput
(
seq_id
,
next_token_id
,
{
next_token_id
:
zero_logprob
}))
batch_idx
+=
1
sampler_outputs
.
append
(
CompletionSequenceGroupOutput
(
seq_outputs
,
None
))
return
[
SamplerOutput
(
sampler_outputs
)]
else
:
token_ids
=
model_input
.
token_ids
.
to
(
self
.
device
)
position_ids
=
model_input
.
position_ids
.
to
(
self
.
device
)
attn_metadata
=
model_input
.
attn_metadata
attn_metadata
.
slot_mapping
=
attn_metadata
.
slot_mapping
.
to
(
self
.
device
)
attn_metadata
.
block_tables
=
attn_metadata
.
block_tables
.
to
(
self
.
device
)
attn_metadata
.
context_lens
=
attn_metadata
.
context_lens
.
to
(
self
.
device
)
t
=
model_input
.
t
.
to
(
self
.
device
)
p
=
model_input
.
p
.
to
(
self
.
device
)
input_lens
=
model_input
.
input_lens
.
to
(
self
.
device
)
for
i
in
range
(
num_steps
):
slot_mapping
=
attn_metadata
.
slot_mapping
output_token_ids
=
self
.
model
(
token_ids
,
position_ids
,
attn_metadata
,
input_lens
,
t
,
p
,
model_input
.
num_samples
,
kv_caches
,
is_prompt
=
False
)
self
.
cached_step_outputs
.
append
(
output_token_ids
)
if
i
<
num_steps
-
1
:
# Prepare the inputs for the next step.
token_ids
=
output_token_ids
.
unsqueeze
(
dim
=
1
).
int
()
position_ids
=
position_ids
+
1
attn_metadata
.
context_lens
=
attn_metadata
.
context_lens
+
1
block_tables
=
attn_metadata
.
block_tables
block_number
=
block_tables
.
gather
(
1
,
position_ids
.
long
()
//
self
.
block_size
)
block_offset
=
position_ids
%
self
.
block_size
is_padding
=
slot_mapping
==
_PAD_SLOT_ID
slot_mapping
=
block_number
*
self
.
block_size
+
block_offset
slot_mapping
=
slot_mapping
.
long
()
slot_mapping
=
torch
.
where
(
is_padding
,
_PAD_SLOT_ID
,
slot_mapping
)
attn_metadata
.
slot_mapping
=
slot_mapping
if
model_input
.
async_callback
is
not
None
:
model_input
.
async_callback
()
if
num_steps
>
1
:
return
[]
# Retrieve the outputs to CPU.
next_token_ids
=
self
.
cached_step_outputs
.
pop
(
0
)
next_token_ids
=
next_token_ids
.
cpu
().
tolist
()
sampler_output
=
_make_decode_output
(
next_token_ids
,
model_input
.
seq_groups
)
return
[
sampler_output
]
class
ModelWrapper
(
TorchCompileWrapperWithCustomDispatcher
):
...
...
@@ -756,3 +823,24 @@ def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor:
cutoff_logit
=
torch
.
gather
(
logits_sorted
,
-
1
,
cutoff_index
)
logits
=
logits
.
masked_fill_
(
logits
<
cutoff_logit
,
-
float
(
"inf"
))
return
logits
def
_make_decode_output
(
next_token_ids
:
List
[
int
],
seq_groups
:
List
[
List
[
int
]],
)
->
SamplerOutput
:
zero_logprob
=
Logprob
(
0.0
)
sampler_outputs
=
[]
batch_idx
=
0
for
seq_group
in
seq_groups
:
seq_ids
=
seq_group
seq_outputs
=
[]
for
seq_id
in
seq_ids
:
next_token_id
=
next_token_ids
[
batch_idx
]
seq_outputs
.
append
(
SequenceOutput
(
seq_id
,
next_token_id
,
{
next_token_id
:
zero_logprob
}))
batch_idx
+=
1
sampler_outputs
.
append
(
CompletionSequenceGroupOutput
(
seq_outputs
,
None
))
return
SamplerOutput
(
sampler_outputs
)
Prev
1
…
15
16
17
18
19
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment