Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7c4f76e3
Commit
7c4f76e3
authored
Apr 15, 2024
by
zhuwenwen
Browse files
merge v0.4.0
parents
2da0dd3e
51c31bc1
Changes
332
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1476 additions
and
644 deletions
+1476
-644
vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
...ransformers_utils/tokenizer_group/base_tokenizer_group.py
+55
-0
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
...transformers_utils/tokenizer_group/ray_tokenizer_group.py
+166
-0
vllm/transformers_utils/tokenizer_group/tokenizer_group.py
vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+78
-0
vllm/transformers_utils/tokenizers/baichuan.py
vllm/transformers_utils/tokenizers/baichuan.py
+42
-50
vllm/usage/__init__.py
vllm/usage/__init__.py
+0
-0
vllm/usage/usage_lib.py
vllm/usage/usage_lib.py
+207
-0
vllm/utils.py
vllm/utils.py
+159
-30
vllm/worker/cache_engine.py
vllm/worker/cache_engine.py
+32
-104
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+381
-262
vllm/worker/neuron_model_runner.py
vllm/worker/neuron_model_runner.py
+285
-0
vllm/worker/neuron_worker.py
vllm/worker/neuron_worker.py
+12
-151
vllm/worker/worker.py
vllm/worker/worker.py
+59
-47
No files found.
vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
0 → 100644
View file @
7c4f76e3
from
abc
import
ABC
,
abstractmethod
from
typing
import
List
,
Optional
from
transformers
import
PreTrainedTokenizer
from
vllm.lora.request
import
LoRARequest
class
BaseTokenizerGroup
(
ABC
):
"""A group of tokenizers that can be used for LoRA adapters."""
@
abstractmethod
def
ping
(
self
)
->
bool
:
"""Check if the tokenizer group is alive."""
pass
@
abstractmethod
def
get_max_input_len
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
Optional
[
int
]:
"""Get the maximum input length for the LoRA request."""
pass
@
abstractmethod
def
encode
(
self
,
prompt
:
str
,
request_id
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
List
[
int
]:
"""Encode a prompt using the tokenizer group."""
pass
@
abstractmethod
async
def
encode_async
(
self
,
prompt
:
str
,
request_id
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
List
[
int
]:
"""Encode a prompt using the tokenizer group."""
pass
@
abstractmethod
def
get_lora_tokenizer
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
"PreTrainedTokenizer"
:
"""Get a tokenizer for a LoRA request."""
pass
@
abstractmethod
async
def
get_lora_tokenizer_async
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
"PreTrainedTokenizer"
:
"""Get a tokenizer for a LoRA request."""
pass
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
0 → 100644
View file @
7c4f76e3
import
asyncio
import
os
from
typing
import
List
,
Optional
from
ray.util.scheduling_strategies
import
NodeAffinitySchedulingStrategy
from
transformers
import
PreTrainedTokenizer
from
vllm.config
import
TokenizerPoolConfig
from
vllm.engine.ray_utils
import
ray
from
vllm.lora.request
import
LoRARequest
from
vllm.transformers_utils.tokenizer_group.base_tokenizer_group
import
(
BaseTokenizerGroup
)
from
vllm.transformers_utils.tokenizer_group.tokenizer_group
import
(
TokenizerGroup
)
class
RayTokenizerGroupPool
(
BaseTokenizerGroup
):
"""A Ray-based pool of TokenizerGroups for async tokenization."""
# Class to use for workers making up the pool.
_worker_cls
=
TokenizerGroup
@
classmethod
def
from_config
(
cls
,
tokenizer_pool_config
:
TokenizerPoolConfig
,
**
init_kwargs
)
->
"RayTokenizerGroupPool"
:
ray_actor_options
=
(
tokenizer_pool_config
.
extra_config
or
{
"num_cpus"
:
0
})
ray_actor_options
.
setdefault
(
"scheduling_strategy"
,
NodeAffinitySchedulingStrategy
(
node_id
=
ray
.
get_runtime_context
().
get_node_id
(),
soft
=
True
))
# Carry over the env vars to the actors.
# This is necessary for API keys and such.
ray_actor_options
.
setdefault
(
"runtime_env"
,
{})
_carry_over_env_vars_to_runtime_env
(
ray_actor_options
[
"runtime_env"
])
init_kwargs
[
"num_actors"
]
=
tokenizer_pool_config
.
pool_size
init_kwargs
[
"ray_actor_options"
]
=
ray_actor_options
return
cls
(
**
init_kwargs
)
def
__init__
(
self
,
tokenizer_id
:
str
,
enable_lora
:
bool
,
max_num_seqs
:
int
,
max_input_length
:
Optional
[
int
],
num_actors
:
int
,
ray_actor_options
:
dict
,
**
tokenizer_config
):
# Store a local copy of the TokenizerGroup for quick access
# to underlying HF tokenizers.
self
.
_local_tokenizer_group
=
self
.
_worker_cls
(
tokenizer_id
=
tokenizer_id
,
enable_lora
=
enable_lora
,
max_num_seqs
=
max_num_seqs
,
max_input_length
=
max_input_length
,
)
ray_tokenizer_group_cls
=
ray
.
remote
(
self
.
_worker_cls
).
options
(
**
ray_actor_options
)
self
.
tokenizer_actors
=
[
ray_tokenizer_group_cls
.
remote
(
tokenizer_id
,
enable_lora
,
max_num_seqs
,
max_input_length
,
**
tokenizer_config
)
for
_
in
range
(
num_actors
)
]
self
.
_idle_actors
:
Optional
[
asyncio
.
Queue
]
=
None
@
property
def
pool_size
(
self
)
->
int
:
return
len
(
self
.
tokenizer_actors
)
def
ping
(
self
):
return
ray
.
get
(
[
actor
.
ping
.
remote
()
for
actor
in
self
.
tokenizer_actors
])
def
_ensure_queue_initialized
(
self
):
if
self
.
_idle_actors
is
None
:
self
.
_idle_actors
=
asyncio
.
Queue
()
for
actor
in
self
.
tokenizer_actors
:
self
.
_idle_actors
.
put_nowait
(
actor
)
def
encode
(
self
,
prompt
:
str
,
request_id
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
List
[
int
]:
"""Encode a prompt using the tokenizer group.
We pick an idle actor and use it to encode the prompt.
The actor is then put back in the queue for future use.
This is blocking.
"""
self
.
_ensure_queue_initialized
()
if
self
.
_idle_actors
.
empty
():
raise
RuntimeError
(
"No idle actors available."
)
actor
=
self
.
_idle_actors
.
get_nowait
()
try
:
ret
=
ray
.
get
(
actor
.
encode
.
remote
(
request_id
=
request_id
,
prompt
=
prompt
,
lora_request
=
lora_request
))
finally
:
# Put the actor back in the queue.
# This is done in a finally block to ensure that the actor is
# always put back in the queue, even if an exception/cancellation
# is raised.
self
.
_idle_actors
.
put_nowait
(
actor
)
return
ret
async
def
encode_async
(
self
,
prompt
:
str
,
request_id
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
List
[
int
]:
"""Encode a prompt using the tokenizer group.
We pick an idle actor and use it to encode the prompt.
If there are no idle actors, we wait until one becomes
available.
The actor is then put back in the queue for future use.
This is non-blocking.
"""
self
.
_ensure_queue_initialized
()
actor
=
await
self
.
_idle_actors
.
get
()
try
:
ret
=
await
actor
.
encode
.
remote
(
request_id
=
request_id
,
prompt
=
prompt
,
lora_request
=
lora_request
)
finally
:
# Put the actor back in the queue.
# This is done in a finally block to ensure that the actor is
# always put back in the queue, even if an exception/cancellation
# is raised.
self
.
_idle_actors
.
put_nowait
(
actor
)
return
ret
def
get_max_input_len
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
Optional
[
int
]:
"""Get the maximum input length for the LoRA request."""
return
self
.
_local_tokenizer_group
.
get_max_input_len
(
lora_request
)
def
get_lora_tokenizer
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
"PreTrainedTokenizer"
:
return
self
.
_local_tokenizer_group
.
get_lora_tokenizer
(
lora_request
)
async
def
get_lora_tokenizer_async
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
"PreTrainedTokenizer"
:
return
await
self
.
_local_tokenizer_group
.
get_lora_tokenizer_async
(
lora_request
)
def
_carry_over_env_vars_to_runtime_env
(
runtime_env
:
dict
)
->
None
:
"""Copy over all current process environment variables to the runtime_env.
The variables in runtime_env will take precedence over the current process
environment variables.
runtime_env will be modified in place."""
env_vars
=
os
.
environ
.
copy
()
runtime_env
.
setdefault
(
"env_vars"
,
{})
env_vars
.
update
(
runtime_env
[
"env_vars"
])
runtime_env
[
"env_vars"
]
=
env_vars
vllm/transformers_utils/tokenizer_group/tokenizer_group.py
0 → 100644
View file @
7c4f76e3
from
typing
import
List
,
Optional
from
transformers
import
PreTrainedTokenizer
from
vllm.lora.request
import
LoRARequest
from
vllm.transformers_utils.tokenizer
import
(
get_lora_tokenizer
,
get_lora_tokenizer_async
,
get_tokenizer
)
from
vllm.transformers_utils.tokenizer_group.base_tokenizer_group
import
(
BaseTokenizerGroup
)
from
vllm.utils
import
LRUCache
class
TokenizerGroup
(
BaseTokenizerGroup
):
"""A group of tokenizers that can be used for LoRA adapters."""
def
__init__
(
self
,
tokenizer_id
:
str
,
enable_lora
:
bool
,
max_num_seqs
:
int
,
max_input_length
:
Optional
[
int
],
**
tokenizer_config
):
self
.
tokenizer_id
=
tokenizer_id
self
.
tokenizer_config
=
tokenizer_config
self
.
enable_lora
=
enable_lora
self
.
max_input_length
=
max_input_length
self
.
tokenizer
=
get_tokenizer
(
self
.
tokenizer_id
,
**
tokenizer_config
)
self
.
lora_tokenizers
=
LRUCache
[
PreTrainedTokenizer
](
capacity
=
max_num_seqs
)
if
enable_lora
else
None
def
ping
(
self
)
->
bool
:
"""Check if the tokenizer group is alive."""
return
True
def
get_max_input_len
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
Optional
[
int
]:
"""Get the maximum input length for the LoRA request."""
return
self
.
max_input_length
def
encode
(
self
,
prompt
:
str
,
request_id
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
List
[
int
]:
tokenizer
=
self
.
get_lora_tokenizer
(
lora_request
)
return
tokenizer
.
encode
(
prompt
)
async
def
encode_async
(
self
,
prompt
:
str
,
request_id
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
List
[
int
]:
tokenizer
=
await
self
.
get_lora_tokenizer_async
(
lora_request
)
return
tokenizer
.
encode
(
prompt
)
def
get_lora_tokenizer
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
"PreTrainedTokenizer"
:
if
not
lora_request
or
not
self
.
enable_lora
:
return
self
.
tokenizer
if
lora_request
.
lora_int_id
not
in
self
.
lora_tokenizers
:
tokenizer
=
(
get_lora_tokenizer
(
lora_request
,
**
self
.
tokenizer_config
)
or
self
.
tokenizer
)
self
.
lora_tokenizers
.
put
(
lora_request
.
lora_int_id
,
tokenizer
)
return
tokenizer
else
:
return
self
.
lora_tokenizers
.
get
(
lora_request
.
lora_int_id
)
async
def
get_lora_tokenizer_async
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
"PreTrainedTokenizer"
:
if
not
lora_request
or
not
self
.
enable_lora
:
return
self
.
tokenizer
if
lora_request
.
lora_int_id
not
in
self
.
lora_tokenizers
:
tokenizer
=
(
await
get_lora_tokenizer_async
(
lora_request
,
**
self
.
tokenizer_config
)
or
self
.
tokenizer
)
self
.
lora_tokenizers
.
put
(
lora_request
.
lora_int_id
,
tokenizer
)
return
tokenizer
else
:
return
self
.
lora_tokenizers
.
get
(
lora_request
.
lora_int_id
)
vllm/transformers_utils/tokenizers/baichuan.py
View file @
7c4f76e3
# yapf: disable
# Adapted from
# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
# This includes a fix suggested in
...
...
@@ -13,7 +12,6 @@ import sentencepiece as spm
from
transformers.tokenization_utils
import
AddedToken
,
PreTrainedTokenizer
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
VOCAB_FILES_NAMES
=
{
"vocab_file"
:
"tokenizer.model"
}
...
...
@@ -52,27 +50,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
clean_up_tokenization_spaces
=
False
,
**
kwargs
,
):
self
.
sp_model_kwargs
=
{}
if
sp_model_kwargs
is
None
else
sp_model_kwargs
bos_token
=
(
AddedToken
(
bos_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
bos_token
,
str
)
else
bos_token
)
eos_token
=
(
AddedToken
(
eos_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
eos_token
,
str
)
else
eos_token
)
unk_token
=
(
AddedToken
(
unk_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
unk_token
,
str
)
else
unk_token
)
pad_token
=
(
AddedToken
(
pad_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
pad_token
,
str
)
else
pad_token
)
self
.
sp_model_kwargs
=
({}
if
sp_model_kwargs
is
None
else
sp_model_kwargs
)
bos_token
=
(
AddedToken
(
bos_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
bos_token
,
str
)
else
bos_token
)
eos_token
=
(
AddedToken
(
eos_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
eos_token
,
str
)
else
eos_token
)
unk_token
=
(
AddedToken
(
unk_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
unk_token
,
str
)
else
unk_token
)
pad_token
=
(
AddedToken
(
pad_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
pad_token
,
str
)
else
pad_token
)
self
.
vocab_file
=
vocab_file
self
.
add_bos_token
=
add_bos_token
self
.
add_eos_token
=
add_eos_token
...
...
@@ -107,7 +94,10 @@ class BaichuanTokenizer(PreTrainedTokenizer):
def
get_vocab
(
self
):
"""Returns vocab as a dict"""
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)
}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
...
...
@@ -130,7 +120,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
out_string
=
""
prev_is_special
=
False
for
i
,
token
in
enumerate
(
tokens
):
# make sure that special tokens are not decoded using sentencepiece model
# make sure that special tokens are not decoded using
# sentencepiece model
if
token
in
self
.
all_special_tokens
:
if
not
prev_is_special
and
i
!=
0
:
out_string
+=
" "
...
...
@@ -143,9 +134,9 @@ class BaichuanTokenizer(PreTrainedTokenizer):
out_string
+=
self
.
sp_model
.
decode
(
current_sub_tokens
)
return
out_string
def
save_vocabulary
(
self
,
save_directory
,
filename_prefix
:
Optional
[
str
]
=
None
)
->
Tuple
[
str
]:
def
save_vocabulary
(
self
,
save_directory
,
filename_prefix
:
Optional
[
str
]
=
None
)
->
Tuple
[
str
]:
"""
Save the vocabulary and special tokens file to a directory.
...
...
@@ -157,24 +148,24 @@ class BaichuanTokenizer(PreTrainedTokenizer):
`Tuple(str)`: Paths to the files saved.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
f
"Vocabulary path (
{
save_directory
}
) should be a directory"
)
logger
.
error
(
f
"Vocabulary path (
{
save_directory
}
) "
"should be a directory"
)
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
(
filename_prefix
+
"-"
if
filename_prefix
else
""
)
+
VOCAB_FILES_NAMES
[
"vocab_file"
],
(
filename_prefix
+
"-"
if
filename_prefix
else
""
)
+
VOCAB_FILES_NAMES
[
"vocab_file"
],
)
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
)
and
os
.
path
.
isfile
(
self
.
vocab_file
):
out_vocab_file
)
and
os
.
path
.
isfile
(
self
.
vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
elif
not
os
.
path
.
isfile
(
self
.
vocab_file
):
with
open
(
out_vocab_file
,
"wb"
)
as
fi
:
content_spiece_model
=
self
.
sp_model
.
serialized_model_proto
()
fi
.
write
(
content_spiece_model
)
return
(
out_vocab_file
,)
return
(
out_vocab_file
,
)
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
bos_token_id
=
[
self
.
bos_token_id
]
if
self
.
add_bos_token
else
[]
...
...
@@ -194,7 +185,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
already_has_special_tokens
:
bool
=
False
,
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
Retrieve sequence ids from a token list that has no special tokens
added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
...
...
@@ -202,11 +194,14 @@ class BaichuanTokenizer(PreTrainedTokenizer):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
already_has_special_tokens (`bool`, *optional*, defaults to
`False`):
Whether or not the token list is already formatted with
special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
`List[int]`: A list of integers in the range [0, 1]:
1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
return
super
().
get_special_tokens_mask
(
...
...
@@ -220,20 +215,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
if
token_ids_1
is
None
:
return
bos_token_id
+
([
0
]
*
len
(
token_ids_0
))
+
eos_token_id
return
(
bos_token_id
+
([
0
]
*
len
(
token_ids_0
))
+
eos_token_id
+
bos_token_id
+
([
0
]
*
len
(
token_ids_1
))
+
eos_token_id
)
return
(
bos_token_id
+
([
0
]
*
len
(
token_ids_0
))
+
eos_token_id
+
bos_token_id
+
([
0
]
*
len
(
token_ids_1
))
+
eos_token_id
)
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
Creates a mask from the two sequences passed to be used in a
sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
...
...
@@ -250,7 +241,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
`List[int]`: List of [token type IDs](../glossary#token-type-ids)
according to the given sequence(s).
"""
bos_token_id
=
[
self
.
bos_token_id
]
if
self
.
add_bos_token
else
[]
eos_token_id
=
[
self
.
eos_token_id
]
if
self
.
add_eos_token
else
[]
...
...
vllm/usage/__init__.py
0 → 100644
View file @
7c4f76e3
vllm/usage/usage_lib.py
0 → 100644
View file @
7c4f76e3
import
datetime
import
json
import
logging
import
os
import
platform
import
time
from
enum
import
Enum
from
pathlib
import
Path
from
threading
import
Thread
from
typing
import
Dict
,
Optional
from
uuid
import
uuid4
import
cpuinfo
import
psutil
import
requests
import
torch
_config_home
=
os
.
getenv
(
"XDG_CONFIG_HOME"
,
os
.
path
.
expanduser
(
"~/.config"
))
_USAGE_STATS_JSON_PATH
=
os
.
path
.
join
(
_config_home
,
"vllm/usage_stats.json"
)
_USAGE_STATS_DO_NOT_TRACK_PATH
=
os
.
path
.
join
(
_config_home
,
"vllm/do_not_track"
)
_USAGE_STATS_ENABLED
=
None
_USAGE_STATS_SERVER
=
os
.
environ
.
get
(
"VLLM_USAGE_STATS_SERVER"
,
"https://stats.vllm.ai"
)
def
is_usage_stats_enabled
():
"""Determine whether or not we can send usage stats to the server.
The logic is as follows:
- By default, it should be enabled.
- Two environment variables can disable it:
- DO_NOT_TRACK=1
- VLLM_NO_USAGE_STATS=1
- A file in the home directory can disable it if it exists:
- $HOME/.config/vllm/do_not_track
"""
global
_USAGE_STATS_ENABLED
if
_USAGE_STATS_ENABLED
is
None
:
do_not_track
=
os
.
environ
.
get
(
"DO_NOT_TRACK"
,
"0"
)
==
"1"
no_usage_stats
=
os
.
environ
.
get
(
"VLLM_NO_USAGE_STATS"
,
"0"
)
==
"1"
do_not_track_file
=
os
.
path
.
exists
(
_USAGE_STATS_DO_NOT_TRACK_PATH
)
_USAGE_STATS_ENABLED
=
not
(
do_not_track
or
no_usage_stats
or
do_not_track_file
)
return
_USAGE_STATS_ENABLED
def
_get_current_timestamp_ns
()
->
int
:
return
int
(
datetime
.
datetime
.
now
(
datetime
.
timezone
.
utc
).
timestamp
()
*
1e9
)
def
_detect_cloud_provider
()
->
str
:
# Try detecting through vendor file
vendor_files
=
[
"/sys/class/dmi/id/product_version"
,
"/sys/class/dmi/id/bios_vendor"
,
"/sys/class/dmi/id/product_name"
,
"/sys/class/dmi/id/chassis_asset_tag"
,
"/sys/class/dmi/id/sys_vendor"
]
# Mapping of identifiable strings to cloud providers
cloud_identifiers
=
{
"amazon"
:
"AWS"
,
"microsoft corporation"
:
"AZURE"
,
"google"
:
"GCP"
,
"oraclecloud"
:
"OCI"
,
}
for
vendor_file
in
vendor_files
:
path
=
Path
(
vendor_file
)
if
path
.
is_file
():
file_content
=
path
.
read_text
().
lower
()
for
identifier
,
provider
in
cloud_identifiers
.
items
():
if
identifier
in
file_content
:
return
provider
# Try detecting through environment variables
env_to_cloud_provider
=
{
"RUNPOD_DC_ID"
:
"RUNPOD"
,
}
for
env_var
,
provider
in
env_to_cloud_provider
.
items
():
if
os
.
environ
.
get
(
env_var
):
return
provider
return
"UNKNOWN"
class
UsageContext
(
str
,
Enum
):
UNKNOWN_CONTEXT
=
"UNKNOWN_CONTEXT"
LLM_CLASS
=
"LLM_CLASS"
API_SERVER
=
"API_SERVER"
OPENAI_API_SERVER
=
"OPENAI_API_SERVER"
ENGINE_CONTEXT
=
"ENGINE_CONTEXT"
class
UsageMessage
:
"""Collect platform information and send it to the usage stats server."""
def
__init__
(
self
)
->
None
:
# NOTE: vLLM's server _only_ support flat KV pair.
# Do not use nested fields.
self
.
uuid
=
str
(
uuid4
())
# Environment Information
self
.
provider
:
Optional
[
str
]
=
None
self
.
num_cpu
:
Optional
[
int
]
=
None
self
.
cpu_type
:
Optional
[
str
]
=
None
self
.
cpu_family_model_stepping
:
Optional
[
str
]
=
None
self
.
total_memory
:
Optional
[
int
]
=
None
self
.
architecture
:
Optional
[
str
]
=
None
self
.
platform
:
Optional
[
str
]
=
None
self
.
gpu_count
:
Optional
[
int
]
=
None
self
.
gpu_type
:
Optional
[
str
]
=
None
self
.
gpu_memory_per_device
:
Optional
[
int
]
=
None
# vLLM Information
self
.
model_architecture
:
Optional
[
str
]
=
None
self
.
vllm_version
:
Optional
[
str
]
=
None
self
.
context
:
Optional
[
str
]
=
None
# Metadata
self
.
log_time
:
Optional
[
int
]
=
None
self
.
source
:
Optional
[
str
]
=
None
def
report_usage
(
self
,
model_architecture
:
str
,
usage_context
:
UsageContext
,
extra_kvs
:
Dict
[
str
,
any
]
=
None
)
->
None
:
t
=
Thread
(
target
=
self
.
_report_usage_worker
,
args
=
(
model_architecture
,
usage_context
,
extra_kvs
or
{}),
daemon
=
True
)
t
.
start
()
def
_report_usage_worker
(
self
,
model_architecture
:
str
,
usage_context
:
UsageContext
,
extra_kvs
:
Dict
[
str
,
any
])
->
None
:
self
.
_report_usage_once
(
model_architecture
,
usage_context
,
extra_kvs
)
self
.
_report_continous_usage
()
def
_report_usage_once
(
self
,
model_architecture
:
str
,
usage_context
:
UsageContext
,
extra_kvs
:
Dict
[
str
,
any
])
->
None
:
# Platform information
if
torch
.
cuda
.
is_available
():
device_property
=
torch
.
cuda
.
get_device_properties
(
0
)
self
.
gpu_count
=
torch
.
cuda
.
device_count
()
self
.
gpu_type
=
device_property
.
name
self
.
gpu_memory_per_device
=
device_property
.
total_memory
self
.
provider
=
_detect_cloud_provider
()
self
.
architecture
=
platform
.
machine
()
self
.
platform
=
platform
.
platform
()
self
.
total_memory
=
psutil
.
virtual_memory
().
total
info
=
cpuinfo
.
get_cpu_info
()
self
.
num_cpu
=
info
.
get
(
"count"
,
None
)
self
.
cpu_type
=
info
.
get
(
"brand_raw"
,
""
)
self
.
cpu_family_model_stepping
=
","
.
join
([
str
(
info
.
get
(
"family"
,
""
)),
str
(
info
.
get
(
"model"
,
""
)),
str
(
info
.
get
(
"stepping"
,
""
))
])
# vLLM information
import
vllm
# delayed import to prevent circular import
self
.
context
=
usage_context
.
value
self
.
vllm_version
=
vllm
.
__version__
self
.
model_architecture
=
model_architecture
# Metadata
self
.
log_time
=
_get_current_timestamp_ns
()
self
.
source
=
os
.
environ
.
get
(
"VLLM_USAGE_SOURCE"
,
"production"
)
data
=
vars
(
self
)
if
extra_kvs
:
data
.
update
(
extra_kvs
)
self
.
_write_to_file
(
data
)
self
.
_send_to_server
(
data
)
def
_report_continous_usage
(
self
):
"""Report usage every 10 minutes.
This helps us to collect more data points for uptime of vLLM usages.
This function can also help send over performance metrics over time.
"""
while
True
:
time
.
sleep
(
600
)
data
=
{
"uuid"
:
self
.
uuid
,
"log_time"
:
_get_current_timestamp_ns
()}
self
.
_write_to_file
(
data
)
self
.
_send_to_server
(
data
)
def
_send_to_server
(
self
,
data
):
try
:
requests
.
post
(
_USAGE_STATS_SERVER
,
json
=
data
)
except
requests
.
exceptions
.
RequestException
:
# silently ignore unless we are using debug log
logging
.
debug
(
"Failed to send usage data to server"
)
def
_write_to_file
(
self
,
data
):
os
.
makedirs
(
os
.
path
.
dirname
(
_USAGE_STATS_JSON_PATH
),
exist_ok
=
True
)
Path
(
_USAGE_STATS_JSON_PATH
).
touch
(
exist_ok
=
True
)
with
open
(
_USAGE_STATS_JSON_PATH
,
"a"
)
as
f
:
json
.
dump
(
data
,
f
)
f
.
write
(
"
\n
"
)
usage_message
=
UsageMessage
()
vllm/utils.py
View file @
7c4f76e3
import
asyncio
import
enum
import
gc
import
os
import
socket
import
subprocess
import
uuid
import
warnings
from
collections
import
OrderedDict
from
functools
import
lru_cache
,
partial
from
platform
import
uname
from
typing
import
List
,
Tuple
,
Union
from
packaging.version
import
parse
,
Vers
ion
from
typing
import
(
Any
,
Awaitable
,
Callable
,
Generic
,
Hashable
,
List
,
Optional
,
Tuple
,
TypeVar
,
Un
ion
)
import
psutil
import
torch
import
asyncio
from
functools
import
partial
from
typing
import
(
Awaitable
,
Callable
,
TypeVar
,
)
from
collections
import
OrderedDict
from
typing
import
Any
,
Hashable
,
Optional
from
packaging.version
import
Version
,
parse
from
vllm.logger
import
init_logger
...
...
@@ -51,10 +48,10 @@ class Counter:
self
.
counter
=
0
class
LRUCache
:
class
LRUCache
(
Generic
[
T
])
:
def
__init__
(
self
,
capacity
:
int
):
self
.
cache
=
OrderedDict
()
self
.
cache
=
OrderedDict
[
Hashable
,
T
]
()
self
.
capacity
=
capacity
def
__contains__
(
self
,
key
:
Hashable
)
->
bool
:
...
...
@@ -63,10 +60,10 @@ class LRUCache:
def
__len__
(
self
)
->
int
:
return
len
(
self
.
cache
)
def
__getitem__
(
self
,
key
:
Hashable
)
->
Any
:
def
__getitem__
(
self
,
key
:
Hashable
)
->
T
:
return
self
.
get
(
key
)
def
__setitem__
(
self
,
key
:
Hashable
,
value
:
Any
)
->
None
:
def
__setitem__
(
self
,
key
:
Hashable
,
value
:
T
)
->
None
:
self
.
put
(
key
,
value
)
def
__delitem__
(
self
,
key
:
Hashable
)
->
None
:
...
...
@@ -75,7 +72,9 @@ class LRUCache:
def
touch
(
self
,
key
:
Hashable
)
->
None
:
self
.
cache
.
move_to_end
(
key
)
def
get
(
self
,
key
:
Hashable
,
default_value
:
Optional
[
Any
]
=
None
)
->
int
:
def
get
(
self
,
key
:
Hashable
,
default_value
:
Optional
[
T
]
=
None
)
->
Optional
[
T
]:
if
key
in
self
.
cache
:
value
=
self
.
cache
[
key
]
self
.
cache
.
move_to_end
(
key
)
...
...
@@ -83,12 +82,12 @@ class LRUCache:
value
=
default_value
return
value
def
put
(
self
,
key
:
Hashable
,
value
:
Any
)
->
None
:
def
put
(
self
,
key
:
Hashable
,
value
:
T
)
->
None
:
self
.
cache
[
key
]
=
value
self
.
cache
.
move_to_end
(
key
)
self
.
_remove_old_if_needed
()
def
_on_remove
(
self
,
key
:
Hashable
,
value
:
Any
):
def
_on_remove
(
self
,
key
:
Hashable
,
value
:
T
):
pass
def
remove_oldest
(
self
):
...
...
@@ -101,7 +100,7 @@ class LRUCache:
while
len
(
self
.
cache
)
>
self
.
capacity
:
self
.
remove_oldest
()
def
pop
(
self
,
key
:
int
,
default_value
:
Optional
[
Any
]
=
None
)
->
Any
:
def
pop
(
self
,
key
:
Hashable
,
default_value
:
Optional
[
Any
]
=
None
)
->
T
:
run_on_remove
=
key
in
self
.
cache
value
=
self
.
cache
.
pop
(
key
,
default_value
)
if
run_on_remove
:
...
...
@@ -118,6 +117,7 @@ def is_hip() -> bool:
return
torch
.
version
.
hip
is
not
None
@
lru_cache
(
maxsize
=
None
)
def
is_neuron
()
->
bool
:
try
:
import
transformers_neuronx
...
...
@@ -126,15 +126,17 @@ def is_neuron() -> bool:
return
transformers_neuronx
is
not
None
@
lru_cache
(
maxsize
=
None
)
def
get_max_shared_memory_bytes
(
gpu
:
int
=
0
)
->
int
:
"""Returns the maximum shared memory per thread block in bytes."""
# NOTE: This import statement should be executed lazily since
# the Neuron-X backend does not have the `cuda_utils` module.
from
vllm._C
import
cuda_utils
max_shared_mem
=
cuda_utils
.
get_max_shared_memory_per_block_device_attribute
(
gpu
)
# value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail
max_shared_mem
=
(
cuda_utils
.
get_max_shared_memory_per_block_device_attribute
(
gpu
))
# value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
# will fail
assert
max_shared_mem
>
0
,
"max_shared_mem can not be zero"
return
int
(
max_shared_mem
)
...
...
@@ -148,6 +150,7 @@ def random_uuid() -> str:
return
str
(
uuid
.
uuid4
().
hex
)
@
lru_cache
(
maxsize
=
None
)
def
in_wsl
()
->
bool
:
# Reference: https://github.com/microsoft/WSL/issues/4071
return
"microsoft"
in
" "
.
join
(
uname
()).
lower
()
...
...
@@ -170,20 +173,41 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
def
get_ip
()
->
str
:
host_ip
=
os
.
environ
.
get
(
"HOST_IP"
)
if
host_ip
:
return
host_ip
# IP is not set, try to get it from the network interface
# try ipv4
s
=
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_DGRAM
)
try
:
s
.
connect
((
"
dns.google
"
,
80
))
# Doesn't need to be reachable
s
.
connect
((
"
8.8.8.8
"
,
80
))
# Doesn't need to be reachable
return
s
.
getsockname
()[
0
]
except
OSError
:
# try ipv6
except
Exception
:
pass
# try ipv6
try
:
s
=
socket
.
socket
(
socket
.
AF_INET6
,
socket
.
SOCK_DGRAM
)
s
.
connect
((
"dns.google"
,
80
))
# Google's public DNS server, see
# https://developers.google.com/speed/public-dns/docs/using#addresses
s
.
connect
((
"2001:4860:4860::8888"
,
80
))
# Doesn't need to be reachable
return
s
.
getsockname
()[
0
]
except
Exception
:
pass
warnings
.
warn
(
"Failed to get the IP address, using 0.0.0.0 by default."
"The value can be set by the environment variable HOST_IP."
,
stacklevel
=
2
)
return
"0.0.0.0"
def
get_distributed_init_method
(
ip
:
str
,
port
:
int
)
->
str
:
return
f
"tcp://
{
ip
}
:
{
port
}
"
# Brackets are not permitted in ipv4 addresses,
# see https://github.com/python/cpython/issues/103848
return
f
"tcp://[
{
ip
}
]:
{
port
}
"
if
":"
in
ip
else
f
"tcp://
{
ip
}
:
{
port
}
"
def
get_open_port
()
->
int
:
...
...
@@ -203,14 +227,24 @@ def set_cuda_visible_devices(device_ids: List[int]) -> None:
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
","
.
join
(
map
(
str
,
device_ids
))
def
chunk_list
(
lst
,
chunk_size
):
"""Yield successive chunk_size chunks from lst."""
return
[
lst
[
i
:
i
+
chunk_size
]
for
i
in
range
(
0
,
len
(
lst
),
chunk_size
)]
def
cdiv
(
a
:
int
,
b
:
int
)
->
int
:
"""Ceiling division."""
return
-
(
a
//
-
b
)
@
lru_cache
(
maxsize
=
None
)
def
get_nvcc_cuda_version
()
->
Optional
[
Version
]:
cuda_home
=
os
.
environ
.
get
(
'CUDA_HOME'
)
if
not
cuda_home
:
cuda_home
=
'/usr/local/cuda'
if
os
.
path
.
isfile
(
cuda_home
+
'/bin/nvcc'
):
logger
.
info
(
f
'CUDA_HOME is not found in the environment. Using
{
cuda_home
}
as CUDA_HOME.'
)
logger
.
info
(
f
'CUDA_HOME is not found in the environment. '
f
'Using
{
cuda_home
}
as CUDA_HOME.'
)
else
:
logger
.
warning
(
f
'Not found nvcc in
{
cuda_home
}
. Skip cuda version check!'
)
...
...
@@ -309,3 +343,98 @@ def create_kv_caches_with_random(
f
"Does not support value cache of type
{
cache_dtype
}
"
)
value_caches
.
append
(
value_cache
)
return
key_caches
,
value_caches
@
lru_cache
def
print_warning_once
(
msg
:
str
)
->
None
:
logger
.
warning
(
msg
)
@
lru_cache
(
maxsize
=
None
)
def
is_pin_memory_available
()
->
bool
:
if
in_wsl
():
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
print_warning_once
(
"Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance."
)
return
False
elif
is_neuron
():
print_warning_once
(
"Pin memory is not supported on Neuron."
)
return
False
return
True
class
CudaMemoryProfiler
:
def
__init__
(
self
,
device
=
None
):
self
.
device
=
device
def
current_memory_usage
(
self
)
->
float
:
# Return the memory usage in bytes.
torch
.
cuda
.
reset_peak_memory_stats
(
self
.
device
)
mem
=
torch
.
cuda
.
max_memory_allocated
(
self
.
device
)
return
mem
def
__enter__
(
self
):
self
.
initial_memory
=
self
.
current_memory_usage
()
# This allows us to call methods of the context manager if needed
return
self
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
self
.
final_memory
=
self
.
current_memory_usage
()
self
.
consumed_memory
=
self
.
final_memory
-
self
.
initial_memory
# Force garbage collection
gc
.
collect
()
def
str_to_int_tuple
(
s
:
str
)
->
Tuple
[
int
]:
"""Convert a string to a tuple of integers."""
try
:
return
tuple
(
map
(
int
,
s
.
split
(
","
)))
except
ValueError
as
e
:
raise
ValueError
(
"String must be a series of integers separated by commas "
f
"(e.g., 1, 2, 3). Given input:
{
s
}
"
)
from
e
def
pad_to_max_length
(
x
:
List
[
int
],
max_len
:
int
,
pad
:
int
)
->
List
[
int
]:
assert
len
(
x
)
<=
max_len
return
x
+
[
pad
]
*
(
max_len
-
len
(
x
))
def
make_tensor_with_pad
(
x
:
List
[
List
[
int
]],
max_len
:
int
,
pad
:
int
,
dtype
:
torch
.
dtype
,
device
:
Optional
[
Union
[
str
,
torch
.
device
]],
)
->
torch
.
Tensor
:
"""Make a padded tensor of a 2D inputs.
The padding is applied to the end of each inner list until it reaches
`max_len`.
"""
padded_x
=
[
pad_to_max_length
(
x_i
,
max_len
,
pad
)
for
x_i
in
x
]
return
torch
.
tensor
(
padded_x
,
dtype
=
dtype
,
device
=
device
)
def
async_tensor_h2d
(
data
:
list
,
dtype
:
torch
.
dtype
,
target_device
:
Union
[
str
,
torch
.
device
],
pin_memory
:
bool
,
)
->
torch
.
Tensor
:
"""Asynchronously create a tensor and copy it from host to device."""
t
=
torch
.
tensor
(
data
,
dtype
=
dtype
,
pin_memory
=
pin_memory
,
device
=
"cpu"
)
return
t
.
to
(
device
=
target_device
,
non_blocking
=
True
)
def
maybe_expand_dim
(
tensor
:
torch
.
Tensor
,
target_dims
:
int
,
size
:
int
=
1
)
->
torch
.
Tensor
:
"""Expand the tensor to the target_dims."""
if
tensor
.
ndim
<
target_dims
:
tensor
=
tensor
.
view
(
-
1
,
*
([
size
]
*
(
target_dims
-
tensor
.
ndim
)))
return
tensor
vllm/worker/cache_engine.py
View file @
7c4f76e3
"""CacheEngine class for managing the KV cache."""
from
typing
import
Dict
,
List
,
Tuple
from
typing
import
Dict
,
List
import
torch
from
vllm.attention
import
get_attn_backend
from
vllm.config
import
CacheConfig
,
ModelConfig
,
ParallelConfig
from
vllm.logger
import
init_logger
from
vllm.utils
import
in_wsl
,
is_neuron
,
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
,
is_pin_memory_available
logger
=
init_logger
(
__name__
)
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
CacheEngine
:
"""Manages the KV cache.
...
...
@@ -38,119 +37,48 @@ class CacheEngine:
self
.
num_gpu_blocks
=
cache_config
.
num_gpu_blocks
self
.
num_cpu_blocks
=
cache_config
.
num_cpu_blocks
# Skip initializing CUDA stream and buffer for Neuron backend.
if
is_neuron
():
return
if
cache_config
.
cache_dtype
==
"auto"
:
self
.
dtype
=
model_config
.
dtype
else
:
self
.
dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
cache_config
.
cache_dtype
]
# Get attention backend.
self
.
attn_backend
=
get_attn_backend
(
model_config
.
dtype
)
# Initialize the cache.
self
.
gpu_cache
=
self
.
allocate_gpu_cache
()
self
.
cpu_cache
=
self
.
allocate_cpu_cache
()
# Initialize the stream for caching operations.
self
.
cache_stream
=
torch
.
cuda
.
Stream
()
assert
self
.
cache_stream
!=
torch
.
cuda
.
current_stream
()
# Initialize the events for stream synchronization.
self
.
events
=
[
torch
.
cuda
.
Event
()
for
_
in
range
(
self
.
num_layers
)]
def
get_key_block_shape
(
self
)
->
Tuple
[
int
,
int
,
int
,
int
]:
element_size
=
torch
.
tensor
([],
dtype
=
self
.
dtype
).
element_size
()
x
=
16
//
element_size
return
(
self
.
num_heads
,
self
.
head_size
//
x
,
self
.
block_size
,
x
,
)
def
get_value_block_shape
(
self
)
->
Tuple
[
int
,
int
,
int
]:
return
(
self
.
num_heads
,
self
.
head_size
,
self
.
block_size
,
)
def
allocate_gpu_cache
(
self
)
->
List
[
KVCache
]:
gpu_cache
:
List
[
KVCache
]
=
[]
key_block_shape
=
self
.
get_key_block_shape
()
value_block_shape
=
self
.
get_value_block_shape
()
for
_
in
range
(
self
.
num_layers
):
key_blocks
=
torch
.
empty
(
size
=
(
self
.
num_gpu_blocks
,
*
key_block_shape
),
dtype
=
self
.
dtype
,
device
=
"cuda"
,
)
value_blocks
=
torch
.
empty
(
size
=
(
self
.
num_gpu_blocks
,
*
value_block_shape
),
dtype
=
self
.
dtype
,
device
=
"cuda"
,
)
gpu_cache
.
append
((
key_blocks
,
value_blocks
))
return
gpu_cache
def
allocate_cpu_cache
(
self
)
->
List
[
KVCache
]:
cpu_cache
:
List
[
KVCache
]
=
[]
key_block_shape
=
self
.
get_key_block_shape
()
value_block_shape
=
self
.
get_value_block_shape
()
pin_memory
=
not
in_wsl
()
if
not
pin_memory
:
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
logger
.
warning
(
"Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance."
)
for
_
in
range
(
self
.
num_layers
):
key_blocks
=
torch
.
empty
(
size
=
(
self
.
num_cpu_blocks
,
*
key_block_shape
),
dtype
=
self
.
dtype
,
pin_memory
=
pin_memory
,
device
=
"cpu"
,
)
value_blocks
=
torch
.
empty
(
size
=
(
self
.
num_cpu_blocks
,
*
value_block_shape
),
dtype
=
self
.
dtype
,
pin_memory
=
pin_memory
,
device
=
"cpu"
,
)
cpu_cache
.
append
((
key_blocks
,
value_blocks
))
return
cpu_cache
def
_swap
(
self
.
gpu_cache
=
self
.
_allocate_kv_cache
(
self
.
num_gpu_blocks
,
"cuda"
)
self
.
cpu_cache
=
self
.
_allocate_kv_cache
(
self
.
num_cpu_blocks
,
"cpu"
)
def
_allocate_kv_cache
(
self
,
src
:
List
[
KVCache
],
dst
:
List
[
KVCache
],
src_to_dst
:
Dict
[
int
,
int
],
)
->
None
:
from
vllm._C
import
cache_ops
with
torch
.
cuda
.
stream
(
self
.
cache_stream
):
for
i
in
range
(
self
.
num_layers
):
src_key_cache
,
src_value_cache
=
src
[
i
]
dst_key_cache
,
dst_value_cache
=
dst
[
i
]
# Copy the key blocks.
cache_ops
.
swap_blocks
(
src_key_cache
,
dst_key_cache
,
src_to_dst
)
# Copy the value blocks.
cache_ops
.
swap_blocks
(
src_value_cache
,
dst_value_cache
,
src_to_dst
)
event
=
self
.
events
[
i
]
event
.
record
(
stream
=
self
.
cache_stream
)
num_blocks
:
int
,
device
:
str
,
)
->
List
[
torch
.
Tensor
]:
"""Allocates KV cache on the specified device."""
kv_cache_shape
=
self
.
attn_backend
.
get_kv_cache_shape
(
num_blocks
,
self
.
block_size
,
self
.
num_heads
,
self
.
head_size
)
pin_memory
=
is_pin_memory_available
()
if
device
==
"cpu"
else
False
kv_cache
:
List
[
torch
.
Tensor
]
=
[]
for
_
in
range
(
self
.
num_layers
):
kv_cache
.
append
(
torch
.
empty
(
kv_cache_shape
,
dtype
=
self
.
dtype
,
pin_memory
=
pin_memory
,
device
=
device
))
return
kv_cache
def
swap_in
(
self
,
src_to_dst
:
Dict
[
int
,
int
])
->
None
:
self
.
_swap
(
self
.
cpu_cache
,
self
.
gpu_cache
,
src_to_dst
)
for
i
in
range
(
self
.
num_layers
):
self
.
attn_backend
.
swap_blocks
(
self
.
cpu_cache
[
i
],
self
.
gpu_cache
[
i
],
src_to_dst
)
def
swap_out
(
self
,
src_to_dst
:
Dict
[
int
,
int
])
->
None
:
self
.
_swap
(
self
.
gpu_cache
,
self
.
cpu_cache
,
src_to_dst
)
for
i
in
range
(
self
.
num_layers
):
self
.
attn_backend
.
swap_blocks
(
self
.
gpu_cache
[
i
],
self
.
cpu_cache
[
i
],
src_to_dst
)
def
copy
(
self
,
src_to_dsts
:
Dict
[
int
,
List
[
int
]])
->
None
:
from
vllm._C
import
cache_ops
key_caches
=
[
key_cache
for
key_cache
,
_
in
self
.
gpu_cache
]
value_caches
=
[
value_cache
for
_
,
value_cache
in
self
.
gpu_cache
]
# NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU.
cache_ops
.
copy_blocks
(
key_caches
,
value_caches
,
src_to_dsts
)
self
.
attn_backend
.
copy_blocks
(
self
.
gpu_cache
,
src_to_dsts
)
@
staticmethod
def
get_cache_block_size
(
...
...
vllm/worker/model_runner.py
View file @
7c4f76e3
import
contextlib
import
time
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Set
,
Union
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
vllm.config
import
(
DeviceConfig
,
ModelConfig
,
LoRAConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.attention
import
AttentionMetadata
,
get_attn_backend
from
vllm.config
import
(
DeviceConfig
,
LoRAConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
VisionLanguageConfig
)
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
get_model
,
InputMetadata
,
SamplingMetadata
from
vllm.model_executor.parallel_utils
import
cupy_utils
from
vllm.lora.layers
import
LoRAMapping
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.worker_manager
import
LRUCacheWorkerLoRAManager
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.parallel_utils
import
custom_all_reduce
,
pynccl_utils
from
vllm.model_executor.parallel_utils.communication_op
import
(
broadcast_tensor_dict
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
with_cupy_nccl_for_all_reduce
)
from
vllm.model_executor.parallel_utils
import
custom_all_reduce
with_pynccl_for_all_reduce
)
from
vllm.sampling_params
import
SamplingParams
,
SamplingType
from
vllm.sequence
import
SamplerOutput
,
SequenceData
,
SequenceGroupMetadata
from
vllm.lora.worker_manager
import
LRUCacheWorkerLoRAManager
from
vllm.
lora.layers
import
LoRAMapping
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
in_wsl
from
vllm.sequence
import
(
MultiModalData
,
SamplerOutput
,
SequenceData
,
SequenceGroupMetadata
)
from
vllm.
utils
import
(
CudaMemoryProfiler
,
async_tensor_h2d
,
is_pin_memory_available
,
make_tensor_with_pad
,
maybe_expand_dim
)
logger
=
init_logger
(
__name__
)
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
_PAD_SLOT_ID
=
-
1
LORA_WARMUP_RANK
=
8
# Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
_BATCH_SIZE_ALIGNMENT
=
8
# Capture graphs for token size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
_BATCH_SIZES_TO_CAPTURE
=
[
1
,
2
,
4
]
+
[
8
*
i
for
i
in
range
(
1
,
33
)]
_BATCH_SIZES_TO_CAPTURE
=
[
1
,
2
,
4
]
+
[
_BATCH_SIZE_ALIGNMENT
*
i
for
i
in
range
(
1
,
33
)
]
class
ModelRunner
:
...
...
@@ -44,6 +50,7 @@ class ModelRunner:
lora_config
:
Optional
[
LoRAConfig
],
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
is_driver_worker
:
bool
=
False
,
vision_language_config
:
Optional
[
VisionLanguageConfig
]
=
None
,
):
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
...
...
@@ -76,27 +83,31 @@ class ModelRunner:
# The shape of the cached block table will be
# (max batch size to capture, max context len to capture / block size).
self
.
graph_block_tables
=
None
# Set after initial profiling.
# cache in_wsl result
self
.
in_wsl
=
in_wsl
()
self
.
pin_memory
=
is_pin_memory_available
()
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
vision_language_config
=
vision_language_config
# Set enforce_eager to True for Neuron backend, to avoid capturing graph
if
self
.
device_config
.
is_neuron
:
self
.
model_config
.
enforce_eager
=
True
self
.
attn_backend
=
get_attn_backend
(
self
.
model_config
.
dtype
if
model_config
is
not
None
else
None
)
def
load_model
(
self
)
->
None
:
self
.
model
=
get_model
(
self
.
model_config
,
self
.
device_config
,
lora_config
=
self
.
lora_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
)
vocab_size
=
self
.
model
.
config
.
vocab_size
with
CudaMemoryProfiler
()
as
m
:
self
.
model
=
get_model
(
self
.
model_config
,
self
.
device_config
,
lora_config
=
self
.
lora_config
,
vision_language_config
=
self
.
vision_language_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
)
self
.
model_memory_usage
=
m
.
consumed_memory
logger
.
info
(
f
"Loading model weights took "
f
"
{
self
.
model_memory_usage
/
float
(
2
**
30
):.
4
f
}
GB"
)
if
self
.
lora_config
:
assert
hasattr
(
self
.
model
,
"
supported_lora_modules
"
)
and
self
.
model
.
supported_lora_modules
,
"Model does not support LoRA"
assert
hasattr
(
self
.
model
,
"supported_lora_modules"
)
and
self
.
model
.
supported_lora_modules
,
(
"Model does not support LoRA"
)
assert
hasattr
(
self
.
model
,
"embedding_modules"
),
"Model does not have embedding_modules"
...
...
@@ -104,8 +115,7 @@ class ModelRunner:
),
"Model does not have embedding_padding_modules"
self
.
lora_manager
=
LRUCacheWorkerLoRAManager
(
self
.
scheduler_config
.
max_num_seqs
,
self
.
scheduler_config
.
max_num_batched_tokens
+
self
.
scheduler_config
.
max_paddings
,
vocab_size
,
self
.
scheduler_config
.
max_num_batched_tokens
,
self
.
vocab_size
,
self
.
lora_config
,
self
.
device
,
self
.
model
.
embedding_modules
,
self
.
model
.
embedding_padding_modules
)
self
.
model
=
self
.
lora_manager
.
create_lora_manager
(
self
.
model
)
...
...
@@ -113,20 +123,24 @@ class ModelRunner:
def
set_block_size
(
self
,
block_size
:
int
)
->
None
:
self
.
block_size
=
block_size
max_num_blocks
=
(
self
.
max_context_len_to_capture
+
block_size
-
1
)
//
block_size
self
.
graph_block_tables
=
np
.
zeros
(
(
max
(
_BATCH_SIZES_TO_CAPTURE
),
max_num_blocks
),
dtype
=
np
.
int32
)
(
max
(
_BATCH_SIZES_TO_CAPTURE
),
self
.
get_max_block_per_batch
()),
dtype
=
np
.
int32
)
def
get_max_block_per_batch
(
self
)
->
int
:
block_size
=
self
.
block_size
return
(
self
.
max_context_len_to_capture
+
block_size
-
1
)
//
block_size
def
_prepare_prompt
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
InputMetadata
,
List
[
int
],
List
[
int
],
List
[
int
],
List
[
int
],
Set
[
LoRARequest
]]:
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
AttentionMetadata
,
List
[
int
],
List
[
int
],
List
[
int
],
List
[
int
],
Set
[
LoRARequest
],
torch
.
Tensor
]:
assert
len
(
seq_group_metadata_list
)
>
0
input_tokens
:
List
[
List
[
int
]
]
=
[]
input_positions
:
List
[
List
[
int
]
]
=
[]
slot_mapping
:
List
[
List
[
int
]
]
=
[]
input_tokens
:
List
[
int
]
=
[]
input_positions
:
List
[
int
]
=
[]
slot_mapping
:
List
[
int
]
=
[]
lora_index_mapping
:
List
[
int
]
=
[]
lora_prompt_mapping
:
List
[
int
]
=
[]
lora_requests
:
Set
[
LoRARequest
]
=
set
()
...
...
@@ -135,53 +149,82 @@ class ModelRunner:
context_lens
:
List
[
int
]
=
[]
subquery_lens
:
List
[
int
]
=
[]
prefix_block_tables
:
List
[
List
[
int
]]
=
[]
multi_modal_input_list
:
List
[
torch
.
Tensor
]
=
[]
for
seq_group_metadata
in
seq_group_metadata_list
:
assert
seq_group_metadata
.
is_prompt
seq_ids
=
list
(
seq_group_metadata
.
seq_data
.
keys
())
assert
len
(
seq_ids
)
==
1
seq_id
=
seq_ids
[
0
]
computed_block_nums
=
seq_group_metadata
.
computed_block_nums
if
(
self
.
scheduler_config
is
not
None
and
self
.
scheduler_config
.
chunked_prefill_enabled
and
computed_block_nums
is
not
None
):
raise
RuntimeError
(
"chunked prefill cannot be used with prefix caching "
"now."
)
token_chunk_size
=
seq_group_metadata
.
token_chunk_size
seq_data
=
seq_group_metadata
.
seq_data
[
seq_id
]
prompt_tokens
=
seq_data
.
get_token_ids
()
computed_len
=
seq_data
.
get_num_computed_tokens
()
# We should use get_len here because in case of preemption
# it contains output tokens.
prefill_end
=
min
(
seq_data
.
get_len
(),
computed_len
+
token_chunk_size
)
# TODO(sang): Rename it after chunked prefill is introduced.
prompt_tokens
=
seq_data
.
get_token_ids
()[
computed_len
:
prefill_end
]
prompt_len
=
len
(
prompt_tokens
)
# Right now, the prefill_end is always same as the length of
# sequence. However, once chunked prefill is introduced, this
# assumption can be changed.
assert
prefill_end
==
seq_data
.
get_len
()
prompt_lens
.
append
(
prompt_len
)
prefix_len
=
0
prefix
=
seq_group_metadata
.
prefix
if
prefix
is
not
None
and
prefix
.
computed
:
prefix_len
=
prefix
.
get_length
()
prompt_tokens
=
prompt_tokens
[
prefix_len
:]
prefix_block_tables
.
append
(
prefix
.
get_block_numbers
())
# NOTE: This only works for oooooooxxx style attention.
if
computed_block_nums
is
not
None
and
len
(
computed_block_nums
)
>
0
and
self
.
sliding_window
is
None
:
# Prefix is not supported with sliding_window
computed_len
=
len
(
computed_block_nums
)
*
self
.
block_size
prompt_tokens
=
prompt_tokens
[
computed_len
:]
prefix_block_tables
.
append
(
computed_block_nums
)
else
:
prefix_block_tables
.
append
([])
# Right now, prefill start is always 0. However, this
# assumption can be changed once chunked prefill is introduced.
assert
computed_len
==
0
# actual prompt lens
context_lens
.
append
(
prefix
_len
)
subquery_lens
.
append
(
prompt_len
-
prefix
_len
)
context_lens
.
append
(
computed
_len
)
subquery_lens
.
append
(
prompt_len
-
computed
_len
)
input_tokens
.
app
end
(
prompt_tokens
)
input_tokens
.
ext
end
(
prompt_tokens
)
# NOTE(woosuk): Here we assume that the first token in the prompt
# is always the first token in the sequence.
input_positions
.
append
(
list
(
range
(
prefix_len
,
prefix_len
+
len
(
prompt_tokens
))))
input_positions
.
extend
(
list
(
range
(
computed_len
,
prefill_end
)))
lora_id
=
seq_group_metadata
.
lora_int_id
if
lora_id
>
0
:
lora_requests
.
add
(
seq_group_metadata
.
lora_request
)
lora_index_mapping
.
append
(
[
lora_id
]
*
(
prompt_len
-
prefix
_len
)
)
lora_index_mapping
+=
[
lora_id
]
*
(
prompt_len
-
computed
_len
)
lora_prompt_mapping
.
extend
(
[
lora_id
]
*
(
prompt_len
-
prefix
_len
(
prompt_len
-
computed
_len
if
seq_group_metadata
.
sampling_params
.
prompt_logprobs
else
1
))
if
seq_group_metadata
.
multi_modal_data
:
multi_modal_input_list
.
append
(
seq_group_metadata
.
multi_modal_data
.
data
)
if
seq_group_metadata
.
block_tables
is
None
:
# During memory profiling, the block tables are not initialized
# yet. In this case, we just use a dummy slot mapping.
slot_mapping
.
app
end
([
_PAD_SLOT_ID
]
*
prompt_len
)
slot_mapping
.
ext
end
([
_PAD_SLOT_ID
]
*
prompt_len
)
continue
# Compute the slot mapping.
slot_mapping
.
append
([])
block_table
=
seq_group_metadata
.
block_tables
[
seq_id
]
# Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
# where start_idx is max(0, prompt_len - sliding_window).
...
...
@@ -190,86 +233,116 @@ class ModelRunner:
# mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
start_idx
=
0
if
self
.
sliding_window
is
not
None
:
assert
prefix
_len
==
0
,
(
assert
computed
_len
==
0
,
(
"Prefix caching is currently not supported with "
"sliding window attention"
)
start_idx
=
max
(
0
,
prompt_len
-
self
.
sliding_window
)
for
i
in
range
(
prefix_len
,
prompt_len
):
for
i
in
range
(
computed_len
,
prefill_end
):
if
i
<
start_idx
:
slot_mapping
[
-
1
]
.
append
(
_PAD_SLOT_ID
)
slot_mapping
.
append
(
_PAD_SLOT_ID
)
continue
block_number
=
block_table
[
i
//
self
.
block_size
]
block_offset
=
i
%
self
.
block_size
slot
=
block_number
*
self
.
block_size
+
block_offset
slot_mapping
[
-
1
].
append
(
slot
)
max_prompt_len
=
max
(
subquery_lens
)
input_tokens
=
_make_tensor_with_pad
(
input_tokens
,
max_prompt_len
,
pad
=
0
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_positions
=
_make_tensor_with_pad
(
input_positions
,
max_prompt_len
,
pad
=
0
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
slot_mapping
=
_make_tensor_with_pad
(
slot_mapping
,
max_prompt_len
,
pad
=
_PAD_SLOT_ID
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
lora_index_mapping
=
[
_pad_to_max
(
mapping
,
max_prompt_len
,
pad
=
0
)
for
mapping
in
lora_index_mapping
]
slot_mapping
.
append
(
slot
)
max_subquery_len
=
max
(
subquery_lens
)
max_prompt_len
=
max
(
prompt_lens
)
num_prompt_tokens
=
len
(
input_tokens
)
assert
max_subquery_len
>
0
input_tokens
=
torch
.
tensor
(
input_tokens
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_positions
=
torch
.
tensor
(
input_positions
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
lora_index_mapping
=
lora_index_mapping
context_lens_tensor
=
torch
.
tensor
(
context_lens
,
dtype
=
torch
.
int
,
device
=
self
.
device
)
if
multi_modal_input_list
:
assert
self
.
vision_language_config
,
(
"Multi-modal inputs are only supported by "
"vision language models."
)
multi_modal_input
=
torch
.
cat
(
multi_modal_input_list
,
dim
=
0
).
to
(
self
.
device
)
else
:
multi_modal_input
=
None
# Prepare prefix block tables
max_prompt_block_table_len
=
max
(
len
(
t
)
for
t
in
prefix_block_tables
)
block_tables
=
_
make_tensor_with_pad
(
block_tables
=
make_tensor_with_pad
(
prefix_block_tables
,
max_len
=
max_prompt_block_table_len
,
pad
=
0
,
dtype
=
torch
.
int
,
device
=
self
.
device
,
)
start_loc_tensor
=
torch
.
arange
(
0
,
len
(
prompt_lens
)
*
max_prompt_len
,
max_prompt_len
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
# Query length can be shorter than key (i.e., prompt) when prefill
# is chunked or prefix cached.
subquery_lens_tensor
=
torch
.
tensor
(
subquery_lens
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
subquery_start_loc
=
torch
.
zeros
(
subquery_lens_tensor
.
shape
[
0
]
+
1
,
dtype
=
torch
.
int32
,
device
=
self
.
device
)
prompt_lens_tensor
=
torch
.
tensor
(
prompt_lens
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
seq_start_loc
=
torch
.
zeros
(
prompt_lens_tensor
.
shape
[
0
]
+
1
,
dtype
=
torch
.
int32
,
device
=
self
.
device
)
torch
.
cumsum
(
subquery_lens_tensor
,
dim
=
0
,
dtype
=
subquery_start_loc
.
dtype
,
out
=
subquery_start_loc
[
1
:])
input_metadata
=
InputMetadata
(
torch
.
cumsum
(
prompt_lens_tensor
,
dim
=
0
,
dtype
=
seq_start_loc
.
dtype
,
out
=
seq_start_loc
[
1
:])
attn_metadata
=
self
.
attn_backend
.
make_metadata
(
is_prompt
=
True
,
slot_mapping
=
slot_mapping
,
prompt_lens
=
prompt_lens_tensor
,
max_seq_len
=
max_prompt_len
,
start_loc
=
start_loc_tensor
,
prompt_lens
=
prompt_lens
,
prompt_lens_tensor
=
prompt_lens_tensor
,
num_prompt_tokens
=
num_prompt_tokens
,
num_generation_tokens
=
0
,
max_subquery_len
=
max_subquery_len
,
max_context_len
=
None
,
max_prompt_len
=
max_prompt_len
,
subquery_start_loc
=
subquery_start_loc
,
seq_start_loc
=
seq_start_loc
,
context_lens
=
context_lens_tensor
,
block_tables
=
block_tables
,
use_cuda_graph
=
False
,
kv_cache_dtype
=
self
.
kv_cache_dtype
,
)
return
(
input_tokens
,
input_positions
,
input
_metadata
,
prompt_lens
,
return
(
input_tokens
,
input_positions
,
attn
_metadata
,
prompt_lens
,
subquery_lens
,
lora_index_mapping
,
lora_prompt_mapping
,
lora_requests
)
lora_requests
,
multi_modal_input
)
def
_prepare_decode
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
InputMetadata
,
List
[
int
]
,
List
[
int
],
Set
[
LoRARequest
]]:
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
AttentionMetadata
,
List
[
int
],
List
[
int
],
Set
[
LoRARequest
]]:
assert
len
(
seq_group_metadata_list
)
>
0
input_tokens
:
List
[
List
[
int
]
]
=
[]
input_positions
:
List
[
List
[
int
]
]
=
[]
slot_mapping
:
List
[
List
[
int
]
]
=
[]
input_tokens
:
List
[
int
]
=
[]
input_positions
:
List
[
int
]
=
[]
slot_mapping
:
List
[
int
]
=
[]
context_lens
:
List
[
int
]
=
[]
block_tables
:
List
[
List
[
int
]]
=
[]
lora_index_mapping
:
List
[
int
]
=
[]
...
...
@@ -278,6 +351,7 @@ class ModelRunner:
for
seq_group_metadata
in
seq_group_metadata_list
:
assert
not
seq_group_metadata
.
is_prompt
assert
seq_group_metadata
.
token_chunk_size
==
1
seq_ids
=
list
(
seq_group_metadata
.
seq_data
.
keys
())
lora_id
=
seq_group_metadata
.
lora_int_id
...
...
@@ -288,11 +362,11 @@ class ModelRunner:
for
seq_id
in
seq_ids
:
seq_data
=
seq_group_metadata
.
seq_data
[
seq_id
]
generation_token
=
seq_data
.
get_last_token_id
()
input_tokens
.
append
(
[
generation_token
]
)
input_tokens
.
append
(
generation_token
)
seq_len
=
seq_data
.
get_len
()
position
=
seq_len
-
1
input_positions
.
append
(
[
position
]
)
input_positions
.
append
(
position
)
context_len
=
seq_len
if
self
.
sliding_window
is
None
else
min
(
seq_len
,
self
.
sliding_window
)
...
...
@@ -302,8 +376,8 @@ class ModelRunner:
block_number
=
block_table
[
position
//
self
.
block_size
]
block_offset
=
position
%
self
.
block_size
slot
=
block_number
*
self
.
block_size
+
block_offset
slot_mapping
.
append
(
[
slot
]
)
lora_index_mapping
.
append
(
[
lora_id
]
)
slot_mapping
.
append
(
slot
)
lora_index_mapping
.
append
(
lora_id
)
lora_prompt_mapping
.
append
(
lora_id
)
if
self
.
sliding_window
is
not
None
:
...
...
@@ -312,6 +386,9 @@ class ModelRunner:
block_table
=
block_table
[
-
sliding_window_blocks
:]
block_tables
.
append
(
block_table
)
# vLLM uses cuda graph only for decoding requests.
# See `capture_model` API for more details.
# For decoding requests, batch_size == input_tokens.
batch_size
=
len
(
input_tokens
)
max_context_len
=
max
(
context_lens
)
use_captured_graph
=
(
...
...
@@ -319,38 +396,37 @@ class ModelRunner:
and
batch_size
<=
_BATCH_SIZES_TO_CAPTURE
[
-
1
]
and
max_context_len
<=
self
.
max_context_len_to_capture
)
if
use_captured_graph
:
# Pad the input tokens, positions, and slot mapping to match the
# batch size of the captured graph.
graph_batch_size
=
_get_graph_batch_size
(
batch_size
)
assert
graph_batch_size
>=
batch_size
for
_
in
range
(
graph_batch_size
-
batch_size
):
input_tokens
.
append
(
[]
)
input_positions
.
append
(
[]
)
slot_mapping
.
append
(
[]
)
input_tokens
.
append
(
0
)
input_positions
.
append
(
0
)
slot_mapping
.
append
(
_PAD_SLOT_ID
)
context_lens
.
append
(
1
)
block_tables
.
append
([])
lora_index_mapping
.
append
(
0
)
batch_size
=
graph_batch_size
input_tokens
=
_make_tensor_with_pad
(
input_tokens
,
max_len
=
1
,
pad
=
0
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_positions
=
_make_tensor_with_pad
(
input_positions
,
max_len
=
1
,
pad
=
0
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
slot_mapping
=
_make_tensor_with_pad
(
slot_mapping
,
max_len
=
1
,
pad
=
_PAD_SLOT_ID
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_tokens
=
torch
.
tensor
(
input_tokens
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_positions
=
torch
.
tensor
(
input_positions
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
context_lens
=
torch
.
tensor
(
context_lens
,
dtype
=
torch
.
int
,
device
=
self
.
device
)
if
use_captured_graph
:
# When using cuda-graph all these tensors should be
# padded.
assert
context_lens
.
shape
[
0
]
==
input_tokens
.
shape
[
0
]
assert
context_lens
.
shape
[
0
]
==
input_positions
.
shape
[
0
]
assert
context_lens
.
shape
[
0
]
==
slot_mapping
.
shape
[
0
]
# The shape of graph_block_tables is
# [max batch size, max context len // block size].
input_block_tables
=
self
.
graph_block_tables
[:
batch_size
]
...
...
@@ -361,7 +437,7 @@ class ModelRunner:
else
:
max_block_table_len
=
max
(
len
(
block_table
)
for
block_table
in
block_tables
)
block_tables
=
_
make_tensor_with_pad
(
block_tables
=
make_tensor_with_pad
(
block_tables
,
max_len
=
max_block_table_len
,
pad
=
0
,
...
...
@@ -369,23 +445,24 @@ class ModelRunner:
device
=
self
.
device
,
)
lora_index_mapping
=
[
_pad_to_max
(
mapping
,
1
,
pad
=
0
)
for
mapping
in
lora_index_mapping
]
input_metadata
=
InputMetadata
(
attn_metadata
=
self
.
attn_backend
.
make_metadata
(
is_prompt
=
False
,
slot_mapping
=
slot_mapping
,
prompt_lens
=
None
,
max_seq_len
=
None
,
start_loc
=
None
,
prompt_lens_tensor
=
None
,
num_prompt_tokens
=
0
,
num_generation_tokens
=
len
(
input_tokens
),
max_subquery_len
=
None
,
max_context_len
=
max_context_len
,
max_prompt_len
=
None
,
subquery_start_loc
=
None
,
seq_start_loc
=
None
,
context_lens
=
context_lens
,
block_tables
=
block_tables
,
use_cuda_graph
=
use_captured_graph
,
kv_cache_dtype
=
self
.
kv_cache_dtype
,
)
return
(
input_tokens
,
input_positions
,
input
_metadata
,
return
(
input_tokens
,
input_positions
,
attn
_metadata
,
lora_index_mapping
,
lora_prompt_mapping
,
lora_requests
)
def
_prepare_sample
(
...
...
@@ -400,9 +477,8 @@ class ModelRunner:
selected_token_start_idx
=
0
categorized_sample_indices
=
{
t
:
[]
for
t
in
SamplingType
}
categorized_sample_indices_start_idx
=
0
pin_memory
=
not
self
.
in_wsl
and
not
self
.
device_config
.
is_neuron
categorized_sampled_token_indices_start_idx
=
0
max_subquery_len
=
max
(
subquery_lens
)
if
subquery_lens
else
1
for
i
,
seq_group_metadata
in
enumerate
(
seq_group_metadata_list
):
seq_ids
=
list
(
seq_group_metadata
.
seq_data
.
keys
())
sampling_params
=
seq_group_metadata
.
sampling_params
...
...
@@ -417,9 +493,12 @@ class ModelRunner:
categorized_sample_indices_start_idx
+=
subquery_len
-
1
categorized_sample_indices
[
sampling_params
.
sampling_type
].
append
(
categorized_sample_indices_start_idx
)
sampling_params
.
sampling_type
].
append
([
categorized_sample_indices_start_idx
,
categorized_sampled_token_indices_start_idx
])
categorized_sample_indices_start_idx
+=
1
categorized_sampled_token_indices_start_idx
+=
1
if
sampling_params
.
prompt_logprobs
is
not
None
:
selected_token_indices
.
extend
(
...
...
@@ -427,11 +506,11 @@ class ModelRunner:
selected_token_start_idx
+
subquery_len
-
1
))
selected_token_indices
.
append
(
selected_token_start_idx
+
subquery_len
-
1
)
selected_token_start_idx
+=
max_
subquery_len
selected_token_start_idx
+=
subquery_len
if
sampling_params
.
seed
is
not
None
:
seq_group_metadata
.
state
.
generator
=
torch
.
Generator
(
device
=
"cuda"
).
manual_seed
(
sampling_params
.
seed
)
device
=
self
.
device
).
manual_seed
(
sampling_params
.
seed
)
else
:
num_seqs
=
len
(
seq_ids
)
selected_token_indices
.
extend
(
...
...
@@ -441,22 +520,32 @@ class ModelRunner:
categorized_sample_indices
[
sampling_params
.
sampling_type
].
extend
(
range
(
categorized_sample_indices_start_idx
,
categorized_sample_indices_start_idx
+
num_seqs
))
zip
(
range
(
categorized_sample_indices_start_idx
,
categorized_sample_indices_start_idx
+
num_seqs
),
range
(
categorized_sampled_token_indices_start_idx
,
categorized_sampled_token_indices_start_idx
+
num_seqs
)))
categorized_sample_indices_start_idx
+=
num_seqs
categorized_sampled_token_indices_start_idx
+=
num_seqs
if
sampling_params
.
seed
is
not
None
:
generators
.
append
(
seq_group_metadata
.
state
.
generator
)
selected_token_indices
=
_async_h2d
(
selected_token_indices
,
dtype
=
torch
.
long
,
target_device
=
self
.
device
,
pin_memory
=
pin_memory
)
selected_token_indices
=
async_tensor_h2d
(
selected_token_indices
,
dtype
=
torch
.
long
,
target_device
=
self
.
device
,
pin_memory
=
self
.
pin_memory
)
categorized_sample_indices
=
{
t
:
_async_h2d
(
seq_ids
,
dtype
=
torch
.
int
,
target_device
=
self
.
device
,
pin_memory
=
pin_memory
)
t
:
maybe_expand_dim
(
async_tensor_h2d
(
seq_ids
,
dtype
=
torch
.
int
,
target_device
=
self
.
device
,
pin_memory
=
self
.
pin_memory
),
2
,
2
)
for
t
,
seq_ids
in
categorized_sample_indices
.
items
()
}
...
...
@@ -477,33 +566,32 @@ class ModelRunner:
def
prepare_input_tensors
(
self
,
seq_group_metadata_list
:
Optional
[
List
[
SequenceGroupMetadata
]],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
Input
Metadata
,
SamplingMetadata
,
Set
[
int
],
LoRAMapping
]:
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
Attention
Metadata
,
SamplingMetadata
,
Set
[
int
],
LoRAMapping
,
torch
.
Tensor
]:
if
self
.
is_driver_worker
:
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt
=
seq_group_metadata_list
[
0
].
is_prompt
# Prepare input tensors.
if
is_prompt
:
(
input_tokens
,
input_positions
,
input
_metadata
,
prompt_lens
,
(
input_tokens
,
input_positions
,
attn
_metadata
,
prompt_lens
,
subquery_lens
,
lora_index_mapping
,
lora_prompt_mapping
,
lora_requests
)
=
self
.
_prepare_prompt
(
seq_group_metadata_list
)
lora_requests
,
multi_modal_input
)
=
self
.
_prepare_prompt
(
seq_group_metadata_list
)
else
:
(
input_tokens
,
input_positions
,
input
_metadata
,
(
input_tokens
,
input_positions
,
attn
_metadata
,
lora_index_mapping
,
lora_prompt_mapping
,
lora_requests
)
=
self
.
_prepare_decode
(
seq_group_metadata_list
)
prompt_lens
=
[]
subquery_lens
=
None
multi_modal_input
=
None
sampling_metadata
=
self
.
_prepare_sample
(
seq_group_metadata_list
,
prompt_lens
,
subquery_lens
)
if
self
.
lora_config
:
flat_lora_index_mapping
=
[
item
for
sublist
in
lora_index_mapping
for
item
in
sublist
]
lora_mapping
=
LoRAMapping
(
flat_
lora_index_mapping
,
lora_index_mapping
,
lora_prompt_mapping
,
)
else
:
...
...
@@ -513,82 +601,77 @@ class ModelRunner:
metadata_dict
=
{
"input_tokens"
:
input_tokens
,
"input_positions"
:
input_positions
,
"is_prompt"
:
input_metadata
.
is_prompt
,
"slot_mapping"
:
input_metadata
.
slot_mapping
,
"prompt_lens"
:
input_metadata
.
prompt_lens
,
"max_seq_len"
:
input_metadata
.
max_seq_len
,
"start_loc"
:
input_metadata
.
start_loc
,
"max_context_len"
:
input_metadata
.
max_context_len
,
"context_lens"
:
input_metadata
.
context_lens
,
"block_tables"
:
input_metadata
.
block_tables
,
"use_cuda_graph"
:
input_metadata
.
use_cuda_graph
,
"kv_cache_dtype"
:
input_metadata
.
kv_cache_dtype
,
"selected_token_indices"
:
sampling_metadata
.
selected_token_indices
,
"lora_requests"
:
lora_requests
,
"lora_mapping"
:
lora_mapping
,
"multi_modal_input"
:
multi_modal_input
,
}
metadata_dict
.
update
(
attn_metadata
.
asdict_zerocopy
())
broadcast_tensor_dict
(
metadata_dict
,
src
=
0
)
else
:
metadata_dict
=
broadcast_tensor_dict
(
src
=
0
)
input_tokens
=
metadata_dict
[
"input_tokens"
]
input_positions
=
metadata_dict
[
"input_positions"
]
lora_mapping
=
metadata_dict
[
"lora_mapping"
]
lora_requests
=
metadata_dict
[
"lora_requests"
]
input_metadata
=
InputMetadata
(
is_prompt
=
metadata_dict
[
"is_prompt"
],
slot_mapping
=
metadata_dict
[
"slot_mapping"
],
prompt_lens
=
metadata_dict
[
"prompt_lens"
],
max_seq_len
=
metadata_dict
[
"max_seq_len"
],
start_loc
=
metadata_dict
[
"start_loc"
],
max_context_len
=
metadata_dict
[
"max_context_len"
],
context_lens
=
metadata_dict
[
"context_lens"
],
block_tables
=
metadata_dict
[
"block_tables"
],
use_cuda_graph
=
metadata_dict
[
"use_cuda_graph"
],
kv_cache_dtype
=
metadata_dict
[
"kv_cache_dtype"
],
)
input_tokens
=
metadata_dict
.
pop
(
"input_tokens"
)
input_positions
=
metadata_dict
.
pop
(
"input_positions"
)
selected_token_indices
=
metadata_dict
.
pop
(
"selected_token_indices"
)
lora_mapping
=
metadata_dict
.
pop
(
"lora_mapping"
)
lora_requests
=
metadata_dict
.
pop
(
"lora_requests"
)
multi_modal_input
=
metadata_dict
.
pop
(
"multi_modal_input"
)
attn_metadata
=
self
.
attn_backend
.
make_metadata
(
**
metadata_dict
)
sampling_metadata
=
SamplingMetadata
(
seq_groups
=
None
,
seq_data
=
None
,
prompt_lens
=
None
,
selected_token_indices
=
metadata_dict
[
"
selected_token_indices
"
]
,
selected_token_indices
=
selected_token_indices
,
categorized_sample_indices
=
None
,
generators
=
None
,
perform_sampling
=
False
,
)
return
(
input_tokens
,
input_positions
,
input_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
)
return
(
input_tokens
,
input_positions
,
attn_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
,
multi_modal_input
)
@
torch
.
inference_mode
()
def
execute_model
(
self
,
seq_group_metadata_list
:
Optional
[
List
[
SequenceGroupMetadata
]],
kv_caches
:
List
[
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
],
kv_caches
:
List
[
torch
.
Tensor
],
)
->
Optional
[
SamplerOutput
]:
(
input_tokens
,
input_positions
,
input
_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
)
=
self
.
prepare_input_tensors
(
seq_group_metadata_list
)
(
input_tokens
,
input_positions
,
attn
_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
,
multi_modal_input
)
=
self
.
prepare_input_tensors
(
seq_group_metadata_list
)
if
self
.
lora_config
:
self
.
set_active_loras
(
lora_requests
,
lora_mapping
)
# Execute the model.
if
input
_metadata
.
use_cuda_graph
:
if
attn
_metadata
.
use_cuda_graph
:
graph_batch_size
=
input_tokens
.
shape
[
0
]
model_executable
=
self
.
graph_runners
[
graph_batch_size
]
else
:
model_executable
=
self
.
model
hidden_states
=
model_executable
(
input_ids
=
input_tokens
,
positions
=
input_positions
,
kv_caches
=
kv_caches
,
input_metadata
=
input_metadata
,
)
execute_model_kwargs
=
{
"input_ids"
:
input_tokens
,
"positions"
:
input_positions
,
"kv_caches"
:
kv_caches
,
"attn_metadata"
:
attn_metadata
,
}
if
self
.
vision_language_config
:
execute_model_kwargs
.
update
({
"image_input"
:
multi_modal_input
})
hidden_states
=
model_executable
(
**
execute_model_kwargs
)
# Compute the logits.
logits
=
self
.
model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
# Only perform sampling in the driver worker.
if
not
sampling_metadata
.
perform_sampling
:
return
None
# Sample the next token.
output
=
self
.
model
.
sample
(
hidden_states
=
hidden_state
s
,
logits
=
logit
s
,
sampling_metadata
=
sampling_metadata
,
)
return
output
...
...
@@ -596,8 +679,7 @@ class ModelRunner:
@
torch
.
inference_mode
()
def
profile_run
(
self
)
->
None
:
# Enable top-k sampling to reflect the accurate memory usage.
vocab_size
=
self
.
model_config
.
get_vocab_size
()
sampling_params
=
SamplingParams
(
top_p
=
0.99
,
top_k
=
vocab_size
-
1
)
sampling_params
=
SamplingParams
(
top_p
=
0.99
,
top_k
=
self
.
vocab_size
-
1
)
max_num_batched_tokens
=
self
.
scheduler_config
.
max_num_batched_tokens
max_num_seqs
=
self
.
scheduler_config
.
max_num_seqs
...
...
@@ -626,10 +708,22 @@ class ModelRunner:
# Profile memory usage with max_num_sequences sequences and the total
# number of tokens equal to max_num_batched_tokens.
seqs
:
List
[
SequenceGroupMetadata
]
=
[]
# Additional GPU memory may be needed for vision encoding, which needs
# to be accounted for when calculating the GPU blocks for
# vLLM blocker manager.
# To exercise the worst scenario for GPU memory consumption,
# the number of seqs (batch_size) is chosen to maximize the number
# of images processed.
if
self
.
vision_language_config
:
max_num_seqs
=
min
(
max_num_seqs
,
int
(
max_num_batched_tokens
/
self
.
vision_language_config
.
image_feature_size
))
for
group_id
in
range
(
max_num_seqs
):
seq_len
=
(
max_num_batched_tokens
//
max_num_seqs
+
(
group_id
<
max_num_batched_tokens
%
max_num_seqs
))
seq_data
=
SequenceData
([
0
]
*
seq_len
)
seq_data
,
fake_multi_modal_input
=
_prepare_fake_inputs
(
seq_len
,
self
.
vision_language_config
)
seq
=
SequenceGroupMetadata
(
request_id
=
str
(
group_id
),
is_prompt
=
True
,
...
...
@@ -638,12 +732,13 @@ class ModelRunner:
block_tables
=
None
,
lora_request
=
dummy_lora_requests_per_seq
[
group_id
]
if
dummy_lora_requests_per_seq
else
None
,
multi_modal_data
=
fake_multi_modal_input
,
)
seqs
.
append
(
seq
)
# Run the model with the dummy inputs.
num_layers
=
self
.
model_config
.
get_num_layers
(
self
.
parallel_config
)
kv_caches
=
[
(
None
,
None
)
]
*
num_layers
kv_caches
=
[
None
]
*
num_layers
self
.
execute_model
(
seqs
,
kv_caches
)
torch
.
cuda
.
synchronize
()
return
...
...
@@ -675,10 +770,22 @@ class ModelRunner:
return
self
.
lora_manager
.
list_loras
()
@
torch
.
inference_mode
()
def
capture_model
(
self
,
kv_caches
:
List
[
KVCache
])
->
None
:
def
capture_model
(
self
,
kv_caches
:
List
[
torch
.
Tensor
])
->
None
:
"""Cuda graph capture a model.
Note that CUDA graph's performance gain is negligible if number
of batched tokens are larger than 200. And since CUDA graph
requires fixed sized tensors, supporting large/variable batch
size requires high GPU memory overhead. Thus, vLLM only captures
decoding requests. Mixed batch (chunked prefill + decoding) or
prefill requests are not captured.
Since it is used for decoding-only, it assumes there's only 1 token
per sequence in the batch.
"""
# NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
# deleted before the CUDA graphs.
self
.
cu
py
_
nccl_backend
=
cu
py_utils
.
get_nccl_backend
()
self
.
pynccl_backend
=
py
nccl
_utils
.
get_nccl_backend
()
assert
not
self
.
model_config
.
enforce_eager
logger
.
info
(
"Capturing the model for CUDA graphs. This may lead to "
...
...
@@ -694,10 +801,9 @@ class ModelRunner:
# Prepare dummy inputs. These will be reused for all batch sizes.
max_batch_size
=
max
(
_BATCH_SIZES_TO_CAPTURE
)
input_tokens
=
torch
.
zeros
(
max_batch_size
,
1
,
dtype
=
torch
.
long
).
cuda
()
input_positions
=
torch
.
zeros
(
max_batch_size
,
1
,
dtype
=
torch
.
long
).
cuda
()
slot_mapping
=
torch
.
empty
(
max_batch_size
,
1
,
dtype
=
torch
.
long
).
cuda
()
input_tokens
=
torch
.
zeros
(
max_batch_size
,
dtype
=
torch
.
long
).
cuda
()
input_positions
=
torch
.
zeros
(
max_batch_size
,
dtype
=
torch
.
long
).
cuda
()
slot_mapping
=
torch
.
empty
(
max_batch_size
,
dtype
=
torch
.
long
).
cuda
()
slot_mapping
.
fill_
(
_PAD_SLOT_ID
)
context_lens
=
torch
.
ones
(
max_batch_size
,
dtype
=
torch
.
int32
).
cuda
()
block_tables
=
torch
.
from_numpy
(
self
.
graph_block_tables
).
cuda
()
...
...
@@ -709,23 +815,28 @@ class ModelRunner:
]
# NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce
# kernel,
CuPy NCCL
, and PyTorch NCCL. When using CUDA graph, we use
# either custom all-reduce kernel or
CuPy NCCL
. When not using CUDA
# kernel,
pynccl
, and PyTorch NCCL. When using CUDA graph, we use
# either custom all-reduce kernel or
pynccl
. When not using CUDA
# graph, we use either custom all-reduce kernel or PyTorch NCCL.
# We always prioritize using custom all-reduce kernel but fall back
# to PyTorch or
CuPy NCCL
if it is disabled or not supported.
# to PyTorch or
pynccl
if it is disabled or not supported.
with
custom_all_reduce
.
capture
():
# NOTE: Capturing the largest batch size first may help reduce the
# memory usage of CUDA graph.
for
batch_size
in
reversed
(
batch_size_capture_list
):
# Create dummy
input
_metadata.
input
_metadata
=
InputM
etadata
(
# Create dummy
attn
_metadata.
attn
_metadata
=
self
.
attn_backend
.
make_m
etadata
(
is_prompt
=
False
,
slot_mapping
=
slot_mapping
[:
batch_size
],
prompt_lens
=
None
,
max_seq_len
=
None
,
start_loc
=
None
,
prompt_lens_tensor
=
None
,
num_prompt_tokens
=
0
,
num_generation_tokens
=
batch_size
,
max_subquery_len
=
None
,
max_context_len
=
self
.
max_context_len_to_capture
,
max_prompt_len
=
None
,
subquery_start_loc
=
None
,
seq_start_loc
=
None
,
context_lens
=
context_lens
[:
batch_size
],
block_tables
=
block_tables
[:
batch_size
],
use_cuda_graph
=
True
,
...
...
@@ -744,7 +855,7 @@ class ModelRunner:
input_tokens
[:
batch_size
],
input_positions
[:
batch_size
],
kv_caches
,
input
_metadata
,
attn
_metadata
,
memory_pool
=
self
.
graph_memory_pool
,
)
self
.
graph_memory_pool
=
graph_runner
.
graph
.
pool
()
...
...
@@ -756,12 +867,18 @@ class ModelRunner:
logger
.
info
(
f
"Graph capturing finished in
{
elapsed_time
:.
0
f
}
secs."
)
def
__del__
(
self
)
->
None
:
# Delete the CUDA graphs before deleting the
CuPy NCCL
communicator.
# Delete the CUDA graphs before deleting the
pynccl
communicator.
# NOTE(woosuk): This is necessary because otherwise deadlocks can
# happen.
# FIXME(woosuk): This is a bit hacky. Find a more robust solution.
# TODO(youkaichao): when we get enough user feedback that pynccl is
# more stable than cupy, we can remove this, e.g. in v0.4.1.
self
.
graph_runners
.
clear
()
self
.
cupy_nccl_backend
=
None
self
.
pynccl_backend
=
None
@
property
def
vocab_size
(
self
)
->
int
:
return
self
.
model_config
.
get_vocab_size
()
class
CUDAGraphRunner
:
...
...
@@ -776,20 +893,22 @@ class CUDAGraphRunner:
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input
_metadata
:
Input
Metadata
,
kv_caches
:
List
[
torch
.
Tensor
],
attn
_metadata
:
Attention
Metadata
,
memory_pool
,
**
kwargs
,
)
->
None
:
assert
self
.
graph
is
None
# Run the model once without capturing the graph.
# This is to make sure that the captured graph does not include the
# kernel launches for initial benchmarking (e.g., Triton autotune).
with
_maybe_
cu
py
_
nccl
():
with
_maybe_pynccl
():
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
attn_metadata
,
**
kwargs
,
)
torch
.
cuda
.
synchronize
()
...
...
@@ -798,12 +917,13 @@ class CUDAGraphRunner:
# https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
self
.
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
self
.
graph
,
pool
=
memory_pool
):
# noqa: SIM117
with
_maybe_
cu
py
_
nccl
():
with
_maybe_pynccl
():
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
attn_metadata
,
**
kwargs
,
)
torch
.
cuda
.
synchronize
()
...
...
@@ -812,9 +932,9 @@ class CUDAGraphRunner:
"input_ids"
:
input_ids
,
"positions"
:
positions
,
"kv_caches"
:
kv_caches
,
"slot_mapping"
:
input
_metadata
.
slot_mapping
,
"context_lens"
:
input
_metadata
.
context_lens
,
"block_tables"
:
input
_metadata
.
block_tables
,
"slot_mapping"
:
attn
_metadata
.
slot_mapping
,
"context_lens"
:
attn
_metadata
.
context_lens
,
"block_tables"
:
attn
_metadata
.
block_tables
,
}
self
.
output_buffers
=
{
"hidden_states"
:
hidden_states
}
return
...
...
@@ -823,8 +943,9 @@ class CUDAGraphRunner:
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]],
input_metadata
:
InputMetadata
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
**
kwargs
,
)
->
torch
.
Tensor
:
# KV caches are fixed tensors, so we don't need to copy them.
del
kv_caches
...
...
@@ -832,13 +953,12 @@ class CUDAGraphRunner:
# Copy the input tensors to the input buffers.
self
.
input_buffers
[
"input_ids"
].
copy_
(
input_ids
,
non_blocking
=
True
)
self
.
input_buffers
[
"positions"
].
copy_
(
positions
,
non_blocking
=
True
)
self
.
input_buffers
[
"slot_mapping"
].
copy_
(
input
_metadata
.
slot_mapping
,
self
.
input_buffers
[
"slot_mapping"
].
copy_
(
attn
_metadata
.
slot_mapping
,
non_blocking
=
True
)
self
.
input_buffers
[
"context_lens"
].
copy_
(
input
_metadata
.
context_lens
,
self
.
input_buffers
[
"context_lens"
].
copy_
(
attn
_metadata
.
context_lens
,
non_blocking
=
True
)
self
.
input_buffers
[
"block_tables"
].
copy_
(
input
_metadata
.
block_tables
,
self
.
input_buffers
[
"block_tables"
].
copy_
(
attn
_metadata
.
block_tables
,
non_blocking
=
True
)
# Run the graph.
self
.
graph
.
replay
()
...
...
@@ -850,44 +970,43 @@ class CUDAGraphRunner:
@
contextlib
.
contextmanager
def
_maybe_cupy_nccl
():
if
cupy_utils
.
is_initialized
()
and
not
custom_all_reduce
.
is_initialized
():
with
with_cupy_nccl_for_all_reduce
():
def
_maybe_pynccl
():
if
pynccl_utils
.
is_initialized
(
)
and
not
custom_all_reduce
.
is_initialized
():
with
with_pynccl_for_all_reduce
():
yield
else
:
yield
def
_pad_to_max
(
x
:
List
[
int
],
max_len
:
int
,
pad
:
int
)
->
List
[
int
]:
assert
len
(
x
)
<=
max_len
return
x
+
[
pad
]
*
(
max_len
-
len
(
x
))
def
_make_tensor_with_pad
(
x
:
List
[
List
[
int
]],
max_len
:
int
,
pad
:
int
,
dtype
:
torch
.
dtype
,
device
:
Optional
[
Union
[
str
,
torch
.
device
]],
)
->
torch
.
Tensor
:
padded_x
=
[
_pad_to_max
(
x_i
,
max_len
,
pad
)
for
x_i
in
x
]
return
torch
.
tensor
(
padded_x
,
dtype
=
dtype
,
device
=
device
)
def
_get_graph_batch_size
(
batch_size
:
int
)
->
int
:
"""Returns the padded batch size given actual batch size.
Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
"""
if
batch_size
<=
2
:
return
batch_size
elif
batch_size
<=
4
:
return
4
else
:
return
(
batch_size
+
7
)
//
8
*
8
def
_async_h2d
(
data
:
list
,
dtype
:
torch
.
dtype
,
target_device
:
Union
[
str
,
torch
.
device
],
pin_memory
:
bool
,
)
->
torch
.
Tensor
:
t
=
torch
.
tensor
(
data
,
dtype
=
dtype
,
pin_memory
=
pin_memory
,
device
=
"cpu"
)
return
t
.
to
(
device
=
target_device
,
non_blocking
=
True
)
return
((
batch_size
+
_BATCH_SIZE_ALIGNMENT
-
1
)
//
_BATCH_SIZE_ALIGNMENT
*
_BATCH_SIZE_ALIGNMENT
)
def
_prepare_fake_inputs
(
seq_len
:
int
,
vision_language_config
:
Optional
[
VisionLanguageConfig
]):
"""Prepare fake inputs for profile run."""
if
vision_language_config
:
prompt_tokens
=
[
vision_language_config
.
image_token_id
]
*
vision_language_config
.
image_feature_size
+
[
0
]
*
(
seq_len
-
vision_language_config
.
image_feature_size
)
fake_image_input
=
MultiModalData
(
type
=
MultiModalData
.
Type
.
IMAGE
,
data
=
torch
.
zeros
(
vision_language_config
.
image_input_shape
,
dtype
=
torch
.
float16
))
else
:
prompt_tokens
=
[
0
]
*
seq_len
fake_image_input
=
None
return
SequenceData
(
prompt_tokens
),
fake_image_input
vllm/worker/neuron_model_runner.py
0 → 100644
View file @
7c4f76e3
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
torch
from
vllm.config
import
(
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.neuron_model_loader
import
get_neuron_model
from
vllm.sampling_params
import
SamplingParams
,
SamplingType
from
vllm.sequence
import
SamplerOutput
,
SequenceData
,
SequenceGroupMetadata
from
vllm.utils
import
(
async_tensor_h2d
,
is_pin_memory_available
,
make_tensor_with_pad
,
maybe_expand_dim
)
logger
=
init_logger
(
__name__
)
class
NeuronModelRunner
:
def
__init__
(
self
,
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
):
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
if
model_config
is
not
None
and
model_config
.
get_sliding_window
():
logger
.
warning
(
"Sliding window is not supported on Neuron. "
"The model will run without sliding window."
)
self
.
device_config
=
(
device_config
if
device_config
is
not
None
else
DeviceConfig
())
self
.
device
=
self
.
device_config
.
device
self
.
model
=
None
self
.
pin_memory
=
is_pin_memory_available
()
def
load_model
(
self
)
->
None
:
self
.
model
=
get_neuron_model
(
self
.
model_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
)
def
_prepare_prompt
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
List
[
int
]]:
assert
len
(
seq_group_metadata_list
)
>
0
input_tokens
:
List
[
List
[
int
]]
=
[]
input_positions
:
List
[
List
[
int
]]
=
[]
input_block_ids
:
List
[
int
]
=
[]
prompt_lens
:
List
[
int
]
=
[]
for
seq_group_metadata
in
seq_group_metadata_list
:
assert
seq_group_metadata
.
is_prompt
seq_ids
=
list
(
seq_group_metadata
.
seq_data
.
keys
())
assert
len
(
seq_ids
)
==
1
seq_id
=
seq_ids
[
0
]
seq_data
=
seq_group_metadata
.
seq_data
[
seq_id
]
prompt_tokens
=
seq_data
.
get_token_ids
()
prompt_len
=
len
(
prompt_tokens
)
prompt_lens
.
append
(
prompt_len
)
input_tokens
.
append
(
prompt_tokens
)
input_positions
.
append
(
list
(
range
(
prompt_len
)))
assert
seq_group_metadata
.
block_tables
is
not
None
block_table
=
seq_group_metadata
.
block_tables
[
seq_id
]
assert
len
(
block_table
)
==
1
input_block_ids
.
append
(
block_table
[
0
])
max_prompt_len
=
max
(
prompt_lens
)
assert
max_prompt_len
>
0
input_tokens
=
make_tensor_with_pad
(
input_tokens
,
max_prompt_len
,
pad
=
0
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_positions
=
make_tensor_with_pad
(
input_positions
,
max_prompt_len
,
pad
=
0
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_block_ids
=
torch
.
tensor
(
input_block_ids
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
return
input_tokens
,
input_positions
,
input_block_ids
,
prompt_lens
def
_prepare_decode
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
assert
len
(
seq_group_metadata_list
)
>
0
input_tokens
:
List
[
List
[
int
]]
=
[]
input_positions
:
List
[
List
[
int
]]
=
[]
input_block_ids
:
List
[
int
]
=
[]
context_lens
:
List
[
int
]
=
[]
for
seq_group_metadata
in
seq_group_metadata_list
:
assert
not
seq_group_metadata
.
is_prompt
seq_ids
=
list
(
seq_group_metadata
.
seq_data
.
keys
())
for
seq_id
in
seq_ids
:
seq_data
=
seq_group_metadata
.
seq_data
[
seq_id
]
generation_token
=
seq_data
.
get_last_token_id
()
input_tokens
.
append
([
generation_token
])
seq_len
=
seq_data
.
get_len
()
position
=
seq_len
-
1
input_positions
.
append
([
position
])
context_lens
.
append
(
seq_len
)
assert
seq_group_metadata
.
block_tables
is
not
None
block_table
=
seq_group_metadata
.
block_tables
[
seq_id
]
assert
len
(
block_table
)
==
1
input_block_ids
.
append
(
block_table
[
0
])
input_tokens
=
make_tensor_with_pad
(
input_tokens
,
max_len
=
1
,
pad
=
0
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_positions
=
make_tensor_with_pad
(
input_positions
,
max_len
=
1
,
pad
=
0
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
context_lens
=
torch
.
tensor
(
context_lens
,
dtype
=
torch
.
int
,
device
=
self
.
device
)
input_block_ids
=
torch
.
tensor
(
input_block_ids
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
return
input_tokens
,
input_positions
,
input_block_ids
def
_prepare_sample
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
prompt_lens
:
List
[
int
],
)
->
SamplingMetadata
:
seq_groups
:
List
[
Tuple
[
List
[
int
],
SamplingParams
]]
=
[]
selected_token_indices
:
List
[
int
]
=
[]
generators
:
List
[
torch
.
Generator
]
=
[]
selected_token_start_idx
=
0
categorized_sample_indices
=
{
t
:
[]
for
t
in
SamplingType
}
categorized_sample_indices_start_idx
=
0
categorized_sampled_token_indices_start_idx
=
0
for
i
,
seq_group_metadata
in
enumerate
(
seq_group_metadata_list
):
seq_ids
=
list
(
seq_group_metadata
.
seq_data
.
keys
())
sampling_params
=
seq_group_metadata
.
sampling_params
seq_groups
.
append
((
seq_ids
,
sampling_params
))
if
seq_group_metadata
.
is_prompt
:
assert
len
(
seq_ids
)
==
1
assert
prompt_lens
is
not
None
prompt_len
=
prompt_lens
[
i
]
if
sampling_params
.
prompt_logprobs
is
not
None
:
# NOTE: prompt token positions do not need sample, skip
categorized_sample_indices_start_idx
+=
prompt_len
-
1
categorized_sample_indices
[
sampling_params
.
sampling_type
].
append
([
categorized_sample_indices_start_idx
,
categorized_sampled_token_indices_start_idx
])
categorized_sample_indices_start_idx
+=
1
categorized_sampled_token_indices_start_idx
+=
1
if
sampling_params
.
prompt_logprobs
is
not
None
:
selected_token_indices
.
extend
(
range
(
selected_token_start_idx
,
selected_token_start_idx
+
prompt_len
-
1
))
selected_token_indices
.
append
(
selected_token_start_idx
+
prompt_len
-
1
)
selected_token_start_idx
+=
prompt_len
if
sampling_params
.
seed
is
not
None
:
seq_group_metadata
.
state
.
generator
=
torch
.
Generator
(
device
=
self
.
device
).
manual_seed
(
sampling_params
.
seed
)
else
:
num_seqs
=
len
(
seq_ids
)
selected_token_indices
.
extend
(
range
(
selected_token_start_idx
,
selected_token_start_idx
+
num_seqs
))
selected_token_start_idx
+=
num_seqs
categorized_sample_indices
[
sampling_params
.
sampling_type
].
extend
(
zip
(
range
(
categorized_sample_indices_start_idx
,
categorized_sample_indices_start_idx
+
num_seqs
),
range
(
categorized_sampled_token_indices_start_idx
,
categorized_sampled_token_indices_start_idx
+
num_seqs
)))
categorized_sample_indices_start_idx
+=
num_seqs
categorized_sampled_token_indices_start_idx
+=
num_seqs
if
sampling_params
.
seed
is
not
None
:
generators
.
append
(
seq_group_metadata
.
state
.
generator
)
selected_token_indices
=
async_tensor_h2d
(
selected_token_indices
,
dtype
=
torch
.
long
,
target_device
=
self
.
device
,
pin_memory
=
self
.
pin_memory
)
categorized_sample_indices
=
{
t
:
maybe_expand_dim
(
async_tensor_h2d
(
seq_ids
,
dtype
=
torch
.
int
,
target_device
=
self
.
device
,
pin_memory
=
self
.
pin_memory
),
2
,
2
)
for
t
,
seq_ids
in
categorized_sample_indices
.
items
()
}
seq_data
:
Dict
[
int
,
SequenceData
]
=
{}
for
seq_group_metadata
in
seq_group_metadata_list
:
seq_data
.
update
(
seq_group_metadata
.
seq_data
)
sampling_metadata
=
SamplingMetadata
(
seq_groups
=
seq_groups
,
seq_data
=
seq_data
,
prompt_lens
=
prompt_lens
,
selected_token_indices
=
selected_token_indices
,
categorized_sample_indices
=
categorized_sample_indices
,
generators
=
generators
,
)
return
sampling_metadata
def
prepare_input_tensors
(
self
,
seq_group_metadata_list
:
Optional
[
List
[
SequenceGroupMetadata
]],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
SamplingMetadata
]:
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt
=
seq_group_metadata_list
[
0
].
is_prompt
# Prepare input tensors.
if
is_prompt
:
(
input_tokens
,
input_positions
,
input_block_ids
,
prompt_lens
)
=
self
.
_prepare_prompt
(
seq_group_metadata_list
)
else
:
(
input_tokens
,
input_positions
,
input_block_ids
)
=
self
.
_prepare_decode
(
seq_group_metadata_list
)
prompt_lens
=
[]
sampling_metadata
=
self
.
_prepare_sample
(
seq_group_metadata_list
,
prompt_lens
)
return
(
input_tokens
,
input_positions
,
input_block_ids
,
sampling_metadata
)
@
torch
.
inference_mode
()
def
execute_model
(
self
,
seq_group_metadata_list
:
Optional
[
List
[
SequenceGroupMetadata
]],
)
->
Optional
[
SamplerOutput
]:
(
input_tokens
,
input_positions
,
input_block_ids
,
sampling_metadata
)
=
self
.
prepare_input_tensors
(
seq_group_metadata_list
)
hidden_states
=
self
.
model
(
input_ids
=
input_tokens
,
positions
=
input_positions
,
input_block_ids
=
input_block_ids
,
)
# Compute the logits.
logits
=
self
.
model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
# Sample the next token.
output
=
self
.
model
.
sample
(
logits
=
logits
,
sampling_metadata
=
sampling_metadata
,
)
return
output
@
property
def
vocab_size
(
self
)
->
int
:
return
self
.
model_config
.
get_vocab_size
()
vllm/worker/neuron_worker.py
View file @
7c4f76e3
"""A Neuron worker class."""
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
List
,
Optional
import
torch
import
torch.distributed
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
LoRA
Config
)
from
vllm.config
import
(
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
Scheduler
Config
)
from
vllm.model_executor
import
set_random_seed
from
vllm.model_executor.parallel_utils.communication_op
import
(
broadcast_tensor_dict
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
ensure_model_parallel_initialized
)
from
vllm.sequence
import
SamplerOutput
,
SequenceGroupMetadata
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.worker.model_runner
import
ModelRunner
from
vllm.worker.neuron_model_runner
import
NeuronModelRunner
class
Worker
:
class
Neuron
Worker
:
"""A worker class that executes the model on a group of neuron cores.
"""
...
...
@@ -26,166 +21,32 @@ class Worker:
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
local_rank
:
int
,
rank
:
int
,
distributed_init_method
:
str
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
is_driver_worker
:
bool
=
False
,
)
->
None
:
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
local_rank
=
local_rank
self
.
rank
=
rank
self
.
distributed_init_method
=
distributed_init_method
self
.
lora_config
=
lora_config
self
.
is_driver_worker
=
is_driver_worker
if
self
.
is_driver_worker
:
assert
self
.
rank
==
0
,
"The driver worker must have rank 0."
self
.
model_runner
=
ModelRunner
(
model_config
,
parallel_config
,
scheduler_config
,
device_config
,
lora_config
=
self
.
lora_config
,
is_driver_worker
=
is_driver_worker
)
# Uninitialized cache engine. Will be initialized by
# self.init_cache_engine().
self
.
cache_config
=
None
self
.
cache_engine
=
None
self
.
cache_events
=
None
self
.
gpu_cache
=
None
self
.
model_runner
=
NeuronModelRunner
(
model_config
,
parallel_config
,
scheduler_config
,
device_config
)
def
init_model
(
self
)
->
None
:
# Initialize the distributed environment.
_init_distributed_environment
(
self
.
parallel_config
,
self
.
rank
,
self
.
distributed_init_method
,
distributed_backend
=
"gloo"
)
# Initialize the model.
def
init_device
(
self
)
->
None
:
# Set random seed.
set_random_seed
(
self
.
model_config
.
seed
)
def
load_model
(
self
):
self
.
model_runner
.
load_model
()
@
torch
.
inference_mode
()
def
profile_num_available_blocks
(
self
,
block_size
:
int
=
128
,
gpu_memory_utilization
:
float
=
0.9
,
cpu_swap_space
:
int
=
0
,
cache_dtype
:
str
=
"float16"
,
)
->
Tuple
[
int
,
int
]:
"""Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks."""
num_gpu_blocks
=
self
.
scheduler_config
.
max_num_seqs
num_cpu_blocks
=
0
return
num_gpu_blocks
,
num_cpu_blocks
def
init_cache_engine
(
self
,
cache_config
:
CacheConfig
)
->
None
:
self
.
cache_config
=
cache_config
self
.
cache_engine
=
CacheEngine
(
self
.
cache_config
,
self
.
model_config
,
self
.
parallel_config
)
self
.
model_runner
.
set_block_size
(
self
.
cache_engine
.
block_size
)
def
warm_up_model
(
self
)
->
None
:
# Warm up is maintained in transformers-neuronx
pass
def
cache_swap
(
self
,
blocks_to_swap_in
:
Dict
[
int
,
int
],
blocks_to_swap_out
:
Dict
[
int
,
int
],
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
None
:
# Issue cache operations.
issued_cache_op
=
False
if
blocks_to_swap_in
:
self
.
cache_engine
.
swap_in
(
blocks_to_swap_in
)
issued_cache_op
=
True
if
blocks_to_swap_out
:
self
.
cache_engine
.
swap_out
(
blocks_to_swap_out
)
issued_cache_op
=
True
if
blocks_to_copy
:
self
.
cache_engine
.
copy
(
blocks_to_copy
)
issued_cache_op
=
True
cache_events
=
self
.
cache_events
if
issued_cache_op
else
None
# Wait for cache operations to finish.
if
cache_events
is
not
None
:
raise
NotImplementedError
(
"cache operations are not implemented for neuron backend."
)
@
torch
.
inference_mode
()
def
execute_model
(
self
,
seq_group_metadata_list
:
Optional
[
List
[
SequenceGroupMetadata
]]
=
None
,
blocks_to_swap_in
:
Optional
[
Dict
[
int
,
int
]]
=
None
,
blocks_to_swap_out
:
Optional
[
Dict
[
int
,
int
]]
=
None
,
blocks_to_copy
:
Optional
[
Dict
[
int
,
List
[
int
]]]
=
None
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
)
->
Optional
[
SamplerOutput
]:
if
self
.
is_driver_worker
:
assert
seq_group_metadata_list
is
not
None
num_seq_groups
=
len
(
seq_group_metadata_list
)
assert
blocks_to_swap_in
is
not
None
assert
blocks_to_swap_out
is
not
None
assert
blocks_to_copy
is
not
None
data
=
{
"num_seq_groups"
:
num_seq_groups
,
"blocks_to_swap_in"
:
blocks_to_swap_in
,
"blocks_to_swap_out"
:
blocks_to_swap_out
,
"blocks_to_copy"
:
blocks_to_copy
,
}
broadcast_tensor_dict
(
data
,
src
=
0
)
else
:
data
=
broadcast_tensor_dict
(
src
=
0
)
num_seq_groups
=
data
[
"num_seq_groups"
]
blocks_to_swap_in
=
data
[
"blocks_to_swap_in"
]
blocks_to_swap_out
=
data
[
"blocks_to_swap_out"
]
blocks_to_copy
=
data
[
"blocks_to_copy"
]
self
.
cache_swap
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
)
num_seq_groups
=
len
(
seq_group_metadata_list
)
# If there is no input, we don't need to execute the model.
if
num_seq_groups
==
0
:
return
{}
output
=
self
.
model_runner
.
execute_model
(
seq_group_metadata_list
,
self
.
gpu_cache
)
output
=
self
.
model_runner
.
execute_model
(
seq_group_metadata_list
)
return
output
def
_init_distributed_environment
(
parallel_config
:
ParallelConfig
,
rank
:
int
,
distributed_init_method
:
Optional
[
str
]
=
None
,
distributed_backend
:
Optional
[
str
]
=
None
,
)
->
None
:
"""Initialize the distributed environment."""
if
torch
.
distributed
.
is_initialized
():
torch_world_size
=
torch
.
distributed
.
get_world_size
()
if
torch_world_size
!=
parallel_config
.
world_size
:
raise
RuntimeError
(
"torch.distributed is already initialized but the torch world "
"size does not match parallel_config.world_size "
f
"(
{
torch_world_size
}
vs.
{
parallel_config
.
world_size
}
)."
)
elif
not
distributed_init_method
:
raise
ValueError
(
"distributed_init_method must be set if torch.distributed "
"is not already initialized"
)
else
:
distributed_backend
=
distributed_backend
if
distributed_backend
else
"nccl"
torch
.
distributed
.
init_process_group
(
backend
=
distributed_backend
,
world_size
=
parallel_config
.
world_size
,
rank
=
rank
,
init_method
=
distributed_init_method
,
)
# A small all_reduce for warmup.
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
))
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
vllm/worker/worker.py
View file @
7c4f76e3
"""A GPU worker class."""
import
gc
import
os
from
typing
import
Dict
,
List
,
Tuple
,
Set
,
Optional
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
import
torch
import
torch.distributed
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
LoRAConfig
)
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoRAConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
VisionLanguageConfig
)
from
vllm.lora.request
import
LoRARequest
from
vllm.model_executor
import
set_random_seed
from
vllm.model_executor.parallel_utils
import
cu
py_utils
from
vllm.model_executor.parallel_utils
import
py
nccl
_utils
from
vllm.model_executor.parallel_utils.communication_op
import
(
broadcast_tensor_dict
)
from
vllm.model_executor.parallel_utils.custom_all_reduce
import
init_custom_ar
...
...
@@ -18,8 +19,6 @@ from vllm.model_executor.parallel_utils.parallel_state import (
from
vllm.sequence
import
SamplerOutput
,
SequenceGroupMetadata
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.worker.model_runner
import
ModelRunner
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
is_hip
class
Worker
:
...
...
@@ -40,6 +39,7 @@ class Worker:
rank
:
int
,
distributed_init_method
:
str
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vision_language_config
:
Optional
[
VisionLanguageConfig
]
=
None
,
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
is_driver_worker
:
bool
=
False
,
)
->
None
:
...
...
@@ -55,21 +55,27 @@ class Worker:
if
self
.
is_driver_worker
:
assert
self
.
rank
==
0
,
"The driver worker must have rank 0."
self
.
model_runner
=
ModelRunner
(
model_config
,
parallel_config
,
scheduler_config
,
device_config
,
lora_config
=
self
.
lora_config
,
kv_cache_dtype
=
kv_cache_dtype
,
is_driver_worker
=
is_driver_worker
)
self
.
vision_language_config
=
vision_language_config
if
self
.
vision_language_config
:
assert
not
self
.
lora_config
,
(
"To be tested: vision language model with LoRA settings."
)
self
.
model_runner
=
ModelRunner
(
model_config
,
parallel_config
,
scheduler_config
,
device_config
,
lora_config
=
self
.
lora_config
,
kv_cache_dtype
=
kv_cache_dtype
,
is_driver_worker
=
is_driver_worker
,
vision_language_config
=
vision_language_config
)
# Uninitialized cache engine. Will be initialized by
# self.init_cache_engine().
self
.
cache_config
=
None
self
.
cache_engine
=
None
self
.
cache_events
=
None
self
.
gpu_cache
=
None
def
init_
model
(
self
,
cupy_port
:
Optional
[
int
]
=
None
)
->
None
:
def
init_
device
(
self
)
->
None
:
if
self
.
device_config
.
device
.
type
==
"cuda"
:
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
...
...
@@ -92,8 +98,9 @@ class Worker:
f
"Not support device type:
{
self
.
device_config
.
device
}
"
)
# Initialize the distributed environment.
init_distributed_environment
(
self
.
parallel_config
,
self
.
rank
,
cupy_port
,
self
.
distributed_init_method
)
# Initialize the model.
self
.
distributed_init_method
,
self
.
local_rank
)
# Set random seed.
set_random_seed
(
self
.
model_config
.
seed
)
def
load_model
(
self
):
...
...
@@ -130,9 +137,12 @@ class Worker:
# NOTE(woosuk): Here we assume that the other processes using the same
# GPU did not change their memory usage during the profiling.
peak_memory
=
self
.
init_gpu_memory
-
free_gpu_memory
assert
peak_memory
>
0
,
(
"Error in memory profiling. This happens when the GPU memory was "
"not properly cleaned up before initializing the vLLM instance."
)
cache_block_size
=
CacheEngine
.
get_cache_block_size
(
block_size
,
cache_dtype
,
self
.
model_config
,
self
.
parallel_config
)
cache_block_size
=
self
.
get_cache_block_size
_bytes
(
block_size
,
cache_dtype
)
num_gpu_blocks
=
int
(
(
total_gpu_memory
*
gpu_memory_utilization
-
peak_memory
)
//
cache_block_size
)
...
...
@@ -149,7 +159,6 @@ class Worker:
self
.
cache_config
=
cache_config
self
.
cache_engine
=
CacheEngine
(
self
.
cache_config
,
self
.
model_config
,
self
.
parallel_config
)
self
.
cache_events
=
self
.
cache_engine
.
events
self
.
gpu_cache
=
self
.
cache_engine
.
gpu_cache
self
.
model_runner
.
set_block_size
(
self
.
cache_engine
.
block_size
)
...
...
@@ -167,24 +176,13 @@ class Worker:
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
None
:
# Issue cache operations.
issued_cache_op
=
False
# TODO(woosuk): Profile swapping overhead and optimize if needed.
if
blocks_to_swap_in
:
self
.
cache_engine
.
swap_in
(
blocks_to_swap_in
)
issued_cache_op
=
True
if
blocks_to_swap_out
:
self
.
cache_engine
.
swap_out
(
blocks_to_swap_out
)
issued_cache_op
=
True
if
blocks_to_copy
:
self
.
cache_engine
.
copy
(
blocks_to_copy
)
issued_cache_op
=
True
cache_events
=
self
.
cache_events
if
issued_cache_op
else
None
# Wait for cache operations to finish.
# TODO(woosuk): Profile swapping overhead and optimize if needed.
if
cache_events
is
not
None
:
for
event
in
cache_events
:
event
.
wait
()
@
torch
.
inference_mode
()
def
execute_model
(
...
...
@@ -233,12 +231,28 @@ class Worker:
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
model_runner
.
list_loras
()
@
property
def
max_model_len
(
self
)
->
int
:
return
self
.
model_config
.
max_model_len
@
property
def
vocab_size
(
self
)
->
int
:
return
self
.
model_runner
.
vocab_size
def
get_cache_block_size_bytes
(
self
,
block_size
:
int
,
cache_dtype
:
str
)
->
int
:
"""Get the size of the KV cache block size in bytes.
"""
return
CacheEngine
.
get_cache_block_size
(
block_size
,
cache_dtype
,
self
.
model_config
,
self
.
parallel_config
)
def
init_distributed_environment
(
parallel_config
:
ParallelConfig
,
rank
:
int
,
cupy_port
:
Optional
[
int
],
distributed_init_method
:
Optional
[
str
]
=
None
,
local_rank
:
int
=
-
1
,
)
->
None
:
"""Initialize the distributed environment."""
if
torch
.
distributed
.
is_initialized
():
...
...
@@ -260,29 +274,27 @@ def init_distributed_environment(
init_method
=
distributed_init_method
,
)
if
cu
py_utils
.
is_initialized
():
cu
py_world_size
=
cu
py_utils
.
get_world_size
()
if
cu
py_world_size
!=
parallel_config
.
world_size
:
if
py
nccl
_utils
.
is_initialized
():
py
nccl
_world_size
=
py
nccl
_utils
.
get_world_size
()
if
py
nccl
_world_size
!=
parallel_config
.
world_size
:
raise
RuntimeError
(
"
cupy.distributed
is already initialized but the
cu
py world "
"
pynccl
is already initialized but the py
nccl
world "
"size does not match parallel_config.world_size "
f
"(
{
cupy_world_size
}
vs.
{
parallel_config
.
world_size
}
)."
)
elif
(
parallel_config
.
world_size
>
1
and
cupy_port
is
not
None
and
not
is_hip
()):
# NOTE(woosuk): We don't initialize CuPy process group when world size
f
"(
{
pynccl_world_size
}
vs.
{
parallel_config
.
world_size
}
)."
)
elif
parallel_config
.
world_size
>
1
:
# NOTE(woosuk): We don't initialize pynccl process group when world size
# is 1.
# TODO(woosuk): Support multi-node connection.
cupy_utils
.
init_process_group
(
pynccl_utils
.
init_process_group
(
world_size
=
parallel_config
.
world_size
,
local_rank
=
local_rank
,
rank
=
rank
,
host
=
"localhost"
,
port
=
cupy_port
,
init_method
=
distributed_init_method
,
)
# A small all_reduce for warmup.
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
if
cu
py_utils
.
is_initialized
():
cu
py_utils
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
if
py
nccl
_utils
.
is_initialized
():
py
nccl
_utils
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
...
...
Prev
1
…
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment