Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4851c202
Commit
4851c202
authored
Sep 13, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.1' into v0.6.1-dev
parents
9b902f9e
3fd2b0d2
Changes
203
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
674 additions
and
106 deletions
+674
-106
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+2
-1
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+42
-0
vllm/multimodal/video.py
vllm/multimodal/video.py
+71
-0
vllm/platforms/__init__.py
vllm/platforms/__init__.py
+10
-0
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+15
-0
vllm/platforms/interface.py
vllm/platforms/interface.py
+7
-3
vllm/platforms/tpu.py
vllm/platforms/tpu.py
+0
-6
vllm/prompt_adapter/models.py
vllm/prompt_adapter/models.py
+1
-1
vllm/prompt_adapter/utils.py
vllm/prompt_adapter/utils.py
+93
-0
vllm/sequence.py
vllm/sequence.py
+11
-0
vllm/spec_decode/draft_model_runner.py
vllm/spec_decode/draft_model_runner.py
+3
-13
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+154
-32
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+4
-0
vllm/transformers_utils/image_processor.py
vllm/transformers_utils/image_processor.py
+27
-0
vllm/transformers_utils/processor.py
vllm/transformers_utils/processor.py
+37
-0
vllm/transformers_utils/tokenizers/mistral.py
vllm/transformers_utils/tokenizers/mistral.py
+62
-34
vllm/utils.py
vllm/utils.py
+25
-0
vllm/version.py
vllm/version.py
+1
-1
vllm/worker/cpu_worker.py
vllm/worker/cpu_worker.py
+2
-1
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+107
-14
No files found.
vllm/multimodal/registry.py
View file @
4851c202
...
...
@@ -9,6 +9,7 @@ from .audio import AudioPlugin
from
.base
import
(
MultiModalDataDict
,
MultiModalInputMapper
,
MultiModalInputs
,
MultiModalPlugin
,
MultiModalTokensCalc
,
NestedTensors
)
from
.image
import
ImagePlugin
from
.video
import
VideoPlugin
logger
=
init_logger
(
__name__
)
...
...
@@ -34,7 +35,7 @@ class MultiModalRegistry:
:class:`~vllm.multimodal.MultiModalPlugin` for each modality.
"""
DEFAULT_PLUGINS
=
(
ImagePlugin
(),
AudioPlugin
())
DEFAULT_PLUGINS
=
(
ImagePlugin
(),
AudioPlugin
()
,
VideoPlugin
()
)
def
__init__
(
self
,
...
...
vllm/multimodal/utils.py
View file @
4851c202
...
...
@@ -4,6 +4,7 @@ from io import BytesIO
from
typing
import
Any
,
List
,
Optional
,
Tuple
,
TypeVar
,
Union
import
numpy
as
np
import
numpy.typing
as
npt
from
PIL
import
Image
from
vllm.connections
import
global_http_connection
...
...
@@ -187,6 +188,47 @@ def rescale_image_size(image: Image.Image,
return
image
def
try_import_video_packages
()
->
Any
:
try
:
import
cv2
except
ImportError
:
raise
ImportError
(
"Please install vllm[video] for video support."
)
from
None
return
cv2
def
resize_video
(
frames
:
npt
.
NDArray
,
size
:
Tuple
[
int
,
int
])
->
npt
.
NDArray
:
cv2
=
try_import_video_packages
()
num_frames
,
_
,
_
,
channels
=
frames
.
shape
new_height
,
new_width
=
size
resized_frames
=
np
.
empty
((
num_frames
,
new_height
,
new_width
,
channels
),
dtype
=
frames
.
dtype
)
for
i
,
frame
in
enumerate
(
frames
):
resized_frame
=
cv2
.
resize
(
frame
,
(
new_width
,
new_height
))
resized_frames
[
i
]
=
resized_frame
return
resized_frames
def
rescale_video_size
(
frames
:
npt
.
NDArray
,
size_factor
:
float
)
->
npt
.
NDArray
:
_
,
height
,
width
,
_
=
frames
.
shape
new_height
=
int
(
height
*
size_factor
)
new_width
=
int
(
width
*
size_factor
)
return
resize_video
(
frames
,
(
new_height
,
new_width
))
def
sample_frames_from_video
(
frames
:
npt
.
NDArray
,
num_frames
:
int
)
->
npt
.
NDArray
:
total_frames
=
frames
.
shape
[
0
]
if
num_frames
==
-
1
:
return
frames
else
:
frame_indices
=
np
.
linspace
(
0
,
total_frames
-
1
,
num_frames
,
dtype
=
int
)
sampled_frames
=
frames
[
frame_indices
,
...]
return
sampled_frames
# Utilities for input processors
_T
=
TypeVar
(
"_T"
,
str
,
int
)
...
...
vllm/multimodal/video.py
0 → 100644
View file @
4851c202
from
functools
import
lru_cache
from
typing
import
List
,
Union
import
numpy
as
np
from
vllm.config
import
ModelConfig
from
vllm.inputs.registry
import
InputContext
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.image_processor
import
get_video_processor
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.utils
import
is_list_of
from
.base
import
MultiModalData
,
MultiModalInputs
from
.image
import
ImagePlugin
logger
=
init_logger
(
__name__
)
cached_get_video_processor
=
lru_cache
(
get_video_processor
)
cached_get_tokenizer
=
lru_cache
(
get_tokenizer
)
VideoInput
=
Union
[
"np.ndarray"
,
# single video input
List
[
"np.ndarray"
],
# TODO: support more types
# List[Image.Image], List[List[Image.Image]],
# "torch.Tensor",
# List["torch.Tensor"],
# List[List["np.ndarrray"]],
# List[List["torch.Tensor"]],
]
class
VideoPlugin
(
ImagePlugin
):
"""Plugin for video data."""
def
get_data_key
(
self
)
->
str
:
return
"video"
def
_get_hf_video_processor
(
self
,
model_config
:
ModelConfig
):
return
cached_get_video_processor
(
model_config
.
model
,
trust_remote_code
=
model_config
.
trust_remote_code
)
def
_default_input_mapper
(
self
,
ctx
:
InputContext
,
data
:
MultiModalData
[
object
],
)
->
MultiModalInputs
:
model_config
=
ctx
.
model_config
# single video input as np.ndarray
if
isinstance
(
data
,
np
.
ndarray
):
video_processor
=
self
.
_get_hf_video_processor
(
model_config
)
if
video_processor
is
None
:
raise
RuntimeError
(
"No HuggingFace processor is available "
"to process the image object"
)
try
:
batch_data
=
video_processor
(
data
,
return_tensors
=
"pt"
).
data
except
Exception
:
logger
.
error
(
"Failed to process image (%s)"
,
data
)
raise
return
MultiModalInputs
(
batch_data
)
elif
is_list_of
(
data
,
np
.
ndarray
):
raise
NotImplementedError
(
"Multi video for a prompt is not supported yet"
)
raise
TypeError
(
f
"Invalid video type:
{
type
(
data
)
}
"
)
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
return
4096
vllm/platforms/__init__.py
View file @
4851c202
...
...
@@ -42,6 +42,13 @@ try:
except
Exception
:
pass
is_cpu
=
False
try
:
from
importlib.metadata
import
version
is_cpu
=
"cpu"
in
version
(
"vllm"
)
except
Exception
:
pass
if
is_tpu
:
# people might install pytorch built with cuda but run on tpu
# so we need to check tpu first
...
...
@@ -53,6 +60,9 @@ elif is_cuda:
elif
is_rocm
:
from
.rocm
import
RocmPlatform
current_platform
=
RocmPlatform
()
elif
is_cpu
:
from
.cpu
import
CpuPlatform
current_platform
=
CpuPlatform
()
else
:
current_platform
=
UnspecifiedPlatform
()
...
...
vllm/platforms/cpu.py
0 → 100644
View file @
4851c202
import
torch
from
.interface
import
Platform
,
PlatformEnum
class
CpuPlatform
(
Platform
):
_enum
=
PlatformEnum
.
CPU
@
staticmethod
def
get_device_name
(
device_id
:
int
=
0
)
->
str
:
return
"cpu"
@
staticmethod
def
inference_mode
():
return
torch
.
no_grad
()
vllm/platforms/interface.py
View file @
4851c202
import
enum
from
typing
import
Tuple
from
typing
import
Optional
,
Tuple
import
torch
...
...
@@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum):
CUDA
=
enum
.
auto
()
ROCM
=
enum
.
auto
()
TPU
=
enum
.
auto
()
CPU
=
enum
.
auto
()
UNSPECIFIED
=
enum
.
auto
()
...
...
@@ -23,9 +24,12 @@ class Platform:
def
is_tpu
(
self
)
->
bool
:
return
self
.
_enum
==
PlatformEnum
.
TPU
def
is_cpu
(
self
)
->
bool
:
return
self
.
_enum
==
PlatformEnum
.
CPU
@
staticmethod
def
get_device_capability
(
device_id
:
int
=
0
)
->
Tuple
[
int
,
int
]:
r
aise
NotImplementedError
def
get_device_capability
(
device_id
:
int
=
0
)
->
Optional
[
Tuple
[
int
,
int
]
]
:
r
eturn
None
@
staticmethod
def
get_device_name
(
device_id
:
int
=
0
)
->
str
:
...
...
vllm/platforms/tpu.py
View file @
4851c202
from
typing
import
Tuple
import
torch
from
.interface
import
Platform
,
PlatformEnum
...
...
@@ -8,10 +6,6 @@ from .interface import Platform, PlatformEnum
class
TpuPlatform
(
Platform
):
_enum
=
PlatformEnum
.
TPU
@
staticmethod
def
get_device_capability
(
device_id
:
int
=
0
)
->
Tuple
[
int
,
int
]:
raise
RuntimeError
(
"TPU does not have device capability."
)
@
staticmethod
def
inference_mode
():
return
torch
.
no_grad
()
vllm/prompt_adapter/models.py
View file @
4851c202
...
...
@@ -14,6 +14,7 @@ from vllm.config import PromptAdapterConfig
from
vllm.prompt_adapter.layers
import
(
VocabParallelEmbeddingWithPromptAdapter
)
# yapf: disable
from
vllm.prompt_adapter.layers
import
PromptAdapterMapping
from
vllm.prompt_adapter.utils
import
load_peft_weights
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -90,7 +91,6 @@ class PromptAdapterModel(AdapterModel):
config
:
PromptAdapterConfig
,
device
:
str
=
"cuda"
,
)
->
"PromptAdapterModel"
:
from
peft.utils
import
load_peft_weights
if
num_virtual_tokens
>
config
.
max_prompt_adapter_token
:
raise
ValueError
(
...
...
vllm/prompt_adapter/utils.py
0 → 100644
View file @
4851c202
# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
import
os
from
typing
import
Optional
import
torch
from
huggingface_hub
import
file_exists
,
hf_hub_download
from
huggingface_hub.utils
import
EntryNotFoundError
from
safetensors.torch
import
load_file
as
safe_load_file
WEIGHTS_NAME
=
"adapter_model.bin"
SAFETENSORS_WEIGHTS_NAME
=
"adapter_model.safetensors"
# Get current device name based on available devices
def
infer_device
()
->
str
:
if
torch
.
cuda
.
is_available
():
return
"cuda"
return
"cpu"
def
load_peft_weights
(
model_id
:
str
,
device
:
Optional
[
str
]
=
None
,
**
hf_hub_download_kwargs
)
->
dict
:
r
"""
A helper method to load the PEFT weights from the HuggingFace Hub or locally
Args:
model_id (`str`):
The local path to the adapter weights or the name of the adapter to
load from the HuggingFace Hub.
device (`str`):
The device to load the weights onto.
hf_hub_download_kwargs (`dict`):
Additional arguments to pass to the `hf_hub_download` method when
loading from the HuggingFace Hub.
"""
path
=
(
os
.
path
.
join
(
model_id
,
hf_hub_download_kwargs
[
"subfolder"
])
if
hf_hub_download_kwargs
.
get
(
"subfolder"
,
None
)
is
not
None
else
model_id
)
if
device
is
None
:
device
=
infer_device
()
if
os
.
path
.
exists
(
os
.
path
.
join
(
path
,
SAFETENSORS_WEIGHTS_NAME
)):
filename
=
os
.
path
.
join
(
path
,
SAFETENSORS_WEIGHTS_NAME
)
use_safetensors
=
True
elif
os
.
path
.
exists
(
os
.
path
.
join
(
path
,
WEIGHTS_NAME
)):
filename
=
os
.
path
.
join
(
path
,
WEIGHTS_NAME
)
use_safetensors
=
False
else
:
token
=
hf_hub_download_kwargs
.
get
(
"token"
,
None
)
if
token
is
None
:
token
=
hf_hub_download_kwargs
.
get
(
"use_auth_token"
,
None
)
hub_filename
=
(
os
.
path
.
join
(
hf_hub_download_kwargs
[
"subfolder"
],
SAFETENSORS_WEIGHTS_NAME
)
if
hf_hub_download_kwargs
.
get
(
"subfolder"
,
None
)
is
not
None
else
SAFETENSORS_WEIGHTS_NAME
)
has_remote_safetensors_file
=
file_exists
(
repo_id
=
model_id
,
filename
=
hub_filename
,
revision
=
hf_hub_download_kwargs
.
get
(
"revision"
,
None
),
repo_type
=
hf_hub_download_kwargs
.
get
(
"repo_type"
,
None
),
token
=
token
,
)
use_safetensors
=
has_remote_safetensors_file
if
has_remote_safetensors_file
:
# Priority 1: load safetensors weights
filename
=
hf_hub_download
(
model_id
,
SAFETENSORS_WEIGHTS_NAME
,
**
hf_hub_download_kwargs
,
)
else
:
try
:
filename
=
hf_hub_download
(
model_id
,
WEIGHTS_NAME
,
**
hf_hub_download_kwargs
)
except
EntryNotFoundError
:
raise
ValueError
(
# noqa: B904
f
"Can't find weights for
{
model_id
}
in
{
model_id
}
or
\
in the Hugging Face Hub. "
f
"Please check that the file
{
WEIGHTS_NAME
}
or
\
{
SAFETENSORS_WEIGHTS_NAME
}
is present at
{
model_id
}
."
)
if
use_safetensors
:
adapters_weights
=
safe_load_file
(
filename
,
device
=
device
)
else
:
adapters_weights
=
torch
.
load
(
filename
,
map_location
=
torch
.
device
(
device
))
return
adapters_weights
vllm/sequence.py
View file @
4851c202
...
...
@@ -165,6 +165,9 @@ class SequenceData(msgspec.Struct,
# is called.
_new_appended_tokens
:
List
[
int
]
=
msgspec
.
field
(
default_factory
=
list
)
# It is used to compute mrope_position_ids.
_mrope_position_delta
:
Optional
[
int
]
=
None
def
__post_init__
(
self
)
->
None
:
assert
self
.
_prompt_token_ids
.
typecode
==
"l"
assert
self
.
_output_token_ids
.
typecode
==
"l"
...
...
@@ -219,6 +222,14 @@ class SequenceData(msgspec.Struct,
assert
isinstance
(
self
.
_output_token_ids
,
array
)
return
self
.
_output_token_ids
@
property
def
mrope_position_delta
(
self
)
->
Optional
[
int
]:
return
self
.
_mrope_position_delta
@
mrope_position_delta
.
setter
def
mrope_position_delta
(
self
,
new_mrope_position_delta
):
self
.
_mrope_position_delta
=
new_mrope_position_delta
def
append_token_id
(
self
,
token_id
:
int
,
logprob
:
float
)
->
None
:
self
.
_output_token_ids
.
append
(
token_id
)
self
.
_new_appended_tokens
.
append
(
token_id
)
...
...
vllm/spec_decode/draft_model_runner.py
View file @
4851c202
...
...
@@ -2,7 +2,6 @@ from typing import List, Optional
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.sampler
import
SamplerOutput
try
:
...
...
@@ -116,18 +115,9 @@ class TP1DraftModelRunner(ModelRunner):
# Update attn_metadata
attn_metadata
=
model_input
.
attn_metadata
assert
isinstance
(
attn_metadata
,
FlashAttentionMetadata
)
attn_metadata
.
advance_step
(
num_seqs
,
num_queries
)
# Update GPU tensors
ops
.
advance_step
(
num_seqs
=
num_seqs
,
num_queries
=
num_queries
,
block_size
=
self
.
block_size
,
input_tokens
=
model_input
.
input_tokens
,
sampled_token_ids
=
sampled_token_ids
,
input_positions
=
model_input
.
input_positions
,
seq_lens
=
attn_metadata
.
seq_lens_tensor
,
slot_mapping
=
attn_metadata
.
slot_mapping
,
block_tables
=
attn_metadata
.
block_tables
)
attn_metadata
.
advance_step
(
model_input
,
sampled_token_ids
,
self
.
block_size
,
num_seqs
,
num_queries
)
# Update sampling_metadata
sampling_metadata
=
model_input
.
sampling_metadata
...
...
vllm/transformers_utils/config.py
View file @
4851c202
import
contextlib
import
enum
import
json
from
pathlib
import
Path
from
typing
import
Any
,
Dict
,
Optional
,
Type
,
Union
from
huggingface_hub
import
file_exists
,
hf_hub_download
from
transformers
import
GenerationConfig
,
PretrainedConfig
from
transformers.models.auto.image_processing_auto
import
(
get_image_processor_config
)
from
transformers.models.auto.modeling_auto
import
(
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
from
transformers.utils
import
CONFIG_NAME
as
HF_CONFIG_NAME
from
vllm.envs
import
VLLM_USE_MODELSCOPE
from
vllm.logger
import
init_logger
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.transformers_utils.configs
import
(
ChatGLMConfig
,
DbrxConfig
,
EAGLEConfig
,
ExaoneConfig
,
InternVLChatConfig
,
JAISConfig
,
MedusaConfig
,
MLPSpeculatorConfig
,
MPTConfig
,
NemotronConfig
,
RWConfig
,
UltravoxConfig
)
GraniteConfig
,
InternVLChatConfig
,
JAISConfig
,
MedusaConfig
,
MLPSpeculatorConfig
,
MPTConfig
,
NemotronConfig
,
RWConfig
,
UltravoxConfig
)
# yapf: enable
from
vllm.transformers_utils.utils
import
check_gguf_file
if
VLLM_USE_MODELSCOPE
:
...
...
@@ -23,6 +31,8 @@ if VLLM_USE_MODELSCOPE:
else
:
from
transformers
import
AutoConfig
MISTRAL_CONFIG_NAME
=
"params.json"
logger
=
init_logger
(
__name__
)
_CONFIG_REGISTRY
:
Dict
[
str
,
Type
[
PretrainedConfig
]]
=
{
...
...
@@ -39,6 +49,9 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
"internvl_chat"
:
InternVLChatConfig
,
"nemotron"
:
NemotronConfig
,
"ultravox"
:
UltravoxConfig
,
# Granite can be removed from here once we have upgraded to
# transformers 4.45+
"granite"
:
GraniteConfig
,
}
for
name
,
cls
in
_CONFIG_REGISTRY
.
items
():
...
...
@@ -46,6 +59,20 @@ for name, cls in _CONFIG_REGISTRY.items():
AutoConfig
.
register
(
name
,
cls
)
class
ConfigFormat
(
str
,
enum
.
Enum
):
AUTO
=
"auto"
HF
=
"hf"
MISTRAL
=
"mistral"
def
file_or_path_exists
(
model
:
Union
[
str
,
Path
],
config_name
,
revision
,
token
)
->
bool
:
if
Path
(
model
).
exists
():
return
(
Path
(
model
)
/
config_name
).
is_file
()
return
file_exists
(
model
,
config_name
,
revision
=
revision
,
token
=
token
)
def
get_config
(
model
:
Union
[
str
,
Path
],
trust_remote_code
:
bool
,
...
...
@@ -53,38 +80,68 @@ def get_config(
code_revision
:
Optional
[
str
]
=
None
,
rope_scaling
:
Optional
[
dict
]
=
None
,
rope_theta
:
Optional
[
float
]
=
None
,
config_format
:
ConfigFormat
=
ConfigFormat
.
AUTO
,
**
kwargs
,
)
->
PretrainedConfig
:
# Separate model folder from file path for GGUF models
is_gguf
=
check_gguf_file
(
model
)
if
is_gguf
:
kwargs
[
"gguf_file"
]
=
Path
(
model
).
name
model
=
Path
(
model
).
parent
try
:
config
=
AutoConfig
.
from_pretrained
(
model
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
code_revision
=
code_revision
,
**
kwargs
)
except
ValueError
as
e
:
if
(
not
trust_remote_code
and
"requires you to execute the configuration file"
in
str
(
e
)):
err_msg
=
(
"Failed to load the model config. If the model is a custom "
"model not yet available in the HuggingFace transformers "
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
if
config_format
==
ConfigFormat
.
AUTO
:
if
is_gguf
or
file_or_path_exists
(
model
,
HF_CONFIG_NAME
,
revision
=
revision
,
token
=
kwargs
.
get
(
"token"
)):
config_format
=
ConfigFormat
.
HF
elif
file_or_path_exists
(
model
,
MISTRAL_CONFIG_NAME
,
revision
=
revision
,
token
=
kwargs
.
get
(
"token"
)):
config_format
=
ConfigFormat
.
MISTRAL
else
:
raise
ValueError
(
f
"No supported config format found in
{
model
}
"
)
if
config_format
==
ConfigFormat
.
HF
:
config_dict
,
_
=
PretrainedConfig
.
get_config_dict
(
model
,
revision
=
revision
,
code_revision
=
code_revision
,
**
kwargs
)
# Use custom model class if it's in our registry
model_type
=
config_dict
.
get
(
"model_type"
)
if
model_type
in
_CONFIG_REGISTRY
:
config_class
=
_CONFIG_REGISTRY
[
model_type
]
config
=
config_class
.
from_pretrained
(
model
,
revision
=
revision
,
code_revision
=
code_revision
)
else
:
raise
e
if
config
.
model_type
in
_CONFIG_REGISTRY
:
config_class
=
_CONFIG_REGISTRY
[
config
.
model_type
]
config
=
config_class
.
from_pretrained
(
model
,
revision
=
revision
,
code_revision
=
code_revision
)
try
:
config
=
AutoConfig
.
from_pretrained
(
model
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
code_revision
=
code_revision
,
**
kwargs
,
)
except
ValueError
as
e
:
if
(
not
trust_remote_code
and
"requires you to execute the configuration file"
in
str
(
e
)):
err_msg
=
(
"Failed to load the model config. If the model "
"is a custom model not yet available in the "
"HuggingFace transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
elif
config_format
==
ConfigFormat
.
MISTRAL
:
config
=
load_params_config
(
model
,
revision
)
else
:
raise
ValueError
(
f
"Unsupported config format:
{
config_format
}
"
)
# Special architecture mapping check for GGUF models
if
is_gguf
:
...
...
@@ -94,16 +151,81 @@ def get_config(
model_type
=
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
[
config
.
model_type
]
config
.
update
({
"architectures"
:
[
model_type
]})
for
key
,
value
in
[(
"rope_scaling"
,
rope_scaling
),
(
"rope_theta"
,
rope_theta
)]:
for
key
,
value
in
[
(
"rope_scaling"
,
rope_scaling
),
(
"rope_theta"
,
rope_theta
),
]:
if
value
is
not
None
:
logger
.
info
(
"Updating %s from %r to %r"
,
key
,
getattr
(
config
,
key
,
None
),
value
)
logger
.
info
(
"Updating %s from %r to %r"
,
key
,
getattr
(
config
,
key
,
None
),
value
,
)
config
.
update
({
key
:
value
})
return
config
def
load_params_config
(
model
,
revision
)
->
PretrainedConfig
:
# This function loads a params.json config which
# should be used when loading models in mistral format
config_file_name
=
"params.json"
config_path
=
Path
(
model
)
/
config_file_name
if
not
config_path
.
is_file
():
config_path
=
Path
(
hf_hub_download
(
model
,
config_file_name
,
revision
=
revision
))
with
open
(
config_path
,
"r"
)
as
file
:
config_dict
=
json
.
load
(
file
)
config_mapping
=
{
"dim"
:
"hidden_size"
,
"norm_eps"
:
"rms_norm_eps"
,
"n_kv_heads"
:
"num_key_value_heads"
,
"n_layers"
:
"num_hidden_layers"
,
"n_heads"
:
"num_attention_heads"
,
"hidden_dim"
:
"intermediate_size"
,
}
def
recurse_elems
(
elem
:
Any
):
if
isinstance
(
elem
,
dict
):
config_dict
=
{}
for
key
,
value
in
elem
.
items
():
key
=
config_mapping
.
get
(
key
,
key
)
config_dict
[
key
]
=
recurse_elems
(
value
)
return
PretrainedConfig
(
**
config_dict
)
else
:
return
elem
config_dict
[
"model_type"
]
=
config_dict
.
get
(
"model_type"
,
"transformer"
)
config_dict
[
"hidden_act"
]
=
config_dict
.
get
(
"activation"
,
"silu"
)
config_dict
[
"tie_word_embeddings"
]
=
config_dict
.
get
(
"tie_embeddings"
,
False
)
config_dict
[
"max_seq_len"
]
=
config_dict
.
get
(
"max_seq_len"
,
128_000
)
if
config_dict
.
get
(
"moe"
)
is
not
None
:
config_dict
[
"architectures"
]
=
[
"MixtralForCausalLM"
]
else
:
config_dict
[
"architectures"
]
=
[
"MistralForCausalLM"
]
if
config_dict
.
get
(
"vision_encoder"
)
is
not
None
:
multimodal_config
=
config_dict
.
pop
(
"vision_encoder"
)
config_dict
=
{
"text_config"
:
config_dict
,
"vision_config"
:
multimodal_config
}
config_dict
[
"architectures"
]
=
[
"PixtralForConditionalGeneration"
]
config_dict
[
"model_type"
]
=
"pixtral"
config
=
recurse_elems
(
config_dict
)
return
config
def
get_hf_image_processor_config
(
model
:
Union
[
str
,
Path
],
revision
:
Optional
[
str
]
=
None
,
...
...
@@ -120,7 +242,7 @@ def get_hf_image_processor_config(
def
get_hf_text_config
(
config
:
PretrainedConfig
):
"""Get the "sub" config relevant to llm for multi modal models.
No op for pure text models.
No op for pure text models.
"""
if
hasattr
(
config
,
"text_config"
):
# The code operates under the assumption that text_config should have
...
...
vllm/transformers_utils/configs/__init__.py
View file @
4851c202
...
...
@@ -6,6 +6,7 @@ from vllm.transformers_utils.configs.exaone import ExaoneConfig
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
from
vllm.transformers_utils.configs.falcon
import
RWConfig
from
vllm.transformers_utils.configs.granite
import
GraniteConfig
from
vllm.transformers_utils.configs.internvl
import
InternVLChatConfig
from
vllm.transformers_utils.configs.jais
import
JAISConfig
from
vllm.transformers_utils.configs.medusa
import
MedusaConfig
...
...
@@ -27,4 +28,7 @@ __all__ = [
"MLPSpeculatorConfig"
,
"NemotronConfig"
,
"UltravoxConfig"
,
# Granite can be removed from here once we have upgraded to
# transformers 4.45+
"GraniteConfig"
,
]
vllm/transformers_utils/image_processor.py
View file @
4851c202
from
typing
import
cast
def
get_video_processor
(
processor_name
:
str
,
trust_remote_code
:
bool
=
False
,
):
"""
Gets a processor for the given model name via HuggingFace.
"""
from
transformers
import
AutoProcessor
try
:
processor
=
AutoProcessor
.
from_pretrained
(
processor_name
)
video_processor
=
processor
.
video_processor
except
ValueError
as
e
:
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
video_processor
def
get_image_processor
(
processor_name
:
str
,
*
args
,
...
...
vllm/transformers_utils/processor.py
0 → 100644
View file @
4851c202
from
typing
import
cast
def
get_processor
(
processor_name
:
str
,
*
args
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
):
"""Gets a processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoProcessor
from
transformers.processing_utils
import
ProcessorMixin
try
:
processor
=
AutoProcessor
.
from_pretrained
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
)
except
ValueError
as
e
:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
cast
(
ProcessorMixin
,
processor
)
vllm/transformers_utils/tokenizers/mistral.py
View file @
4851c202
...
...
@@ -16,7 +16,7 @@ from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
Tekkenizer
)
if
TYPE_CHECKING
:
from
vllm.entrypoints.chat_utils
import
C
onversa
tionMessage
from
vllm.entrypoints.chat_utils
import
C
hatComple
tionMessage
Param
@
dataclass
...
...
@@ -45,26 +45,25 @@ class MistralTokenizer:
def
__init__
(
self
,
tokenizer
:
PublicMistralTokenizer
)
->
None
:
self
.
mistral
=
tokenizer
self
.
instruct
=
tokenizer
.
instruct_tokenizer
self
.
tokenizer
=
tokenizer
.
instruct_tokenizer
.
tokenizer
self
.
vocab_size
=
len
(
self
.
tokenizer
.
vocab
())
assert
isinstance
(
self
.
tokenizer
,
(
Tekkenizer
,
SentencePieceTokenizer
)),
type
(
self
.
tokenizer
)
if
(
is_tekken
:
=
isinstance
(
self
.
tokenizer
,
Tekkenizer
)):
tokenizer_
=
tokenizer
.
instruct_tokenizer
.
tokenizer
if
isinstance
(
tokenizer_
,
Tekkenizer
):
# Make sure special tokens will not raise
self
.
tokenizer
.
special_token_policy
=
SpecialTokenPolicy
.
IGNORE
self
.
_is_tekken
=
is_tekken
tokenizer_
.
special_token_policy
=
SpecialTokenPolicy
.
IGNORE
self
.
_vocab
=
{
token
:
idx
for
idx
,
token
in
enumerate
(
tokenizer_
.
vocab
())
}
elif
isinstance
(
tokenizer_
,
SentencePieceTokenizer
):
self
.
_vocab
=
{
token
:
idx
for
idx
,
token
in
enumerate
(
tokenizer_
.
vocab
())
}
else
:
raise
TypeError
(
f
"Unsupported tokenizer:
{
type
(
tokenizer_
)
}
"
)
# the following attributes are set to fit VLLM's design
self
.
is_fast
=
True
self
.
chat_template
=
True
self
.
all_special_ids
:
List
[
Any
]
=
[]
self
.
all_special_tokens
:
List
[
Any
]
=
[]
self
.
all_special_tokens_extended
:
List
[
Any
]
=
[]
self
.
tokenizer
=
tokenizer_
@
classmethod
def
from_pretrained
(
cls
,
...
...
@@ -102,6 +101,38 @@ class MistralTokenizer:
revision
=
revision
)
return
tokenizer_file
# the following attributes are set to fit VLLM's design
@
property
def
all_special_tokens_extended
(
self
)
->
List
[
str
]:
return
[]
@
property
def
all_special_tokens
(
self
)
->
List
[
str
]:
return
[]
@
property
def
all_special_ids
(
self
)
->
List
[
int
]:
return
[]
@
property
def
bos_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
bos_id
@
property
def
eos_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
eos_id
@
property
def
is_fast
(
self
)
->
bool
:
return
True
@
property
def
vocab_size
(
self
)
->
int
:
return
len
(
self
.
_vocab
)
def
__len__
(
self
)
->
int
:
return
self
.
vocab_size
def
__call__
(
self
,
prompt
:
str
,
...
...
@@ -117,31 +148,34 @@ class MistralTokenizer:
return
Encoding
(
input_ids
=
input_ids
)
def
get_added_vocab
(
self
)
->
List
[
str
]:
def
get_vocab
(
self
)
->
Dict
[
str
,
int
]:
return
self
.
_vocab
def
get_added_vocab
(
self
)
->
Dict
[
str
,
int
]:
# Mistral tokenizers have no added vocabulary
return
[]
return
{}
def
encode
(
self
,
prompt
:
str
)
->
List
[
int
]:
# `encode
` should only be used for prompt completion
# `encode` should only be used for prompt completion
# it should never be used for chat_completion.
# For chat completion use `apply_chat_template`
return
self
.
tokenizer
.
encode
(
prompt
,
bos
=
True
,
eos
=
False
)
def
apply_chat_template
(
self
,
conversation
:
List
[
"Conversa
tionMessage"
],
messages
:
List
[
"ChatComple
tionMessage
Param
"
],
tools
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
**
kwargs
)
->
List
[
int
]:
assert
tools
is
None
,
"`tools` are not yet supported."
request
=
ChatCompletionRequest
(
messages
=
conversation
)
# type: ignore[type-var]
messages
=
messages
)
# type: ignore[type-var]
encoded
=
self
.
mistral
.
encode_chat_completion
(
request
)
# encode-decode to get clean prompt
return
encoded
.
tokens
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
])
->
str
:
if
self
.
_is_tekken
:
if
isinstance
(
self
.
tokenizer
,
Tekkenizer
)
:
return
""
.
join
(
tokens
)
else
:
return
self
.
tokenizer
.
decode
(
tokens
)
# type: ignore[arg-type]
...
...
@@ -151,14 +185,11 @@ class MistralTokenizer:
ids
=
[
ids
]
return
self
.
tokenizer
.
decode
(
ids
)
@
property
def
eos_token_id
(
self
):
return
self
.
tokenizer
.
eos_id
def
convert_ids_to_tokens
(
self
,
ids
:
List
[
int
],
skip_special_tokens
:
Optional
[
bool
]
=
True
)
->
List
[
str
]:
self
,
ids
:
List
[
int
],
skip_special_tokens
:
bool
=
True
,
)
->
List
[
str
]:
# TODO(Patrick) - potentially allow special tokens to not be skipped
assert
(
skip_special_tokens
...
...
@@ -170,6 +201,3 @@ class MistralTokenizer:
tokens
=
[
self
.
tokenizer
.
id_to_piece
(
id
)
for
id
in
ids
]
return
tokens
def
__len__
(
self
):
return
self
.
vocab_size
vllm/utils.py
View file @
4851c202
...
...
@@ -1224,3 +1224,28 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
def
supports_dynamo
()
->
bool
:
base_torch_version
=
Version
(
Version
(
torch
.
__version__
).
base_version
)
return
base_torch_version
>=
Version
(
"2.4.0"
)
class
AtomicCounter
:
"""An atomic, thread-safe counter"""
def
__init__
(
self
,
initial
=
0
):
"""Initialize a new atomic counter to given initial value"""
self
.
_value
=
initial
self
.
_lock
=
threading
.
Lock
()
def
inc
(
self
,
num
=
1
):
"""Atomically increment the counter by num and return the new value"""
with
self
.
_lock
:
self
.
_value
+=
num
return
self
.
_value
def
dec
(
self
,
num
=
1
):
"""Atomically decrement the counter by num and return the new value"""
with
self
.
_lock
:
self
.
_value
-=
num
return
self
.
_value
@
property
def
value
(
self
):
return
self
.
_value
vllm/version.py
View file @
4851c202
...
...
@@ -9,4 +9,4 @@ except Exception as e:
stacklevel
=
2
)
__commit__
=
"COMMIT_HASH_PLACEHOLDER"
__version__
=
"0.6.
0
"
__version__
=
"0.6.
1
"
vllm/worker/cpu_worker.py
View file @
4851c202
...
...
@@ -207,7 +207,8 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
def
init_device
(
self
)
->
None
:
if
self
.
local_omp_cpuid
!=
"all"
:
torch
.
ops
.
_C_utils
.
init_cpu_threads_env
(
self
.
local_omp_cpuid
)
ret
=
torch
.
ops
.
_C_utils
.
init_cpu_threads_env
(
self
.
local_omp_cpuid
)
logger
.
info
(
ret
)
self
.
init_distributed_environment
()
# Set random seed.
...
...
vllm/worker/model_runner.py
View file @
4851c202
...
...
@@ -30,6 +30,7 @@ from vllm.lora.layers import LoRAMapping
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.worker_manager
import
LRUCacheWorkerLoRAManager
from
vllm.model_executor
import
SamplingMetadata
,
SamplingMetadataCache
from
vllm.model_executor.layers.rotary_embedding
import
MRotaryEmbedding
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader.tensorizer
import
TensorizerConfig
...
...
@@ -74,6 +75,10 @@ _NUM_WARMUP_ITERS = 2
TModelInputForGPU
=
TypeVar
(
'TModelInputForGPU'
,
bound
=
"ModelInputForGPU"
)
# For now, bump up cache limits for recompilations during CUDA graph warmups.
torch
.
_dynamo
.
config
.
cache_size_limit
=
128
torch
.
_dynamo
.
config
.
accumulated_cache_size_limit
=
128
@
dataclass
(
frozen
=
True
)
class
ModelInputForGPU
(
ModelRunnerInputBase
):
...
...
@@ -181,6 +186,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
def
simple_reinit
(
self
):
self
.
input_tokens
[
0
].
clear
()
# type: ignore
self
.
input_positions
[
0
].
clear
()
# type: ignore
self
.
mrope_input_positions
=
None
# type: ignore
self
.
seq_lens
[
0
]
=
0
# type: ignore
self
.
orig_seq_lens
[
0
]
=
0
# type: ignore
self
.
query_lens
[
0
]
=
0
# type: ignore
...
...
@@ -206,6 +212,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Input tokens and positions.
input_tokens
:
Optional
[
List
[
List
[
int
]]]
=
None
,
input_positions
:
Optional
[
List
[
List
[
int
]]]
=
None
,
mrope_input_positions
:
Optional
[
List
[
List
[
List
[
int
]]]]
=
None
,
# The sequence length (may be capped to the sliding window).
seq_lens
:
Optional
[
List
[
int
]]
=
None
,
...
...
@@ -266,6 +273,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
for
seq_id
in
range
(
len
(
self
.
seq_ids
)):
self
.
input_positions
[
seq_id
].
clear
()
self
.
mrope_input_positions
=
None
if
seq_lens
:
self
.
seq_lens
=
seq_lens
else
:
...
...
@@ -327,6 +336,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
else
:
self
.
input_tokens
=
input_tokens
or
[]
self
.
input_positions
=
input_positions
or
[]
self
.
mrope_input_positions
=
mrope_input_positions
or
None
self
.
seq_lens
=
seq_lens
or
[]
self
.
orig_seq_lens
=
orig_seq_lens
or
[]
self
.
query_lens
=
query_lens
or
[]
...
...
@@ -357,6 +367,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self
.
input_tokens
=
[[]
for
_
in
range
(
self
.
n_seqs
)]
self
.
input_positions
=
[[]
for
_
in
range
(
self
.
n_seqs
)]
self
.
mrope_input_positions
=
None
self
.
seq_lens
=
[
0
]
*
self
.
n_seqs
self
.
orig_seq_lens
=
[
0
]
*
self
.
n_seqs
self
.
query_lens
=
[
0
]
*
self
.
n_seqs
...
...
@@ -493,6 +504,17 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
inter_data
.
query_lens
[
seq_idx
]
=
seq_len
-
context_len
if
inter_data
.
is_prompt
else
1
if
seq_data
.
mrope_position_delta
is
not
None
:
if
inter_data
.
mrope_input_positions
is
None
:
inter_data
.
mrope_input_positions
=
[
None
]
*
inter_data
.
n_seqs
inter_data
.
mrope_input_positions
[
seq_idx
]
=
MRotaryEmbedding
.
get_next_input_positions
(
seq_data
.
mrope_position_delta
,
context_len
,
seq_len
,
)
def
_compute_for_prefix_cache_hit
(
self
,
inter_data
:
InterDataForSeqGroup
,
seq_idx
:
int
,
seq_group_metadata
:
SequenceGroupMetadata
):
...
...
@@ -636,6 +658,40 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
)
inter_data
.
multi_modal_inputs
=
mm_kwargs
# special processing for mrope position deltas.
if
self
.
runner
.
model_is_mrope
:
image_grid_thw
=
mm_kwargs
.
get
(
"image_grid_thw"
,
None
)
video_grid_thw
=
mm_kwargs
.
get
(
"video_grid_thw"
,
None
)
assert
image_grid_thw
is
not
None
or
video_grid_thw
is
not
None
,
(
"mrope embedding type requires multi-modal input mapper "
"returns 'image_grid_thw' or 'video_grid_thw'."
)
hf_config
=
self
.
runner
.
model_config
.
hf_config
inter_data
.
mrope_input_positions
=
[
None
]
*
inter_data
.
n_seqs
for
seq_idx
in
range
(
inter_data
.
n_seqs
):
seq_data
=
seq_group_metadata
.
seq_data
[
inter_data
.
seq_ids
[
seq_idx
]]
token_ids
=
seq_data
.
get_token_ids
()
mrope_input_positions
,
mrope_position_delta
=
\
MRotaryEmbedding
.
get_input_positions
(
token_ids
,
image_grid_thw
=
image_grid_thw
,
video_grid_thw
=
video_grid_thw
,
image_token_id
=
hf_config
.
image_token_id
,
video_token_id
=
hf_config
.
video_token_id
,
vision_start_token_id
=
hf_config
.
vision_start_token_id
,
vision_end_token_id
=
hf_config
.
vision_end_token_id
,
spatial_merge_size
=
hf_config
.
vision_config
.
spatial_merge_size
,
context_len
=
inter_data
.
context_lens
[
seq_idx
],
)
seq_data
.
mrope_position_delta
=
mrope_position_delta
inter_data
.
mrope_input_positions
[
seq_idx
]
=
mrope_input_positions
def
add_seq_group
(
self
,
seq_group_metadata
:
SequenceGroupMetadata
):
"""Add a sequence group to the builder."""
seq_ids
=
seq_group_metadata
.
seq_data
.
keys
()
...
...
@@ -684,10 +740,27 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# prefix caching and there is no decode request.
return
self
.
model_input_cls
()
input_positions
=
[]
for
inter_data
in
self
.
inter_data_list
:
for
cur_input_positions
in
inter_data
.
input_positions
:
input_positions
.
extend
(
cur_input_positions
)
mrope_input_positions
:
Optional
[
List
[
List
[
int
]]]
=
None
if
any
(
inter_data
.
mrope_input_positions
is
not
None
for
inter_data
in
self
.
inter_data_list
):
mrope_input_positions
=
[[]
for
_
in
range
(
3
)]
for
idx
in
range
(
3
):
for
inter_data
in
self
.
inter_data_list
:
msections
=
inter_data
.
mrope_input_positions
if
msections
is
None
:
for
_seq_input_positions
in
inter_data
.
input_positions
:
mrope_input_positions
[
idx
].
extend
(
_seq_input_positions
)
else
:
for
_seq_mrope_input_positions
in
msections
:
mrope_input_positions
[
idx
].
extend
(
_seq_mrope_input_positions
[
idx
])
input_positions
=
None
else
:
input_positions
=
[]
for
inter_data
in
self
.
inter_data_list
:
for
cur_input_positions
in
inter_data
.
input_positions
:
input_positions
.
extend
(
cur_input_positions
)
seq_lens
=
[]
max_decode_seq_len
=
0
...
...
@@ -724,15 +797,24 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Tokens and positions.
if
cuda_graph_pad_size
:
input_tokens
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
input_positions
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
assert
self
.
runner
.
device
is
not
None
input_tokens_tensor
=
async_tensor_h2d
(
input_tokens
,
torch
.
long
,
self
.
runner
.
device
,
self
.
runner
.
pin_memory
)
input_positions_tensor
=
async_tensor_h2d
(
input_positions
,
torch
.
long
,
self
.
runner
.
device
,
self
.
runner
.
pin_memory
)
if
mrope_input_positions
is
not
None
:
for
idx
in
range
(
3
):
mrope_input_positions
[
idx
].
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
input_positions_tensor
=
async_tensor_h2d
(
mrope_input_positions
,
torch
.
long
,
self
.
runner
.
device
,
self
.
runner
.
pin_memory
)
else
:
input_positions
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
input_positions_tensor
=
async_tensor_h2d
(
input_positions
,
torch
.
long
,
self
.
runner
.
device
,
self
.
runner
.
pin_memory
)
# Sequence and query lengths.
if
cuda_graph_pad_size
:
seq_lens
.
extend
(
itertools
.
repeat
(
1
,
cuda_graph_pad_size
))
...
...
@@ -982,9 +1064,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"This may lead to less accurate results!"
)
if
envs
.
VLLM_TEST_DYNAMO_GRAPH_CAPTURE
and
supports_dynamo
():
self
.
model
=
torch
.
compile
(
self
.
model
,
fullgraph
=
True
,
backend
=
"eager"
)
self
.
model
=
torch
.
compile
(
self
.
model
,
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
backend
=
"eager"
)
def
save_sharded_state
(
self
,
...
...
@@ -1226,6 +1309,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
raise
RuntimeError
(
"PromptAdapter is not enabled."
)
return
self
.
prompt_adapter_manager
.
list_adapters
()
@
property
def
model_is_mrope
(
self
)
->
bool
:
"""Detect if the model has "mrope" rope_scaling type.
mrope requires keep "rope_deltas" between prompt and decoding phases."""
rope_scaling
=
getattr
(
self
.
model_config
.
hf_config
,
"rope_scaling"
,
{})
if
rope_scaling
is
None
:
return
False
return
rope_scaling
.
get
(
"type"
,
None
)
==
"mrope"
@
torch
.
inference_mode
()
def
capture_model
(
self
,
kv_caches
:
List
[
List
[
torch
.
Tensor
]])
->
None
:
"""Cuda graph capture a model.
...
...
@@ -1256,7 +1348,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
max_batch_size
=
self
.
max_batchsize_to_capture
input_tokens
=
torch
.
zeros
(
max_batch_size
,
dtype
=
torch
.
long
).
cuda
()
input_positions
=
torch
.
zeros
(
max_batch_size
,
dtype
=
torch
.
long
).
cuda
()
if
self
.
model_is_mrope
:
input_positions
=
torch
.
tile
(
input_positions
,
(
3
,
1
))
# Prepare dummy previous_hidden_states only if needed by the model.
# This is used by draft models such as EAGLE.
previous_hidden_states
=
None
...
...
@@ -1320,7 +1413,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"input_ids"
:
input_tokens
[:
batch_size
],
"positions"
:
input_positions
[:
batch_size
],
input_positions
[
...,
:
batch_size
],
"hidden_or_intermediate_states"
:
hidden_or_intermediate_states
[
virtual_engine
]
# type: ignore
...
...
Prev
1
…
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment