Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4851c202
Commit
4851c202
authored
Sep 13, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.1' into v0.6.1-dev
parents
9b902f9e
3fd2b0d2
Changes
203
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
674 additions
and
106 deletions
+674
-106
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+2
-1
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+42
-0
vllm/multimodal/video.py
vllm/multimodal/video.py
+71
-0
vllm/platforms/__init__.py
vllm/platforms/__init__.py
+10
-0
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+15
-0
vllm/platforms/interface.py
vllm/platforms/interface.py
+7
-3
vllm/platforms/tpu.py
vllm/platforms/tpu.py
+0
-6
vllm/prompt_adapter/models.py
vllm/prompt_adapter/models.py
+1
-1
vllm/prompt_adapter/utils.py
vllm/prompt_adapter/utils.py
+93
-0
vllm/sequence.py
vllm/sequence.py
+11
-0
vllm/spec_decode/draft_model_runner.py
vllm/spec_decode/draft_model_runner.py
+3
-13
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+154
-32
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+4
-0
vllm/transformers_utils/image_processor.py
vllm/transformers_utils/image_processor.py
+27
-0
vllm/transformers_utils/processor.py
vllm/transformers_utils/processor.py
+37
-0
vllm/transformers_utils/tokenizers/mistral.py
vllm/transformers_utils/tokenizers/mistral.py
+62
-34
vllm/utils.py
vllm/utils.py
+25
-0
vllm/version.py
vllm/version.py
+1
-1
vllm/worker/cpu_worker.py
vllm/worker/cpu_worker.py
+2
-1
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+107
-14
No files found.
vllm/multimodal/registry.py
View file @
4851c202
...
@@ -9,6 +9,7 @@ from .audio import AudioPlugin
...
@@ -9,6 +9,7 @@ from .audio import AudioPlugin
from
.base
import
(
MultiModalDataDict
,
MultiModalInputMapper
,
MultiModalInputs
,
from
.base
import
(
MultiModalDataDict
,
MultiModalInputMapper
,
MultiModalInputs
,
MultiModalPlugin
,
MultiModalTokensCalc
,
NestedTensors
)
MultiModalPlugin
,
MultiModalTokensCalc
,
NestedTensors
)
from
.image
import
ImagePlugin
from
.image
import
ImagePlugin
from
.video
import
VideoPlugin
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -34,7 +35,7 @@ class MultiModalRegistry:
...
@@ -34,7 +35,7 @@ class MultiModalRegistry:
:class:`~vllm.multimodal.MultiModalPlugin` for each modality.
:class:`~vllm.multimodal.MultiModalPlugin` for each modality.
"""
"""
DEFAULT_PLUGINS
=
(
ImagePlugin
(),
AudioPlugin
())
DEFAULT_PLUGINS
=
(
ImagePlugin
(),
AudioPlugin
()
,
VideoPlugin
()
)
def
__init__
(
def
__init__
(
self
,
self
,
...
...
vllm/multimodal/utils.py
View file @
4851c202
...
@@ -4,6 +4,7 @@ from io import BytesIO
...
@@ -4,6 +4,7 @@ from io import BytesIO
from
typing
import
Any
,
List
,
Optional
,
Tuple
,
TypeVar
,
Union
from
typing
import
Any
,
List
,
Optional
,
Tuple
,
TypeVar
,
Union
import
numpy
as
np
import
numpy
as
np
import
numpy.typing
as
npt
from
PIL
import
Image
from
PIL
import
Image
from
vllm.connections
import
global_http_connection
from
vllm.connections
import
global_http_connection
...
@@ -187,6 +188,47 @@ def rescale_image_size(image: Image.Image,
...
@@ -187,6 +188,47 @@ def rescale_image_size(image: Image.Image,
return
image
return
image
def
try_import_video_packages
()
->
Any
:
try
:
import
cv2
except
ImportError
:
raise
ImportError
(
"Please install vllm[video] for video support."
)
from
None
return
cv2
def
resize_video
(
frames
:
npt
.
NDArray
,
size
:
Tuple
[
int
,
int
])
->
npt
.
NDArray
:
cv2
=
try_import_video_packages
()
num_frames
,
_
,
_
,
channels
=
frames
.
shape
new_height
,
new_width
=
size
resized_frames
=
np
.
empty
((
num_frames
,
new_height
,
new_width
,
channels
),
dtype
=
frames
.
dtype
)
for
i
,
frame
in
enumerate
(
frames
):
resized_frame
=
cv2
.
resize
(
frame
,
(
new_width
,
new_height
))
resized_frames
[
i
]
=
resized_frame
return
resized_frames
def
rescale_video_size
(
frames
:
npt
.
NDArray
,
size_factor
:
float
)
->
npt
.
NDArray
:
_
,
height
,
width
,
_
=
frames
.
shape
new_height
=
int
(
height
*
size_factor
)
new_width
=
int
(
width
*
size_factor
)
return
resize_video
(
frames
,
(
new_height
,
new_width
))
def
sample_frames_from_video
(
frames
:
npt
.
NDArray
,
num_frames
:
int
)
->
npt
.
NDArray
:
total_frames
=
frames
.
shape
[
0
]
if
num_frames
==
-
1
:
return
frames
else
:
frame_indices
=
np
.
linspace
(
0
,
total_frames
-
1
,
num_frames
,
dtype
=
int
)
sampled_frames
=
frames
[
frame_indices
,
...]
return
sampled_frames
# Utilities for input processors
# Utilities for input processors
_T
=
TypeVar
(
"_T"
,
str
,
int
)
_T
=
TypeVar
(
"_T"
,
str
,
int
)
...
...
vllm/multimodal/video.py
0 → 100644
View file @
4851c202
from
functools
import
lru_cache
from
typing
import
List
,
Union
import
numpy
as
np
from
vllm.config
import
ModelConfig
from
vllm.inputs.registry
import
InputContext
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.image_processor
import
get_video_processor
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.utils
import
is_list_of
from
.base
import
MultiModalData
,
MultiModalInputs
from
.image
import
ImagePlugin
logger
=
init_logger
(
__name__
)
cached_get_video_processor
=
lru_cache
(
get_video_processor
)
cached_get_tokenizer
=
lru_cache
(
get_tokenizer
)
VideoInput
=
Union
[
"np.ndarray"
,
# single video input
List
[
"np.ndarray"
],
# TODO: support more types
# List[Image.Image], List[List[Image.Image]],
# "torch.Tensor",
# List["torch.Tensor"],
# List[List["np.ndarrray"]],
# List[List["torch.Tensor"]],
]
class
VideoPlugin
(
ImagePlugin
):
"""Plugin for video data."""
def
get_data_key
(
self
)
->
str
:
return
"video"
def
_get_hf_video_processor
(
self
,
model_config
:
ModelConfig
):
return
cached_get_video_processor
(
model_config
.
model
,
trust_remote_code
=
model_config
.
trust_remote_code
)
def
_default_input_mapper
(
self
,
ctx
:
InputContext
,
data
:
MultiModalData
[
object
],
)
->
MultiModalInputs
:
model_config
=
ctx
.
model_config
# single video input as np.ndarray
if
isinstance
(
data
,
np
.
ndarray
):
video_processor
=
self
.
_get_hf_video_processor
(
model_config
)
if
video_processor
is
None
:
raise
RuntimeError
(
"No HuggingFace processor is available "
"to process the image object"
)
try
:
batch_data
=
video_processor
(
data
,
return_tensors
=
"pt"
).
data
except
Exception
:
logger
.
error
(
"Failed to process image (%s)"
,
data
)
raise
return
MultiModalInputs
(
batch_data
)
elif
is_list_of
(
data
,
np
.
ndarray
):
raise
NotImplementedError
(
"Multi video for a prompt is not supported yet"
)
raise
TypeError
(
f
"Invalid video type:
{
type
(
data
)
}
"
)
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
return
4096
vllm/platforms/__init__.py
View file @
4851c202
...
@@ -42,6 +42,13 @@ try:
...
@@ -42,6 +42,13 @@ try:
except
Exception
:
except
Exception
:
pass
pass
is_cpu
=
False
try
:
from
importlib.metadata
import
version
is_cpu
=
"cpu"
in
version
(
"vllm"
)
except
Exception
:
pass
if
is_tpu
:
if
is_tpu
:
# people might install pytorch built with cuda but run on tpu
# people might install pytorch built with cuda but run on tpu
# so we need to check tpu first
# so we need to check tpu first
...
@@ -53,6 +60,9 @@ elif is_cuda:
...
@@ -53,6 +60,9 @@ elif is_cuda:
elif
is_rocm
:
elif
is_rocm
:
from
.rocm
import
RocmPlatform
from
.rocm
import
RocmPlatform
current_platform
=
RocmPlatform
()
current_platform
=
RocmPlatform
()
elif
is_cpu
:
from
.cpu
import
CpuPlatform
current_platform
=
CpuPlatform
()
else
:
else
:
current_platform
=
UnspecifiedPlatform
()
current_platform
=
UnspecifiedPlatform
()
...
...
vllm/platforms/cpu.py
0 → 100644
View file @
4851c202
import
torch
from
.interface
import
Platform
,
PlatformEnum
class
CpuPlatform
(
Platform
):
_enum
=
PlatformEnum
.
CPU
@
staticmethod
def
get_device_name
(
device_id
:
int
=
0
)
->
str
:
return
"cpu"
@
staticmethod
def
inference_mode
():
return
torch
.
no_grad
()
vllm/platforms/interface.py
View file @
4851c202
import
enum
import
enum
from
typing
import
Tuple
from
typing
import
Optional
,
Tuple
import
torch
import
torch
...
@@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum):
...
@@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum):
CUDA
=
enum
.
auto
()
CUDA
=
enum
.
auto
()
ROCM
=
enum
.
auto
()
ROCM
=
enum
.
auto
()
TPU
=
enum
.
auto
()
TPU
=
enum
.
auto
()
CPU
=
enum
.
auto
()
UNSPECIFIED
=
enum
.
auto
()
UNSPECIFIED
=
enum
.
auto
()
...
@@ -23,9 +24,12 @@ class Platform:
...
@@ -23,9 +24,12 @@ class Platform:
def
is_tpu
(
self
)
->
bool
:
def
is_tpu
(
self
)
->
bool
:
return
self
.
_enum
==
PlatformEnum
.
TPU
return
self
.
_enum
==
PlatformEnum
.
TPU
def
is_cpu
(
self
)
->
bool
:
return
self
.
_enum
==
PlatformEnum
.
CPU
@
staticmethod
@
staticmethod
def
get_device_capability
(
device_id
:
int
=
0
)
->
Tuple
[
int
,
int
]:
def
get_device_capability
(
device_id
:
int
=
0
)
->
Optional
[
Tuple
[
int
,
int
]
]
:
r
aise
NotImplementedError
r
eturn
None
@
staticmethod
@
staticmethod
def
get_device_name
(
device_id
:
int
=
0
)
->
str
:
def
get_device_name
(
device_id
:
int
=
0
)
->
str
:
...
...
vllm/platforms/tpu.py
View file @
4851c202
from
typing
import
Tuple
import
torch
import
torch
from
.interface
import
Platform
,
PlatformEnum
from
.interface
import
Platform
,
PlatformEnum
...
@@ -8,10 +6,6 @@ from .interface import Platform, PlatformEnum
...
@@ -8,10 +6,6 @@ from .interface import Platform, PlatformEnum
class
TpuPlatform
(
Platform
):
class
TpuPlatform
(
Platform
):
_enum
=
PlatformEnum
.
TPU
_enum
=
PlatformEnum
.
TPU
@
staticmethod
def
get_device_capability
(
device_id
:
int
=
0
)
->
Tuple
[
int
,
int
]:
raise
RuntimeError
(
"TPU does not have device capability."
)
@
staticmethod
@
staticmethod
def
inference_mode
():
def
inference_mode
():
return
torch
.
no_grad
()
return
torch
.
no_grad
()
vllm/prompt_adapter/models.py
View file @
4851c202
...
@@ -14,6 +14,7 @@ from vllm.config import PromptAdapterConfig
...
@@ -14,6 +14,7 @@ from vllm.config import PromptAdapterConfig
from
vllm.prompt_adapter.layers
import
(
from
vllm.prompt_adapter.layers
import
(
VocabParallelEmbeddingWithPromptAdapter
)
# yapf: disable
VocabParallelEmbeddingWithPromptAdapter
)
# yapf: disable
from
vllm.prompt_adapter.layers
import
PromptAdapterMapping
from
vllm.prompt_adapter.layers
import
PromptAdapterMapping
from
vllm.prompt_adapter.utils
import
load_peft_weights
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -90,7 +91,6 @@ class PromptAdapterModel(AdapterModel):
...
@@ -90,7 +91,6 @@ class PromptAdapterModel(AdapterModel):
config
:
PromptAdapterConfig
,
config
:
PromptAdapterConfig
,
device
:
str
=
"cuda"
,
device
:
str
=
"cuda"
,
)
->
"PromptAdapterModel"
:
)
->
"PromptAdapterModel"
:
from
peft.utils
import
load_peft_weights
if
num_virtual_tokens
>
config
.
max_prompt_adapter_token
:
if
num_virtual_tokens
>
config
.
max_prompt_adapter_token
:
raise
ValueError
(
raise
ValueError
(
...
...
vllm/prompt_adapter/utils.py
0 → 100644
View file @
4851c202
# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
import
os
from
typing
import
Optional
import
torch
from
huggingface_hub
import
file_exists
,
hf_hub_download
from
huggingface_hub.utils
import
EntryNotFoundError
from
safetensors.torch
import
load_file
as
safe_load_file
WEIGHTS_NAME
=
"adapter_model.bin"
SAFETENSORS_WEIGHTS_NAME
=
"adapter_model.safetensors"
# Get current device name based on available devices
def
infer_device
()
->
str
:
if
torch
.
cuda
.
is_available
():
return
"cuda"
return
"cpu"
def
load_peft_weights
(
model_id
:
str
,
device
:
Optional
[
str
]
=
None
,
**
hf_hub_download_kwargs
)
->
dict
:
r
"""
A helper method to load the PEFT weights from the HuggingFace Hub or locally
Args:
model_id (`str`):
The local path to the adapter weights or the name of the adapter to
load from the HuggingFace Hub.
device (`str`):
The device to load the weights onto.
hf_hub_download_kwargs (`dict`):
Additional arguments to pass to the `hf_hub_download` method when
loading from the HuggingFace Hub.
"""
path
=
(
os
.
path
.
join
(
model_id
,
hf_hub_download_kwargs
[
"subfolder"
])
if
hf_hub_download_kwargs
.
get
(
"subfolder"
,
None
)
is
not
None
else
model_id
)
if
device
is
None
:
device
=
infer_device
()
if
os
.
path
.
exists
(
os
.
path
.
join
(
path
,
SAFETENSORS_WEIGHTS_NAME
)):
filename
=
os
.
path
.
join
(
path
,
SAFETENSORS_WEIGHTS_NAME
)
use_safetensors
=
True
elif
os
.
path
.
exists
(
os
.
path
.
join
(
path
,
WEIGHTS_NAME
)):
filename
=
os
.
path
.
join
(
path
,
WEIGHTS_NAME
)
use_safetensors
=
False
else
:
token
=
hf_hub_download_kwargs
.
get
(
"token"
,
None
)
if
token
is
None
:
token
=
hf_hub_download_kwargs
.
get
(
"use_auth_token"
,
None
)
hub_filename
=
(
os
.
path
.
join
(
hf_hub_download_kwargs
[
"subfolder"
],
SAFETENSORS_WEIGHTS_NAME
)
if
hf_hub_download_kwargs
.
get
(
"subfolder"
,
None
)
is
not
None
else
SAFETENSORS_WEIGHTS_NAME
)
has_remote_safetensors_file
=
file_exists
(
repo_id
=
model_id
,
filename
=
hub_filename
,
revision
=
hf_hub_download_kwargs
.
get
(
"revision"
,
None
),
repo_type
=
hf_hub_download_kwargs
.
get
(
"repo_type"
,
None
),
token
=
token
,
)
use_safetensors
=
has_remote_safetensors_file
if
has_remote_safetensors_file
:
# Priority 1: load safetensors weights
filename
=
hf_hub_download
(
model_id
,
SAFETENSORS_WEIGHTS_NAME
,
**
hf_hub_download_kwargs
,
)
else
:
try
:
filename
=
hf_hub_download
(
model_id
,
WEIGHTS_NAME
,
**
hf_hub_download_kwargs
)
except
EntryNotFoundError
:
raise
ValueError
(
# noqa: B904
f
"Can't find weights for
{
model_id
}
in
{
model_id
}
or
\
in the Hugging Face Hub. "
f
"Please check that the file
{
WEIGHTS_NAME
}
or
\
{
SAFETENSORS_WEIGHTS_NAME
}
is present at
{
model_id
}
."
)
if
use_safetensors
:
adapters_weights
=
safe_load_file
(
filename
,
device
=
device
)
else
:
adapters_weights
=
torch
.
load
(
filename
,
map_location
=
torch
.
device
(
device
))
return
adapters_weights
vllm/sequence.py
View file @
4851c202
...
@@ -165,6 +165,9 @@ class SequenceData(msgspec.Struct,
...
@@ -165,6 +165,9 @@ class SequenceData(msgspec.Struct,
# is called.
# is called.
_new_appended_tokens
:
List
[
int
]
=
msgspec
.
field
(
default_factory
=
list
)
_new_appended_tokens
:
List
[
int
]
=
msgspec
.
field
(
default_factory
=
list
)
# It is used to compute mrope_position_ids.
_mrope_position_delta
:
Optional
[
int
]
=
None
def
__post_init__
(
self
)
->
None
:
def
__post_init__
(
self
)
->
None
:
assert
self
.
_prompt_token_ids
.
typecode
==
"l"
assert
self
.
_prompt_token_ids
.
typecode
==
"l"
assert
self
.
_output_token_ids
.
typecode
==
"l"
assert
self
.
_output_token_ids
.
typecode
==
"l"
...
@@ -219,6 +222,14 @@ class SequenceData(msgspec.Struct,
...
@@ -219,6 +222,14 @@ class SequenceData(msgspec.Struct,
assert
isinstance
(
self
.
_output_token_ids
,
array
)
assert
isinstance
(
self
.
_output_token_ids
,
array
)
return
self
.
_output_token_ids
return
self
.
_output_token_ids
@
property
def
mrope_position_delta
(
self
)
->
Optional
[
int
]:
return
self
.
_mrope_position_delta
@
mrope_position_delta
.
setter
def
mrope_position_delta
(
self
,
new_mrope_position_delta
):
self
.
_mrope_position_delta
=
new_mrope_position_delta
def
append_token_id
(
self
,
token_id
:
int
,
logprob
:
float
)
->
None
:
def
append_token_id
(
self
,
token_id
:
int
,
logprob
:
float
)
->
None
:
self
.
_output_token_ids
.
append
(
token_id
)
self
.
_output_token_ids
.
append
(
token_id
)
self
.
_new_appended_tokens
.
append
(
token_id
)
self
.
_new_appended_tokens
.
append
(
token_id
)
...
...
vllm/spec_decode/draft_model_runner.py
View file @
4851c202
...
@@ -2,7 +2,6 @@ from typing import List, Optional
...
@@ -2,7 +2,6 @@ from typing import List, Optional
import
torch
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
try
:
try
:
...
@@ -116,18 +115,9 @@ class TP1DraftModelRunner(ModelRunner):
...
@@ -116,18 +115,9 @@ class TP1DraftModelRunner(ModelRunner):
# Update attn_metadata
# Update attn_metadata
attn_metadata
=
model_input
.
attn_metadata
attn_metadata
=
model_input
.
attn_metadata
assert
isinstance
(
attn_metadata
,
FlashAttentionMetadata
)
assert
isinstance
(
attn_metadata
,
FlashAttentionMetadata
)
attn_metadata
.
advance_step
(
num_seqs
,
num_queries
)
attn_metadata
.
advance_step
(
model_input
,
sampled_token_ids
,
# Update GPU tensors
self
.
block_size
,
num_seqs
,
num_queries
)
ops
.
advance_step
(
num_seqs
=
num_seqs
,
num_queries
=
num_queries
,
block_size
=
self
.
block_size
,
input_tokens
=
model_input
.
input_tokens
,
sampled_token_ids
=
sampled_token_ids
,
input_positions
=
model_input
.
input_positions
,
seq_lens
=
attn_metadata
.
seq_lens_tensor
,
slot_mapping
=
attn_metadata
.
slot_mapping
,
block_tables
=
attn_metadata
.
block_tables
)
# Update sampling_metadata
# Update sampling_metadata
sampling_metadata
=
model_input
.
sampling_metadata
sampling_metadata
=
model_input
.
sampling_metadata
...
...
vllm/transformers_utils/config.py
View file @
4851c202
import
contextlib
import
contextlib
import
enum
import
json
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Any
,
Dict
,
Optional
,
Type
,
Union
from
typing
import
Any
,
Dict
,
Optional
,
Type
,
Union
from
huggingface_hub
import
file_exists
,
hf_hub_download
from
transformers
import
GenerationConfig
,
PretrainedConfig
from
transformers
import
GenerationConfig
,
PretrainedConfig
from
transformers.models.auto.image_processing_auto
import
(
from
transformers.models.auto.image_processing_auto
import
(
get_image_processor_config
)
get_image_processor_config
)
from
transformers.models.auto.modeling_auto
import
(
from
transformers.models.auto.modeling_auto
import
(
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
from
transformers.utils
import
CONFIG_NAME
as
HF_CONFIG_NAME
from
vllm.envs
import
VLLM_USE_MODELSCOPE
from
vllm.envs
import
VLLM_USE_MODELSCOPE
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.transformers_utils.configs
import
(
ChatGLMConfig
,
DbrxConfig
,
from
vllm.transformers_utils.configs
import
(
ChatGLMConfig
,
DbrxConfig
,
EAGLEConfig
,
ExaoneConfig
,
EAGLEConfig
,
ExaoneConfig
,
InternVLChatConfig
,
JAISConfig
,
GraniteConfig
,
InternVLChatConfig
,
MedusaConfig
,
MLPSpeculatorConfig
,
JAISConfig
,
MedusaConfig
,
MPTConfig
,
NemotronConfig
,
MLPSpeculatorConfig
,
MPTConfig
,
RWConfig
,
UltravoxConfig
)
NemotronConfig
,
RWConfig
,
UltravoxConfig
)
# yapf: enable
from
vllm.transformers_utils.utils
import
check_gguf_file
from
vllm.transformers_utils.utils
import
check_gguf_file
if
VLLM_USE_MODELSCOPE
:
if
VLLM_USE_MODELSCOPE
:
...
@@ -23,6 +31,8 @@ if VLLM_USE_MODELSCOPE:
...
@@ -23,6 +31,8 @@ if VLLM_USE_MODELSCOPE:
else
:
else
:
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
MISTRAL_CONFIG_NAME
=
"params.json"
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
_CONFIG_REGISTRY
:
Dict
[
str
,
Type
[
PretrainedConfig
]]
=
{
_CONFIG_REGISTRY
:
Dict
[
str
,
Type
[
PretrainedConfig
]]
=
{
...
@@ -39,6 +49,9 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
...
@@ -39,6 +49,9 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
"internvl_chat"
:
InternVLChatConfig
,
"internvl_chat"
:
InternVLChatConfig
,
"nemotron"
:
NemotronConfig
,
"nemotron"
:
NemotronConfig
,
"ultravox"
:
UltravoxConfig
,
"ultravox"
:
UltravoxConfig
,
# Granite can be removed from here once we have upgraded to
# transformers 4.45+
"granite"
:
GraniteConfig
,
}
}
for
name
,
cls
in
_CONFIG_REGISTRY
.
items
():
for
name
,
cls
in
_CONFIG_REGISTRY
.
items
():
...
@@ -46,6 +59,20 @@ for name, cls in _CONFIG_REGISTRY.items():
...
@@ -46,6 +59,20 @@ for name, cls in _CONFIG_REGISTRY.items():
AutoConfig
.
register
(
name
,
cls
)
AutoConfig
.
register
(
name
,
cls
)
class
ConfigFormat
(
str
,
enum
.
Enum
):
AUTO
=
"auto"
HF
=
"hf"
MISTRAL
=
"mistral"
def
file_or_path_exists
(
model
:
Union
[
str
,
Path
],
config_name
,
revision
,
token
)
->
bool
:
if
Path
(
model
).
exists
():
return
(
Path
(
model
)
/
config_name
).
is_file
()
return
file_exists
(
model
,
config_name
,
revision
=
revision
,
token
=
token
)
def
get_config
(
def
get_config
(
model
:
Union
[
str
,
Path
],
model
:
Union
[
str
,
Path
],
trust_remote_code
:
bool
,
trust_remote_code
:
bool
,
...
@@ -53,38 +80,68 @@ def get_config(
...
@@ -53,38 +80,68 @@ def get_config(
code_revision
:
Optional
[
str
]
=
None
,
code_revision
:
Optional
[
str
]
=
None
,
rope_scaling
:
Optional
[
dict
]
=
None
,
rope_scaling
:
Optional
[
dict
]
=
None
,
rope_theta
:
Optional
[
float
]
=
None
,
rope_theta
:
Optional
[
float
]
=
None
,
config_format
:
ConfigFormat
=
ConfigFormat
.
AUTO
,
**
kwargs
,
**
kwargs
,
)
->
PretrainedConfig
:
)
->
PretrainedConfig
:
# Separate model folder from file path for GGUF models
# Separate model folder from file path for GGUF models
is_gguf
=
check_gguf_file
(
model
)
is_gguf
=
check_gguf_file
(
model
)
if
is_gguf
:
if
is_gguf
:
kwargs
[
"gguf_file"
]
=
Path
(
model
).
name
kwargs
[
"gguf_file"
]
=
Path
(
model
).
name
model
=
Path
(
model
).
parent
model
=
Path
(
model
).
parent
try
:
if
config_format
==
ConfigFormat
.
AUTO
:
config
=
AutoConfig
.
from_pretrained
(
if
is_gguf
or
file_or_path_exists
(
model
,
model
,
HF_CONFIG_NAME
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
revision
=
revision
,
token
=
kwargs
.
get
(
"token"
)):
code_revision
=
code_revision
,
config_format
=
ConfigFormat
.
HF
**
kwargs
)
elif
file_or_path_exists
(
model
,
except
ValueError
as
e
:
MISTRAL_CONFIG_NAME
,
if
(
not
trust_remote_code
and
revision
=
revision
,
"requires you to execute the configuration file"
in
str
(
e
)):
token
=
kwargs
.
get
(
"token"
)):
err_msg
=
(
config_format
=
ConfigFormat
.
MISTRAL
"Failed to load the model config. If the model is a custom "
else
:
"model not yet available in the HuggingFace transformers "
raise
ValueError
(
f
"No supported config format found in
{
model
}
"
)
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI."
)
if
config_format
==
ConfigFormat
.
HF
:
raise
RuntimeError
(
err_msg
)
from
e
config_dict
,
_
=
PretrainedConfig
.
get_config_dict
(
model
,
revision
=
revision
,
code_revision
=
code_revision
,
**
kwargs
)
# Use custom model class if it's in our registry
model_type
=
config_dict
.
get
(
"model_type"
)
if
model_type
in
_CONFIG_REGISTRY
:
config_class
=
_CONFIG_REGISTRY
[
model_type
]
config
=
config_class
.
from_pretrained
(
model
,
revision
=
revision
,
code_revision
=
code_revision
)
else
:
else
:
raise
e
try
:
if
config
.
model_type
in
_CONFIG_REGISTRY
:
config
=
AutoConfig
.
from_pretrained
(
config_class
=
_CONFIG_REGISTRY
[
config
.
model_type
]
model
,
config
=
config_class
.
from_pretrained
(
model
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
revision
=
revision
,
code_revision
=
code_revision
)
code_revision
=
code_revision
,
**
kwargs
,
)
except
ValueError
as
e
:
if
(
not
trust_remote_code
and
"requires you to execute the configuration file"
in
str
(
e
)):
err_msg
=
(
"Failed to load the model config. If the model "
"is a custom model not yet available in the "
"HuggingFace transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
elif
config_format
==
ConfigFormat
.
MISTRAL
:
config
=
load_params_config
(
model
,
revision
)
else
:
raise
ValueError
(
f
"Unsupported config format:
{
config_format
}
"
)
# Special architecture mapping check for GGUF models
# Special architecture mapping check for GGUF models
if
is_gguf
:
if
is_gguf
:
...
@@ -94,16 +151,81 @@ def get_config(
...
@@ -94,16 +151,81 @@ def get_config(
model_type
=
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
[
config
.
model_type
]
model_type
=
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
[
config
.
model_type
]
config
.
update
({
"architectures"
:
[
model_type
]})
config
.
update
({
"architectures"
:
[
model_type
]})
for
key
,
value
in
[(
"rope_scaling"
,
rope_scaling
),
for
key
,
value
in
[
(
"rope_theta"
,
rope_theta
)]:
(
"rope_scaling"
,
rope_scaling
),
(
"rope_theta"
,
rope_theta
),
]:
if
value
is
not
None
:
if
value
is
not
None
:
logger
.
info
(
"Updating %s from %r to %r"
,
key
,
logger
.
info
(
getattr
(
config
,
key
,
None
),
value
)
"Updating %s from %r to %r"
,
key
,
getattr
(
config
,
key
,
None
),
value
,
)
config
.
update
({
key
:
value
})
config
.
update
({
key
:
value
})
return
config
return
config
def
load_params_config
(
model
,
revision
)
->
PretrainedConfig
:
# This function loads a params.json config which
# should be used when loading models in mistral format
config_file_name
=
"params.json"
config_path
=
Path
(
model
)
/
config_file_name
if
not
config_path
.
is_file
():
config_path
=
Path
(
hf_hub_download
(
model
,
config_file_name
,
revision
=
revision
))
with
open
(
config_path
,
"r"
)
as
file
:
config_dict
=
json
.
load
(
file
)
config_mapping
=
{
"dim"
:
"hidden_size"
,
"norm_eps"
:
"rms_norm_eps"
,
"n_kv_heads"
:
"num_key_value_heads"
,
"n_layers"
:
"num_hidden_layers"
,
"n_heads"
:
"num_attention_heads"
,
"hidden_dim"
:
"intermediate_size"
,
}
def
recurse_elems
(
elem
:
Any
):
if
isinstance
(
elem
,
dict
):
config_dict
=
{}
for
key
,
value
in
elem
.
items
():
key
=
config_mapping
.
get
(
key
,
key
)
config_dict
[
key
]
=
recurse_elems
(
value
)
return
PretrainedConfig
(
**
config_dict
)
else
:
return
elem
config_dict
[
"model_type"
]
=
config_dict
.
get
(
"model_type"
,
"transformer"
)
config_dict
[
"hidden_act"
]
=
config_dict
.
get
(
"activation"
,
"silu"
)
config_dict
[
"tie_word_embeddings"
]
=
config_dict
.
get
(
"tie_embeddings"
,
False
)
config_dict
[
"max_seq_len"
]
=
config_dict
.
get
(
"max_seq_len"
,
128_000
)
if
config_dict
.
get
(
"moe"
)
is
not
None
:
config_dict
[
"architectures"
]
=
[
"MixtralForCausalLM"
]
else
:
config_dict
[
"architectures"
]
=
[
"MistralForCausalLM"
]
if
config_dict
.
get
(
"vision_encoder"
)
is
not
None
:
multimodal_config
=
config_dict
.
pop
(
"vision_encoder"
)
config_dict
=
{
"text_config"
:
config_dict
,
"vision_config"
:
multimodal_config
}
config_dict
[
"architectures"
]
=
[
"PixtralForConditionalGeneration"
]
config_dict
[
"model_type"
]
=
"pixtral"
config
=
recurse_elems
(
config_dict
)
return
config
def
get_hf_image_processor_config
(
def
get_hf_image_processor_config
(
model
:
Union
[
str
,
Path
],
model
:
Union
[
str
,
Path
],
revision
:
Optional
[
str
]
=
None
,
revision
:
Optional
[
str
]
=
None
,
...
@@ -120,7 +242,7 @@ def get_hf_image_processor_config(
...
@@ -120,7 +242,7 @@ def get_hf_image_processor_config(
def
get_hf_text_config
(
config
:
PretrainedConfig
):
def
get_hf_text_config
(
config
:
PretrainedConfig
):
"""Get the "sub" config relevant to llm for multi modal models.
"""Get the "sub" config relevant to llm for multi modal models.
No op for pure text models.
No op for pure text models.
"""
"""
if
hasattr
(
config
,
"text_config"
):
if
hasattr
(
config
,
"text_config"
):
# The code operates under the assumption that text_config should have
# The code operates under the assumption that text_config should have
...
...
vllm/transformers_utils/configs/__init__.py
View file @
4851c202
...
@@ -6,6 +6,7 @@ from vllm.transformers_utils.configs.exaone import ExaoneConfig
...
@@ -6,6 +6,7 @@ from vllm.transformers_utils.configs.exaone import ExaoneConfig
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
# `FalconConfig` class from the official HuggingFace transformers library.
from
vllm.transformers_utils.configs.falcon
import
RWConfig
from
vllm.transformers_utils.configs.falcon
import
RWConfig
from
vllm.transformers_utils.configs.granite
import
GraniteConfig
from
vllm.transformers_utils.configs.internvl
import
InternVLChatConfig
from
vllm.transformers_utils.configs.internvl
import
InternVLChatConfig
from
vllm.transformers_utils.configs.jais
import
JAISConfig
from
vllm.transformers_utils.configs.jais
import
JAISConfig
from
vllm.transformers_utils.configs.medusa
import
MedusaConfig
from
vllm.transformers_utils.configs.medusa
import
MedusaConfig
...
@@ -27,4 +28,7 @@ __all__ = [
...
@@ -27,4 +28,7 @@ __all__ = [
"MLPSpeculatorConfig"
,
"MLPSpeculatorConfig"
,
"NemotronConfig"
,
"NemotronConfig"
,
"UltravoxConfig"
,
"UltravoxConfig"
,
# Granite can be removed from here once we have upgraded to
# transformers 4.45+
"GraniteConfig"
,
]
]
vllm/transformers_utils/image_processor.py
View file @
4851c202
from
typing
import
cast
from
typing
import
cast
def
get_video_processor
(
processor_name
:
str
,
trust_remote_code
:
bool
=
False
,
):
"""
Gets a processor for the given model name via HuggingFace.
"""
from
transformers
import
AutoProcessor
try
:
processor
=
AutoProcessor
.
from_pretrained
(
processor_name
)
video_processor
=
processor
.
video_processor
except
ValueError
as
e
:
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
video_processor
def
get_image_processor
(
def
get_image_processor
(
processor_name
:
str
,
processor_name
:
str
,
*
args
,
*
args
,
...
...
vllm/transformers_utils/processor.py
0 → 100644
View file @
4851c202
from
typing
import
cast
def
get_processor
(
processor_name
:
str
,
*
args
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
):
"""Gets a processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoProcessor
from
transformers.processing_utils
import
ProcessorMixin
try
:
processor
=
AutoProcessor
.
from_pretrained
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
)
except
ValueError
as
e
:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
if
not
trust_remote_code
:
err_msg
=
(
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
return
cast
(
ProcessorMixin
,
processor
)
vllm/transformers_utils/tokenizers/mistral.py
View file @
4851c202
...
@@ -16,7 +16,7 @@ from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
...
@@ -16,7 +16,7 @@ from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
Tekkenizer
)
Tekkenizer
)
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.entrypoints.chat_utils
import
C
onversa
tionMessage
from
vllm.entrypoints.chat_utils
import
C
hatComple
tionMessage
Param
@
dataclass
@
dataclass
...
@@ -45,26 +45,25 @@ class MistralTokenizer:
...
@@ -45,26 +45,25 @@ class MistralTokenizer:
def
__init__
(
self
,
tokenizer
:
PublicMistralTokenizer
)
->
None
:
def
__init__
(
self
,
tokenizer
:
PublicMistralTokenizer
)
->
None
:
self
.
mistral
=
tokenizer
self
.
mistral
=
tokenizer
self
.
instruct
=
tokenizer
.
instruct_tokenizer
self
.
instruct
=
tokenizer
.
instruct_tokenizer
self
.
tokenizer
=
tokenizer
.
instruct_tokenizer
.
tokenizer
self
.
vocab_size
=
len
(
self
.
tokenizer
.
vocab
())
tokenizer_
=
tokenizer
.
instruct_tokenizer
.
tokenizer
if
isinstance
(
tokenizer_
,
Tekkenizer
):
assert
isinstance
(
self
.
tokenizer
,
(
Tekkenizer
,
SentencePieceTokenizer
)),
type
(
self
.
tokenizer
)
if
(
is_tekken
:
=
isinstance
(
self
.
tokenizer
,
Tekkenizer
)):
# Make sure special tokens will not raise
# Make sure special tokens will not raise
self
.
tokenizer
.
special_token_policy
=
SpecialTokenPolicy
.
IGNORE
tokenizer_
.
special_token_policy
=
SpecialTokenPolicy
.
IGNORE
self
.
_is_tekken
=
is_tekken
self
.
_vocab
=
{
token
:
idx
for
idx
,
token
in
enumerate
(
tokenizer_
.
vocab
())
}
elif
isinstance
(
tokenizer_
,
SentencePieceTokenizer
):
self
.
_vocab
=
{
token
:
idx
for
idx
,
token
in
enumerate
(
tokenizer_
.
vocab
())
}
else
:
raise
TypeError
(
f
"Unsupported tokenizer:
{
type
(
tokenizer_
)
}
"
)
# the following attributes are set to fit VLLM's design
self
.
tokenizer
=
tokenizer_
self
.
is_fast
=
True
self
.
chat_template
=
True
self
.
all_special_ids
:
List
[
Any
]
=
[]
self
.
all_special_tokens
:
List
[
Any
]
=
[]
self
.
all_special_tokens_extended
:
List
[
Any
]
=
[]
@
classmethod
@
classmethod
def
from_pretrained
(
cls
,
def
from_pretrained
(
cls
,
...
@@ -102,6 +101,38 @@ class MistralTokenizer:
...
@@ -102,6 +101,38 @@ class MistralTokenizer:
revision
=
revision
)
revision
=
revision
)
return
tokenizer_file
return
tokenizer_file
# the following attributes are set to fit VLLM's design
@
property
def
all_special_tokens_extended
(
self
)
->
List
[
str
]:
return
[]
@
property
def
all_special_tokens
(
self
)
->
List
[
str
]:
return
[]
@
property
def
all_special_ids
(
self
)
->
List
[
int
]:
return
[]
@
property
def
bos_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
bos_id
@
property
def
eos_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
eos_id
@
property
def
is_fast
(
self
)
->
bool
:
return
True
@
property
def
vocab_size
(
self
)
->
int
:
return
len
(
self
.
_vocab
)
def
__len__
(
self
)
->
int
:
return
self
.
vocab_size
def
__call__
(
def
__call__
(
self
,
self
,
prompt
:
str
,
prompt
:
str
,
...
@@ -117,31 +148,34 @@ class MistralTokenizer:
...
@@ -117,31 +148,34 @@ class MistralTokenizer:
return
Encoding
(
input_ids
=
input_ids
)
return
Encoding
(
input_ids
=
input_ids
)
def
get_added_vocab
(
self
)
->
List
[
str
]:
def
get_vocab
(
self
)
->
Dict
[
str
,
int
]:
return
self
.
_vocab
def
get_added_vocab
(
self
)
->
Dict
[
str
,
int
]:
# Mistral tokenizers have no added vocabulary
# Mistral tokenizers have no added vocabulary
return
[]
return
{}
def
encode
(
self
,
prompt
:
str
)
->
List
[
int
]:
def
encode
(
self
,
prompt
:
str
)
->
List
[
int
]:
# `encode
` should only be used for prompt completion
# `encode` should only be used for prompt completion
# it should never be used for chat_completion.
# it should never be used for chat_completion.
# For chat completion use `apply_chat_template`
# For chat completion use `apply_chat_template`
return
self
.
tokenizer
.
encode
(
prompt
,
bos
=
True
,
eos
=
False
)
return
self
.
tokenizer
.
encode
(
prompt
,
bos
=
True
,
eos
=
False
)
def
apply_chat_template
(
self
,
def
apply_chat_template
(
self
,
conversation
:
List
[
"Conversa
tionMessage"
],
messages
:
List
[
"ChatComple
tionMessage
Param
"
],
tools
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
tools
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
**
kwargs
)
->
List
[
int
]:
**
kwargs
)
->
List
[
int
]:
assert
tools
is
None
,
"`tools` are not yet supported."
assert
tools
is
None
,
"`tools` are not yet supported."
request
=
ChatCompletionRequest
(
request
=
ChatCompletionRequest
(
messages
=
conversation
)
# type: ignore[type-var]
messages
=
messages
)
# type: ignore[type-var]
encoded
=
self
.
mistral
.
encode_chat_completion
(
request
)
encoded
=
self
.
mistral
.
encode_chat_completion
(
request
)
# encode-decode to get clean prompt
# encode-decode to get clean prompt
return
encoded
.
tokens
return
encoded
.
tokens
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
])
->
str
:
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
])
->
str
:
if
self
.
_is_tekken
:
if
isinstance
(
self
.
tokenizer
,
Tekkenizer
)
:
return
""
.
join
(
tokens
)
return
""
.
join
(
tokens
)
else
:
else
:
return
self
.
tokenizer
.
decode
(
tokens
)
# type: ignore[arg-type]
return
self
.
tokenizer
.
decode
(
tokens
)
# type: ignore[arg-type]
...
@@ -151,14 +185,11 @@ class MistralTokenizer:
...
@@ -151,14 +185,11 @@ class MistralTokenizer:
ids
=
[
ids
]
ids
=
[
ids
]
return
self
.
tokenizer
.
decode
(
ids
)
return
self
.
tokenizer
.
decode
(
ids
)
@
property
def
eos_token_id
(
self
):
return
self
.
tokenizer
.
eos_id
def
convert_ids_to_tokens
(
def
convert_ids_to_tokens
(
self
,
self
,
ids
:
List
[
int
],
ids
:
List
[
int
],
skip_special_tokens
:
Optional
[
bool
]
=
True
)
->
List
[
str
]:
skip_special_tokens
:
bool
=
True
,
)
->
List
[
str
]:
# TODO(Patrick) - potentially allow special tokens to not be skipped
# TODO(Patrick) - potentially allow special tokens to not be skipped
assert
(
assert
(
skip_special_tokens
skip_special_tokens
...
@@ -170,6 +201,3 @@ class MistralTokenizer:
...
@@ -170,6 +201,3 @@ class MistralTokenizer:
tokens
=
[
self
.
tokenizer
.
id_to_piece
(
id
)
for
id
in
ids
]
tokens
=
[
self
.
tokenizer
.
id_to_piece
(
id
)
for
id
in
ids
]
return
tokens
return
tokens
def
__len__
(
self
):
return
self
.
vocab_size
vllm/utils.py
View file @
4851c202
...
@@ -1224,3 +1224,28 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
...
@@ -1224,3 +1224,28 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
def
supports_dynamo
()
->
bool
:
def
supports_dynamo
()
->
bool
:
base_torch_version
=
Version
(
Version
(
torch
.
__version__
).
base_version
)
base_torch_version
=
Version
(
Version
(
torch
.
__version__
).
base_version
)
return
base_torch_version
>=
Version
(
"2.4.0"
)
return
base_torch_version
>=
Version
(
"2.4.0"
)
class
AtomicCounter
:
"""An atomic, thread-safe counter"""
def
__init__
(
self
,
initial
=
0
):
"""Initialize a new atomic counter to given initial value"""
self
.
_value
=
initial
self
.
_lock
=
threading
.
Lock
()
def
inc
(
self
,
num
=
1
):
"""Atomically increment the counter by num and return the new value"""
with
self
.
_lock
:
self
.
_value
+=
num
return
self
.
_value
def
dec
(
self
,
num
=
1
):
"""Atomically decrement the counter by num and return the new value"""
with
self
.
_lock
:
self
.
_value
-=
num
return
self
.
_value
@
property
def
value
(
self
):
return
self
.
_value
vllm/version.py
View file @
4851c202
...
@@ -9,4 +9,4 @@ except Exception as e:
...
@@ -9,4 +9,4 @@ except Exception as e:
stacklevel
=
2
)
stacklevel
=
2
)
__commit__
=
"COMMIT_HASH_PLACEHOLDER"
__commit__
=
"COMMIT_HASH_PLACEHOLDER"
__version__
=
"0.6.
0
"
__version__
=
"0.6.
1
"
vllm/worker/cpu_worker.py
View file @
4851c202
...
@@ -207,7 +207,8 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
...
@@ -207,7 +207,8 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
def
init_device
(
self
)
->
None
:
def
init_device
(
self
)
->
None
:
if
self
.
local_omp_cpuid
!=
"all"
:
if
self
.
local_omp_cpuid
!=
"all"
:
torch
.
ops
.
_C_utils
.
init_cpu_threads_env
(
self
.
local_omp_cpuid
)
ret
=
torch
.
ops
.
_C_utils
.
init_cpu_threads_env
(
self
.
local_omp_cpuid
)
logger
.
info
(
ret
)
self
.
init_distributed_environment
()
self
.
init_distributed_environment
()
# Set random seed.
# Set random seed.
...
...
vllm/worker/model_runner.py
View file @
4851c202
...
@@ -30,6 +30,7 @@ from vllm.lora.layers import LoRAMapping
...
@@ -30,6 +30,7 @@ from vllm.lora.layers import LoRAMapping
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.worker_manager
import
LRUCacheWorkerLoRAManager
from
vllm.lora.worker_manager
import
LRUCacheWorkerLoRAManager
from
vllm.model_executor
import
SamplingMetadata
,
SamplingMetadataCache
from
vllm.model_executor
import
SamplingMetadata
,
SamplingMetadataCache
from
vllm.model_executor.layers.rotary_embedding
import
MRotaryEmbedding
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader.tensorizer
import
TensorizerConfig
from
vllm.model_executor.model_loader.tensorizer
import
TensorizerConfig
...
@@ -74,6 +75,10 @@ _NUM_WARMUP_ITERS = 2
...
@@ -74,6 +75,10 @@ _NUM_WARMUP_ITERS = 2
TModelInputForGPU
=
TypeVar
(
'TModelInputForGPU'
,
bound
=
"ModelInputForGPU"
)
TModelInputForGPU
=
TypeVar
(
'TModelInputForGPU'
,
bound
=
"ModelInputForGPU"
)
# For now, bump up cache limits for recompilations during CUDA graph warmups.
torch
.
_dynamo
.
config
.
cache_size_limit
=
128
torch
.
_dynamo
.
config
.
accumulated_cache_size_limit
=
128
@
dataclass
(
frozen
=
True
)
@
dataclass
(
frozen
=
True
)
class
ModelInputForGPU
(
ModelRunnerInputBase
):
class
ModelInputForGPU
(
ModelRunnerInputBase
):
...
@@ -181,6 +186,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -181,6 +186,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
def
simple_reinit
(
self
):
def
simple_reinit
(
self
):
self
.
input_tokens
[
0
].
clear
()
# type: ignore
self
.
input_tokens
[
0
].
clear
()
# type: ignore
self
.
input_positions
[
0
].
clear
()
# type: ignore
self
.
input_positions
[
0
].
clear
()
# type: ignore
self
.
mrope_input_positions
=
None
# type: ignore
self
.
seq_lens
[
0
]
=
0
# type: ignore
self
.
seq_lens
[
0
]
=
0
# type: ignore
self
.
orig_seq_lens
[
0
]
=
0
# type: ignore
self
.
orig_seq_lens
[
0
]
=
0
# type: ignore
self
.
query_lens
[
0
]
=
0
# type: ignore
self
.
query_lens
[
0
]
=
0
# type: ignore
...
@@ -206,6 +212,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -206,6 +212,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Input tokens and positions.
# Input tokens and positions.
input_tokens
:
Optional
[
List
[
List
[
int
]]]
=
None
,
input_tokens
:
Optional
[
List
[
List
[
int
]]]
=
None
,
input_positions
:
Optional
[
List
[
List
[
int
]]]
=
None
,
input_positions
:
Optional
[
List
[
List
[
int
]]]
=
None
,
mrope_input_positions
:
Optional
[
List
[
List
[
List
[
int
]]]]
=
None
,
# The sequence length (may be capped to the sliding window).
# The sequence length (may be capped to the sliding window).
seq_lens
:
Optional
[
List
[
int
]]
=
None
,
seq_lens
:
Optional
[
List
[
int
]]
=
None
,
...
@@ -266,6 +273,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -266,6 +273,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
for
seq_id
in
range
(
len
(
self
.
seq_ids
)):
for
seq_id
in
range
(
len
(
self
.
seq_ids
)):
self
.
input_positions
[
seq_id
].
clear
()
self
.
input_positions
[
seq_id
].
clear
()
self
.
mrope_input_positions
=
None
if
seq_lens
:
if
seq_lens
:
self
.
seq_lens
=
seq_lens
self
.
seq_lens
=
seq_lens
else
:
else
:
...
@@ -327,6 +336,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -327,6 +336,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
else
:
else
:
self
.
input_tokens
=
input_tokens
or
[]
self
.
input_tokens
=
input_tokens
or
[]
self
.
input_positions
=
input_positions
or
[]
self
.
input_positions
=
input_positions
or
[]
self
.
mrope_input_positions
=
mrope_input_positions
or
None
self
.
seq_lens
=
seq_lens
or
[]
self
.
seq_lens
=
seq_lens
or
[]
self
.
orig_seq_lens
=
orig_seq_lens
or
[]
self
.
orig_seq_lens
=
orig_seq_lens
or
[]
self
.
query_lens
=
query_lens
or
[]
self
.
query_lens
=
query_lens
or
[]
...
@@ -357,6 +367,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -357,6 +367,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self
.
input_tokens
=
[[]
for
_
in
range
(
self
.
n_seqs
)]
self
.
input_tokens
=
[[]
for
_
in
range
(
self
.
n_seqs
)]
self
.
input_positions
=
[[]
for
_
in
range
(
self
.
n_seqs
)]
self
.
input_positions
=
[[]
for
_
in
range
(
self
.
n_seqs
)]
self
.
mrope_input_positions
=
None
self
.
seq_lens
=
[
0
]
*
self
.
n_seqs
self
.
seq_lens
=
[
0
]
*
self
.
n_seqs
self
.
orig_seq_lens
=
[
0
]
*
self
.
n_seqs
self
.
orig_seq_lens
=
[
0
]
*
self
.
n_seqs
self
.
query_lens
=
[
0
]
*
self
.
n_seqs
self
.
query_lens
=
[
0
]
*
self
.
n_seqs
...
@@ -493,6 +504,17 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -493,6 +504,17 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
inter_data
.
query_lens
[
inter_data
.
query_lens
[
seq_idx
]
=
seq_len
-
context_len
if
inter_data
.
is_prompt
else
1
seq_idx
]
=
seq_len
-
context_len
if
inter_data
.
is_prompt
else
1
if
seq_data
.
mrope_position_delta
is
not
None
:
if
inter_data
.
mrope_input_positions
is
None
:
inter_data
.
mrope_input_positions
=
[
None
]
*
inter_data
.
n_seqs
inter_data
.
mrope_input_positions
[
seq_idx
]
=
MRotaryEmbedding
.
get_next_input_positions
(
seq_data
.
mrope_position_delta
,
context_len
,
seq_len
,
)
def
_compute_for_prefix_cache_hit
(
def
_compute_for_prefix_cache_hit
(
self
,
inter_data
:
InterDataForSeqGroup
,
seq_idx
:
int
,
self
,
inter_data
:
InterDataForSeqGroup
,
seq_idx
:
int
,
seq_group_metadata
:
SequenceGroupMetadata
):
seq_group_metadata
:
SequenceGroupMetadata
):
...
@@ -636,6 +658,40 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -636,6 +658,40 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
)
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
)
inter_data
.
multi_modal_inputs
=
mm_kwargs
inter_data
.
multi_modal_inputs
=
mm_kwargs
# special processing for mrope position deltas.
if
self
.
runner
.
model_is_mrope
:
image_grid_thw
=
mm_kwargs
.
get
(
"image_grid_thw"
,
None
)
video_grid_thw
=
mm_kwargs
.
get
(
"video_grid_thw"
,
None
)
assert
image_grid_thw
is
not
None
or
video_grid_thw
is
not
None
,
(
"mrope embedding type requires multi-modal input mapper "
"returns 'image_grid_thw' or 'video_grid_thw'."
)
hf_config
=
self
.
runner
.
model_config
.
hf_config
inter_data
.
mrope_input_positions
=
[
None
]
*
inter_data
.
n_seqs
for
seq_idx
in
range
(
inter_data
.
n_seqs
):
seq_data
=
seq_group_metadata
.
seq_data
[
inter_data
.
seq_ids
[
seq_idx
]]
token_ids
=
seq_data
.
get_token_ids
()
mrope_input_positions
,
mrope_position_delta
=
\
MRotaryEmbedding
.
get_input_positions
(
token_ids
,
image_grid_thw
=
image_grid_thw
,
video_grid_thw
=
video_grid_thw
,
image_token_id
=
hf_config
.
image_token_id
,
video_token_id
=
hf_config
.
video_token_id
,
vision_start_token_id
=
hf_config
.
vision_start_token_id
,
vision_end_token_id
=
hf_config
.
vision_end_token_id
,
spatial_merge_size
=
hf_config
.
vision_config
.
spatial_merge_size
,
context_len
=
inter_data
.
context_lens
[
seq_idx
],
)
seq_data
.
mrope_position_delta
=
mrope_position_delta
inter_data
.
mrope_input_positions
[
seq_idx
]
=
mrope_input_positions
def
add_seq_group
(
self
,
seq_group_metadata
:
SequenceGroupMetadata
):
def
add_seq_group
(
self
,
seq_group_metadata
:
SequenceGroupMetadata
):
"""Add a sequence group to the builder."""
"""Add a sequence group to the builder."""
seq_ids
=
seq_group_metadata
.
seq_data
.
keys
()
seq_ids
=
seq_group_metadata
.
seq_data
.
keys
()
...
@@ -684,10 +740,27 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -684,10 +740,27 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# prefix caching and there is no decode request.
# prefix caching and there is no decode request.
return
self
.
model_input_cls
()
return
self
.
model_input_cls
()
input_positions
=
[]
mrope_input_positions
:
Optional
[
List
[
List
[
int
]]]
=
None
for
inter_data
in
self
.
inter_data_list
:
if
any
(
inter_data
.
mrope_input_positions
is
not
None
for
cur_input_positions
in
inter_data
.
input_positions
:
for
inter_data
in
self
.
inter_data_list
):
input_positions
.
extend
(
cur_input_positions
)
mrope_input_positions
=
[[]
for
_
in
range
(
3
)]
for
idx
in
range
(
3
):
for
inter_data
in
self
.
inter_data_list
:
msections
=
inter_data
.
mrope_input_positions
if
msections
is
None
:
for
_seq_input_positions
in
inter_data
.
input_positions
:
mrope_input_positions
[
idx
].
extend
(
_seq_input_positions
)
else
:
for
_seq_mrope_input_positions
in
msections
:
mrope_input_positions
[
idx
].
extend
(
_seq_mrope_input_positions
[
idx
])
input_positions
=
None
else
:
input_positions
=
[]
for
inter_data
in
self
.
inter_data_list
:
for
cur_input_positions
in
inter_data
.
input_positions
:
input_positions
.
extend
(
cur_input_positions
)
seq_lens
=
[]
seq_lens
=
[]
max_decode_seq_len
=
0
max_decode_seq_len
=
0
...
@@ -724,15 +797,24 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -724,15 +797,24 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Tokens and positions.
# Tokens and positions.
if
cuda_graph_pad_size
:
if
cuda_graph_pad_size
:
input_tokens
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
input_tokens
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
input_positions
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
assert
self
.
runner
.
device
is
not
None
assert
self
.
runner
.
device
is
not
None
input_tokens_tensor
=
async_tensor_h2d
(
input_tokens
,
torch
.
long
,
input_tokens_tensor
=
async_tensor_h2d
(
input_tokens
,
torch
.
long
,
self
.
runner
.
device
,
self
.
runner
.
device
,
self
.
runner
.
pin_memory
)
self
.
runner
.
pin_memory
)
input_positions_tensor
=
async_tensor_h2d
(
input_positions
,
torch
.
long
,
if
mrope_input_positions
is
not
None
:
self
.
runner
.
device
,
for
idx
in
range
(
3
):
self
.
runner
.
pin_memory
)
mrope_input_positions
[
idx
].
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
input_positions_tensor
=
async_tensor_h2d
(
mrope_input_positions
,
torch
.
long
,
self
.
runner
.
device
,
self
.
runner
.
pin_memory
)
else
:
input_positions
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
input_positions_tensor
=
async_tensor_h2d
(
input_positions
,
torch
.
long
,
self
.
runner
.
device
,
self
.
runner
.
pin_memory
)
# Sequence and query lengths.
# Sequence and query lengths.
if
cuda_graph_pad_size
:
if
cuda_graph_pad_size
:
seq_lens
.
extend
(
itertools
.
repeat
(
1
,
cuda_graph_pad_size
))
seq_lens
.
extend
(
itertools
.
repeat
(
1
,
cuda_graph_pad_size
))
...
@@ -982,9 +1064,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -982,9 +1064,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"This may lead to less accurate results!"
)
"This may lead to less accurate results!"
)
if
envs
.
VLLM_TEST_DYNAMO_GRAPH_CAPTURE
and
supports_dynamo
():
if
envs
.
VLLM_TEST_DYNAMO_GRAPH_CAPTURE
and
supports_dynamo
():
self
.
model
=
torch
.
compile
(
self
.
model
,
self
.
model
=
torch
.
compile
(
fullgraph
=
True
,
self
.
model
,
backend
=
"eager"
)
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
backend
=
"eager"
)
def
save_sharded_state
(
def
save_sharded_state
(
self
,
self
,
...
@@ -1226,6 +1309,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -1226,6 +1309,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
raise
RuntimeError
(
"PromptAdapter is not enabled."
)
raise
RuntimeError
(
"PromptAdapter is not enabled."
)
return
self
.
prompt_adapter_manager
.
list_adapters
()
return
self
.
prompt_adapter_manager
.
list_adapters
()
@
property
def
model_is_mrope
(
self
)
->
bool
:
"""Detect if the model has "mrope" rope_scaling type.
mrope requires keep "rope_deltas" between prompt and decoding phases."""
rope_scaling
=
getattr
(
self
.
model_config
.
hf_config
,
"rope_scaling"
,
{})
if
rope_scaling
is
None
:
return
False
return
rope_scaling
.
get
(
"type"
,
None
)
==
"mrope"
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
capture_model
(
self
,
kv_caches
:
List
[
List
[
torch
.
Tensor
]])
->
None
:
def
capture_model
(
self
,
kv_caches
:
List
[
List
[
torch
.
Tensor
]])
->
None
:
"""Cuda graph capture a model.
"""Cuda graph capture a model.
...
@@ -1256,7 +1348,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -1256,7 +1348,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
max_batch_size
=
self
.
max_batchsize_to_capture
max_batch_size
=
self
.
max_batchsize_to_capture
input_tokens
=
torch
.
zeros
(
max_batch_size
,
dtype
=
torch
.
long
).
cuda
()
input_tokens
=
torch
.
zeros
(
max_batch_size
,
dtype
=
torch
.
long
).
cuda
()
input_positions
=
torch
.
zeros
(
max_batch_size
,
dtype
=
torch
.
long
).
cuda
()
input_positions
=
torch
.
zeros
(
max_batch_size
,
dtype
=
torch
.
long
).
cuda
()
if
self
.
model_is_mrope
:
input_positions
=
torch
.
tile
(
input_positions
,
(
3
,
1
))
# Prepare dummy previous_hidden_states only if needed by the model.
# Prepare dummy previous_hidden_states only if needed by the model.
# This is used by draft models such as EAGLE.
# This is used by draft models such as EAGLE.
previous_hidden_states
=
None
previous_hidden_states
=
None
...
@@ -1320,7 +1413,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -1320,7 +1413,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"input_ids"
:
"input_ids"
:
input_tokens
[:
batch_size
],
input_tokens
[:
batch_size
],
"positions"
:
"positions"
:
input_positions
[:
batch_size
],
input_positions
[
...,
:
batch_size
],
"hidden_or_intermediate_states"
:
"hidden_or_intermediate_states"
:
hidden_or_intermediate_states
[
hidden_or_intermediate_states
[
virtual_engine
]
# type: ignore
virtual_engine
]
# type: ignore
...
...
Prev
1
…
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment