Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bbf55c48
Unverified
Commit
bbf55c48
authored
Aug 17, 2024
by
Roger Wang
Committed by
GitHub
Aug 17, 2024
Browse files
[VLM] Refactor `MultiModalConfig` initialization and profiling (#7530)
parent
1ef13cf9
Changes
29
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
109 additions
and
129 deletions
+109
-129
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+6
-2
tests/multimodal/test_mapper.py
tests/multimodal/test_mapper.py
+9
-9
vllm/config.py
vllm/config.py
+33
-5
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+3
-6
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+1
-6
vllm/executor/cpu_executor.py
vllm/executor/cpu_executor.py
+0
-1
vllm/executor/executor_base.py
vllm/executor/executor_base.py
+2
-4
vllm/executor/gpu_executor.py
vllm/executor/gpu_executor.py
+0
-1
vllm/executor/openvino_executor.py
vllm/executor/openvino_executor.py
+0
-1
vllm/executor/ray_xpu_executor.py
vllm/executor/ray_xpu_executor.py
+2
-6
vllm/executor/tpu_executor.py
vllm/executor/tpu_executor.py
+0
-1
vllm/executor/xpu_executor.py
vllm/executor/xpu_executor.py
+2
-5
vllm/inputs/registry.py
vllm/inputs/registry.py
+1
-15
vllm/model_executor/model_loader/__init__.py
vllm/model_executor/model_loader/__init__.py
+1
-4
vllm/model_executor/model_loader/loader.py
vllm/model_executor/model_loader/loader.py
+14
-32
vllm/model_executor/models/__init__.py
vllm/model_executor/models/__init__.py
+27
-16
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+2
-2
vllm/spec_decode/draft_model_runner.py
vllm/spec_decode/draft_model_runner.py
+2
-4
vllm/spec_decode/target_model_runner.py
vllm/spec_decode/target_model_runner.py
+2
-4
vllm/worker/cpu_model_runner.py
vllm/worker/cpu_model_runner.py
+2
-5
No files found.
tests/entrypoints/openai/test_audio.py
View file @
bbf55c48
...
...
@@ -86,8 +86,12 @@ def server_function(port):
ModelRegistry
.
register_model
(
"OPTForCausalLM"
,
FakeAudioModel
)
with
patch
(
"vllm.entrypoints.chat_utils._mm_token_str"
,
lambda
*
_
,
**
__
:
"_"
):
with
patch
(
"vllm.entrypoints.chat_utils._mm_token_str"
,
lambda
*
_
,
**
__
:
"_"
),
patch
(
"vllm.model_executor.models.ModelRegistry.is_multimodal_model"
)
as
mock
:
mock
.
return_value
=
True
sys
.
argv
=
[
"placeholder.py"
]
+
\
(
f
"--model
{
MODEL_NAME
}
--gpu-memory-utilization 0.10 "
"--dtype bfloat16 --enforce-eager --api-key token-abc123 "
...
...
tests/multimodal/test_mapper.py
View file @
bbf55c48
...
...
@@ -4,7 +4,7 @@ import numpy as np
import
pytest
from
transformers
import
CLIPImageProcessor
,
LlavaNextImageProcessor
from
vllm.config
import
ModelConfig
,
MultiModalConfig
from
vllm.config
import
ModelConfig
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal.utils
import
rescale_image_size
...
...
@@ -30,10 +30,10 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
seed
=
0
,
dtype
=
dtype
,
revision
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
mm_config
=
MultiModalConfig
(
limit_per_prompt
=
{
"image"
:
1
})
mm_registry
.
init_mm_limits_per_prompt
(
model_config
,
mm_config
)
mm_registry
.
init_mm_limits_per_prompt
(
model_config
)
for
asset
in
image_assets
:
image
=
rescale_image_size
(
asset
.
pil_image
,
size_factor
)
...
...
@@ -73,10 +73,10 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
seed
=
0
,
dtype
=
dtype
,
revision
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
mm_config
=
MultiModalConfig
(
limit_per_prompt
=
{
"image"
:
1
})
mm_registry
.
init_mm_limits_per_prompt
(
model_config
,
mm_config
)
mm_registry
.
init_mm_limits_per_prompt
(
model_config
)
for
asset
in
image_assets
:
image
=
rescale_image_size
(
asset
.
pil_image
,
size_factor
)
...
...
@@ -115,10 +115,10 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
seed
=
0
,
dtype
=
"half"
,
revision
=
None
,
limit_mm_per_prompt
=
{
"image"
:
limit
},
)
mm_config
=
MultiModalConfig
(
limit_per_prompt
=
{
"image"
:
limit
})
mm_registry
.
init_mm_limits_per_prompt
(
model_config
,
mm_config
)
mm_registry
.
init_mm_limits_per_prompt
(
model_config
)
image
=
image_assets
[
0
].
pil_image
if
num_images
==
0
:
...
...
@@ -145,10 +145,10 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
seed
=
0
,
dtype
=
"half"
,
revision
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_images
},
)
mm_config
=
MultiModalConfig
(
limit_per_prompt
=
{
"image"
:
num_images
})
mm_registry
.
init_mm_limits_per_prompt
(
model_config
,
mm_config
)
mm_registry
.
init_mm_limits_per_prompt
(
model_config
)
image
=
image_assets
[
0
].
pil_image
mm_inputs
=
{
"image"
:
[
image
]
*
num_images
}
...
...
vllm/config.py
View file @
bbf55c48
...
...
@@ -109,6 +109,8 @@ class ModelConfig:
matches the model name exposed via the APIs. If multiple model
names provided, the first name will be used. If not specified,
the model name will be the same as `model`.
limit_mm_per_prompt: Maximum number of data instances per modality
per prompt. Only applicable for multimodal models.
"""
def
__init__
(
...
...
@@ -134,7 +136,7 @@ class ModelConfig:
disable_sliding_window
:
bool
=
False
,
skip_tokenizer_init
:
bool
=
False
,
served_model_name
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
,
multimodal_config
:
Optional
[
"MultiModalConfig"
]
=
None
,
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]
]
=
None
,
)
->
None
:
self
.
model
=
model
self
.
tokenizer
=
tokenizer
...
...
@@ -211,14 +213,29 @@ class ModelConfig:
sliding_window_len
=
self
.
get_hf_config_sliding_window
())
self
.
served_model_name
=
get_served_model_name
(
model
,
served_model_name
)
self
.
multimodal_config
=
multimodal_config
self
.
multimodal_config
=
self
.
_init_
multimodal_config
(
limit_mm_per_prompt
)
if
not
self
.
skip_tokenizer_init
:
self
.
_verify_tokenizer_mode
()
self
.
_verify_embedding_mode
()
self
.
_verify_quantization
()
self
.
_verify_cuda_graph
()
def
_init_multimodal_config
(
self
,
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
)
->
Optional
[
"MultiModalConfig"
]:
architectures
=
getattr
(
self
.
hf_config
,
"architectures"
,
[])
if
any
(
ModelRegistry
.
is_multimodal_model
(
arch
)
for
arch
in
architectures
):
return
MultiModalConfig
(
limit_per_prompt
=
limit_mm_per_prompt
or
{})
else
:
if
limit_mm_per_prompt
:
raise
ValueError
(
"limit_mm_per_prompt is only supported for multimodal "
"models."
)
return
None
def
_verify_tokenizer_mode
(
self
)
->
None
:
tokenizer_mode
=
self
.
tokenizer_mode
.
lower
()
if
tokenizer_mode
not
in
[
"auto"
,
"slow"
]:
...
...
@@ -467,6 +484,18 @@ class ModelConfig:
if
t
!=
"attention"
])
def
get_multimodal_config
(
self
)
->
"MultiModalConfig"
:
"""
Get the multimodal configuration of the model.
Raises:
ValueError: If the model is not multimodal.
"""
if
self
.
multimodal_config
is
None
:
raise
ValueError
(
"The model is not multimodal."
)
return
self
.
multimodal_config
@
property
def
is_encoder_decoder_model
(
self
)
->
bool
:
"""Extract the HF encoder/decoder model flag."""
...
...
@@ -1450,7 +1479,7 @@ class PromptAdapterConfig:
class
MultiModalConfig
:
"""Controls the behavior of multimodal models."""
limit_per_prompt
:
Mapping
[
str
,
int
]
limit_per_prompt
:
Mapping
[
str
,
int
]
=
field
(
default_factory
=
dict
)
"""
The maximum number of multi-modal input instances allowed per prompt
for each :class:`~vllm.multimodal.MultiModalPlugin`.
...
...
@@ -1710,7 +1739,6 @@ class EngineConfig:
device_config
:
DeviceConfig
load_config
:
LoadConfig
lora_config
:
Optional
[
LoRAConfig
]
multimodal_config
:
Optional
[
MultiModalConfig
]
speculative_config
:
Optional
[
SpeculativeConfig
]
decoding_config
:
Optional
[
DecodingConfig
]
observability_config
:
Optional
[
ObservabilityConfig
]
...
...
vllm/engine/arg_utils.py
View file @
bbf55c48
...
...
@@ -7,7 +7,7 @@ from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
from
vllm.config
import
(
CacheConfig
,
DecodingConfig
,
DeviceConfig
,
EngineConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ObservabilityConfig
,
ParallelConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
,
TokenizerPoolConfig
)
from
vllm.executor.executor_base
import
ExecutorBase
...
...
@@ -765,9 +765,6 @@ class EngineArgs:
"CPU offload space must be non-negative"
f
", but got
{
self
.
cpu_offload_gb
}
"
)
multimodal_config
=
MultiModalConfig
(
limit_per_prompt
=
self
.
limit_mm_per_prompt
or
{})
device_config
=
DeviceConfig
(
device
=
self
.
device
)
model_config
=
ModelConfig
(
model
=
self
.
model
,
...
...
@@ -791,7 +788,8 @@ class EngineArgs:
disable_sliding_window
=
self
.
disable_sliding_window
,
skip_tokenizer_init
=
self
.
skip_tokenizer_init
,
served_model_name
=
self
.
served_model_name
,
multimodal_config
=
multimodal_config
)
limit_mm_per_prompt
=
self
.
limit_mm_per_prompt
,
)
cache_config
=
CacheConfig
(
block_size
=
self
.
block_size
,
gpu_memory_utilization
=
self
.
gpu_memory_utilization
,
...
...
@@ -970,7 +968,6 @@ class EngineArgs:
scheduler_config
=
scheduler_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
multimodal_config
=
multimodal_config
,
speculative_config
=
speculative_config
,
load_config
=
load_config
,
decoding_config
=
decoding_config
,
...
...
vllm/engine/llm_engine.py
View file @
bbf55c48
...
...
@@ -10,7 +10,7 @@ from typing_extensions import assert_never
import
vllm.envs
as
envs
from
vllm.config
import
(
CacheConfig
,
DecodingConfig
,
DeviceConfig
,
EngineConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ObservabilityConfig
,
ParallelConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
from
vllm.core.scheduler
import
(
ScheduledSequenceGroup
,
Scheduler
,
...
...
@@ -100,8 +100,6 @@ class LLMEngine:
scheduler_config: The configuration related to the request scheduler.
device_config: The configuration related to the device.
lora_config (Optional): The configuration related to serving multi-LoRA.
multimodal_config (Optional): The configuration related to multimodal
models.
speculative_config (Optional): The configuration related to speculative
decoding.
executor_class: The model executor class for managing distributed
...
...
@@ -172,7 +170,6 @@ class LLMEngine:
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
decoding_config
:
Optional
[
DecodingConfig
],
observability_config
:
Optional
[
ObservabilityConfig
],
...
...
@@ -235,7 +232,6 @@ class LLMEngine:
self
.
model_config
=
model_config
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
self
.
multimodal_config
=
multimodal_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
...
...
@@ -278,7 +274,6 @@ class LLMEngine:
scheduler_config
=
scheduler_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
multimodal_config
=
multimodal_config
,
speculative_config
=
speculative_config
,
load_config
=
load_config
,
prompt_adapter_config
=
prompt_adapter_config
,
...
...
vllm/executor/cpu_executor.py
View file @
bbf55c48
...
...
@@ -141,7 +141,6 @@ class CPUExecutor(ExecutorBase):
rank
=
rank
,
distributed_init_method
=
self
.
distributed_init_method
,
lora_config
=
self
.
lora_config
,
multimodal_config
=
self
.
multimodal_config
,
kv_cache_dtype
=
self
.
cache_config
.
cache_dtype
,
prompt_adapter_config
=
self
.
prompt_adapter_config
,
is_driver_worker
=
rank
==
0
,
...
...
vllm/executor/executor_base.py
View file @
bbf55c48
...
...
@@ -2,8 +2,8 @@ from abc import ABC, abstractmethod
from
typing
import
List
,
Optional
,
Set
,
Tuple
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
ModelConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
from
vllm.lora.request
import
LoRARequest
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
...
...
@@ -29,7 +29,6 @@ class ExecutorBase(ABC):
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
],
observability_config
:
Optional
[
ObservabilityConfig
],
...
...
@@ -41,7 +40,6 @@ class ExecutorBase(ABC):
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
multimodal_config
=
multimodal_config
self
.
speculative_config
=
speculative_config
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
observability_config
=
observability_config
...
...
vllm/executor/gpu_executor.py
View file @
bbf55c48
...
...
@@ -55,7 +55,6 @@ class GPUExecutor(ExecutorBase):
rank
=
rank
,
distributed_init_method
=
distributed_init_method
,
lora_config
=
self
.
lora_config
,
multimodal_config
=
self
.
multimodal_config
,
speculative_config
=
self
.
speculative_config
,
prompt_adapter_config
=
self
.
prompt_adapter_config
,
is_driver_worker
=
(
not
self
.
parallel_config
)
...
...
vllm/executor/openvino_executor.py
View file @
bbf55c48
...
...
@@ -49,7 +49,6 @@ class OpenVINOExecutor(ExecutorBase):
rank
=
0
,
distributed_init_method
=
distributed_init_method
,
lora_config
=
self
.
lora_config
,
multimodal_config
=
self
.
multimodal_config
,
kv_cache_dtype
=
self
.
cache_config
.
cache_dtype
,
is_driver_worker
=
True
,
)
...
...
vllm/executor/ray_xpu_executor.py
View file @
bbf55c48
...
...
@@ -7,9 +7,8 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
import
vllm.envs
as
envs
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
ModelConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
from
vllm.executor.distributed_gpu_executor
import
(
# yapf: disable
DistributedGPUExecutor
,
DistributedGPUExecutorAsync
)
from
vllm.executor.ray_utils
import
RayWorkerWrapper
,
ray
...
...
@@ -46,7 +45,6 @@ class RayXPUExecutor(DistributedGPUExecutor):
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
)
->
None
:
...
...
@@ -61,7 +59,6 @@ class RayXPUExecutor(DistributedGPUExecutor):
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
multimodal_config
=
multimodal_config
self
.
prompt_adapter_config
=
prompt_adapter_config
placement_group
=
self
.
parallel_config
.
placement_group
...
...
@@ -203,7 +200,6 @@ class RayXPUExecutor(DistributedGPUExecutor):
rank
=
rank
,
distributed_init_method
=
distributed_init_method
,
lora_config
=
self
.
lora_config
,
multimodal_config
=
self
.
multimodal_config
,
is_driver_worker
=
rank
==
0
,
))
self
.
_run_workers
(
"init_worker"
,
all_kwargs
=
init_worker_all_kwargs
)
...
...
vllm/executor/tpu_executor.py
View file @
bbf55c48
...
...
@@ -52,7 +52,6 @@ class TPUExecutor(ExecutorBase):
local_rank
=
local_rank
,
rank
=
rank
,
distributed_init_method
=
distributed_init_method
,
multimodal_config
=
self
.
multimodal_config
,
is_driver_worker
=
rank
==
0
,
)
...
...
vllm/executor/xpu_executor.py
View file @
bbf55c48
...
...
@@ -3,9 +3,8 @@ from typing import List, Optional
import
torch
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
ModelConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
from
vllm.executor.executor_base
import
ExecutorAsyncBase
from
vllm.executor.gpu_executor
import
GPUExecutor
from
vllm.logger
import
init_logger
...
...
@@ -29,7 +28,6 @@ class XPUExecutor(GPUExecutor):
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
)
->
None
:
...
...
@@ -46,7 +44,6 @@ class XPUExecutor(GPUExecutor):
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
multimodal_config
=
multimodal_config
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
speculative_config
=
None
...
...
vllm/inputs/registry.py
View file @
bbf55c48
...
...
@@ -13,7 +13,7 @@ from vllm.logger import init_logger
from
.data
import
LLMInputs
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
,
MultiModalConfig
from
vllm.config
import
ModelConfig
from
vllm.multimodal
import
MultiModalDataDict
,
MultiModalRegistry
from
vllm.sequence
import
SequenceData
...
...
@@ -32,20 +32,6 @@ class InputContext:
model_config
:
"ModelConfig"
"""The configuration of the model."""
def
get_multimodal_config
(
self
)
->
"MultiModalConfig"
:
"""
Get the multimodal configuration of the model.
Raises:
ValueError: If the model is not multimodal.
"""
multimodal_config
=
self
.
model_config
.
multimodal_config
if
multimodal_config
is
None
:
raise
ValueError
(
"No multimodal config found"
)
return
multimodal_config
def
get_hf_config
(
self
,
hf_config_type
:
Type
[
C
]
=
PretrainedConfig
)
->
C
:
"""
Get the HuggingFace configuration
...
...
vllm/model_executor/model_loader/__init__.py
View file @
bbf55c48
...
...
@@ -3,8 +3,7 @@ from typing import Optional
from
torch
import
nn
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ParallelConfig
,
SchedulerConfig
)
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.model_executor.model_loader.loader
import
(
BaseModelLoader
,
get_model_loader
)
from
vllm.model_executor.model_loader.utils
import
(
...
...
@@ -15,13 +14,11 @@ def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
device_config
:
DeviceConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
cache_config
:
CacheConfig
)
->
nn
.
Module
:
loader
=
get_model_loader
(
load_config
)
return
loader
.
load_model
(
model_config
=
model_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
multimodal_config
=
multimodal_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
cache_config
=
cache_config
)
...
...
vllm/model_executor/model_loader/loader.py
View file @
bbf55c48
...
...
@@ -132,9 +132,7 @@ def _get_model_initialization_kwargs(
"please open an issue on github."
)
if
supports_multimodal
(
model_class
):
if
multimodal_config
is
None
:
raise
ValueError
(
"Provide multi-modal related configurations "
"through LLM entrypoint or engine arguments."
)
assert
multimodal_config
is
not
None
extra_kwargs
[
"multimodal_config"
]
=
multimodal_config
...
...
@@ -164,7 +162,6 @@ def _initialize_model(
model_config
:
ModelConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
cache_config
:
CacheConfig
,
scheduler_config
:
Optional
[
SchedulerConfig
]
=
None
)
->
nn
.
Module
:
"""Initialize a model with the given configurations."""
...
...
@@ -173,10 +170,10 @@ def _initialize_model(
return
build_model
(
model_class
,
model_config
.
hf_config
,
cache_config
=
cache_config
,
quant_config
=
_get_quantization_config
(
model_config
,
load_config
),
lora_config
=
lora_config
,
multimodal_config
=
multimodal_config
,
cache_config
=
cache_config
,
multimodal_config
=
model_config
.
multimodal_config
,
scheduler_config
=
scheduler_config
,
)
...
...
@@ -191,7 +188,6 @@ class BaseModelLoader(ABC):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
...
...
@@ -336,7 +332,6 @@ class DefaultModelLoader(BaseModelLoader):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
...
...
@@ -344,8 +339,8 @@ class DefaultModelLoader(BaseModelLoader):
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
target_device
:
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
multimodal
_config
,
cache_config
,
scheduler_config
)
lora_config
,
cache
_config
,
scheduler_config
)
model
.
load_weights
(
self
.
_get_weights_iterator
(
model_config
.
model
,
model_config
.
revision
,
...
...
@@ -379,15 +374,14 @@ class DummyModelLoader(BaseModelLoader):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
multimodal
_config
,
cache_config
,
scheduler_config
)
lora_config
,
cache
_config
,
scheduler_config
)
# NOTE(woosuk): For accurate performance evaluation, we assign
# random values to the weights.
initialize_dummy_weights
(
model
)
...
...
@@ -420,7 +414,6 @@ class TensorizerLoader(BaseModelLoader):
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
cache_config
:
CacheConfig
,
)
->
nn
.
Module
:
"""Load a serialized model with tensorizer to the CPU.
...
...
@@ -433,8 +426,7 @@ class TensorizerLoader(BaseModelLoader):
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
multimodal_config
,
cache_config
)
lora_config
,
cache_config
)
model
.
load_weights
(
self
.
_get_weights_iterator
())
return
model
.
eval
()
...
...
@@ -444,7 +436,6 @@ class TensorizerLoader(BaseModelLoader):
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
cache_config
:
CacheConfig
,
)
->
nn
.
Module
:
"""Load a serialized model with tensorizer.
...
...
@@ -458,7 +449,7 @@ class TensorizerLoader(BaseModelLoader):
quant_config
=
_get_quantization_config
(
model_config
,
self
.
load_config
)
extra_kwargs
=
_get_model_initialization_kwargs
(
model_class
,
lora_config
,
multimodal_config
)
model_class
,
lora_config
,
model_config
.
multimodal_config
)
extra_kwargs
[
"quant_config"
]
=
quant_config
extra_kwargs
[
"cache_config"
]
=
cache_config
...
...
@@ -473,7 +464,6 @@ class TensorizerLoader(BaseModelLoader):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
...
...
@@ -487,11 +477,9 @@ class TensorizerLoader(BaseModelLoader):
if
is_vllm_tensorized
(
self
.
tensorizer_config
):
return
self
.
_load_model_serialized
(
model_config
,
device_config
,
lora_config
,
multimodal_config
,
cache_config
)
lora_config
,
cache_config
)
return
self
.
_load_model_serialized_cpu
(
model_config
,
device_config
,
lora_config
,
multimodal_config
,
cache_config
)
lora_config
,
cache_config
)
@
staticmethod
def
save_model
(
...
...
@@ -577,7 +565,6 @@ class ShardedStateLoader(BaseModelLoader):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
...
...
@@ -591,8 +578,7 @@ class ShardedStateLoader(BaseModelLoader):
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
multimodal_config
,
cache_config
)
lora_config
,
cache_config
)
rank
=
get_tensor_model_parallel_rank
()
pattern
=
os
.
path
.
join
(
local_model_path
,
...
...
@@ -955,15 +941,13 @@ class BitsAndBytesModelLoader(BaseModelLoader):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
multimodal_config
,
cache_config
)
lora_config
,
cache_config
)
self
.
_load_weights
(
model_config
,
model
)
...
...
@@ -1032,7 +1016,6 @@ class GGUFModelLoader(BaseModelLoader):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
...
...
@@ -1047,8 +1030,7 @@ class GGUFModelLoader(BaseModelLoader):
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
multimodal_config
,
cache_config
)
lora_config
,
cache_config
)
model
.
load_weights
(
self
.
_get_weights_iterator
(
local_model_path
,
gguf_weights_map
))
return
model
...
...
vllm/model_executor/models/__init__.py
View file @
bbf55c48
...
...
@@ -9,17 +9,12 @@ from vllm.utils import is_hip
logger
=
init_logger
(
__name__
)
# Architecture -> (module, class).
_GENERATION_MODELS
=
{
"AquilaModel"
:
(
"llama"
,
"LlamaForCausalLM"
),
"AquilaForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
# AquilaChat2
"BaiChuanForCausalLM"
:
(
"baichuan"
,
"BaiChuanForCausalLM"
),
# baichuan-7b
"BaichuanForCausalLM"
:
(
"baichuan"
,
"BaichuanForCausalLM"
),
# baichuan-13b
"BloomForCausalLM"
:
(
"bloom"
,
"BloomForCausalLM"
),
"Blip2ForConditionalGeneration"
:
(
"blip2"
,
"Blip2ForConditionalGeneration"
),
"ChameleonForConditionalGeneration"
:
(
"chameleon"
,
"ChameleonForConditionalGeneration"
),
"ChatGLMModel"
:
(
"chatglm"
,
"ChatGLMForCausalLM"
),
"ChatGLMForConditionalGeneration"
:
(
"chatglm"
,
"ChatGLMForCausalLM"
),
"CohereForCausalLM"
:
(
"commandr"
,
"CohereForCausalLM"
),
...
...
@@ -28,7 +23,6 @@ _GENERATION_MODELS = {
"DeepseekForCausalLM"
:
(
"deepseek"
,
"DeepseekForCausalLM"
),
"DeepseekV2ForCausalLM"
:
(
"deepseek_v2"
,
"DeepseekV2ForCausalLM"
),
"FalconForCausalLM"
:
(
"falcon"
,
"FalconForCausalLM"
),
"FuyuForCausalLM"
:
(
"fuyu"
,
"FuyuForCausalLM"
),
"GemmaForCausalLM"
:
(
"gemma"
,
"GemmaForCausalLM"
),
"Gemma2ForCausalLM"
:
(
"gemma2"
,
"Gemma2ForCausalLM"
),
"GPT2LMHeadModel"
:
(
"gpt2"
,
"GPT2LMHeadModel"
),
...
...
@@ -37,13 +31,8 @@ _GENERATION_MODELS = {
"GPTNeoXForCausalLM"
:
(
"gpt_neox"
,
"GPTNeoXForCausalLM"
),
"InternLMForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"InternLM2ForCausalLM"
:
(
"internlm2"
,
"InternLM2ForCausalLM"
),
"InternVLChatModel"
:
(
"internvl"
,
"InternVLChatModel"
),
"JAISLMHeadModel"
:
(
"jais"
,
"JAISLMHeadModel"
),
"LlamaForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"LlavaForConditionalGeneration"
:
(
"llava"
,
"LlavaForConditionalGeneration"
),
"LlavaNextForConditionalGeneration"
:
(
"llava_next"
,
"LlavaNextForConditionalGeneration"
),
# For decapoda-research/llama-*
"LLaMAForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"MistralForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
...
...
@@ -53,17 +42,13 @@ _GENERATION_MODELS = {
"MptForCausalLM"
:
(
"mpt"
,
"MPTForCausalLM"
),
"MPTForCausalLM"
:
(
"mpt"
,
"MPTForCausalLM"
),
"MiniCPMForCausalLM"
:
(
"minicpm"
,
"MiniCPMForCausalLM"
),
"MiniCPMV"
:
(
"minicpmv"
,
"MiniCPMV"
),
"NemotronForCausalLM"
:
(
"nemotron"
,
"NemotronForCausalLM"
),
"OlmoForCausalLM"
:
(
"olmo"
,
"OlmoForCausalLM"
),
"OPTForCausalLM"
:
(
"opt"
,
"OPTForCausalLM"
),
"OrionForCausalLM"
:
(
"orion"
,
"OrionForCausalLM"
),
"PersimmonForCausalLM"
:
(
"persimmon"
,
"PersimmonForCausalLM"
),
"PaliGemmaForConditionalGeneration"
:
(
"paligemma"
,
"PaliGemmaForConditionalGeneration"
),
"PhiForCausalLM"
:
(
"phi"
,
"PhiForCausalLM"
),
"Phi3ForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
"QWenLMHeadModel"
:
(
"qwen"
,
"QWenLMHeadModel"
),
"Qwen2ForCausalLM"
:
(
"qwen2"
,
"Qwen2ForCausalLM"
),
"Qwen2MoeForCausalLM"
:
(
"qwen2_moe"
,
"Qwen2MoeForCausalLM"
),
...
...
@@ -83,6 +68,22 @@ _EMBEDDING_MODELS = {
"MistralModel"
:
(
"llama_embedding"
,
"LlamaEmbeddingModel"
),
}
_MULTIMODAL_MODELS
=
{
"Blip2ForConditionalGeneration"
:
(
"blip2"
,
"Blip2ForConditionalGeneration"
),
"ChameleonForConditionalGeneration"
:
(
"chameleon"
,
"ChameleonForConditionalGeneration"
),
"FuyuForCausalLM"
:
(
"fuyu"
,
"FuyuForCausalLM"
),
"InternVLChatModel"
:
(
"internvl"
,
"InternVLChatModel"
),
"LlavaForConditionalGeneration"
:
(
"llava"
,
"LlavaForConditionalGeneration"
),
"LlavaNextForConditionalGeneration"
:
(
"llava_next"
,
"LlavaNextForConditionalGeneration"
),
"MiniCPMV"
:
(
"minicpmv"
,
"MiniCPMV"
),
"PaliGemmaForConditionalGeneration"
:
(
"paligemma"
,
"PaliGemmaForConditionalGeneration"
),
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
}
_CONDITIONAL_GENERATION_MODELS
=
{
"BartModel"
:
(
"bart"
,
"BartForConditionalGeneration"
),
"BartForConditionalGeneration"
:
(
"bart"
,
"BartForConditionalGeneration"
),
...
...
@@ -91,7 +92,8 @@ _CONDITIONAL_GENERATION_MODELS = {
_MODELS
=
{
**
_GENERATION_MODELS
,
**
_EMBEDDING_MODELS
,
**
_CONDITIONAL_GENERATION_MODELS
**
_MULTIMODAL_MODELS
,
**
_CONDITIONAL_GENERATION_MODELS
,
}
# Architecture -> type.
...
...
@@ -182,6 +184,15 @@ class ModelRegistry:
def
is_embedding_model
(
model_arch
:
str
)
->
bool
:
return
model_arch
in
_EMBEDDING_MODELS
@
staticmethod
def
is_multimodal_model
(
model_arch
:
str
)
->
bool
:
# TODO: find a way to avoid initializing CUDA prematurely to
# use `supports_multimodal` to determine if a model is multimodal
# model_cls = ModelRegistry._try_load_model_cls(model_arch)
# from vllm.model_executor.models.interfaces import supports_multimodal
return
model_arch
in
_MULTIMODAL_MODELS
__all__
=
[
"ModelRegistry"
,
...
...
vllm/multimodal/registry.py
View file @
bbf55c48
...
...
@@ -2,7 +2,7 @@ import functools
from
collections
import
UserDict
from
typing
import
Dict
,
Mapping
,
Optional
,
Sequence
from
vllm.config
import
ModelConfig
,
MultiModalConfig
from
vllm.config
import
ModelConfig
from
vllm.logger
import
init_logger
from
.audio
import
AudioPlugin
...
...
@@ -181,7 +181,6 @@ class MultiModalRegistry:
def
init_mm_limits_per_prompt
(
self
,
model_config
:
ModelConfig
,
multimodal_config
:
Optional
[
MultiModalConfig
],
)
->
None
:
"""
Initialize the maximum number of multi-modal input instances for each
...
...
@@ -192,6 +191,7 @@ class MultiModalRegistry:
"`mm_limits` has already been set for model=%s, and will "
"be overwritten by the new values."
,
model_config
.
model
)
multimodal_config
=
model_config
.
multimodal_config
if
multimodal_config
is
None
:
limits_per_plugin
=
self
.
_disabled_limits_per_plugin
else
:
...
...
vllm/spec_decode/draft_model_runner.py
View file @
bbf55c48
...
...
@@ -23,8 +23,8 @@ except ImportError:
FLASHINFER_WORKSPACE_BUFFER_SIZE
=
0
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
)
ModelConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
)
from
vllm.logger
import
init_logger
from
vllm.multimodal
import
MultiModalInputs
from
vllm.sequence
import
(
ExecuteModelRequest
,
IntermediateTensors
,
...
...
@@ -66,7 +66,6 @@ class TP1DraftModelRunner(ModelRunner):
lora_config
:
Optional
[
LoRAConfig
],
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
is_driver_worker
:
bool
=
False
,
multimodal_config
:
Optional
[
MultiModalConfig
]
=
None
,
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
]
=
None
,
return_hidden_states
:
bool
=
False
,
observability_config
:
Optional
[
ObservabilityConfig
]
=
None
,
...
...
@@ -86,7 +85,6 @@ class TP1DraftModelRunner(ModelRunner):
lora_config
=
lora_config
,
kv_cache_dtype
=
kv_cache_dtype
,
is_driver_worker
=
is_driver_worker
,
multimodal_config
=
multimodal_config
,
prompt_adapter_config
=
prompt_adapter_config
,
return_hidden_states
=
return_hidden_states
,
observability_config
=
observability_config
,
...
...
vllm/spec_decode/target_model_runner.py
View file @
bbf55c48
from
typing
import
List
,
Optional
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
)
ModelConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
)
from
vllm.sequence
import
SequenceGroupMetadata
from
vllm.worker.model_runner
import
(
ModelInputForGPUWithSamplingMetadata
,
ModelRunner
)
...
...
@@ -31,7 +31,6 @@ class TargetModelRunner(ModelRunner):
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
is_driver_worker
:
bool
=
False
,
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
]
=
None
,
multimodal_config
:
Optional
[
MultiModalConfig
]
=
None
,
return_hidden_states
:
bool
=
False
,
observability_config
:
Optional
[
ObservabilityConfig
]
=
None
):
# An internal boolean member variable to indicate if token log
...
...
@@ -47,7 +46,6 @@ class TargetModelRunner(ModelRunner):
lora_config
=
lora_config
,
kv_cache_dtype
=
kv_cache_dtype
,
is_driver_worker
=
is_driver_worker
,
multimodal_config
=
multimodal_config
,
prompt_adapter_config
=
prompt_adapter_config
,
return_hidden_states
=
return_hidden_states
,
observability_config
=
observability_config
,
...
...
vllm/worker/cpu_model_runner.py
View file @
bbf55c48
...
...
@@ -6,8 +6,8 @@ from torch import nn
from
vllm.attention
import
AttentionMetadata
,
get_attn_backend
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModa
lConfig
,
P
arallel
Config
,
PromptAdapterConfig
,
SchedulerConfig
)
ModelConfig
,
Paralle
lConfig
,
P
romptAdapter
Config
,
SchedulerConfig
)
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.model_loader
import
get_model
...
...
@@ -79,7 +79,6 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
cache_config
:
CacheConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
]
=
None
,
is_driver_worker
:
bool
=
False
,
...
...
@@ -94,7 +93,6 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
self
.
device_config
=
device_config
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
self
.
multimodal_config
=
multimodal_config
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
load_config
=
load_config
self
.
is_driver_worker
=
is_driver_worker
...
...
@@ -125,7 +123,6 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
load_config
=
self
.
load_config
,
device_config
=
self
.
device_config
,
multimodal_config
=
self
.
multimodal_config
,
lora_config
=
self
.
lora_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment