Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1bc3b5e7
Unverified
Commit
1bc3b5e7
authored
Feb 13, 2025
by
Cyrus Leung
Committed by
GitHub
Feb 13, 2025
Browse files
[VLM] Separate text-only and vision variants of the same model architecture (#13157)
parent
02ed8a1f
Changes
14
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
1729 additions
and
1331 deletions
+1729
-1331
docs/source/models/supported_models.md
docs/source/models/supported_models.md
+8
-9
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+3
-0
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+3
-2
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+87
-84
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+3
-8
tests/models/decoder_only/vision_language/vlm_utils/core.py
tests/models/decoder_only/vision_language/vlm_utils/core.py
+39
-23
tests/models/decoder_only/vision_language/vlm_utils/types.py
tests/models/decoder_only/vision_language/vlm_utils/types.py
+3
-7
tests/models/registry.py
tests/models/registry.py
+22
-15
tests/models/test_initialization.py
tests/models/test_initialization.py
+1
-2
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+58
-362
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+662
-0
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+42
-814
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/qwen_vl.py
+794
-0
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+4
-5
No files found.
docs/source/models/supported_models.md
View file @
1bc3b5e7
...
...
@@ -699,10 +699,10 @@ See [this page](#generative-models) for more information on how to use generativ
*
*
✅︎
*
✅︎
-
*
`DeepseekVLV2ForCausalLM`
-
*
`DeepseekVLV2ForCausalLM`
<sup>
^
</sup>
*
DeepSeek-VL2
*
T + I
<sup>
+
</sup>
*
`deepseek-ai/deepseek-vl2-tiny`
,
`deepseek-ai/deepseek-vl2-small`
,
`deepseek-ai/deepseek-vl2`
etc.
(see note)
*
`deepseek-ai/deepseek-vl2-tiny`
,
`deepseek-ai/deepseek-vl2-small`
,
`deepseek-ai/deepseek-vl2`
etc.
*
*
✅︎
*
✅︎
...
...
@@ -713,10 +713,10 @@ See [this page](#generative-models) for more information on how to use generativ
*
*
✅︎
*
✅︎
-
*
`
ChatGLMModel`
-
*
`
GLM4VForCausalLM`
<sup>
^
</sup>
*
GLM-4V
*
T + I
*
`THUDM/glm-4v-9b`
etc.
*
`THUDM/glm-4v-9b`
,
`THUDM/cogagent-9b-20241220`
etc.
*
✅︎
*
✅︎
*
✅︎
...
...
@@ -825,7 +825,7 @@ See [this page](#generative-models) for more information on how to use generativ
*
*
✅︎
*
✅︎
-
*
`Q
W
en
LMHeadModel`
-
*
`Q
w
en
VLForConditionalGeneration`
<sup>
^
</sup>
*
Qwen-VL
*
T + I
<sup>
E+
</sup>
*
`Qwen/Qwen-VL`
,
`Qwen/Qwen-VL-Chat`
, etc.
...
...
@@ -862,13 +862,12 @@ See [this page](#generative-models) for more information on how to use generativ
*
✅︎
:::
<sup>
^
</sup>
You need to set the architecture name via
`--hf-overrides`
to match the one in vLLM.
• For example, to use DeepSeek-VL2 series models:
`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
<sup>
E
</sup>
Pre-computed embeddings can be inputted for this modality.
<sup>
+
</sup>
Multiple items can be inputted per text prompt for this modality.
:::{note}
To use DeepSeek-VL2 series models, you have to pass
`--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
when running vLLM.
:::
:::{note}
H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
:::
...
...
examples/offline_inference/vision_language.py
View file @
1bc3b5e7
...
...
@@ -105,7 +105,9 @@ def run_glm4v(question: str, modality: str):
max_num_seqs
=
2
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
prompt
=
f
"<|user|>
\n
<|begin_of_image|><|endoftext|><|end_of_image|>
\
{
question
}
<|assistant|>"
...
...
@@ -495,6 +497,7 @@ def run_qwen_vl(question: str, modality: str):
trust_remote_code
=
True
,
max_model_len
=
1024
,
max_num_seqs
=
2
,
hf_overrides
=
{
"architectures"
:
[
"QwenVLForConditionalGeneration"
]},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
1bc3b5e7
...
...
@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
)
def
load_h2o
n
vl
(
question
:
str
,
image_urls
:
List
[
str
])
->
ModelRequestData
:
def
load_h2ovl
(
question
:
str
,
image_urls
:
List
[
str
])
->
ModelRequestData
:
model_name
=
"h2oai/h2ovl-mississippi-2b"
llm
=
LLM
(
...
...
@@ -302,6 +302,7 @@ def load_qwen_vl_chat(question: str,
trust_remote_code
=
True
,
max_model_len
=
1024
,
max_num_seqs
=
2
,
hf_overrides
=
{
"architectures"
:
[
"QwenVLForConditionalGeneration"
]},
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
placeholders
=
""
.
join
(
f
"Picture
{
i
}
: <img></img>
\n
"
...
...
@@ -452,7 +453,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
model_example_map
=
{
"aria"
:
load_aria
,
"deepseek_vl_v2"
:
load_deepseek_vl2
,
"h2ovl_chat"
:
load_h2o
n
vl
,
"h2ovl_chat"
:
load_h2ovl
,
"idefics3"
:
load_idefics3
,
"internvl_chat"
:
load_internvl
,
"mllama"
:
load_mllama
,
...
...
tests/distributed/test_pipeline_parallel.py
View file @
1bc3b5e7
...
...
@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
all workers in a node other than the head node, which can cause the test
to fail.
"""
import
json
import
os
from
dataclasses
import
dataclass
from
typing
import
List
,
Literal
,
NamedTuple
,
Optional
...
...
@@ -15,6 +16,7 @@ import pytest
from
vllm.config
import
TaskOption
from
vllm.logger
import
init_logger
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
logger
=
init_logger
(
"test_pipeline_parallel"
)
...
...
@@ -31,10 +33,7 @@ class ParallelSetup(NamedTuple):
class
PPTestOptions
(
NamedTuple
):
multi_node_only
:
bool
trust_remote_code
:
bool
tokenizer_mode
:
Optional
[
str
]
load_format
:
Optional
[
str
]
=
None
hf_overrides
:
Optional
[
str
]
=
None
@
dataclass
...
...
@@ -64,10 +63,7 @@ class PPTestSettings:
pp_base
:
int
=
2
,
multi_node_only
:
bool
=
False
,
task
:
TaskOption
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
hf_overrides
:
Optional
[
str
]
=
None
,
):
return
PPTestSettings
(
parallel_setups
=
[
...
...
@@ -97,10 +93,7 @@ class PPTestSettings:
vllm_major_versions
=
[
"0"
,
"0"
,
"1"
],
task
=
task
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
load_format
=
load_format
,
hf_overrides
=
hf_overrides
),
load_format
=
load_format
),
)
@
staticmethod
...
...
@@ -110,10 +103,7 @@ class PPTestSettings:
pp_base
:
int
=
2
,
task
:
TaskOption
=
"auto"
,
multi_node_only
:
bool
=
False
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
hf_overrides
:
Optional
[
str
]
=
None
,
):
return
PPTestSettings
(
parallel_setups
=
[
...
...
@@ -126,19 +116,16 @@ class PPTestSettings:
vllm_major_versions
=
[
"0"
],
task
=
task
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
load_format
=
load_format
,
hf_overrides
=
hf_overrides
),
load_format
=
load_format
),
)
def
iter_params
(
self
,
model_
name
:
str
):
def
iter_params
(
self
,
model_
id
:
str
):
opts
=
self
.
test_options
for
parallel_setup
in
self
.
parallel_setups
:
for
backend
,
vllm_major_version
in
zip
(
self
.
distributed_backends
,
self
.
vllm_major_versions
):
yield
(
model_
name
,
parallel_setup
,
backend
,
vllm_major_version
,
yield
(
model_
id
,
parallel_setup
,
backend
,
vllm_major_version
,
self
.
task
,
opts
)
...
...
@@ -150,16 +137,16 @@ TEXT_GENERATION_MODELS = {
# [Decoder-only]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct"
:
PPTestSettings
.
fast
(
tp_base
=
8
,
trust_remote_code
=
True
),
# noqa: E501
"baichuan-inc/Baichuan-7B"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"baichuan-inc/Baichuan2-13B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"Snowflake/snowflake-arctic-instruct"
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
# noqa: E501
"baichuan-inc/Baichuan-7B"
:
PPTestSettings
.
fast
(),
"baichuan-inc/Baichuan2-13B-Chat"
:
PPTestSettings
.
fast
(
),
"bigscience/bloomz-1b1"
:
PPTestSettings
.
fast
(),
"THUDM/chatglm3-6b"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"CohereForAI/c4ai-command-r-v01"
:
PPTestSettings
.
fast
(
tp_base
=
2
,
trust_remote_code
=
True
),
# noqa: E501
"databricks/dbrx-instruct"
:
PPTestSettings
.
fast
(
tp_base
=
8
),
"Deci/DeciLM-7B-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"THUDM/chatglm3-6b"
:
PPTestSettings
.
fast
(),
"CohereForAI/c4ai-command-r-v01"
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
"databricks/dbrx-instruct"
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
"Deci/DeciLM-7B-instruct"
:
PPTestSettings
.
fast
(),
"deepseek-ai/deepseek-llm-7b-chat"
:
PPTestSettings
.
fast
(),
"deepseek-ai/DeepSeek-V2-Lite-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"deepseek-ai/DeepSeek-V2-Lite-Chat"
:
PPTestSettings
.
fast
(
),
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
:
PPTestSettings
.
fast
(),
"tiiuae/falcon-7b"
:
PPTestSettings
.
fast
(),
"google/gemma-2b"
:
PPTestSettings
.
fast
(),
...
...
@@ -172,36 +159,36 @@ TEXT_GENERATION_MODELS = {
"ibm/PowerMoE-3b"
:
PPTestSettings
.
fast
(),
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
"internlm/internlm2-chat-7b"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"internlm/internlm2-chat-7b"
:
PPTestSettings
.
fast
(),
"inceptionai/jais-13b-chat"
:
PPTestSettings
.
fast
(),
"ai21labs/Jamba-tiny-dev"
:
PPTestSettings
.
fast
(),
"meta-llama/Meta-Llama-3-8B"
:
PPTestSettings
.
detailed
(),
"openbmb/MiniCPM-2B-sft-bf16"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"openbmb/MiniCPM3-4B"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"openbmb/MiniCPM-2B-sft-bf16"
:
PPTestSettings
.
fast
(),
"openbmb/MiniCPM3-4B"
:
PPTestSettings
.
fast
(),
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"state-spaces/mamba-130m-hf"
:
PPTestSettings
.
fast
(),
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
PPTestSettings
.
fast
(
tp_base
=
4
),
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
# noqa: E501
"mosaicml/mpt-7b"
:
PPTestSettings
.
fast
(),
"nvidia/Minitron-8B-Base"
:
PPTestSettings
.
fast
(),
"allenai/OLMo-1B-hf"
:
PPTestSettings
.
fast
(),
"shanearora/OLMo-7B-1124-hf"
:
PPTestSettings
.
fast
(),
"allenai/OLMoE-1B-7B-0924-Instruct"
:
PPTestSettings
.
fast
(),
"facebook/opt-iml-max-1.3b"
:
PPTestSettings
.
fast
(),
"OrionStarAI/Orion-14B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"OrionStarAI/Orion-14B-Chat"
:
PPTestSettings
.
fast
(),
"adept/persimmon-8b-chat"
:
PPTestSettings
.
fast
(),
"microsoft/phi-2"
:
PPTestSettings
.
fast
(),
"microsoft/Phi-3-small-8k-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"microsoft/Phi-3.5-MoE-instruct"
:
PPTestSettings
.
detailed
(
trust_remote_code
=
True
,
multi_node_only
=
True
,
load_format
=
"dummy"
,
hf_overrides
=
'{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'
),
# noqa: E501
"Qwen/Qwen-7B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"microsoft/Phi-3-small-8k-instruct"
:
PPTestSettings
.
fast
(
),
"microsoft/Phi-3.5-MoE-instruct"
:
PPTestSettings
.
detailed
(
multi_node_only
=
True
,
load_format
=
"dummy"
),
# noqa: E501
"Qwen/Qwen-7B-Chat"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen2-7B-Instruct"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
:
PPTestSettings
.
fast
(),
"stabilityai/stablelm-3b-4e1t"
:
PPTestSettings
.
fast
(),
"bigcode/starcoder2-3b"
:
PPTestSettings
.
fast
(),
"upstage/solar-pro-preview-instruct"
:
PPTestSettings
.
fast
(
tp_base
=
2
),
"upstage/solar-pro-preview-instruct"
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
# noqa: E501
# FIXME: Cannot load tokenizer in latest transformers version.
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(
trust_remote_code=True
),
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
# [Encoder-only]
# TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(),
...
...
@@ -211,7 +198,7 @@ EMBEDDING_MODELS = { # type: ignore[var-annotated]
# [Text-only]
"intfloat/e5-mistral-7b-instruct"
:
PPTestSettings
.
fast
(),
"BAAI/bge-multilingual-gemma2"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen2.5-Math-RM-72B"
:
PPTestSettings
.
fast
(
tp_base
=
4
,
trust_remote_code
=
True
),
# noqa: E501
"Qwen/Qwen2.5-Math-RM-72B"
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
}
MULTIMODAL_MODELS
=
{
...
...
@@ -219,20 +206,20 @@ MULTIMODAL_MODELS = {
"Salesforce/blip2-opt-2.7b"
:
PPTestSettings
.
fast
(),
"facebook/chameleon-7b"
:
PPTestSettings
.
fast
(),
"adept/fuyu-8b"
:
PPTestSettings
.
fast
(),
"THUDM/glm-4v-9b"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"OpenGVLab/InternVL2-1B"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"THUDM/glm-4v-9b"
:
PPTestSettings
.
fast
(),
"OpenGVLab/InternVL2-1B"
:
PPTestSettings
.
fast
(),
"llava-hf/llava-1.5-7b-hf"
:
PPTestSettings
.
fast
(),
"llava-hf/llava-v1.6-mistral-7b-hf"
:
PPTestSettings
.
fast
(),
"llava-hf/LLaVA-NeXT-Video-7B-hf"
:
PPTestSettings
.
fast
(),
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
:
PPTestSettings
.
fast
(),
"openbmb/MiniCPM-Llama3-V-2_5"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"allenai/Molmo-7B-D-0924"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"microsoft/Phi-3-vision-128k-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"mistralai/Pixtral-12B-2409"
:
PPTestSettings
.
fast
(
tp_base
=
2
,
tokenizer_mode
=
"mistral"
),
# noqa: E501
"Qwen/Qwen-VL-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"openbmb/MiniCPM-Llama3-V-2_5"
:
PPTestSettings
.
fast
(),
"allenai/Molmo-7B-D-0924"
:
PPTestSettings
.
fast
(),
"microsoft/Phi-3-vision-128k-instruct"
:
PPTestSettings
.
fast
(
),
"mistralai/Pixtral-12B-2409"
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
"Qwen/Qwen-VL-Chat"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen2-Audio-7B-Instruct"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen2-VL-2B-Instruct"
:
PPTestSettings
.
fast
(),
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
:
PPTestSettings
.
fast
(
),
# [Encoder-decoder]
# TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
...
...
@@ -258,7 +245,7 @@ TEST_MODELS = [
def
_compare_tp
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
...
...
@@ -267,6 +254,7 @@ def _compare_tp(
num_gpus_available
:
int
,
*
,
method
:
Literal
[
"generate"
,
"encode"
],
is_multimodal
:
bool
,
):
(
tp_size
,
...
...
@@ -274,13 +262,32 @@ def _compare_tp(
eager_mode
,
chunked_prefill
,
)
=
parallel_setup
(
multi_node_only
,
trust_remote_code
,
tokenizer_mode
,
load_format
,
hf_overrides
,
)
=
test_options
multi_node_only
,
load_format
=
test_options
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
trust_remote_code
=
model_info
.
trust_remote_code
tokenizer_mode
=
model_info
.
tokenizer_mode
hf_overrides
=
model_info
.
hf_overrides
if
load_format
==
"dummy"
:
# Avoid OOM
text_overrides
=
{
"num_layers"
:
1
,
"num_hidden_layers"
:
1
,
"num_experts"
:
2
,
"num_experts_per_tok"
:
2
,
"num_local_experts"
:
2
,
}
if
is_multimodal
:
hf_overrides
.
update
({
"text_config"
:
text_overrides
})
else
:
hf_overrides
.
update
(
text_overrides
)
else
:
model_info
.
check_available_online
(
on_fail
=
"skip"
)
if
num_gpus_available
<
tp_size
*
pp_size
:
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
...
...
@@ -312,7 +319,7 @@ def _compare_tp(
if
load_format
:
common_args
.
extend
([
"--load-format"
,
load_format
])
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
hf_overrides
])
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)
])
specific_case
=
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
if
distributed_backend
==
"ray"
and
(
vllm_major_version
==
"1"
...
...
@@ -355,11 +362,7 @@ def _compare_tp(
]
try
:
compare_two_settings
(
model_name
,
pp_args
,
tp_args
,
pp_env
,
method
=
method
)
compare_two_settings
(
model_id
,
pp_args
,
tp_args
,
pp_env
,
method
=
method
)
except
Exception
:
if
pp_env
is
None
:
raise
...
...
@@ -369,17 +372,16 @@ def _compare_tp(
@
pytest
.
mark
.
parametrize
(
(
"model_
name
"
,
"parallel_setup"
,
"distributed_backend"
,
"vllm_major_version"
,
"task"
,
"test_options"
),
(
"model_
id
"
,
"parallel_setup"
,
"distributed_backend"
,
"vllm_major_version"
,
"task"
,
"test_options"
),
[
params
for
model_name
,
settings
in
TEXT_GENERATION_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
params
for
model_id
,
settings
in
TEXT_GENERATION_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_language_generation
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
...
...
@@ -387,28 +389,28 @@ def test_tp_language_generation(
test_options
:
PPTestOptions
,
num_gpus_available
,
):
_compare_tp
(
model_
name
,
_compare_tp
(
model_
id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
task
,
test_options
,
num_gpus_available
,
method
=
"generate"
)
method
=
"generate"
,
is_multimodal
=
False
)
@
pytest
.
mark
.
parametrize
(
(
"model_
name
"
,
"parallel_setup"
,
"distributed_backend"
,
"vllm_major_version"
,
"task"
,
"test_options"
),
(
"model_
id
"
,
"parallel_setup"
,
"distributed_backend"
,
"vllm_major_version"
,
"task"
,
"test_options"
),
[
params
for
model_name
,
settings
in
EMBEDDING_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
params
for
model_id
,
settings
in
EMBEDDING_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_language_embedding
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
...
...
@@ -416,28 +418,28 @@ def test_tp_language_embedding(
test_options
:
PPTestOptions
,
num_gpus_available
,
):
_compare_tp
(
model_
name
,
_compare_tp
(
model_
id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
task
,
test_options
,
num_gpus_available
,
method
=
"encode"
)
method
=
"encode"
,
is_multimodal
=
False
)
@
pytest
.
mark
.
parametrize
(
(
"model_
name
"
,
"parallel_setup"
,
"distributed_backend"
,
"vllm_major_version"
,
"task"
,
"test_options"
),
(
"model_
id
"
,
"parallel_setup"
,
"distributed_backend"
,
"vllm_major_version"
,
"task"
,
"test_options"
),
[
params
for
model_name
,
settings
in
MULTIMODAL_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
params
for
model_id
,
settings
in
MULTIMODAL_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_multimodal_generation
(
model_
name
:
str
,
model_
id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
...
...
@@ -445,11 +447,12 @@ def test_tp_multimodal_generation(
test_options
:
PPTestOptions
,
num_gpus_available
,
):
_compare_tp
(
model_
name
,
_compare_tp
(
model_
id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
task
,
test_options
,
num_gpus_available
,
method
=
"generate"
)
method
=
"generate"
,
is_multimodal
=
True
)
tests/models/decoder_only/vision_language/test_models.py
View file @
1bc3b5e7
...
...
@@ -155,10 +155,7 @@ VLM_TEST_SETTINGS = {
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
marks
=
[
pytest
.
mark
.
skipif
(
TRANSFORMERS_VERSION
<
"4.49.0"
,
reason
=
"HF model requires transformers>=4.49.0"
,
),
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
#### Extended model tests
"aria"
:
VLMTestInfo
(
...
...
@@ -215,7 +212,6 @@ VLM_TEST_SETTINGS = {
"cherry_blossom"
:
"<image>
\n
Please infer the season with reason in details."
,
# noqa: E501
}),
multi_image_prompt
=
"image_1:<image>
\n
image_2:<image>
\n
Which image can we see the car and the tower?"
,
# noqa: E501
vllm_runner_kwargs
=
{
"hf_overrides"
:
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}},
# noqa: E501
patch_hf_runner
=
model_utils
.
deepseekvl2_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"images"
),
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
...
...
@@ -240,7 +236,7 @@ VLM_TEST_SETTINGS = {
num_logprobs
=
10
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
),
"glm4"
:
VLMTestInfo
(
"glm4
v
"
:
VLMTestInfo
(
models
=
[
"THUDM/glm-4v-9b"
],
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
identity
,
...
...
@@ -351,7 +347,6 @@ VLM_TEST_SETTINGS = {
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
vllm_runner_kwargs
=
{
"hf_overrides"
:
{
"architectures"
:
[
"MantisForConditionalGeneration"
]}},
# noqa: E501
get_stop_token_ids
=
lambda
tok
:
[
128009
],
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
mantis_vllm_to_hf_output
,
...
...
@@ -437,7 +432,7 @@ VLM_TEST_SETTINGS = {
auto_cls
=
AutoModelForVision2Seq
,
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
),
"qwen"
:
VLMTestInfo
(
"qwen
_vl
"
:
VLMTestInfo
(
models
=
[
"Qwen/Qwen-VL"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
identity
,
...
...
tests/models/decoder_only/vision_language/vlm_utils/core.py
View file @
1bc3b5e7
...
...
@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
import
torch
from
PIL.Image
import
Image
from
transformers
import
AutoTokenizer
,
BatchEncoding
,
PreTrainedTokenizerBase
from
transformers
import
BatchEncoding
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
.....conftest
import
HfRunner
,
VllmRunner
from
....registry
import
HF_EXAMPLE_MODELS
from
.types
import
RunnerOutput
...
...
@@ -31,10 +33,8 @@ def run_test(
use_tokenizer_eos
:
bool
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
],
comparator
:
Callable
[...,
None
],
get_stop_token_ids
:
Optional
[
Callable
[[
PreTrainedTokenizerBase
],
List
[
int
]]],
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]],
stop_str
:
Optional
[
List
[
str
]],
tokenizer_mode
:
str
,
limit_mm_per_prompt
:
Dict
[
str
,
int
],
vllm_runner_kwargs
:
Optional
[
Dict
[
str
,
Any
]],
hf_model_kwargs
:
Optional
[
Dict
[
str
,
Any
]],
...
...
@@ -48,7 +48,10 @@ def run_test(
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs
=
vllm_embeddings
if
vllm_embeddings
is
not
None
else
inputs
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
vllm_outputs_per_mm
=
[]
hf_outputs_per_mm
=
[]
...
...
@@ -57,17 +60,19 @@ def run_test(
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_kwargs
:
Dict
[
str
,
Any
]
=
{}
if
get_stop_token_ids
is
not
None
:
vllm_kwargs
[
"stop_token_ids"
]
=
get_stop_token_ids
(
tokenizer
)
if
stop_str
:
vllm_kwargs
[
"stop"
]
=
stop_str
if
vllm_runner_kwargs
is
None
:
vllm_runner_kwargs
=
{}
vllm_runner_kwargs_
:
Dict
[
str
,
Any
]
=
{}
if
model_info
.
tokenizer
:
vllm_runner_kwargs_
[
"tokenizer"
]
=
model_info
.
tokenizer
if
model_info
.
tokenizer_mode
:
vllm_runner_kwargs_
[
"tokenizer_mode"
]
=
model_info
.
tokenizer_mode
if
model_info
.
hf_overrides
:
vllm_runner_kwargs_
[
"hf_overrides"
]
=
model_info
.
hf_overrides
if
vllm_runner_kwargs
:
vllm_runner_kwargs_
.
update
(
vllm_runner_kwargs
)
with
vllm_runner
(
model
,
tokenizer_mode
=
tokenizer_mode
,
max_model_len
=
max_model_len
,
max_num_seqs
=
max_num_seqs
,
dtype
=
dtype
,
...
...
@@ -76,7 +81,15 @@ def run_test(
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
enforce_eager
,
task
=
task
,
**
vllm_runner_kwargs
)
as
vllm_model
:
**
vllm_runner_kwargs_
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
vllm_kwargs
:
Dict
[
str
,
Any
]
=
{}
if
get_stop_token_ids
is
not
None
:
vllm_kwargs
[
"stop_token_ids"
]
=
get_stop_token_ids
(
tokenizer
)
if
stop_str
:
vllm_kwargs
[
"stop"
]
=
stop_str
for
prompts
,
media
in
vllm_inputs
:
vllm_kwargs
[
runner_mm_key
]
=
media
vllm_output
=
vllm_model
.
generate_greedy_logprobs
(
...
...
@@ -93,16 +106,19 @@ def run_test(
if
patch_hf_runner
is
not
None
:
hf_model
=
patch_hf_runner
(
hf_model
)
# Some models need to explicitly pass the eos_token_id off the tokenizer or
# processor for a good comparison; currently assume processor/tokenizer
# agree on the EOS, and pull it off the tokenizer if requested.
with
hf_model
,
torch
.
no_grad
():
tokenizer
=
hf_model
.
tokenizer
# Some models need to explicitly pass the eos_token_id off the tokenizer
# or processor for a good comparison;
# currently assume processor/tokenizer agree on the EOS, and pull it off
# the tokenizer if requested.
hf_kwargs
=
{}
if
use_tokenizer_eos
:
hf_kwargs
[
"eos_token_id"
]
=
tokenizer
.
eos_token_id
if
stop_str
:
hf_kwargs
[
"stop_strings"
]
=
stop_str
with
hf_model
,
torch
.
no_grad
():
for
prompts
,
media
in
inputs
:
hf_kwargs
[
runner_mm_key
]
=
media
hf_output
=
hf_model
.
generate_greedy_logprobs_limit
(
...
...
tests/models/decoder_only/vision_language/vlm_utils/types.py
View file @
1bc3b5e7
...
...
@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
import
torch
from
PIL.Image
import
Image
from
pytest
import
MarkDecorator
from
transformers
import
(
AutoModelForCausalLM
,
BatchEncoding
,
PreTrainedTokenizerBase
)
from
transformers
import
AutoModelForCausalLM
,
BatchEncoding
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
identity
from
.....conftest
import
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
_ImageAssets
...
...
@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple):
vllm_runner_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids
:
Optional
[
Callable
[[
PreTrainedTokenizerBase
],
List
[
int
]]]
=
None
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]]
=
None
# Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer
stop_str
:
Optional
[
List
[
str
]]
=
None
...
...
@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple):
marks
:
Optional
[
List
[
MarkDecorator
]]
=
None
tokenizer_mode
:
str
=
"auto"
def
get_non_parametrized_runner_kwargs
(
self
):
"""Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized
...
...
@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple):
"hf_model_kwargs"
:
self
.
hf_model_kwargs
,
"stop_str"
:
self
.
stop_str
,
"patch_hf_runner"
:
self
.
patch_hf_runner
,
"tokenizer_mode"
:
self
.
tokenizer_mode
}
...
...
tests/models/registry.py
View file @
1bc3b5e7
...
...
@@ -104,7 +104,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"BambaForCausalLM"
:
_HfExamplesInfo
(
"ibm-ai-platform/Bamba-9B"
),
"BloomForCausalLM"
:
_HfExamplesInfo
(
"bigscience/bloomz-1b1"
),
# ChatGLMModel supports multimodal
"ChatGLMModel"
:
_HfExamplesInfo
(
"THUDM/chatglm3-6b"
,
trust_remote_code
=
True
),
"CohereForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r-v01"
,
trust_remote_code
=
True
),
"Cohere2ForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r7b-12-2024"
,
# noqa: E501
...
...
@@ -138,7 +139,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"InternLM3ForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm3-8b-instruct"
,
trust_remote_code
=
True
),
"JAISLMHeadModel"
:
_HfExamplesInfo
(
"inceptionai/jais-13b-chat"
),
"JambaForCausalLM"
:
_HfExamplesInfo
(
"ai21labs/AI21-Jamba-1.5-Mini"
),
"JambaForCausalLM"
:
_HfExamplesInfo
(
"ai21labs/AI21-Jamba-1.5-Mini"
,
extras
=
{
"tiny"
:
"ai21labs/Jamba-tiny-dev"
}),
# noqa: E501
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Meta-Llama-3-8B"
),
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
"decapoda-research/llama-7b-hf"
,
is_available_online
=
False
),
...
...
@@ -167,7 +169,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"PhiMoEForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3.5-MoE-instruct"
,
trust_remote_code
=
True
),
# QWenLMHeadModel supports multimodal
"QWenLMHeadModel"
:
_HfExamplesInfo
(
"Qwen/Qwen-7B-Chat"
,
trust_remote_code
=
True
),
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen2-7B-Instruct"
),
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
,
...
...
@@ -232,18 +235,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"Blip2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Salesforce/blip2-opt-2.7b"
),
# noqa: E501
"ChameleonForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/chameleon-7b"
),
# noqa: E501
"ChatGLMModel"
:
_HfExamplesInfo
(
"THUDM/glm-4v-9b"
,
extras
=
{
"text_only"
:
"THUDM/chatglm3-6b"
},
trust_remote_code
=
True
),
"ChatGLMForConditionalGeneration"
:
_HfExamplesInfo
(
"chatglm2-6b"
,
is_available_online
=
False
),
"DeepseekVLV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-vl2-tiny"
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}),
# noqa: E501
"FuyuForCausalLM"
:
_HfExamplesInfo
(
"adept/fuyu-8b"
),
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
),
"GLM4VForCausalLM"
:
_HfExamplesInfo
(
"THUDM/glm-4v-9b"
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}),
# noqa: E501
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
,
extras
=
{
"2b"
:
"h2oai/h2ovl-mississippi-2b"
}),
# noqa: E501
"InternVLChatModel"
:
_HfExamplesInfo
(
"OpenGVLab/InternVL2-1B"
,
extras
=
{
"2B"
:
"OpenGVLab/InternVL2-2B"
},
# noqa: E501
trust_remote_code
=
True
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
),
# noqa: E501
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
}),
# noqa: E501
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-1.5-7b-hf"
,
extras
=
{
"mistral"
:
"mistral-community/pixtral-12b"
}),
# noqa: E501
"LlavaNextForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-v1.6-mistral-7b-hf"
),
# noqa: E501
...
...
@@ -253,21 +257,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]}),
# noqa: E501
"MiniCPMO"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-o-2_6"
,
trust_remote_code
=
True
),
"MiniCPMV"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-V-2_6"
,
"MiniCPMV"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-Llama3-V-2_5"
,
extras
=
{
"2.6"
:
"openbmb/MiniCPM-V-2_6"
},
# noqa: E501
trust_remote_code
=
True
),
"MolmoForCausalLM"
:
_HfExamplesInfo
(
"allenai/Molmo-7B-D-0924"
,
extras
=
{
"olmo"
:
"allenai/Molmo-7B-O-0924"
},
# noqa: E501
trust_remote_code
=
True
),
"NVLM_D"
:
_HfExamplesInfo
(
"nvidia/NVLM-D-72B"
,
trust_remote_code
=
True
),
"PaliGemmaForConditionalGeneration"
:
_HfExamplesInfo
(
"google/paligemma-3b-pt-224"
),
# noqa: E501
"PaliGemmaForConditionalGeneration"
:
_HfExamplesInfo
(
"google/paligemma-3b-mix-224"
,
# noqa: E501
extras
=
{
"v2"
:
"google/paligemma2-3b-ft-docci-448"
}),
# noqa: E501
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-vision-128k-instruct"
,
trust_remote_code
=
True
),
"PixtralForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Pixtral-12B-2409"
,
# noqa: E501
tokenizer_mode
=
"mistral"
),
"QWenLMHeadModel"
:
_HfExamplesInfo
(
"Qwen/Qwen-VL-Chat"
,
extras
=
{
"text_only"
:
"Qwen/Qwen-7B-Chat"
},
# noqa: E501
trust_remote_code
=
True
),
"QwenVLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen-VL"
,
extras
=
{
"chat"
:
"Qwen/Qwen-VL-Chat"
},
# noqa: E501
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"QwenVLForConditionalGeneration"
]}),
# noqa: E501
"Qwen2AudioForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-Audio-7B-Instruct"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-VL-2B-Instruct"
),
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-VL-3B-Instruct"
,
# noqa: E501
...
...
tests/models/test_initialization.py
View file @
1bc3b5e7
...
...
@@ -18,8 +18,7 @@ def test_can_initialize(model_arch):
# Avoid OOM
def
hf_overrides
(
hf_config
:
PretrainedConfig
)
->
PretrainedConfig
:
if
hf_config
.
model_type
==
"deepseek_vl_v2"
:
hf_config
.
update
({
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]})
hf_config
.
update
(
model_info
.
hf_overrides
)
if
hasattr
(
hf_config
,
"text_config"
):
text_config
:
PretrainedConfig
=
hf_config
.
text_config
...
...
vllm/model_executor/models/chatglm.py
View file @
1bc3b5e7
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/THUDM/CogAgent
"""Inference-only CogAgent model compatible with THUDM weights."""
from
argparse
import
Namespace
from
typing
import
(
Iterable
,
List
,
Mapping
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
)
# https://github.com/THUDM/ChatGLM2-6B
"""Inference-only ChatGLM model compatible with THUDM weights."""
from
typing
import
Iterable
,
List
,
Optional
,
Set
,
Tuple
,
Union
import
torch
from
torch
import
nn
from
torch.nn
import
LayerNorm
from
torchvision
import
transforms
from
torchvision.transforms
import
InterpolationMode
from
transformers
import
PreTrainedTokenizer
,
TensorType
from
transformers.image_utils
import
ImageInput
from
transformers.tokenization_utils_base
import
TextInput
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -31,204 +23,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.glm4_vision_encoder
import
EVA2CLIPModel
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalKwargs
,
NestedTensors
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
BatchFeature
,
MultiModalFieldConfig
,
PromptReplacement
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
ChatGLMConfig
from
.interfaces
import
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
,
merge_multimodal_embeddings
)
class
GLMImagePixelInputs
(
TypedDict
):
pixel_values
:
torch
.
Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
class
GLM4VProcessor
:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
"""
def
__init__
(
self
,
config
:
ChatGLMConfig
,
tokenizer
:
PreTrainedTokenizer
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
if
vision_config
:
=
getattr
(
config
,
"vision_config"
,
None
):
image_size
=
vision_config
[
"image_size"
]
self
.
image_transform
=
transforms
.
Compose
([
transforms
.
Resize
(
(
image_size
,
image_size
),
interpolation
=
InterpolationMode
.
BICUBIC
,
),
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
=
(
0.48145466
,
0.4578275
,
0.40821073
),
std
=
(
0.26862954
,
0.26130258
,
0.27577711
),
),
])
else
:
self
.
image_transform
=
None
def
__call__
(
self
,
text
:
Optional
[
Union
[
TextInput
,
list
[
TextInput
]]]
=
None
,
images
:
Optional
[
Union
[
ImageInput
,
list
[
ImageInput
]]]
=
None
,
return_tensors
:
Optional
[
Union
[
str
,
TensorType
]]
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
text_inputs
=
self
.
tokenizer
(
text
)
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
if
self
.
image_transform
is
None
:
raise
ValueError
(
"This model does not support image inputs"
)
pixel_values
=
[
self
.
image_transform
(
image
)
for
image
in
images
]
image_inputs
=
{
"pixel_values"
:
torch
.
stack
(
pixel_values
)}
return
BatchFeature
(
{
**
text_inputs
,
**
image_inputs
,
},
tensor_type
=
return_tensors
,
)
class
GLM4VProcessingInfo
(
BaseProcessingInfo
):
def
get_tokenizer
(
self
):
tokenizer
=
self
.
ctx
.
tokenizer
assert
isinstance
(
tokenizer
,
PreTrainedTokenizer
)
return
tokenizer
def
get_hf_config
(
self
):
return
self
.
ctx
.
get_hf_config
(
ChatGLMConfig
)
def
get_hf_processor
(
self
)
->
GLM4VProcessor
:
return
GLM4VProcessor
(
self
.
get_hf_config
(),
self
.
get_tokenizer
(),
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
1
}
def
get_mm_max_tokens_per_item
(
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
Mapping
[
str
,
int
]:
return
{
"image"
:
self
.
get_num_image_feature_tokens
()}
def
get_num_image_tokens
(
self
)
->
int
:
hf_config
=
self
.
get_hf_config
()
if
not
(
vision_config
:
=
getattr
(
hf_config
,
"vision_config"
,
None
)):
return
0
image_size
=
vision_config
[
"image_size"
]
patch_size
=
vision_config
[
"patch_size"
]
grid_length
=
image_size
//
patch_size
//
2
return
grid_length
*
grid_length
def
get_num_image_feature_tokens
(
self
)
->
int
:
# EVA2CLIPModel has embeddings for boi and eoi tokens as well
return
self
.
get_num_image_tokens
()
+
2
class
GLM4VDummyInputsBuilder
(
BaseDummyInputsBuilder
[
GLM4VProcessingInfo
]):
def
get_dummy_processor_inputs
(
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
ProcessorInputs
:
hf_config
=
self
.
info
.
get_hf_config
()
if
not
(
vision_config
:
=
getattr
(
hf_config
,
"vision_config"
,
None
)):
return
ProcessorInputs
(
prompt_text
=
""
,
mm_data
=
{})
target_width
=
target_height
=
vision_config
[
"image_size"
]
num_images
=
mm_counts
.
get
(
"image"
,
0
)
mm_data
=
{
"image"
:
self
.
_get_dummy_images
(
width
=
target_width
,
height
=
target_height
,
num_images
=
num_images
)
}
base_text
=
"<|begin_of_image|><|endoftext|><|end_of_image|>"
return
ProcessorInputs
(
prompt_text
=
base_text
*
num_images
,
mm_data
=
mm_data
,
)
class
GLM4VMultiModalProcessor
(
BaseMultiModalProcessor
[
GLM4VProcessingInfo
]):
def
_get_mm_fields_config
(
self
,
hf_inputs
:
BatchFeature
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
return
dict
(
pixel_values
=
MultiModalFieldConfig
.
batched
(
"image"
))
def
_get_prompt_replacements
(
self
,
mm_items
:
MultiModalDataItems
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
out_mm_kwargs
:
MultiModalKwargs
,
)
->
list
[
PromptReplacement
]:
hf_config
=
self
.
info
.
get_hf_config
()
if
not
hasattr
(
hf_config
,
"vision_config"
):
return
[]
boi_token_id
=
hf_config
.
boi_token_id
image_token_id
=
hf_config
.
pad_token_id
eoi_token_id
=
hf_config
.
eoi_token_id
def
get_replacement
(
item_idx
:
int
):
num_image_tokens
=
self
.
info
.
get_num_image_tokens
()
image_tokens
=
[
image_token_id
]
*
num_image_tokens
return
[
boi_token_id
]
+
image_tokens
+
[
eoi_token_id
]
return
[
PromptReplacement
(
modality
=
"image"
,
target
=
[
boi_token_id
,
image_token_id
,
eoi_token_id
],
replacement
=
get_replacement
,
),
]
maybe_prefix
)
class
GLMAttention
(
nn
.
Module
):
...
...
@@ -489,7 +291,7 @@ class GLMTransformer(nn.Module):
position_ids
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]
:
for
i
in
range
(
self
.
start_layer
,
self
.
end_layer
):
layer
=
self
.
layers
[
i
]
hidden_states
=
layer
(
...
...
@@ -498,8 +300,12 @@ class GLMTransformer(nn.Module):
kv_cache
=
kv_caches
[
i
-
self
.
start_layer
],
attn_metadata
=
attn_metadata
,
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
"hidden_states"
:
hidden_states
})
# Final layer norm.
if
get_pp_group
().
is_last_rank
and
self
.
post_layer_norm
:
if
self
.
post_layer_norm
:
hidden_states
=
self
.
final_layernorm
(
hidden_states
)
return
hidden_states
...
...
@@ -534,61 +340,11 @@ class ChatGLMModel(nn.Module):
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.output_layer"
)
vision_config_flag
=
getattr
(
config
,
'vision_config'
,
None
)
if
vision_config_flag
is
not
None
:
self
.
vision_config
=
Namespace
(
**
config
.
vision_config
)
self
.
vision
=
EVA2CLIPModel
(
self
.
config
,
quant_config
,
prefix
=
f
"
{
prefix
}
.vision"
)
else
:
self
.
vision
=
None
self
.
make_empty_intermediate_tensors
=
(
self
.
encoder
.
make_empty_intermediate_tensors
)
def
_parse_and_validate_image_input
(
self
,
**
kwargs
:
object
)
->
GLMImagePixelInputs
:
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
if
pixel_values
is
not
None
and
self
.
vision
is
not
None
:
if
isinstance
(
pixel_values
,
torch
.
Tensor
):
if
pixel_values
.
ndim
>
2
:
pixel_values
=
torch
.
concat
(
list
(
pixel_values
))
elif
isinstance
(
pixel_values
,
list
):
return
torch
.
concat
(
pixel_values
)
else
:
raise
TypeError
(
"""pixel_values must be a torch.Tensor
or a list of torch.Tensor
"""
)
return
GLMImagePixelInputs
(
pixel_values
=
pixel_values
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
[
"pixel_values"
]
is
None
:
return
None
pixel_values
=
image_input
[
"pixel_values"
].
to
(
dtype
=
self
.
config
.
torch_dtype
)
vision_embeddings
=
self
.
vision
(
pixel_values
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
embedding
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
=
input_ids
,
inputs_embeds
=
inputs_embeds
,
multimodal_embeddings
=
multimodal_embeddings
,
placeholder_token_id
=
[
self
.
config
.
boi_token_id
,
self
.
config
.
pad_token_id
,
self
.
config
.
eoi_token_id
,
],
)
return
inputs_embeds
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
embedding
(
input_ids
)
def
forward
(
self
,
...
...
@@ -599,26 +355,24 @@ class ChatGLMModel(nn.Module):
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
)
->
torch
.
Tensor
:
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
get_pp_group
().
is_first_rank
:
if
inputs_embeds
is
not
None
:
hidden_states
=
inputs_embeds
else
:
hidden_states
=
self
.
get_input_embeddings
(
input_ids
)
else
:
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
intermediate_tensors
[
"hidden_states"
]
elif
inputs_embeds
is
None
:
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
)
# Run encoder.
hidden_states
=
self
.
encoder
(
hidden_states
=
inputs_embed
s
,
hidden_states
=
hidden_state
s
,
position_ids
=
positions
,
kv_caches
=
kv_caches
,
attn_metadata
=
attn_metadata
,
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
"hidden_states"
:
hidden_states
})
return
hidden_states
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
...
...
@@ -660,12 +414,18 @@ class ChatGLMModel(nn.Module):
return
loaded_params
class
ChatGLMBaseModel
(
nn
.
Module
,
SupportsLoRA
,
SupportsPP
):
class
ChatGLMBaseModel
(
nn
.
Module
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_substr
=
{
".word_embeddings"
:
""
},
)
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
transformer_type
:
type
[
ChatGLMModel
]
=
ChatGLMModel
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
...
...
@@ -678,7 +438,7 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
self
.
quant_config
=
quant_config
self
.
max_position_embeddings
=
getattr
(
config
,
"max_sequence_length"
,
8192
)
self
.
transformer
=
ChatGLMModel
(
vllm_config
=
vllm_config
,
self
.
transformer
=
transformer_type
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"transformer"
))
if
self
.
config
.
tie_word_embeddings
:
...
...
@@ -687,18 +447,8 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
self
.
lm_head
=
self
.
transformer
.
output_layer
self
.
logits_processor
=
LogitsProcessor
(
config
.
padded_vocab_size
)
self
.
sampler
=
get_sampler
()
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
**
kwargs
)
->
torch
.
Tensor
:
hidden_states
=
self
.
transformer
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
,
**
kwargs
)
return
hidden_states
self
.
make_empty_intermediate_tensors
=
(
self
.
transformer
.
make_empty_intermediate_tensors
)
def
compute_logits
(
self
,
...
...
@@ -722,7 +472,7 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
class
ChatGLM
(
ChatGLMBaseModel
):
class
ChatGLM
ForCausalLM
(
ChatGLMBaseModel
,
SupportsLoRA
,
SupportsPP
):
packed_modules_mapping
=
{
"query_key_value"
:
[
"query_key_value"
],
"dense_h_to_4h"
:
[
"dense_h_to_4h"
]
...
...
@@ -738,82 +488,28 @@ class ChatGLM(ChatGLMBaseModel):
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
if
hasattr
(
config
,
"vision_config"
):
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}
raise
RuntimeError
(
"The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting "
f
"`--hf-overrides
{
hf_overrides
!
r
}
`"
)
class
ChatGLMV
(
ChatGLMBaseModel
,
SupportsMultiModal
):
packed_modules_mapping
=
{
"query_key_value"
:
[
"query_key_value"
],
"dense_h_to_4h"
:
[
"dense_h_to_4h"
],
"merged_proj"
:
[
"gate_proj"
,
"dense_h_to_4h"
]
}
# LoRA specific attributes
supported_lora_modules
=
[
"query_key_value"
,
"dense"
,
"dense_h_to_4h"
,
"dense_4h_to_h"
,
# vision
"fc1"
,
"fc2"
,
"merged_proj"
,
"linear_proj"
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
get_mm_mapping
(
self
)
->
MultiModelKeys
:
"""
Get the module prefix in multimodal models
"""
return
MultiModelKeys
.
from_string_field
(
language_model
=
"transformer.encoder"
,
connector
=
"transformer.vision.linear_proj"
,
tower_model
=
"transformer.vision.transformer"
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
return
self
.
transformer
.
get_multimodal_embeddings
(
**
kwargs
)
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
def
get_input_embeddings
(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
return
self
.
transformer
.
get_input_embeddings
(
input_ids
,
multimodal_embeddings
)
@
MULTIMODAL_REGISTRY
.
register_processor
(
GLM4VMultiModalProcessor
,
info
=
GLM4VProcessingInfo
,
dummy_inputs
=
GLM4VDummyInputsBuilder
)
class
ChatGLMForCausalLM
(
ChatGLMBaseModel
,
SupportsLoRA
,
SupportsPP
,
SupportsMultiModal
):
# Ensure that the LoRA support check passes when the class is not
# initialized, but set all these attributes to empty.
# These will be updated when an instance class is selected
packed_modules_mapping
=
{}
supported_lora_modules
=
[]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__new__
(
cls
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
config
=
vllm_config
.
model_config
.
hf_config
# Initialize VL
if
hasattr
(
config
,
"vision_config"
):
# noqa: SIM108
instance_cls
=
ChatGLMV
# Initialize LLM
else
:
instance_cls
=
ChatGLM
# quant_config references base class members,
# so update values before init is called
cls
.
packed_modules_mapping
.
update
(
instance_cls
.
packed_modules_mapping
)
cls
.
supported_lora_modules
+=
instance_cls
.
supported_lora_modules
cls
.
embedding_modules
.
update
(
instance_cls
.
embedding_modules
)
cls
.
embedding_padding_modules
+=
instance_cls
.
embedding_padding_modules
return
instance_cls
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
hidden_states
=
self
.
transformer
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
,
inputs_embeds
)
return
hidden_states
vllm/model_executor/models/glm4
_vision_encoder
.py
→
vllm/model_executor/models/glm4
v
.py
View file @
1bc3b5e7
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/THUDM/
GLM-4
"""Inference-only
GLM-4v model visual encoder
compatible with THUDM weights."""
# https://github.com/THUDM/
CogAgent
"""Inference-only
CogAgent model
compatible with THUDM weights."""
from
argparse
import
Namespace
from
typing
import
Optional
from
typing
import
List
,
Literal
,
Mapping
,
Optional
,
TypedDict
,
Union
import
torch
from
torch
import
nn
from
torch.nn
import
LayerNorm
from
torchvision
import
transforms
from
torchvision.transforms
import
InterpolationMode
from
transformers
import
PreTrainedTokenizer
,
TensorType
from
transformers.image_utils
import
ImageInput
from
transformers.tokenization_utils_base
import
TextInput
from
vllm.attention
import
AttentionMetadata
from
vllm.attention.layer
import
MultiHeadAttention
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
SiluAndMul
,
get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -18,11 +25,31 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
QKVParallelLinear
,
ReplicatedLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalKwargs
,
NestedTensors
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
BatchFeature
,
MultiModalFieldConfig
,
PromptReplacement
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
ChatGLMConfig
from
.chatglm
import
ChatGLMBaseModel
,
ChatGLMModel
from
.interfaces
import
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
flatten_bn
,
merge_multimodal_embeddings
class
PatchEmbedding
(
nn
.
Module
):
class
GLMVImagePixelInputs
(
TypedDict
):
type
:
Literal
[
"pixel_values"
]
data
:
torch
.
Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
class
EVA2CLIPPatchEmbedding
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
().
__init__
()
...
...
@@ -54,7 +81,7 @@ class PatchEmbedding(nn.Module):
return
x
class
Attention
(
nn
.
Module
):
class
EVA2CLIP
Attention
(
nn
.
Module
):
def
__init__
(
self
,
...
...
@@ -97,7 +124,7 @@ class Attention(nn.Module):
return
output
class
MLP
(
nn
.
Module
):
class
EVA2CLIP
MLP
(
nn
.
Module
):
def
__init__
(
self
,
...
...
@@ -128,7 +155,7 @@ class MLP(nn.Module):
return
x
class
TransformerLayer
(
nn
.
Module
):
class
EVA2CLIP
TransformerLayer
(
nn
.
Module
):
def
__init__
(
self
,
...
...
@@ -139,10 +166,10 @@ class TransformerLayer(nn.Module):
super
().
__init__
()
self
.
input_layernorm
=
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
attention
=
Attention
(
config
,
self
.
attention
=
EVA2CLIP
Attention
(
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.attention"
)
self
.
mlp
=
MLP
(
config
,
self
.
mlp
=
EVA2CLIP
MLP
(
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
)
self
.
post_attention_layernorm
=
LayerNorm
(
config
.
hidden_size
,
...
...
@@ -159,7 +186,7 @@ class TransformerLayer(nn.Module):
return
output
class
Transformer
(
nn
.
Module
):
class
EVA2CLIP
Transformer
(
nn
.
Module
):
def
__init__
(
self
,
...
...
@@ -169,7 +196,7 @@ class Transformer(nn.Module):
):
super
().
__init__
()
self
.
layers
=
nn
.
ModuleList
([
TransformerLayer
(
config
,
EVA2CLIP
TransformerLayer
(
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.layers.
{
layer_idx
}
"
)
for
layer_idx
in
range
(
config
.
num_hidden_layers
)
...
...
@@ -181,7 +208,7 @@ class Transformer(nn.Module):
return
hidden_states
class
GLU
(
nn
.
Module
):
class
EVA2CLIP
GLU
(
nn
.
Module
):
def
__init__
(
self
,
...
...
@@ -268,11 +295,11 @@ class EVA2CLIPModel(nn.Module):
):
super
().
__init__
()
vision_config
=
Namespace
(
**
config
.
vision_config
)
self
.
patch_embedding
=
PatchEmbedding
(
vision_config
)
self
.
transformer
=
Transformer
(
vision_config
,
self
.
patch_embedding
=
EVA2CLIP
PatchEmbedding
(
vision_config
)
self
.
transformer
=
EVA2CLIP
Transformer
(
vision_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.transformer"
)
self
.
linear_proj
=
GLU
(
config
,
self
.
linear_proj
=
EVA2CLIP
GLU
(
config
,
in_features
=
config
.
hidden_size
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.linear_proj"
)
...
...
@@ -310,3 +337,326 @@ class EVA2CLIPModel(nn.Module):
x
=
torch
.
cat
((
boi
,
x
,
eoi
),
dim
=
1
)
x
=
x
/
self
.
scaling_factor
return
x
class
GLM4VModel
(
ChatGLMModel
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
quant_config
=
vllm_config
.
quant_config
self
.
vision
=
EVA2CLIPModel
(
self
.
config
,
quant_config
,
prefix
=
f
"
{
prefix
}
.vision"
)
class
GLM4VProcessor
:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
"""
def
__init__
(
self
,
config
:
ChatGLMConfig
,
tokenizer
:
PreTrainedTokenizer
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
vision_config
=
config
.
vision_config
image_size
=
vision_config
[
"image_size"
]
self
.
image_transform
=
transforms
.
Compose
([
transforms
.
Resize
(
(
image_size
,
image_size
),
interpolation
=
InterpolationMode
.
BICUBIC
,
),
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
=
(
0.48145466
,
0.4578275
,
0.40821073
),
std
=
(
0.26862954
,
0.26130258
,
0.27577711
),
),
])
def
__call__
(
self
,
text
:
Optional
[
Union
[
TextInput
,
list
[
TextInput
]]]
=
None
,
images
:
Optional
[
Union
[
ImageInput
,
list
[
ImageInput
]]]
=
None
,
return_tensors
:
Optional
[
Union
[
str
,
TensorType
]]
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
text_inputs
=
self
.
tokenizer
(
text
)
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values
=
[
self
.
image_transform
(
image
)
for
image
in
images
]
image_inputs
=
{
"pixel_values"
:
torch
.
stack
(
pixel_values
)}
return
BatchFeature
(
{
**
text_inputs
,
**
image_inputs
,
},
tensor_type
=
return_tensors
,
)
class
GLM4VProcessingInfo
(
BaseProcessingInfo
):
def
get_tokenizer
(
self
):
tokenizer
=
self
.
ctx
.
tokenizer
assert
isinstance
(
tokenizer
,
PreTrainedTokenizer
)
return
tokenizer
def
get_hf_config
(
self
):
return
self
.
ctx
.
get_hf_config
(
ChatGLMConfig
)
def
get_hf_processor
(
self
)
->
GLM4VProcessor
:
return
GLM4VProcessor
(
self
.
get_hf_config
(),
self
.
get_tokenizer
(),
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
1
}
def
get_mm_max_tokens_per_item
(
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
Mapping
[
str
,
int
]:
return
{
"image"
:
self
.
get_num_image_feature_tokens
()}
def
get_num_image_tokens
(
self
)
->
int
:
hf_config
=
self
.
get_hf_config
()
vision_config
=
hf_config
.
vision_config
image_size
=
vision_config
[
"image_size"
]
patch_size
=
vision_config
[
"patch_size"
]
grid_length
=
image_size
//
patch_size
//
2
return
grid_length
*
grid_length
def
get_num_image_feature_tokens
(
self
)
->
int
:
# EVA2CLIPModel has embeddings for boi and eoi tokens as well
return
self
.
get_num_image_tokens
()
+
2
class
GLM4VDummyInputsBuilder
(
BaseDummyInputsBuilder
[
GLM4VProcessingInfo
]):
def
get_dummy_processor_inputs
(
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
ProcessorInputs
:
hf_config
=
self
.
info
.
get_hf_config
()
vision_config
=
hf_config
.
vision_config
target_width
=
target_height
=
vision_config
[
"image_size"
]
num_images
=
mm_counts
.
get
(
"image"
,
0
)
mm_data
=
{
"image"
:
self
.
_get_dummy_images
(
width
=
target_width
,
height
=
target_height
,
num_images
=
num_images
)
}
base_text
=
"<|begin_of_image|><|endoftext|><|end_of_image|>"
return
ProcessorInputs
(
prompt_text
=
base_text
*
num_images
,
mm_data
=
mm_data
,
)
class
GLM4VMultiModalProcessor
(
BaseMultiModalProcessor
[
GLM4VProcessingInfo
]):
def
_get_mm_fields_config
(
self
,
hf_inputs
:
BatchFeature
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
return
dict
(
pixel_values
=
MultiModalFieldConfig
.
batched
(
"image"
))
def
_get_prompt_replacements
(
self
,
mm_items
:
MultiModalDataItems
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
out_mm_kwargs
:
MultiModalKwargs
,
)
->
list
[
PromptReplacement
]:
hf_config
=
self
.
info
.
get_hf_config
()
boi_token_id
=
hf_config
.
boi_token_id
image_token_id
=
hf_config
.
pad_token_id
eoi_token_id
=
hf_config
.
eoi_token_id
def
get_replacement
(
item_idx
:
int
):
num_image_tokens
=
self
.
info
.
get_num_image_tokens
()
image_tokens
=
[
image_token_id
]
*
num_image_tokens
return
[
boi_token_id
]
+
image_tokens
+
[
eoi_token_id
]
return
[
PromptReplacement
(
modality
=
"image"
,
target
=
[
boi_token_id
,
image_token_id
,
eoi_token_id
],
replacement
=
get_replacement
,
),
]
@
MULTIMODAL_REGISTRY
.
register_processor
(
GLM4VMultiModalProcessor
,
info
=
GLM4VProcessingInfo
,
dummy_inputs
=
GLM4VDummyInputsBuilder
)
class
GLM4VForCausalLM
(
ChatGLMBaseModel
,
SupportsLoRA
,
SupportsPP
,
SupportsMultiModal
):
packed_modules_mapping
=
{
"query_key_value"
:
[
"query_key_value"
],
"dense_h_to_4h"
:
[
"dense_h_to_4h"
],
"merged_proj"
:
[
"gate_proj"
,
"dense_h_to_4h"
]
}
# LoRA specific attributes
supported_lora_modules
=
[
"query_key_value"
,
"dense"
,
"dense_h_to_4h"
,
"dense_4h_to_h"
,
# vision
"fc1"
,
"fc2"
,
"merged_proj"
,
"linear_proj"
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
get_mm_mapping
(
self
)
->
MultiModelKeys
:
"""
Get the module prefix in multimodal models
"""
return
MultiModelKeys
.
from_string_field
(
language_model
=
"transformer.encoder"
,
connector
=
"transformer.vision.linear_proj"
,
tower_model
=
"transformer.vision.transformer"
)
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
transformer_type
:
type
[
GLM4VModel
]
=
GLM4VModel
,
)
->
None
:
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
,
transformer_type
=
transformer_type
,
)
self
.
transformer
:
GLM4VModel
def
_validate_pixel_values
(
self
,
data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
h
=
w
=
self
.
config
.
vision_config
[
"image_size"
]
expected_dims
=
(
3
,
h
,
w
)
actual_dims
=
tuple
(
data
.
shape
[
1
:])
if
actual_dims
!=
expected_dims
:
expected_expr
=
(
"batch_size"
,
*
map
(
str
,
expected_dims
))
raise
ValueError
(
f
"The expected shape of pixel values is
{
expected_expr
}
. "
f
"You supplied
{
tuple
(
data
.
shape
)
}
."
)
return
data
def
_parse_and_validate_image_input
(
self
,
**
kwargs
:
object
)
->
Optional
[
GLMVImagePixelInputs
]:
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
if
pixel_values
is
not
None
:
if
not
isinstance
(
pixel_values
,
torch
.
Tensor
):
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
return
GLMVImagePixelInputs
(
type
=
"pixel_values"
,
data
=
self
.
_validate_pixel_values
(
flatten_bn
(
pixel_values
,
concat
=
True
)),
)
return
None
def
_process_image_input
(
self
,
image_input
:
GLMVImagePixelInputs
)
->
torch
.
Tensor
:
pixel_values
=
image_input
[
"data"
].
to
(
dtype
=
self
.
config
.
torch_dtype
)
return
self
.
transformer
.
vision
(
pixel_values
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
transformer
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
=
input_ids
,
inputs_embeds
=
inputs_embeds
,
multimodal_embeddings
=
multimodal_embeddings
,
placeholder_token_id
=
[
self
.
config
.
boi_token_id
,
self
.
config
.
pad_token_id
,
self
.
config
.
eoi_token_id
,
],
)
return
inputs_embeds
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif
inputs_embeds
is
None
:
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
)
input_ids
=
None
hidden_states
=
self
.
transformer
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
,
inputs_embeds
)
return
hidden_states
vllm/model_executor/models/qwen.py
View file @
1bc3b5e7
This diff is collapsed.
Click to expand it.
vllm/model_executor/models/qwen_vl.py
0 → 100644
View file @
1bc3b5e7
This diff is collapsed.
Click to expand it.
vllm/model_executor/models/registry.py
View file @
1bc3b5e7
...
...
@@ -39,7 +39,7 @@ _TEXT_GENERATION_MODELS = {
"BaichuanForCausalLM"
:
(
"baichuan"
,
"BaichuanForCausalLM"
),
"BambaForCausalLM"
:
(
"bamba"
,
"BambaForCausalLM"
),
"BloomForCausalLM"
:
(
"bloom"
,
"BloomForCausalLM"
),
#
ChatGLMModel
supports multimodal
"
ChatGLMModel
"
:
(
"chatglm"
,
"ChatGLMForCausalLM"
),
"CohereForCausalLM"
:
(
"commandr"
,
"CohereForCausalLM"
),
"Cohere2ForCausalLM"
:
(
"commandr"
,
"CohereForCausalLM"
),
"DbrxForCausalLM"
:
(
"dbrx"
,
"DbrxForCausalLM"
),
...
...
@@ -90,7 +90,7 @@ _TEXT_GENERATION_MODELS = {
"Phi3ForCausalLM"
:
(
"phi3"
,
"Phi3ForCausalLM"
),
"Phi3SmallForCausalLM"
:
(
"phi3_small"
,
"Phi3SmallForCausalLM"
),
"PhiMoEForCausalLM"
:
(
"phimoe"
,
"PhiMoEForCausalLM"
),
#
QWenLMHeadModel
supports multimodal
"
QWenLMHeadModel
"
:
(
"qwen"
,
"QWenLMHeadModel"
),
"Qwen2ForCausalLM"
:
(
"qwen2"
,
"Qwen2ForCausalLM"
),
"Qwen2MoeForCausalLM"
:
(
"qwen2_moe"
,
"Qwen2MoeForCausalLM"
),
"RWForCausalLM"
:
(
"falcon"
,
"FalconForCausalLM"
),
...
...
@@ -156,10 +156,9 @@ _MULTIMODAL_MODELS = {
"AriaForConditionalGeneration"
:
(
"aria"
,
"AriaForConditionalGeneration"
),
"Blip2ForConditionalGeneration"
:
(
"blip2"
,
"Blip2ForConditionalGeneration"
),
"ChameleonForConditionalGeneration"
:
(
"chameleon"
,
"ChameleonForConditionalGeneration"
),
# noqa: E501
"ChatGLMModel"
:
(
"chatglm"
,
"ChatGLMForCausalLM"
),
"ChatGLMForConditionalGeneration"
:
(
"chatglm"
,
"ChatGLMForCausalLM"
),
"DeepseekVLV2ForCausalLM"
:
(
"deepseek_vl2"
,
"DeepseekVLV2ForCausalLM"
),
"FuyuForCausalLM"
:
(
"fuyu"
,
"FuyuForCausalLM"
),
"GLM4VForCausalLM"
:
(
"glm4v"
,
"GLM4VForCausalLM"
),
"H2OVLChatModel"
:
(
"h2ovl"
,
"H2OVLChatModel"
),
"InternVLChatModel"
:
(
"internvl"
,
"InternVLChatModel"
),
"Idefics3ForConditionalGeneration"
:(
"idefics3"
,
"Idefics3ForConditionalGeneration"
),
...
...
@@ -175,7 +174,7 @@ _MULTIMODAL_MODELS = {
"PaliGemmaForConditionalGeneration"
:
(
"paligemma"
,
"PaliGemmaForConditionalGeneration"
),
# noqa: E501
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
"PixtralForConditionalGeneration"
:
(
"pixtral"
,
"PixtralForConditionalGeneration"
),
# noqa: E501
"Q
W
en
LMHeadModel
"
:
(
"qwen"
,
"Q
W
en
LMHeadModel"
),
"Q
w
en
VLForConditionalGeneration
"
:
(
"qwen
_vl
"
,
"Q
w
en
VLForConditionalGeneration"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
(
"qwen2_vl"
,
"Qwen2VLForConditionalGeneration"
),
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
(
"qwen2_5_vl"
,
"Qwen2_5_VLForConditionalGeneration"
),
# noqa: E501
"Qwen2AudioForConditionalGeneration"
:
(
"qwen2_audio"
,
"Qwen2AudioForConditionalGeneration"
),
# noqa: E501
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment