Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
31330101
Commit
31330101
authored
Apr 16, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.4' into v0.8.4-dev
parents
e8933c34
dc1b4a6f
Changes
346
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
573 additions
and
85 deletions
+573
-85
tests/models/decoder_only/vision_language/test_phi4mm.py
tests/models/decoder_only/vision_language/test_phi4mm.py
+80
-17
tests/models/decoder_only/vision_language/test_pixtral.py
tests/models/decoder_only/vision_language/test_pixtral.py
+10
-16
tests/models/decoder_only/vision_language/vlm_utils/core.py
tests/models/decoder_only/vision_language/vlm_utils/core.py
+4
-0
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
...els/decoder_only/vision_language/vlm_utils/model_utils.py
+6
-0
tests/models/embedding/language/test_jina.py
tests/models/embedding/language/test_jina.py
+166
-0
tests/models/embedding/utils.py
tests/models/embedding/utils.py
+7
-0
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+53
-12
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+2
-0
tests/models/multimodal/processing/test_llama4.py
tests/models/multimodal/processing/test_llama4.py
+1
-16
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+2
-2
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_llava_onevision.py
+2
-2
tests/models/multimodal/processing/test_mllama.py
tests/models/multimodal/processing/test_mllama.py
+71
-0
tests/models/multimodal/processing/test_smolvlm.py
tests/models/multimodal/processing/test_smolvlm.py
+65
-0
tests/models/registry.py
tests/models/registry.py
+30
-2
tests/models/test_initialization.py
tests/models/test_initialization.py
+13
-4
tests/models/test_oot_registration.py
tests/models/test_oot_registration.py
+1
-0
tests/models/utils.py
tests/models/utils.py
+3
-0
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+14
-2
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+6
-4
tests/quantization/test_quark.py
tests/quantization/test_quark.py
+37
-8
No files found.
tests/models/decoder_only/vision_language/test_phi4mm.py
View file @
31330101
...
@@ -2,18 +2,22 @@
...
@@ -2,18 +2,22 @@
import
os
import
os
import
re
import
re
from
collections.abc
import
Sequence
from
typing
import
Optional
from
typing
import
Optional
import
librosa
import
pytest
import
pytest
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
vllm.assets.image
import
ImageAsset
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptAudioInput
,
PromptImageInput
,
VllmRunner
)
from
....utils
import
large_gpu_test
from
....utils
import
large_gpu_test
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
...
@@ -29,6 +33,8 @@ model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
...
@@ -29,6 +33,8 @@ model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
# Since the vision-lora and speech-lora co-exist with the base model,
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
# we have to manually specify the path of the lora weights.
vision_lora_path
=
os
.
path
.
join
(
model_path
,
"vision-lora"
)
vision_lora_path
=
os
.
path
.
join
(
model_path
,
"vision-lora"
)
speech_question
=
os
.
path
.
join
(
model_path
,
"examples"
,
"what_is_shown_in_this_image.wav"
)
models
=
[
model_path
]
models
=
[
model_path
]
...
@@ -64,7 +70,8 @@ if current_platform.is_rocm():
...
@@ -64,7 +70,8 @@ if current_platform.is_rocm():
def
run_test
(
def
run_test
(
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
inputs
:
list
[
tuple
[
list
[
str
],
PromptImageInput
]],
inputs
:
Sequence
[
tuple
[
list
[
str
],
PromptImageInput
,
Optional
[
PromptAudioInput
]]],
model
:
str
,
model
:
str
,
*
,
*
,
max_model_len
:
int
,
max_model_len
:
int
,
...
@@ -104,28 +111,49 @@ def run_test(
...
@@ -104,28 +111,49 @@ def run_test(
enforce_eager
=
True
,
enforce_eager
=
True
,
)
as
vllm_model
:
)
as
vllm_model
:
lora_request
=
LoRARequest
(
"vision"
,
1
,
vision_lora_path
)
lora_request
=
LoRARequest
(
"vision"
,
1
,
vision_lora_path
)
vllm_model
.
model
.
llm_engine
.
add_lora
(
lora_request
=
lora_request
)
vllm_outputs_per_case
=
[
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
max_tokens
,
num_logprobs
=
num_logprobs
,
num_logprobs
=
num_logprobs
,
images
=
images
)
images
=
images
,
for
prompts
,
images
in
inputs
audios
=
audios
,
lora_request
=
lora_request
)
for
prompts
,
images
,
audios
in
inputs
]
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs
=
{
"_attn_implementation"
:
"sdpa"
}
hf_model_kwargs
=
{
"_attn_implementation"
:
"eager"
}
with
hf_runner
(
model
,
dtype
=
dtype
,
with
hf_runner
(
model
,
dtype
=
dtype
,
model_kwargs
=
hf_model_kwargs
)
as
hf_model
:
model_kwargs
=
hf_model_kwargs
)
as
hf_model
:
eos_token_id
=
hf_model
.
processor
.
tokenizer
.
eos_token_id
hf_processor
=
hf_model
.
processor
eos_token_id
=
hf_processor
.
tokenizer
.
eos_token_id
def
patch_hf_processor
(
*
args
,
text
=
""
,
images
=
None
,
audio
=
None
,
sampling_rate
=
None
,
**
kwargs
):
audios
=
None
if
audio
is
not
None
and
sampling_rate
is
not
None
:
audios
=
[(
audio
,
sampling_rate
)]
return
hf_processor
(
*
args
,
text
=
text
,
images
=
images
,
audios
=
audios
,
**
kwargs
)
hf_model
.
processor
=
patch_hf_processor
hf_outputs_per_case
=
[
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
max_tokens
,
num_logprobs
=
num_logprobs
,
num_logprobs
=
num_logprobs
,
images
=
images
,
images
=
images
,
audios
=
audios
,
eos_token_id
=
eos_token_id
,
eos_token_id
=
eos_token_id
,
num_logits_to_keep
=
0
)
num_logits_to_keep
=
0
)
for
prompts
,
images
in
inputs
for
prompts
,
images
,
audios
in
inputs
]
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
...
@@ -138,8 +166,6 @@ def run_test(
...
@@ -138,8 +166,6 @@ def run_test(
)
)
# Since we use _attn_implementation="eager" for hf_runner, there is more
# significant numerical difference. The basic `logprobs=5` fails to pass.
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
"size_factors"
,
...
@@ -151,7 +177,7 @@ def run_test(
...
@@ -151,7 +177,7 @@ def run_test(
# Single-scale, batched
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
[
1.0
,
1.0
,
1.0
],
# Multi-scale
# Multi-scale
[
0.
7
,
0.
7
5
,
1.0
],
[
0.
25
,
0.5
,
1.0
],
],
],
)
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
...
@@ -166,6 +192,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
...
@@ -166,6 +192,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
inputs_per_image
=
[(
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
None
,
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
run_test
(
run_test
(
...
@@ -201,17 +228,18 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
...
@@ -201,17 +228,18 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
10000
])
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
10000
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
xfail
(
reason
=
"Phi-4-MM multi-image inference is divergent with hf model."
)
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_model_len
:
int
,
size_factors
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
inputs_per_case
=
[
([
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
(
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
[
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
for
factor
in
size_factors
])
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
],
None
,
),
]
]
run_test
(
run_test
(
...
@@ -226,3 +254,38 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
...
@@ -226,3 +254,38 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
mm_limit
=
2
,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
)
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
10000
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_vision_speech_models
(
hf_runner
,
vllm_runner
,
model
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
# use the example speech question so that the model outputs are reasonable
audio
=
librosa
.
load
(
speech_question
,
sr
=
None
)
image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
inputs_vision_speech
=
[
(
[
"<|user|><|image_1|><|audio_1|><|end|><|assistant|>"
],
[
image
],
[
audio
],
),
]
run_test
(
hf_runner
,
vllm_runner
,
inputs_vision_speech
,
model
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
tests/models/decoder_only/vision_language/test_pixtral.py
View file @
31330101
...
@@ -178,6 +178,8 @@ def test_chat(
...
@@ -178,6 +178,8 @@ def test_chat(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
tokenizer_mode
=
"mistral"
,
tokenizer_mode
=
"mistral"
,
load_format
=
"mistral"
,
config_format
=
"mistral"
,
max_model_len
=
max_model_len
,
max_model_len
=
max_model_len
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
)
as
vllm_model
:
)
as
vllm_model
:
...
@@ -200,22 +202,14 @@ def test_chat(
...
@@ -200,22 +202,14 @@ def test_chat(
@
large_gpu_test
(
min_gb
=
48
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"prompt,expected_ranges"
,
"prompt,expected_ranges"
,
[(
_create_engine_inputs_hf
(
IMG_URLS
[:
1
]),
[(
_create_engine_inputs_hf
(
IMG_URLS
[:
1
]),
[{
[
PlaceholderRange
(
offset
=
11
,
length
=
494
)]),
"offset"
:
11
,
(
_create_engine_inputs_hf
(
IMG_URLS
[
1
:
4
]),
[
"length"
:
494
PlaceholderRange
(
offset
=
11
,
length
=
266
),
}]),
PlaceholderRange
(
offset
=
277
,
length
=
1056
),
(
_create_engine_inputs_hf
(
IMG_URLS
[
1
:
4
]),
[{
PlaceholderRange
(
offset
=
1333
,
length
=
418
)
"offset"
:
11
,
])])
"length"
:
266
},
{
"offset"
:
277
,
"length"
:
1056
},
{
"offset"
:
1333
,
"length"
:
418
}])])
def
test_multi_modal_placeholders
(
vllm_runner
,
prompt
,
def
test_multi_modal_placeholders
(
vllm_runner
,
prompt
,
expected_ranges
:
list
[
PlaceholderRange
],
expected_ranges
:
list
[
PlaceholderRange
],
monkeypatch
)
->
None
:
monkeypatch
)
->
None
:
...
...
tests/models/decoder_only/vision_language/vlm_utils/core.py
View file @
31330101
...
@@ -51,6 +51,10 @@ def run_test(
...
@@ -51,6 +51,10 @@ def run_test(
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
# Disable other modalities to save memory
default_limits
=
{
"image"
:
0
,
"video"
:
0
,
"audio"
:
0
}
limit_mm_per_prompt
=
default_limits
|
limit_mm_per_prompt
vllm_outputs_per_mm
=
[]
vllm_outputs_per_mm
=
[]
hf_outputs_per_mm
=
[]
hf_outputs_per_mm
=
[]
...
...
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
View file @
31330101
...
@@ -204,6 +204,12 @@ def idefics3_trunc_hf_output(hf_output: RunnerOutput,
...
@@ -204,6 +204,12 @@ def idefics3_trunc_hf_output(hf_output: RunnerOutput,
return
output_ids
,
output_str
,
out_logprobs
return
output_ids
,
output_str
,
out_logprobs
def
smolvlm_trunc_hf_output
(
hf_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
# Based on Idefics3
return
idefics3_trunc_hf_output
(
hf_output
,
model
)
def
minicpmv_trunc_hf_output
(
hf_output
:
RunnerOutput
,
def
minicpmv_trunc_hf_output
(
hf_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
model
:
str
)
->
RunnerOutput
:
output_ids
,
output_str
,
out_logprobs
=
hf_output
output_ids
,
output_str
,
out_logprobs
=
hf_output
...
...
tests/models/embedding/language/test_jina
_reranker_v2
.py
→
tests/models/embedding/language/test_jina.py
View file @
31330101
...
@@ -2,13 +2,16 @@
...
@@ -2,13 +2,16 @@
# ruff: noqa: E501
# ruff: noqa: E501
"""Compare the scoring outputs of HF and vLLM models.
"""Compare the scoring outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_jina
_reranker_v2
.py`.
Run `pytest tests/models/embedding/language/test_jina.py`.
"""
"""
import
math
import
math
import
pytest
import
pytest
MODELS
=
[
from
tests.models.embedding.utils
import
check_embeddings_close
,
matryoshka_fy
from
vllm
import
PoolingParams
SCORING_MODELS
=
[
"jinaai/jina-reranker-v2-base-multilingual"
,
# Roberta
"jinaai/jina-reranker-v2-base-multilingual"
,
# Roberta
]
]
...
@@ -27,8 +30,21 @@ TEXTS_2 = [
...
@@ -27,8 +30,21 @@ TEXTS_2 = [
"新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています"
,
"新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています"
,
]
]
EMBEDDING_MODELS
=
[
"jinaai/jina-embeddings-v3"
,
]
EMBEDDING_PROMPTS
=
[
"Follow the white rabbit."
,
# English
"Sigue al conejo blanco."
,
# Spanish
"Suis le lapin blanc."
,
# French
"跟着白兔走。"
,
# Chinese
"اتبع الأرنب الأبيض."
,
# Arabic
"Folge dem weißen Kaninchen."
,
# German
]
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
MODELS
)
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
SCORING_
MODELS
)
def
model_name
(
request
):
def
model_name
(
request
):
yield
request
.
param
yield
request
.
param
...
@@ -68,3 +84,83 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
...
@@ -68,3 +84,83 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
1
],
vllm_outputs
[
1
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
1
],
vllm_outputs
[
1
],
rel_tol
=
0.01
)
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
EMBEDDING_MODELS
)
def
emb_model_name
(
request
):
yield
request
.
param
def
test_is_matryoshka
(
vllm_runner
,
emb_model_name
):
with
vllm_runner
(
emb_model_name
,
task
=
"embed"
,
max_model_len
=
None
)
as
vllm_model
:
assert
vllm_model
.
model
.
llm_engine
.
model_config
.
is_matryoshka
@
pytest
.
mark
.
parametrize
(
"model"
,
EMBEDDING_MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_embeddings
(
hf_runner
,
vllm_runner
,
model
,
dtype
:
str
,
monkeypatch
,
)
->
None
:
example_prompts
=
EMBEDDING_PROMPTS
with
hf_runner
(
model
,
dtype
=
dtype
,
is_sentence_transformer
=
True
,
)
as
hf_model
:
hf_outputs
=
hf_model
.
encode
(
example_prompts
,
task
=
"text-matching"
)
with
vllm_runner
(
model
,
task
=
"embed"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
1e-2
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
EMBEDDING_MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dimensions"
,
[
16
,
32
])
def
test_matryoshka
(
hf_runner
,
vllm_runner
,
model
,
dtype
:
str
,
dimensions
:
int
,
monkeypatch
,
)
->
None
:
example_prompts
=
EMBEDDING_PROMPTS
with
hf_runner
(
model
,
dtype
=
dtype
,
is_sentence_transformer
=
True
,
)
as
hf_model
:
hf_outputs
=
hf_model
.
encode
(
example_prompts
,
task
=
"text-matching"
)
hf_outputs
=
matryoshka_fy
(
hf_outputs
,
dimensions
)
with
vllm_runner
(
model
,
task
=
"embed"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
,
pooling_params
=
PoolingParams
(
dimensions
=
dimensions
))
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
1e-2
,
)
tests/models/embedding/utils.py
View file @
31330101
...
@@ -30,3 +30,10 @@ def check_embeddings_close(
...
@@ -30,3 +30,10 @@ def check_embeddings_close(
f
"
\n
{
name_1
}
:
\t
{
embeddings_1
[:
16
]
!
r
}
"
)
f
"
\n
{
name_1
}
:
\t
{
embeddings_1
[:
16
]
!
r
}
"
)
assert
sim
>=
1
-
tol
,
fail_msg
assert
sim
>=
1
-
tol
,
fail_msg
def
matryoshka_fy
(
tensor
,
dimensions
):
tensor
=
torch
.
tensor
(
tensor
)
tensor
=
tensor
[...,
:
dimensions
]
tensor
=
F
.
normalize
(
tensor
,
p
=
2
,
dim
=
1
)
return
tensor
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
31330101
...
@@ -211,14 +211,15 @@ def _run_test(
...
@@ -211,14 +211,15 @@ def _run_test(
# will hurt multiprocessing backend with fork method (the default method).
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
with
vllm_runner
(
dtype
=
dtype
,
model
,
max_model_len
=
4096
,
dtype
=
dtype
,
max_num_seqs
=
3
,
max_model_len
=
19212
,
# 3 max size images
tensor_parallel_size
=
tensor_parallel_size
,
max_num_seqs
=
3
,
distributed_executor_backend
=
distributed_executor_backend
,
tensor_parallel_size
=
tensor_parallel_size
,
limit_mm_per_prompt
=
{
"image"
:
_LIMIT_IMAGE_PER_PROMPT
distributed_executor_backend
=
distributed_executor_backend
,
})
as
vllm_model
:
limit_mm_per_prompt
=
{
"image"
:
_LIMIT_IMAGE_PER_PROMPT
})
as
vllm_model
:
vllm_outputs_per_image
=
[
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
max_tokens
,
...
@@ -424,7 +425,7 @@ def test_bnb_regression(
...
@@ -424,7 +425,7 @@ def test_bnb_regression(
llm
=
LLM
(
llm
=
LLM
(
model
=
model
,
model
=
model
,
dtype
=
dtype
,
dtype
=
dtype
,
max_model_len
=
4096
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
quantization
=
"bitsandbytes"
,
quantization
=
"bitsandbytes"
,
)
)
...
@@ -477,7 +478,7 @@ def test_explicit_implicit_prompt(
...
@@ -477,7 +478,7 @@ def test_explicit_implicit_prompt(
llm
=
LLM
(
llm
=
LLM
(
model
=
model
,
model
=
model
,
dtype
=
dtype
,
dtype
=
dtype
,
max_model_len
=
4096
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
)
)
...
@@ -508,8 +509,8 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
...
@@ -508,8 +509,8 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
with
global_force_attn_backend_context_manager
(
attn_backend
),
vllm_runner
(
with
global_force_attn_backend_context_manager
(
attn_backend
),
vllm_runner
(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
max_model_len
=
4096
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
max_num_seqs
=
4
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
limit_mm_per_prompt
=
{
"image"
:
limit_mm_per_prompt
=
{
"image"
:
_LIMIT_IMAGE_PER_PROMPT
})
as
vllm_model
:
_LIMIT_IMAGE_PER_PROMPT
})
as
vllm_model
:
...
@@ -554,6 +555,23 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
...
@@ -554,6 +555,23 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
num_logprobs
,
num_logprobs
,
images
=
images
)
images
=
images
)
# Mixed batch with text and images with different numbers of tiles
prompts
=
[
"<|begin_of_text|>Hello!"
,
"<|begin_of_text|>Some text before.<|image|>What is in the image?"
,
# noqa: E501
"<|begin_of_text|>Some text before.<|image|>What is in the image?"
,
# noqa: E501
]
images
=
[
None
,
[
stop_sign
],
# smaller image must be 2nd for the repro
[
stop_sign
.
resize
((
448
,
448
))],
]
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
,
images
=
images
)
class
DummyModel
:
class
DummyModel
:
image_token_id
=
MLLAMA_IMAGE_TOKEN_ID
image_token_id
=
MLLAMA_IMAGE_TOKEN_ID
...
@@ -676,3 +694,26 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
...
@@ -676,3 +694,26 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
f
"full_text_row_masked_out_mask[
{
idx
}
] must be "
\
f
"full_text_row_masked_out_mask[
{
idx
}
] must be "
\
f
"'
{
must_be_masked
}
' "
f
"'
{
must_be_masked
}
' "
idx
+=
1
idx
+=
1
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"encoder_seq_lens, num_tiles, expected"
,
[
([
6404
],
[[
4
]],
[
6404
]),
([
0
,
6404
],
[[
4
]],
[
6404
]),
([
0
,
1601
,
8005
],
[[
1
],
[
4
,
1
]],
[
1601
,
8005
]),
([
0
,
19212
,
0
,
3202
],
[[
4
,
4
,
4
],
[
2
]],
[
19212
,
3202
]),
])
def
test_parse_and_validate_encoder_lens
(
encoder_seq_lens
,
num_tiles
,
expected
)
->
None
:
dummy
=
DummyModel
()
num_tokens_per_tile
=
1601
actual_encoder_seq_lens
=
MllamaForConditionalGeneration
\
.
_get_and_validate_encoder_lens
(
dummy
,
encoder_seq_lens
,
num_tiles
,
num_tokens_per_tile
,
)
assert
actual_encoder_seq_lens
==
expected
,
\
f
"Expected
{
expected
}
but got
{
actual_encoder_seq_lens
}
"
tests/models/multimodal/processing/test_common.py
View file @
31330101
...
@@ -257,6 +257,8 @@ def _test_processing_correctness_mistral(
...
@@ -257,6 +257,8 @@ def _test_processing_correctness_mistral(
"h2oai/h2ovl-mississippi-800m"
,
"h2oai/h2ovl-mississippi-800m"
,
"OpenGVLab/InternVL2-1B"
,
"OpenGVLab/InternVL2-1B"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
"llava-hf/llava-1.5-7b-hf"
,
"llava-hf/llava-1.5-7b-hf"
,
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
...
...
tests/models/multimodal/processing/test_llama4.py
View file @
31330101
...
@@ -71,29 +71,14 @@ def test_processor_override(
...
@@ -71,29 +71,14 @@ def test_processor_override(
# image token offsets
# image token offsets
img_locs
=
processed_inputs
[
"mm_placeholders"
].
get
(
"image"
,
[])
img_locs
=
processed_inputs
[
"mm_placeholders"
].
get
(
"image"
,
[])
assert
len
(
img_locs
)
==
num_imgs
assert
len
(
img_locs
)
==
num_imgs
assert
[
img_loc
[
"
offset
"
]
for
img_loc
in
img_locs
]
==
\
assert
[
img_loc
.
offset
for
img_loc
in
img_locs
]
==
\
[
i
for
i
,
v
in
enumerate
(
prompt_token_ids
)
\
[
i
for
i
,
v
in
enumerate
(
prompt_token_ids
)
\
if
v
==
config
.
boi_token_index
]
if
v
==
config
.
boi_token_index
]
# patch sizes and masks
# patch sizes and masks
assert
prompt_token_ids
.
count
(
config
.
image_token_index
)
\
==
sum
(
img_patch
.
sum
()
for
img_patch
in
mm_kwargs
[
"embed_is_patch"
])
patch_token_id
=
vocab
[
hf_processor
.
img_patch_token
]
num_patches
=
processed_inputs
[
"prompt_token_ids"
].
count
(
patch_token_id
)
mm_counts
=
{
"image"
:
num_imgs
}
assert
num_patches
/
num_imgs
<=
\
processor
.
info
.
get_mm_max_tokens_per_item
(
32768
,
mm_counts
)[
"image"
]
num_patches_per_chunk
=
processor
.
info
.
get_patch_per_chunk
(
num_patches_per_chunk
=
processor
.
info
.
get_patch_per_chunk
(
config
.
vision_config
)
config
.
vision_config
)
assert
prompt_token_ids
.
count
(
config
.
image_token_index
)
\
assert
prompt_token_ids
.
count
(
config
.
image_token_index
)
\
==
mm_kwargs
[
"patches_per_image"
].
sum
()
*
num_patches_per_chunk
==
mm_kwargs
[
"patches_per_image"
].
sum
()
*
num_patches_per_chunk
assert
mm_kwargs
[
"pixel_values"
].
shape
[
0
]
\
assert
mm_kwargs
[
"pixel_values"
].
shape
[
0
]
\
==
mm_kwargs
[
"patches_per_image"
].
sum
()
==
mm_kwargs
[
"patches_per_image"
].
sum
()
for
embed_is_patch
,
aspect_ratio
in
zip
(
mm_kwargs
[
"embed_is_patch"
],
mm_kwargs
[
"aspect_ratios"
]):
assert
embed_is_patch
.
shape
[
0
]
==
\
len
(
tokenizer
.
encode
(
hf_processor
.
_prompt_split_image
(
aspect_ratio
,
num_patches_per_chunk
),
add_special_tokens
=
False
))
tests/models/multimodal/processing/test_llava_next.py
View file @
31330101
...
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
...
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
first_placeholder
=
image_placeholders
[
0
]
first_placeholder
=
image_placeholders
[
0
]
# NOTE: There is a BOS token
# NOTE: There is a BOS token
assert
first_placeholder
[
"
offset
"
]
==
1
assert
first_placeholder
.
offset
==
1
assert
first_placeholder
[
"
length
"
]
==
(
assert
first_placeholder
.
length
==
(
len
(
processed_inputs
[
"prompt_token_ids"
])
-
1
)
//
num_imgs
len
(
processed_inputs
[
"prompt_token_ids"
])
-
1
)
//
num_imgs
except
Exception
as
exc
:
except
Exception
as
exc
:
...
...
tests/models/multimodal/processing/test_llava_onevision.py
View file @
31330101
...
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
...
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
first_placeholder
=
image_placeholders
[
0
]
first_placeholder
=
image_placeholders
[
0
]
assert
first_placeholder
[
"
offset
"
]
==
0
assert
first_placeholder
.
offset
==
0
assert
first_placeholder
[
"
length
"
]
==
len
(
assert
first_placeholder
.
length
==
len
(
processed_inputs
[
"prompt_token_ids"
])
//
num_imgs
processed_inputs
[
"prompt_token_ids"
])
//
num_imgs
except
Exception
as
exc
:
except
Exception
as
exc
:
failed_size_excs
.
append
((
image_size
,
exc
))
failed_size_excs
.
append
((
image_size
,
exc
))
...
...
tests/models/multimodal/processing/test_mllama.py
0 → 100644
View file @
31330101
# SPDX-License-Identifier: Apache-2.0
"""Tests for mllama's multimodal preprocessing and profiling."""
import
pytest
from
transformers
import
MllamaConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
...utils
import
build_model_context
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"meta-llama/Llama-3.2-11B-Vision-Instruct"
])
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
4096
,
8192
,
25600
,
131072
])
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
[
1
,
2
,
8
])
def
test_profiling
(
model_id
:
str
,
max_model_len
:
int
,
max_num_seqs
:
int
,
):
# regression test for https://github.com/vllm-project/vllm/issues/13929
from
vllm.model_executor.models.mllama
import
calc_token_per_chunk
model_config_kwargs
=
{
"max_model_len"
:
max_model_len
,
}
ctx
=
build_model_context
(
model_id
,
model_config_kwargs
=
model_config_kwargs
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
mm_config
=
ctx
.
get_mm_config
()
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
profiler
=
MultiModalProfiler
(
processor
)
dummy_encoder_data
=
profiler
.
get_encoder_dummy_data
(
max_model_len
,
mm_counts
=
mm_config
.
limit_per_prompt
,
)
dummy_mm_data
=
processor
.
dummy_inputs
.
get_dummy_processor_inputs
(
max_model_len
,
mm_counts
=
mm_config
.
limit_per_prompt
,
)
hf_config
=
ctx
.
get_hf_config
(
MllamaConfig
)
image_size
=
hf_config
.
vision_config
.
image_size
encoder_seq_lens
=
[
len
(
dummy_encoder_data
.
prompt_token_ids
)
]
*
max_num_seqs
mm_kwargs
=
processor
.
apply
(
prompt
=
dummy_mm_data
.
prompt_text
,
mm_data
=
dummy_mm_data
.
mm_data
,
hf_processor_mm_kwargs
=
dict
(),
)[
"mm_kwargs"
]
# Get the actual number of encoder tokens for each sample.
# Because attn_metadata.encoder_seq_lens only counts the last
# group of images for each sample, which is used to cheat the
# block manager to allocate blocks for those images only.
# See MllamaMultiModalProcessor for more details.
num_tiles
=
[[
t
]
for
t
in
mm_kwargs
.
pop
(
"num_tiles"
)]
num_tokens_per_tile
=
calc_token_per_chunk
(
image_size
)
actual_encoder_seq_lens
=
[
sum
(
num_tile
)
*
num_tokens_per_tile
for
num_tile
in
num_tiles
]
# simulate mllama image-present prefill.
for
actual_len
,
last_group_len
in
zip
(
actual_encoder_seq_lens
,
encoder_seq_lens
):
assert
actual_len
>=
last_group_len
tests/models/multimodal/processing/test_smolvlm.py
0 → 100644
View file @
31330101
# SPDX-License-Identifier: Apache-2.0
"""Tests for smolvlm's multimodal preprocessing kwargs."""
import
pytest
from
transformers
import
SmolVLMConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
])
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"mm_processor_kwargs"
,
"expected_toks_per_img"
),
[
({
"max_image_size"
:
{
"longest_edge"
:
384
}},
1377
),
({
"max_image_size"
:
{
"longest_edge"
:
768
}},
405
),
])
# yapf: enable
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model_id
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
"""Ensure Idefics3MultiModalProcessor handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx
=
build_model_context
(
model_id
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders
=
"<image>"
if
num_imgs
==
1
else
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
in
range
(
1
,
num_imgs
+
1
))
prompt
=
f
"<|im_start|>User:
{
placeholders
}
\n
<end_of_utterance>
\n
Assistant:"
# noqa: E501
# Build mm_data
image_size
=
ctx
.
get_hf_config
(
SmolVLMConfig
).
vision_config
.
image_size
dummy_image_size
=
(
image_size
*
4
,
image_size
*
4
)
dummy_image
=
image_assets
[
0
].
pil_image
.
resize
(
dummy_image_size
)
mm_data
=
{
"image"
:
[
dummy_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
# Ensure the placeholders format are correct
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
hf_processed_inputs
=
hf_processor
(
text
=
prompt
,
images
=
mm_data
[
"image"
])
assert
processed_inputs
[
"prompt_token_ids"
]
==
hf_processed_inputs
[
"input_ids"
][
0
]
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
ctx
.
get_hf_config
().
image_token_id
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
tests/models/registry.py
View file @
31330101
...
@@ -124,6 +124,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -124,6 +124,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"BloomForCausalLM"
:
_HfExamplesInfo
(
"bigscience/bloomz-1b1"
),
"BloomForCausalLM"
:
_HfExamplesInfo
(
"bigscience/bloomz-1b1"
),
"ChatGLMModel"
:
_HfExamplesInfo
(
"THUDM/chatglm3-6b"
,
"ChatGLMModel"
:
_HfExamplesInfo
(
"THUDM/chatglm3-6b"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"ChatGLMForConditionalGeneration"
:
_HfExamplesInfo
(
"thu-coai/ShieldLM-6B-chatglm3"
,
# noqa: E501
trust_remote_code
=
True
),
"CohereForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r-v01"
,
"CohereForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r-v01"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Cohere2ForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r7b-12-2024"
,
# noqa: E501
"Cohere2ForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r7b-12-2024"
,
# noqa: E501
...
@@ -144,6 +146,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -144,6 +146,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Gemma3ForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-3-1b-it"
,
"Gemma3ForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-3-1b-it"
,
min_transformers_version
=
"4.50"
),
min_transformers_version
=
"4.50"
),
"GlmForCausalLM"
:
_HfExamplesInfo
(
"THUDM/glm-4-9b-chat-hf"
),
"GlmForCausalLM"
:
_HfExamplesInfo
(
"THUDM/glm-4-9b-chat-hf"
),
"Glm4ForCausalLM"
:
_HfExamplesInfo
(
"THUDM/GLM-4-32B-Chat-0414"
,
is_available_online
=
False
,
min_transformers_version
=
"4.52.dev0"
),
"GPT2LMHeadModel"
:
_HfExamplesInfo
(
"gpt2"
),
"GPT2LMHeadModel"
:
_HfExamplesInfo
(
"gpt2"
),
"GPTBigCodeForCausalLM"
:
_HfExamplesInfo
(
"bigcode/starcoder"
),
"GPTBigCodeForCausalLM"
:
_HfExamplesInfo
(
"bigcode/starcoder"
),
"GPTJForCausalLM"
:
_HfExamplesInfo
(
"EleutherAI/gpt-j-6b"
),
"GPTJForCausalLM"
:
_HfExamplesInfo
(
"EleutherAI/gpt-j-6b"
),
...
@@ -202,6 +209,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -202,6 +209,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen2-7B-Instruct"
,
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen2-7B-Instruct"
,
extras
=
{
"2.5"
:
"Qwen/Qwen2.5-7B-Instruct"
}),
# noqa: E501
extras
=
{
"2.5"
:
"Qwen/Qwen2.5-7B-Instruct"
}),
# noqa: E501
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
,
is_available_online
=
False
,
min_transformers_version
=
"4.51"
),
"Qwen3MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-MoE-15B-A2B"
,
is_available_online
=
False
,
min_transformers_version
=
"4.51"
),
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
,
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
,
is_available_online
=
False
),
is_available_online
=
False
),
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-zephyr-3b"
,
# noqa: E501
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-zephyr-3b"
,
# noqa: E501
...
@@ -277,12 +294,16 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -277,12 +294,16 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}),
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}),
# noqa: E501
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
,
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
,
extras
=
{
"2b"
:
"h2oai/h2ovl-mississippi-2b"
}),
# noqa: E501
extras
=
{
"2b"
:
"h2oai/h2ovl-mississippi-2b"
},
# noqa: E501
max_transformers_version
=
"4.48"
,
# noqa: E501
transformers_version_reason
=
"HF model is not compatible."
),
# noqa: E501
"InternVLChatModel"
:
_HfExamplesInfo
(
"OpenGVLab/InternVL2-1B"
,
"InternVLChatModel"
:
_HfExamplesInfo
(
"OpenGVLab/InternVL2-1B"
,
extras
=
{
"2B"
:
"OpenGVLab/InternVL2-2B"
},
# noqa: E501
extras
=
{
"2B"
:
"OpenGVLab/InternVL2-2B"
},
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
}),
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
}),
# noqa: E501
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
# noqa: E501
min_transformers_version
=
"4.51"
),
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-1.5-7b-hf"
,
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-1.5-7b-hf"
,
extras
=
{
"mistral"
:
"mistral-community/pixtral-12b"
,
# noqa: E501
extras
=
{
"mistral"
:
"mistral-community/pixtral-12b"
,
# noqa: E501
"mistral-fp8"
:
"nm-testing/pixtral-12b-FP8-dynamic"
}),
# noqa: E501
"mistral-fp8"
:
"nm-testing/pixtral-12b-FP8-dynamic"
}),
# noqa: E501
...
@@ -305,7 +326,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -305,7 +326,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras
=
{
"fp8"
:
"nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"
}),
# noqa: E501
extras
=
{
"fp8"
:
"nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"
}),
# noqa: E501
"MolmoForCausalLM"
:
_HfExamplesInfo
(
"allenai/Molmo-7B-D-0924"
,
"MolmoForCausalLM"
:
_HfExamplesInfo
(
"allenai/Molmo-7B-D-0924"
,
max_transformers_version
=
"4.48"
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"
Use of private method which no longer exists
."
,
# noqa: E501
transformers_version_reason
=
"
Incorrectly-detected `tensorflow` import
."
,
# noqa: E501
extras
=
{
"olmo"
:
"allenai/Molmo-7B-O-0924"
},
# noqa: E501
extras
=
{
"olmo"
:
"allenai/Molmo-7B-O-0924"
},
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"NVLM_D"
:
_HfExamplesInfo
(
"nvidia/NVLM-D-72B"
,
"NVLM_D"
:
_HfExamplesInfo
(
"nvidia/NVLM-D-72B"
,
...
@@ -314,6 +335,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -314,6 +335,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras
=
{
"v2"
:
"google/paligemma2-3b-ft-docci-448"
}),
# noqa: E501
extras
=
{
"v2"
:
"google/paligemma2-3b-ft-docci-448"
}),
# noqa: E501
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-vision-128k-instruct"
,
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-vision-128k-instruct"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
extras
=
{
"phi3.5"
:
"microsoft/Phi-3.5-vision-instruct"
}),
# noqa: E501
extras
=
{
"phi3.5"
:
"microsoft/Phi-3.5-vision-instruct"
}),
# noqa: E501
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-4-multimodal-instruct"
,
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-4-multimodal-instruct"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
...
@@ -328,6 +351,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -328,6 +351,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-VL-3B-Instruct"
,
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-VL-3B-Instruct"
,
# noqa: E501
min_transformers_version
=
"4.49"
),
# noqa: E501
min_transformers_version
=
"4.49"
),
# noqa: E501
"SkyworkR1VChatModel"
:
_HfExamplesInfo
(
"Skywork/Skywork-R1V-38B"
),
"SkyworkR1VChatModel"
:
_HfExamplesInfo
(
"Skywork/Skywork-R1V-38B"
),
"SmolVLMForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
),
# noqa: E501
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
# noqa: E501
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
# [Encoder-decoder]
# [Encoder-decoder]
...
@@ -351,6 +375,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
...
@@ -351,6 +375,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"DeepSeekMTPModel"
:
_HfExamplesInfo
(
"luccafong/deepseek_mtp_main_random"
,
"DeepSeekMTPModel"
:
_HfExamplesInfo
(
"luccafong/deepseek_mtp_main_random"
,
speculative_model
=
"luccafong/deepseek_mtp_draft_random"
,
# noqa: E501
speculative_model
=
"luccafong/deepseek_mtp_draft_random"
,
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"EagleLlamaForCausalLM"
:
_HfExamplesInfo
(
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
trust_remote_code
=
True
,
speculative_model
=
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
tokenizer
=
"meta-llama/Meta-Llama-3-8B-Instruct"
),
# noqa: E501
}
}
_TRANSFORMERS_MODELS
=
{
_TRANSFORMERS_MODELS
=
{
...
...
tests/models/test_initialization.py
View file @
31330101
...
@@ -7,6 +7,8 @@ from transformers import PretrainedConfig
...
@@ -7,6 +7,8 @@ from transformers import PretrainedConfig
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.engine.llm_engine
import
LLMEngine
as
V0LLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
as
V0LLMEngine
from
vllm.utils
import
GiB_bytes
from
vllm.v1.core.kv_cache_utils
import
get_kv_cache_config
from
vllm.v1.engine.core
import
EngineCore
as
V1EngineCore
from
vllm.v1.engine.core
import
EngineCore
as
V1EngineCore
from
.registry
import
HF_EXAMPLE_MODELS
from
.registry
import
HF_EXAMPLE_MODELS
...
@@ -42,14 +44,21 @@ def test_can_initialize(model_arch):
...
@@ -42,14 +44,21 @@ def test_can_initialize(model_arch):
self
.
cache_config
.
num_gpu_blocks
=
0
self
.
cache_config
.
num_gpu_blocks
=
0
self
.
cache_config
.
num_cpu_blocks
=
0
self
.
cache_config
.
num_cpu_blocks
=
0
def
_initalize_kv_caches_v1
(
self
,
vllm_config
):
def
_initialize_kv_caches_v1
(
self
,
vllm_config
):
# gpu_blocks (> 0), cpu_blocks
kv_cache_specs
=
self
.
model_executor
.
get_kv_cache_specs
()
return
1
,
0
scheduler_kv_cache_config
=
get_kv_cache_config
(
vllm_config
,
kv_cache_specs
[
0
],
20
*
GiB_bytes
,
)
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
return
1
,
0
,
scheduler_kv_cache_config
with
(
patch
.
object
(
V0LLMEngine
,
"_initialize_kv_caches"
,
with
(
patch
.
object
(
V0LLMEngine
,
"_initialize_kv_caches"
,
_initialize_kv_caches_v0
),
_initialize_kv_caches_v0
),
patch
.
object
(
V1EngineCore
,
"_initialize_kv_caches"
,
patch
.
object
(
V1EngineCore
,
"_initialize_kv_caches"
,
_initalize_kv_caches_v1
)):
_init
i
alize_kv_caches_v1
)):
LLM
(
LLM
(
model_info
.
default
,
model_info
.
default
,
tokenizer
=
model_info
.
tokenizer
,
tokenizer
=
model_info
.
tokenizer
,
...
...
tests/models/test_oot_registration.py
View file @
31330101
...
@@ -90,6 +90,7 @@ def test_oot_registration_multimodal(
...
@@ -90,6 +90,7 @@ def test_oot_registration_multimodal(
max_model_len
=
4096
,
max_model_len
=
4096
,
enforce_eager
=
True
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
})
limit_mm_per_prompt
=
{
"image"
:
1
})
first_token
=
llm
.
get_tokenizer
().
decode
(
0
)
first_token
=
llm
.
get_tokenizer
().
decode
(
0
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
...
tests/models/utils.py
View file @
31330101
...
@@ -255,6 +255,7 @@ def build_model_context(
...
@@ -255,6 +255,7 @@ def build_model_context(
model_id
:
str
,
model_id
:
str
,
task
:
TaskOption
=
"auto"
,
task
:
TaskOption
=
"auto"
,
dtype
:
Union
[
str
,
torch
.
dtype
]
=
"auto"
,
dtype
:
Union
[
str
,
torch
.
dtype
]
=
"auto"
,
model_config_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]]
=
None
,
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]]
=
None
,
disable_mm_preprocessor_cache
:
bool
=
True
,
disable_mm_preprocessor_cache
:
bool
=
True
,
...
@@ -274,6 +275,7 @@ def build_model_context(
...
@@ -274,6 +275,7 @@ def build_model_context(
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_config_kwargs
=
model_config_kwargs
or
{}
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model_id
,
model_id
,
task
=
task
,
task
=
task
,
...
@@ -286,5 +288,6 @@ def build_model_context(
...
@@ -286,5 +288,6 @@ def build_model_context(
limit_mm_per_prompt
=
limit_mm_per_prompt
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
disable_mm_preprocessor_cache
=
disable_mm_preprocessor_cache
,
disable_mm_preprocessor_cache
=
disable_mm_preprocessor_cache
,
hf_overrides
=
model_info
.
hf_overrides
,
hf_overrides
=
model_info
.
hf_overrides
,
**
model_config_kwargs
,
)
)
return
InputContext
(
model_config
)
return
InputContext
(
model_config
)
tests/multimodal/test_processing.py
View file @
31330101
...
@@ -785,6 +785,7 @@ def test_find_update_tokens(
...
@@ -785,6 +785,7 @@ def test_find_update_tokens(
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
6
,
start_idx
=
6
,
tokens
=
[
32000
,
32000
],
tokens
=
[
32000
,
32000
],
is_embed
=
None
,
),
),
],
],
"pattern_4"
:
[
"pattern_4"
:
[
...
@@ -793,6 +794,7 @@ def test_find_update_tokens(
...
@@ -793,6 +794,7 @@ def test_find_update_tokens(
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
3
,
start_idx
=
3
,
tokens
=
[
32000
],
tokens
=
[
32000
],
is_embed
=
None
,
),
),
],
],
}
}
...
@@ -807,12 +809,14 @@ def test_find_update_tokens(
...
@@ -807,12 +809,14 @@ def test_find_update_tokens(
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
1
,
start_idx
=
1
,
tokens
=
[
32000
,
32000
],
tokens
=
[
32000
,
32000
],
is_embed
=
None
,
),
),
PlaceholderFeaturesInfo
(
PlaceholderFeaturesInfo
(
modality
=
"pattern_1"
,
modality
=
"pattern_1"
,
item_idx
=
1
,
item_idx
=
1
,
start_idx
=
5
,
start_idx
=
5
,
tokens
=
[
32000
,
32000
],
tokens
=
[
32000
,
32000
],
is_embed
=
None
,
),
),
],
],
"pattern_3"
:
[
"pattern_3"
:
[
...
@@ -821,6 +825,7 @@ def test_find_update_tokens(
...
@@ -821,6 +825,7 @@ def test_find_update_tokens(
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
7
,
start_idx
=
7
,
tokens
=
[
1550
,
918
,
1550
],
tokens
=
[
1550
,
918
,
1550
],
is_embed
=
None
,
),
),
],
],
# No match for pattern_4 as it has lower priority than pattern_1
# No match for pattern_4 as it has lower priority than pattern_1
...
@@ -835,12 +840,14 @@ def test_find_update_tokens(
...
@@ -835,12 +840,14 @@ def test_find_update_tokens(
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
1
,
start_idx
=
1
,
tokens
=
[
32000
,
32000
],
tokens
=
[
32000
,
32000
],
is_embed
=
None
,
),
),
PlaceholderFeaturesInfo
(
PlaceholderFeaturesInfo
(
modality
=
"pattern_1"
,
modality
=
"pattern_1"
,
item_idx
=
1
,
item_idx
=
1
,
start_idx
=
3
,
start_idx
=
3
,
tokens
=
[
32000
,
32000
],
tokens
=
[
32000
,
32000
],
is_embed
=
None
,
),
),
],
],
"pattern_4"
:
[
"pattern_4"
:
[
...
@@ -849,6 +856,7 @@ def test_find_update_tokens(
...
@@ -849,6 +856,7 @@ def test_find_update_tokens(
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
5
,
start_idx
=
5
,
tokens
=
[
32000
],
tokens
=
[
32000
],
is_embed
=
None
,
),
),
],
],
"pattern_3"
:
[
"pattern_3"
:
[
...
@@ -857,6 +865,7 @@ def test_find_update_tokens(
...
@@ -857,6 +865,7 @@ def test_find_update_tokens(
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
6
,
start_idx
=
6
,
tokens
=
[
1550
,
918
,
1550
],
tokens
=
[
1550
,
918
,
1550
],
is_embed
=
None
,
),
),
],
],
}
}
...
@@ -963,10 +972,13 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
...
@@ -963,10 +972,13 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
if
is_valid
:
if
is_valid
:
exc_ctx
=
nullcontext
()
exc_ctx
=
nullcontext
()
else
:
else
:
exc_ctx
=
pytest
.
raises
(
ValueError
,
match
=
"
this
model only supports"
)
exc_ctx
=
pytest
.
raises
(
ValueError
,
match
=
"
The
model only supports"
)
with
exc_ctx
:
with
exc_ctx
:
profiler
.
get_decoder_dummy_data
(
model_config
.
max_model_len
)
profiler
.
get_decoder_dummy_data
(
model_config
.
max_model_len
,
mm_counts
=
limit_mm_per_prompt
,
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
...
...
tests/quantization/test_bitsandbytes.py
View file @
31330101
...
@@ -45,7 +45,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
...
@@ -45,7 +45,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
hf_model_kwargs
=
{
"load_in_4bit"
:
True
}
hf_model_kwargs
=
{
"load_in_4bit"
:
True
}
validate_generated_texts
(
hf_runner
,
vllm_runner
,
example_prompts
[:
1
],
validate_generated_texts
(
hf_runner
,
vllm_runner
,
example_prompts
[:
1
],
model_name
,
hf_model_kwargs
)
model_name
,
False
,
hf_model_kwargs
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
)
or
current_platform
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
)
or
current_platform
(),
...
@@ -57,7 +57,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
...
@@ -57,7 +57,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name
,
description
)
->
None
:
model_name
,
description
)
->
None
:
validate_generated_texts
(
hf_runner
,
vllm_runner
,
example_prompts
[:
1
],
validate_generated_texts
(
hf_runner
,
vllm_runner
,
example_prompts
[:
1
],
model_name
)
model_name
,
True
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
)
or
current_platform
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
)
or
current_platform
(),
...
@@ -69,7 +69,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
...
@@ -69,7 +69,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name
,
description
)
->
None
:
model_name
,
description
)
->
None
:
validate_generated_texts
(
hf_runner
,
vllm_runner
,
example_prompts
[:
1
],
validate_generated_texts
(
hf_runner
,
vllm_runner
,
example_prompts
[:
1
],
model_name
)
model_name
,
True
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
...
@@ -86,6 +86,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
...
@@ -86,6 +86,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
vllm_runner
,
vllm_runner
,
example_prompts
[:
1
],
example_prompts
[:
1
],
model_name
,
model_name
,
False
,
hf_model_kwargs
,
hf_model_kwargs
,
vllm_tp_size
=
2
)
vllm_tp_size
=
2
)
...
@@ -132,13 +133,14 @@ def validate_generated_texts(hf_runner,
...
@@ -132,13 +133,14 @@ def validate_generated_texts(hf_runner,
vllm_runner
,
vllm_runner
,
prompts
,
prompts
,
model_name
,
model_name
,
pre_quant
=
False
,
hf_model_kwargs
=
None
,
hf_model_kwargs
=
None
,
vllm_tp_size
=
1
):
vllm_tp_size
=
1
):
# NOTE: run vLLM first, as it requires a clean process
# NOTE: run vLLM first, as it requires a clean process
# when using distributed inference
# when using distributed inference
with
vllm_runner
(
model_name
,
with
vllm_runner
(
model_name
,
quantization
=
'bitsandbytes'
,
quantization
=
None
if
pre_quant
else
'bitsandbytes'
,
tensor_parallel_size
=
vllm_tp_size
,
tensor_parallel_size
=
vllm_tp_size
,
enforce_eager
=
False
)
as
llm
:
enforce_eager
=
False
)
as
llm
:
vllm_outputs
=
llm
.
generate_greedy
(
prompts
,
8
)
vllm_outputs
=
llm
.
generate_greedy
(
prompts
,
8
)
...
...
tests/quantization/test_quark.py
View file @
31330101
...
@@ -4,17 +4,28 @@
...
@@ -4,17 +4,28 @@
Run `pytest tests/quantization/test_quark.py`.
Run `pytest tests/quantization/test_quark.py`.
"""
"""
import
torch
import
pytest
from
vllm.model_executor.layers.quantization.quark.quark
import
(
# noqa: E501
from
vllm.model_executor.layers.quantization.quark.quark
import
(
# noqa: E501
QuarkLinearMethod
,
QuarkW8A8Fp8
)
QuarkLinearMethod
,
QuarkW8A8Fp8
,
QuarkW8A8Int8
)
from
vllm.platforms
import
current_platform
def
test_quark_fp8
(
vllm_runner
,
monkeypatch
):
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
# vllm_runner.apply_model() relies on V0 internals.
def
use_v0_only
(
monkeypatch
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
"""
This module relies on V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
@
pytest
.
mark
.
parametrize
(
'kv_cache_dtype'
,
[
'auto'
,
'fp8'
])
@
pytest
.
mark
.
parametrize
(
'tp'
,
[
1
])
def
test_quark_fp8_w_per_tensor_a_per_tensor
(
vllm_runner
,
kv_cache_dtype
,
tp
):
model_path
=
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
model_path
=
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
with
vllm_runner
(
model_path
)
as
llm
:
with
vllm_runner
(
model_path
,
kv_cache_dtype
=
kv_cache_dtype
,
tensor_parallel_size
=
tp
)
as
llm
:
def
check_model
(
model
):
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
layer
=
model
.
model
.
layers
[
0
]
...
@@ -26,11 +37,29 @@ def test_quark_fp8(vllm_runner, monkeypatch):
...
@@ -26,11 +37,29 @@ def test_quark_fp8(vllm_runner, monkeypatch):
if
isinstance
(
qkv_proj
.
scheme
,
QuarkW8A8Fp8
):
if
isinstance
(
qkv_proj
.
scheme
,
QuarkW8A8Fp8
):
assert
len
(
qkv_proj
.
input_scale
.
shape
)
==
0
assert
len
(
qkv_proj
.
input_scale
.
shape
)
==
0
assert
qkv_proj
.
weight
.
dtype
is
torch
.
float8_e4m3fn
assert
qkv_proj
.
weight
.
dtype
is
current_platform
.
fp8_dtype
()
#assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz
assert
len
(
qkv_proj
.
weight_scale
.
shape
)
==
0
assert
len
(
qkv_proj
.
weight_scale
.
shape
)
==
0
llm
.
apply_model
(
check_model
)
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
assert
output
assert
output
@
pytest
.
mark
.
parametrize
(
'tp'
,
[
1
])
def
test_quark_int8_w_per_tensor_a_per_tensor
(
vllm_runner
,
tp
):
model_path
=
"amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
with
vllm_runner
(
model_path
,
tensor_parallel_size
=
tp
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
QuarkLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
QuarkW8A8Int8
)
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
assert
output
Prev
1
…
3
4
5
6
7
8
9
10
11
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment