Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4eabe123
Commit
4eabe123
authored
May 28, 2025
by
zhuwenwen
Browse files
Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori
parents
45840cd2
58738772
Changes
670
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
509 additions
and
240 deletions
+509
-240
tests/models/multimodal/generation/test_phi4mm.py
tests/models/multimodal/generation/test_phi4mm.py
+3
-3
tests/models/multimodal/generation/vlm_utils/model_utils.py
tests/models/multimodal/generation/vlm_utils/model_utils.py
+67
-21
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+106
-156
tests/models/multimodal/processing/test_mllama.py
tests/models/multimodal/processing/test_mllama.py
+1
-1
tests/models/quantization/test_gguf.py
tests/models/quantization/test_gguf.py
+6
-2
tests/models/quantization/test_nvfp4.py
tests/models/quantization/test_nvfp4.py
+3
-3
tests/models/registry.py
tests/models/registry.py
+33
-10
tests/models/test_initialization.py
tests/models/test_initialization.py
+13
-4
tests/models/test_oot_registration.py
tests/models/test_oot_registration.py
+2
-1
tests/models/test_transformers.py
tests/models/test_transformers.py
+42
-14
tests/models/test_utils.py
tests/models/test_utils.py
+70
-0
tests/multimodal/assets/rgba.png
tests/multimodal/assets/rgba.png
+0
-0
tests/multimodal/test_image.py
tests/multimodal/test_image.py
+36
-0
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+2
-1
tests/neuron/2_core/test_eagle.py
tests/neuron/2_core/test_eagle.py
+82
-0
tests/neuron/2_core/test_mistral.py
tests/neuron/2_core/test_mistral.py
+39
-7
tests/plugins_tests/test_platform_plugins.py
tests/plugins_tests/test_platform_plugins.py
+1
-1
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+0
-6
tests/runai_model_streamer_test/test_weight_utils.py
tests/runai_model_streamer_test/test_weight_utils.py
+3
-2
tests/tensorizer_loader/conftest.py
tests/tensorizer_loader/conftest.py
+0
-8
No files found.
tests/models/multimodal/generation/test_phi4mm.py
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
import
os
import
re
from
collections.abc
import
Sequence
from
typing
import
Optional
import
librosa
import
pytest
import
regex
as
re
from
huggingface_hub
import
snapshot_download
from
transformers
import
AutoTokenizer
from
vllm.assets.image
import
ImageAsset
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
convert_image_mode
,
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
...
...
@@ -267,7 +267,7 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
# use the example speech question so that the model outputs are reasonable
audio
=
librosa
.
load
(
speech_question
,
sr
=
None
)
image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
image
=
convert_image_mode
(
ImageAsset
(
"cherry_blossom"
).
pil_image
,
"RGB"
)
inputs_vision_speech
=
[
(
...
...
tests/models/multimodal/generation/vlm_utils/model_utils.py
View file @
4eabe123
...
...
@@ -3,11 +3,13 @@
for manipulating the input / output of HF & vLLM test runners, which are
typically specific to a small subset of models.
"""
import
re
import
types
from
pathlib
import
PosixPath
from
typing
import
Optional
,
Union
import
numpy
as
np
import
numpy.typing
as
npt
import
regex
as
re
import
torch
from
PIL.Image
import
Image
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
BatchFeature
,
...
...
@@ -495,13 +497,20 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self
.
max_num
=
self
.
config
.
max_dynamic_patch
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Union
[
Image
,
list
[
Image
]],
**
kwargs
):
def
__call__
(
self
,
text
:
str
,
images
:
Union
[
Image
,
list
[
Image
]]
=
None
,
videos
:
Union
[
npt
.
NDArray
,
list
[
npt
.
NDArray
]]
=
None
,
**
kwargs
,
):
from
vllm.model_executor.models.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
image_to_pixel_values_internvl
)
image_to_pixel_values_internvl
,
video_to_pixel_values_internvl
)
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
pixel_values
=
[
videos
=
[
videos
]
if
isinstance
(
videos
,
np
.
ndarray
)
else
videos
if
images
is
not
None
:
pixel_values_images
=
[
image_to_pixel_values_internvl
(
image
,
input_size
=
self
.
image_size
,
...
...
@@ -510,15 +519,52 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
num_patches_
list
=
[
pixel_value
.
shape
[
0
]
for
pixel_value
in
pixel_values
num_patches_
images
=
[
pixel_value
.
shape
[
0
]
for
pixel_value
in
pixel_values
_images
]
pixel_values
=
torch
.
cat
(
pixel_values
,
dim
=
0
)
for
num_patches
in
num_patches_list
:
else
:
pixel_values_images
,
num_patches_images
=
[],
[]
if
videos
is
not
None
:
pixel_values_videos
=
[
video_to_pixel_values_internvl
(
video
,
input_size
=
self
.
image_size
,
min_num
=
1
,
max_num
=
1
,
use_thumbnail
=
False
,
)
for
video
in
videos
]
num_patches_videos
=
[
pixel_value
.
shape
[
0
]
for
pixel_value
in
pixel_values_videos
]
else
:
pixel_values_videos
,
num_patches_videos
=
[],
[]
pixel_values
=
[]
while
(
"<image>"
in
text
)
or
(
"<video>"
in
text
):
image_index
=
text
.
find
(
"<image>"
)
video_index
=
text
.
find
(
"<video>"
)
if
image_index
==
-
1
or
(
video_index
>
-
1
and
video_index
<
image_index
):
num_patches
=
num_patches_videos
.
pop
(
0
)
pixel_values
.
append
(
pixel_values_videos
.
pop
(
0
))
context_tokens
=
IMG_START
+
\
IMG_CONTEXT
*
self
.
num_image_token
+
IMG_END
video_tokens
=
''
.
join
([
f
'Frame
{
i
+
1
}
:
{
context_tokens
}
'
for
i
in
range
(
num_patches
)
])
text
=
text
.
replace
(
'<video>'
,
video_tokens
,
1
)
else
:
num_patches
=
num_patches_images
.
pop
(
0
)
pixel_values
.
append
(
pixel_values_images
.
pop
(
0
))
context_tokens
=
IMG_CONTEXT
*
self
.
num_image_token
\
*
num_patches
image_tokens
=
IMG_START
+
context_tokens
+
IMG_END
text
=
text
.
replace
(
'<image>'
,
image_tokens
,
1
)
pixel_values
=
torch
.
cat
(
pixel_values
,
dim
=
0
)
prompt
=
self
.
tokenizer
(
text
,
return_tensors
=
"pt"
)
prompt
.
update
({
"pixel_values"
:
pixel_values
})
return
prompt
...
...
tests/models/multimodal/processing/test_common.py
View file @
4eabe123
...
...
@@ -9,15 +9,15 @@ from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
UserMessage
)
from
mistral_common.protocol.instruct.request
import
ChatCompletionRequest
from
PIL
import
Image
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
InputProcessingContext
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalDataDict
from
vllm.multimodal.inputs
import
MultiModalInputs
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
,
ProcessingCache
from
vllm.transformers_utils.tokenizer
import
(
MistralTokenizer
,
cached_tokenizer_from_config
)
from
vllm.transformers_utils.tokenizer
import
(
AnyTokenizer
,
MistralTokenizer
,
cached_tokenizer_from_config
,
encode_tokens
)
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
...registry
import
HF_EXAMPLE_MODELS
...
...
@@ -28,7 +28,6 @@ def _test_processing_correctness(
hit_rate
:
float
,
num_batches
:
int
,
simplify_rate
:
float
,
ignore_mm_keys
:
Optional
[
set
[
str
]]
=
None
,
):
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
...
...
@@ -99,10 +98,23 @@ def _test_processing_correctness(
}
mm_counts
=
{
k
:
len
(
vs
)
for
k
,
vs
in
mm_data
.
items
()}
# Mistral chat outputs tokens directly, rather than text prompts
if
isinstance
(
tokenizer
,
MistralTokenizer
):
images
=
mm_data
.
get
(
"image"
,
[])
request
=
ChatCompletionRequest
(
messages
=
[
UserMessage
(
content
=
[
TextChunk
(
text
=
""
),
*
(
ImageChunk
(
image
=
image
)
for
image
in
images
),
]),
])
res
=
tokenizer
.
mistral
.
encode_chat_completion
(
request
)
prompt
=
res
.
tokens
else
:
prompt
=
dummy_inputs
.
get_dummy_processor_inputs
(
model_config
.
max_model_len
,
mm_counts
,
).
prompt
_text
).
prompt
# Drop unnecessary keys and test single -> multi conversion
if
rng
.
rand
()
<
simplify_rate
:
...
...
@@ -112,8 +124,7 @@ def _test_processing_correctness(
elif
len
(
mm_data
[
k
])
==
1
:
mm_data
[
k
]
=
mm_data
[
k
][
0
]
if
isinstance
(
tokenizer
,
MistralTokenizer
):
_test_processing_correctness_mistral
(
_test_processing_correctness_one
(
model_config
,
tokenizer
,
prompt
,
...
...
@@ -121,58 +132,51 @@ def _test_processing_correctness(
baseline_processor
,
cached_processor
,
batch_idx
,
ignore_mm_keys
=
ignore_mm_keys
,
)
else
:
_test_processing_correctness_hf
(
model_config
,
tokenizer
,
prompt
,
mm_data
,
baseline_processor
,
cached_processor
,
batch_idx
,
ignore_mm_keys
=
ignore_mm_keys
,
)
def
_test_processing_correctness_hf
(
# For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
_ADD_SPECIAL_TOKENS_OVERRIDES
=
{
"mllama"
:
False
,
"ovis"
:
False
,
"ultravox"
:
False
,
"whisper"
:
False
,
}
_IGNORE_MM_KEYS
=
{
# In Ultravox, the audio_features can be different depending on padding
# The slight difference should not be a problem though, since
# attention_mask lets us ignore the difference.
"ultravox"
:
{
"audio_features"
},
}
def
_test_processing_correctness_one
(
model_config
:
ModelConfig
,
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]
,
prompt
:
str
,
tokenizer
:
AnyTokenizer
,
prompt
:
Union
[
str
,
list
[
int
]]
,
mm_data
:
MultiModalDataDict
,
baseline_processor
:
BaseMultiModalProcessor
,
cached_processor
:
BaseMultiModalProcessor
,
batch_idx
:
int
,
ignore_mm_keys
:
Optional
[
set
[
str
]]
=
None
,
):
if
model_config
.
hf_config
.
model_type
in
(
"mllama"
,
"ovis"
,
"ultravox"
,
"whisper"
):
# For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
token_prompt
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)
else
:
token_prompt
=
tokenizer
.
encode
(
prompt
)
model_type
=
model_config
.
hf_config
.
model_type
ignore_mm_keys
=
_IGNORE_MM_KEYS
.
get
(
model_type
,
set
[
str
]())
baseline_result
=
baseline_processor
.
apply
(
prompt
,
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
cached_result
=
cached_processor
.
apply
(
if
isinstance
(
prompt
,
str
):
text_prompt
=
prompt
token_prompt
=
encode_tokens
(
tokenizer
,
prompt
,
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
_assert_inputs_equal
(
baseline_result
,
cached_result
,
ignore_mm_keys
=
ignore_mm_keys
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
,
add_special_tokens
=
_ADD_SPECIAL_TOKENS_OVERRIDES
.
get
(
model_type
),
)
else
:
# Mistral does not support decode_tokens with skip_special_tokens=False
text_prompt
=
None
token_prompt
=
prompt
baseline_tokenized_result
=
baseline_processor
.
apply
(
token_prompt
,
...
...
@@ -180,13 +184,6 @@ def _test_processing_correctness_hf(
hf_processor_mm_kwargs
=
{},
)
_assert_inputs_equal
(
baseline_result
,
baseline_tokenized_result
,
ignore_mm_keys
=
ignore_mm_keys
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
,
)
cached_tokenized_result
=
cached_processor
.
apply
(
token_prompt
,
mm_data
=
mm_data
,
...
...
@@ -194,53 +191,45 @@ def _test_processing_correctness_hf(
)
_assert_inputs_equal
(
cach
ed_result
,
baseline_tokeniz
ed_result
,
cached_tokenized_result
,
ignore_mm_keys
=
ignore_mm_keys
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
token_
prompt
=
}
,
{
mm_data
=
}
)"
,
)
def
_test_processing_correctness_mistral
(
model_config
:
ModelConfig
,
tokenizer
:
MistralTokenizer
,
prompt
:
str
,
mm_data
:
MultiModalDataDict
,
baseline_processor
:
BaseMultiModalProcessor
,
cached_processor
:
BaseMultiModalProcessor
,
batch_idx
:
int
,
ignore_mm_keys
:
Optional
[
set
[
str
]]
=
None
,
):
images
=
mm_data
.
get
(
"image"
,
[])
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
request
=
ChatCompletionRequest
(
messages
=
[
UserMessage
(
content
=
[
TextChunk
(
text
=
prompt
),
*
(
ImageChunk
(
image
=
image
)
for
image
in
images
),
]),
])
res
=
tokenizer
.
mistral
.
encode_chat_completion
(
request
)
token_prompt
=
res
.
tokens
# Mistral chat outputs tokens directly, rather than text prompts
baseline_tokenized_result
=
baseline_processor
.
apply
(
token_prompt
,
if
text_prompt
is
not
None
:
baseline_text_result
=
baseline_processor
.
apply
(
text_prompt
,
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
cached_t
okenized
_result
=
cached_processor
.
apply
(
token
_prompt
,
cached_t
ext
_result
=
cached_processor
.
apply
(
text
_prompt
,
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
_assert_inputs_equal
(
baseline_text_result
,
cached_text_result
,
ignore_mm_keys
=
ignore_mm_keys
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
text_prompt
=
}
,
{
mm_data
=
}
)"
,
)
_assert_inputs_equal
(
baseline_text_result
,
baseline_tokenized_result
,
ignore_mm_keys
=
ignore_mm_keys
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
text_prompt
=
}
, "
f
"
{
token_prompt
=
}
,
{
mm_data
=
}
)"
,
)
_assert_inputs_equal
(
cached_text_result
,
cached_tokenized_result
,
ignore_mm_keys
=
ignore_mm_keys
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
text_prompt
=
}
, "
f
"
{
token_prompt
=
}
,
{
mm_data
=
}
)"
,
)
...
...
@@ -258,6 +247,7 @@ def _test_processing_correctness_mistral(
"ibm-granite/granite-speech-3.3-8b"
,
"h2oai/h2ovl-mississippi-800m"
,
"OpenGVLab/InternVL2-1B"
,
"OpenGVLab/InternVL3-1B"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
,
"moonshotai/Kimi-VL-A3B-Instruct"
,
...
...
@@ -280,6 +270,7 @@ def _test_processing_correctness_mistral(
"AIDC-AI/Ovis2-1B"
,
"google/paligemma-3b-mix-224"
,
"google/paligemma2-3b-ft-docci-448"
,
"microsoft/Phi-3.5-vision-instruct"
,
"microsoft/Phi-4-multimodal-instruct"
,
"mistralai/Pixtral-12B-2409"
,
"mistral-community/pixtral-12b"
,
...
...
@@ -303,41 +294,6 @@ def test_processing_correctness(
num_batches
:
int
,
simplify_rate
:
float
,
):
ignore_mm_keys
=
None
if
'ultravox'
in
model_id
:
# In Ultravox, the audio_features can be different depending on padding
# The slight difference should not be a problem though, since
# attention_mask lets us ignore the difference.
ignore_mm_keys
=
{
"audio_features"
}
_test_processing_correctness
(
model_id
,
hit_rate
=
hit_rate
,
num_batches
=
num_batches
,
simplify_rate
=
simplify_rate
,
ignore_mm_keys
=
ignore_mm_keys
,
)
# yapf: disable
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"microsoft/Phi-3.5-vision-instruct"
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"simplify_rate"
,
[
1.0
])
# yapf: enable
def
test_processing_correctness_phi3v
(
model_id
:
str
,
hit_rate
:
float
,
num_batches
:
int
,
simplify_rate
:
float
,
):
# HACK - this is an attempted workaround for the following bug
# https://github.com/huggingface/transformers/issues/34307
from
transformers
import
AutoImageProcessor
# noqa: F401
from
transformers
import
AutoProcessor
# noqa: F401
AutoImageProcessor
.
from_pretrained
(
model_id
,
trust_remote_code
=
True
)
_test_processing_correctness
(
model_id
,
hit_rate
=
hit_rate
,
...
...
@@ -356,16 +312,10 @@ def _assert_inputs_equal(
if
ignore_mm_keys
is
None
:
ignore_mm_keys
=
set
()
if
msg
is
None
:
assert
"mm_kwargs"
in
a
and
"mm_kwargs"
in
b
else
:
assert
"mm_kwargs"
in
a
and
"mm_kwargs"
in
b
,
msg
for
key
in
ignore_mm_keys
:
a
[
"mm_kwargs"
].
pop
(
key
,
None
)
b
[
"mm_kwargs"
].
pop
(
key
,
None
)
if
msg
is
None
:
assert
a
==
b
else
:
assert
a
==
b
,
msg
tests/models/multimodal/processing/test_mllama.py
View file @
4eabe123
...
...
@@ -49,7 +49,7 @@ def test_profiling(
]
*
max_num_seqs
mm_kwargs
=
processor
.
apply
(
prompt
=
dummy_mm_data
.
prompt
_text
,
prompt
=
dummy_mm_data
.
prompt
,
mm_data
=
dummy_mm_data
.
mm_data
,
hf_processor_mm_kwargs
=
dict
(),
)[
"mm_kwargs"
]
...
...
tests/models/quantization/test_gguf.py
View file @
4eabe123
...
...
@@ -78,8 +78,12 @@ DOLPHIN_CONFIG = GGUFTestConfig(
)
MODELS
=
[
LLAMA_CONFIG
,
QWEN2_CONFIG
,
PHI3_CONFIG
,
GPT2_CONFIG
,
STABLELM_CONFIG
,
DOLPHIN_CONFIG
LLAMA_CONFIG
,
QWEN2_CONFIG
,
PHI3_CONFIG
,
GPT2_CONFIG
,
# STABLELM_CONFIG, # enable this when v1 support head_size=80
DOLPHIN_CONFIG
,
# STARCODER_CONFIG, # broken
]
...
...
tests/models/quantization/test_nvfp4.py
View file @
4eabe123
...
...
@@ -41,8 +41,8 @@ EXPECTED_STRS_MAP = {
reason
=
"Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system."
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"
nv
fp4"
),
reason
=
"
nv
fp4 is not supported on this GPU type."
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"
modelopt_
fp4"
),
reason
=
"
modelopt_
fp4 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
def
test_models
(
example_prompts
,
model_name
)
->
None
:
model
=
LLM
(
...
...
@@ -50,7 +50,7 @@ def test_models(example_prompts, model_name) -> None:
max_model_len
=
MAX_MODEL_LEN
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
quantization
=
"
nv
fp4"
,
quantization
=
"
modelopt_
fp4"
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
...
...
tests/models/registry.py
View file @
4eabe123
...
...
@@ -8,6 +8,8 @@ import pytest
from
packaging.version
import
Version
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
vllm.config
import
TokenizerMode
@
dataclass
(
frozen
=
True
)
class
_HfExamplesInfo
:
...
...
@@ -20,7 +22,7 @@ class _HfExamplesInfo:
tokenizer
:
Optional
[
str
]
=
None
"""Set the tokenizer to load for this architecture."""
tokenizer_mode
:
str
=
"auto"
tokenizer_mode
:
TokenizerMode
=
"auto"
"""Set the tokenizer type for this architecture."""
speculative_model
:
Optional
[
str
]
=
None
...
...
@@ -55,9 +57,18 @@ class _HfExamplesInfo:
trust_remote_code
:
bool
=
False
"""The ``trust_remote_code`` level required to load the model."""
v0_only
:
bool
=
False
"""The model is only available with the vLLM V0 engine."""
hf_overrides
:
dict
[
str
,
Any
]
=
field
(
default_factory
=
dict
)
"""The ``hf_overrides`` required to load the model."""
max_model_len
:
Optional
[
int
]
=
None
"""
The maximum model length to use for this model. Some models default to a
length that is too large to fit into memory in CI.
"""
def
check_transformers_version
(
self
,
*
,
...
...
@@ -124,7 +135,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"BaichuanForCausalLM"
:
_HfExamplesInfo
(
"baichuan-inc/Baichuan2-7B-chat"
,
trust_remote_code
=
True
),
"BambaForCausalLM"
:
_HfExamplesInfo
(
"ibm-ai-platform/Bamba-9B"
,
extras
=
{
"tiny"
:
"hmellor/
bamba-
tiny-random"
}),
# noqa: E501
extras
=
{
"tiny"
:
"hmellor/tiny-random
-BambaForCausalLM
"
}),
# noqa: E501
"BloomForCausalLM"
:
_HfExamplesInfo
(
"bigscience/bloom-560m"
,
{
"1b"
:
"bigscience/bloomz-1b1"
}),
"ChatGLMModel"
:
_HfExamplesInfo
(
"THUDM/chatglm3-6b"
,
...
...
@@ -147,6 +158,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"ExaoneForCausalLM"
:
_HfExamplesInfo
(
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
),
# noqa: E501
"Fairseq2LlamaForCausalLM"
:
_HfExamplesInfo
(
"mgleize/fairseq2-dummy-Llama-3.2-1B"
),
# noqa: E501
"FalconForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-7b"
),
"FalconH1ForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/Falcon-H1-1.5B-Instruct"
,
is_available_online
=
False
,
min_transformers_version
=
"4.52.2"
),
"GemmaForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-1.1-2b-it"
),
"Gemma2ForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-2-9b"
),
"Gemma3ForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-3-1b-it"
),
...
...
@@ -212,10 +226,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"OrionForCausalLM"
:
_HfExamplesInfo
(
"OrionStarAI/Orion-14B-Chat"
,
trust_remote_code
=
True
),
"PersimmonForCausalLM"
:
_HfExamplesInfo
(
"adept/persimmon-8b-chat"
),
"PhiForCausalLM"
:
_HfExamplesInfo
(
"microsoft/phi-2"
),
"PhiForCausalLM"
:
_HfExamplesInfo
(
"microsoft/phi-2"
,
v0_only
=
True
),
"Phi3ForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-mini-4k-instruct"
),
"Phi3SmallForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-small-8k-instruct"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
,
v0_only
=
True
),
"PhiMoEForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3.5-MoE-instruct"
,
trust_remote_code
=
True
),
"Plamo2ForCausalLM"
:
_HfExamplesInfo
(
"pfnet/plamo-2-1b"
,
...
...
@@ -231,7 +246,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
is_available_online
=
False
),
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-zephyr-3b"
,
# noqa: E501
is_available_online
=
False
),
"StableLmForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-3b-4e1t"
),
"StableLmForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-3b-4e1t"
,
v0_only
=
True
),
"Starcoder2ForCausalLM"
:
_HfExamplesInfo
(
"bigcode/starcoder2-3b"
),
"SolarForCausalLM"
:
_HfExamplesInfo
(
"upstage/solar-pro-preview-instruct"
),
"TeleChat2ForCausalLM"
:
_HfExamplesInfo
(
"Tele-AI/TeleChat2-3B"
,
...
...
@@ -300,7 +316,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereForAI/aya-vision-8b"
),
# noqa: E501
"Blip2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Salesforce/blip2-opt-2.7b"
,
# noqa: E501
extras
=
{
"6b"
:
"Salesforce/blip2-opt-6.7b"
}),
# noqa: E501
extras
=
{
"6b"
:
"Salesforce/blip2-opt-6.7b"
},
# noqa: E501
v0_only
=
True
),
"ChameleonForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/chameleon-7b"
),
# noqa: E501
"DeepseekVLV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-vl2-tiny"
,
# noqa: E501
extras
=
{
"fork"
:
"Isotr0py/deepseek-vl2-tiny"
},
# noqa: E501
...
...
@@ -319,15 +336,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
max_transformers_version
=
"4.48"
,
# noqa: E501
transformers_version_reason
=
"HF model is not compatible."
),
# noqa: E501
"InternVLChatModel"
:
_HfExamplesInfo
(
"OpenGVLab/InternVL2-1B"
,
extras
=
{
"2B"
:
"OpenGVLab/InternVL2-2B"
},
# noqa: E501
extras
=
{
"2B"
:
"OpenGVLab/InternVL2-2B"
,
"3.0"
:
"OpenGVLab/InternVL3-1B"
},
# noqa: E501
trust_remote_code
=
True
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
}),
# noqa: E501
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
"moonshotai/Kimi-VL-A3B-Instruct"
,
# noqa: E501
extras
=
{
"thinking"
:
"moonshotai/Kimi-VL-A3B-Thinking"
},
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
,
v0_only
=
True
),
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
# noqa: E501
min_transformers_version
=
"4.51"
),
min_transformers_version
=
"4.51"
,
max_model_len
=
10240
),
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-1.5-7b-hf"
,
extras
=
{
"mistral"
:
"mistral-community/pixtral-12b"
,
# noqa: E501
"mistral-fp8"
:
"nm-testing/pixtral-12b-FP8-dynamic"
}),
# noqa: E501
...
...
@@ -346,7 +366,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras
=
{
"2.6"
:
"openbmb/MiniCPM-V-2_6"
},
# noqa: E501
trust_remote_code
=
True
),
"MiniMaxVL01ForConditionalGeneration"
:
_HfExamplesInfo
(
"MiniMaxAI/MiniMax-VL-01"
,
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
,
v0_only
=
True
),
"Mistral3ForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
,
# noqa: E501
extras
=
{
"fp8"
:
"nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"
}),
# noqa: E501
"MolmoForCausalLM"
:
_HfExamplesInfo
(
"allenai/Molmo-7B-D-0924"
,
...
...
@@ -379,6 +400,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-VL-3B-Instruct"
),
# noqa: E501
"Qwen2_5OmniModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Omni-3B"
,
min_transformers_version
=
"4.52"
),
"Qwen2_5OmniForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Omni-7B-AWQ"
,
# noqa: E501
min_transformers_version
=
"4.52"
),
"SkyworkR1VChatModel"
:
_HfExamplesInfo
(
"Skywork/Skywork-R1V-38B"
),
"SmolVLMForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
),
# noqa: E501
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
# noqa: E501
...
...
tests/models/test_initialization.py
View file @
4eabe123
...
...
@@ -15,12 +15,12 @@ from .registry import HF_EXAMPLE_MODELS
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
HF_EXAMPLE_MODELS
.
get_supported_archs
())
def
test_can_initialize
(
model_arch
):
def
test_can_initialize
(
model_arch
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
):
model_info
=
HF_EXAMPLE_MODELS
.
get_hf_info
(
model_arch
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
# Avoid OOM
# Avoid OOM
and reduce initialization time by only using 1 layer
def
hf_overrides
(
hf_config
:
PretrainedConfig
)
->
PretrainedConfig
:
hf_config
.
update
(
model_info
.
hf_overrides
)
...
...
@@ -34,6 +34,12 @@ def test_can_initialize(model_arch):
"num_local_experts"
:
2
,
})
if
hasattr
(
hf_config
,
"vision_config"
):
hf_config
.
vision_config
.
update
({
"num_layers"
:
1
,
"num_hidden_layers"
:
1
,
})
return
hf_config
# Avoid calling model.forward()
...
...
@@ -46,7 +52,7 @@ def test_can_initialize(model_arch):
scheduler_kv_cache_config
=
get_kv_cache_config
(
vllm_config
,
kv_cache_specs
[
0
],
2
0
*
GiB_bytes
,
1
0
*
GiB_bytes
,
)
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
...
...
@@ -55,7 +61,9 @@ def test_can_initialize(model_arch):
with
(
patch
.
object
(
V0LLMEngine
,
"_initialize_kv_caches"
,
_initialize_kv_caches_v0
),
patch
.
object
(
V1EngineCore
,
"_initialize_kv_caches"
,
_initialize_kv_caches_v1
)):
_initialize_kv_caches_v1
),
monkeypatch
.
context
()
as
m
):
if
model_info
.
v0_only
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
LLM
(
model_info
.
default
,
tokenizer
=
model_info
.
tokenizer
,
...
...
@@ -65,6 +73,7 @@ def test_can_initialize(model_arch):
"num_speculative_tokens"
:
1
,
}
if
model_info
.
speculative_model
else
None
,
trust_remote_code
=
model_info
.
trust_remote_code
,
max_model_len
=
model_info
.
max_model_len
,
load_format
=
"dummy"
,
hf_overrides
=
hf_overrides
,
)
tests/models/test_oot_registration.py
View file @
4eabe123
...
...
@@ -4,6 +4,7 @@ import pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.multimodal.image
import
convert_image_mode
from
..utils
import
create_new_process_for_each_test
...
...
@@ -58,7 +59,7 @@ def test_oot_registration_embedding(
assert
all
(
v
==
0
for
v
in
output
.
outputs
.
embedding
)
image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
image
=
convert_image_mode
(
ImageAsset
(
"cherry_blossom"
).
pil_image
,
"RGB"
)
@
create_new_process_for_each_test
()
...
...
tests/models/test_transformers.py
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
"""Test the functionality of the Transformers backend."""
from
typing
import
Any
,
Optional
,
Union
import
pytest
from
vllm.platforms
import
current_platform
from
..conftest
import
HfRunner
,
VllmRunner
from
..core.block.e2e.test_correctness_sliding_window
import
prep_prompts
from
..utils
import
multi_gpu_test
from
.utils
import
check_logprobs_close
def
check_implementation
(
hf_
runner
:
type
[
HfRunner
],
vllm_
runner
:
type
[
VllmRunner
],
runner
_ref
:
type
[
Union
[
HfRunner
,
VllmRunner
]
],
runner
_test
:
type
[
VllmRunner
],
example_prompts
:
list
[
str
],
model
:
str
,
kwargs_ref
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
kwargs_test
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
**
kwargs
,
):
if
kwargs_ref
is
None
:
kwargs_ref
=
{}
if
kwargs_test
is
None
:
kwargs_test
=
{}
max_tokens
=
32
num_logprobs
=
5
with
vllm_runner
(
model
,
**
kwargs
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
args
=
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
runner_test
(
model
,
**
kwargs_test
,
**
kwargs
)
as
model_test
:
outputs_test
=
model_test
.
generate_greedy_logprobs
(
*
args
)
with
hf_runner
(
model
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
runner_ref
(
model
,
**
kwargs_ref
)
as
model_ref
:
if
isinstance
(
model_ref
,
VllmRunner
):
outputs_ref
=
model_ref
.
generate_greedy_logprobs
(
*
args
)
else
:
outputs_ref
=
model_ref
.
generate_greedy_logprobs_limit
(
*
args
)
check_logprobs_close
(
outputs_0_lst
=
hf_
outputs
,
outputs_1_lst
=
vllm_
outputs
,
name_0
=
"
h
f"
,
name_1
=
"
vllm
"
,
outputs_0_lst
=
outputs
_ref
,
outputs_1_lst
=
outputs
_test
,
name_0
=
"
re
f"
,
name_1
=
"
test
"
,
)
...
...
@@ -58,6 +71,18 @@ def test_models(
model_impl
=
model_impl
)
def
test_hybrid_attention
(
vllm_runner
:
type
[
VllmRunner
])
->
None
:
prompts
,
_
,
_
=
prep_prompts
(
4
,
(
800
,
801
))
kwargs_ref
=
{
"max_model_len"
:
8192
,
"enforce_eager"
:
True
}
kwargs_test
=
{
"model_impl"
:
"transformers"
,
**
kwargs_ref
}
check_implementation
(
vllm_runner
,
vllm_runner
,
prompts
,
model
=
"hmellor/tiny-random-Gemma2ForCausalLM"
,
kwargs_ref
=
kwargs_ref
,
kwargs_test
=
kwargs_test
)
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_distributed
(
hf_runner
:
type
[
HfRunner
],
...
...
@@ -65,8 +90,11 @@ def test_distributed(
example_prompts
,
):
kwargs
=
{
"model_impl"
:
"transformers"
,
"tensor_parallel_size"
:
2
}
check_implementation
(
hf_runner
,
vllm_runner
,
example_prompts
,
"meta-llama/Llama-3.2-1B-Instruct"
,
**
kwargs
)
check_implementation
(
hf_runner
,
vllm_runner
,
example_prompts
,
"meta-llama/Llama-3.2-1B-Instruct"
,
kwargs_test
=
kwargs
)
@
pytest
.
mark
.
skipif
(
...
...
tests/models/test_utils.py
View file @
4eabe123
...
...
@@ -77,3 +77,73 @@ def test_module_with_child_containing_batchnorm_can_autoload():
assert
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_var
==
mod
.
nested_mod
.
bn
.
running_var
)
assert
new_mod
.
nested_mod
.
bn
.
num_batches_tracked
.
item
()
==
1
def
test_module_skip_prefix
():
"""Ensure the auto weight loader can skip prefix."""
mod
=
ModuleWithNestedBatchNorm
()
# Run some data through the module with batchnorm
mod
(
torch
.
Tensor
([[
1
,
2
],
[
3
,
4
]]))
# Try to load the weights to a new instance
def
weight_generator
():
# weights needed to be filtered out
redundant_weights
=
{
"prefix.bn.weight"
:
torch
.
Tensor
([
1
,
2
]),
"prefix.bn.bias"
:
torch
.
Tensor
([
3
,
4
]),
}
yield
from
(
mod
.
state_dict
()
|
redundant_weights
).
items
()
new_mod
=
ModuleWithNestedBatchNorm
()
assert
not
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_mean
==
mod
.
nested_mod
.
bn
.
running_mean
)
assert
not
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_var
==
mod
.
nested_mod
.
bn
.
running_var
)
assert
new_mod
.
nested_mod
.
bn
.
num_batches_tracked
.
item
()
==
0
loader
=
AutoWeightsLoader
(
new_mod
,
skip_prefixes
=
[
"prefix."
])
loader
.
load_weights
(
weight_generator
())
# Ensure the stats are updated
assert
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_mean
==
mod
.
nested_mod
.
bn
.
running_mean
)
assert
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_var
==
mod
.
nested_mod
.
bn
.
running_var
)
assert
new_mod
.
nested_mod
.
bn
.
num_batches_tracked
.
item
()
==
1
def
test_module_skip_substr
():
"""Ensure the auto weight loader can skip prefix."""
mod
=
ModuleWithNestedBatchNorm
()
# Run some data through the module with batchnorm
mod
(
torch
.
Tensor
([[
1
,
2
],
[
3
,
4
]]))
# Try to load the weights to a new instance
def
weight_generator
():
# weights needed to be filtered out
redundant_weights
=
{
"nested_mod.0.substr.weight"
:
torch
.
Tensor
([
1
,
2
]),
"nested_mod.0.substr.bias"
:
torch
.
Tensor
([
3
,
4
]),
"nested_mod.substr.weight"
:
torch
.
Tensor
([
1
,
2
]),
"nested_mod.substr.bias"
:
torch
.
Tensor
([
3
,
4
]),
}
yield
from
(
mod
.
state_dict
()
|
redundant_weights
).
items
()
new_mod
=
ModuleWithNestedBatchNorm
()
assert
not
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_mean
==
mod
.
nested_mod
.
bn
.
running_mean
)
assert
not
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_var
==
mod
.
nested_mod
.
bn
.
running_var
)
assert
new_mod
.
nested_mod
.
bn
.
num_batches_tracked
.
item
()
==
0
loader
=
AutoWeightsLoader
(
new_mod
,
skip_substrs
=
[
"substr."
])
loader
.
load_weights
(
weight_generator
())
# Ensure the stats are updated
assert
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_mean
==
mod
.
nested_mod
.
bn
.
running_mean
)
assert
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_var
==
mod
.
nested_mod
.
bn
.
running_var
)
assert
new_mod
.
nested_mod
.
bn
.
num_batches_tracked
.
item
()
==
1
tests/multimodal/assets/rgba.png
0 → 100644
View file @
4eabe123
219 KB
tests/multimodal/test_image.py
0 → 100644
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
from
pathlib
import
Path
import
numpy
as
np
from
PIL
import
Image
,
ImageChops
from
vllm.multimodal.image
import
convert_image_mode
ASSETS_DIR
=
Path
(
__file__
).
parent
/
"assets"
assert
ASSETS_DIR
.
exists
()
def
test_rgb_to_rgb
():
# Start with an RGB image.
original_image
=
Image
.
open
(
ASSETS_DIR
/
"image1.png"
).
convert
(
"RGB"
)
converted_image
=
convert_image_mode
(
original_image
,
"RGB"
)
# RGB to RGB should be a no-op.
diff
=
ImageChops
.
difference
(
original_image
,
converted_image
)
assert
diff
.
getbbox
()
is
None
def
test_rgba_to_rgb
():
original_image
=
Image
.
open
(
ASSETS_DIR
/
"rgba.png"
)
original_image_numpy
=
np
.
array
(
original_image
)
converted_image
=
convert_image_mode
(
original_image
,
"RGB"
)
converted_image_numpy
=
np
.
array
(
converted_image
)
for
i
in
range
(
original_image_numpy
.
shape
[
0
]):
for
j
in
range
(
original_image_numpy
.
shape
[
1
]):
# Verify that all transparent pixels are converted to white.
if
original_image_numpy
[
i
][
j
][
3
]
==
0
:
assert
converted_image_numpy
[
i
][
j
][
0
]
==
255
assert
converted_image_numpy
[
i
][
j
][
1
]
==
255
assert
converted_image_numpy
[
i
][
j
][
2
]
==
255
tests/multimodal/test_utils.py
View file @
4eabe123
...
...
@@ -10,6 +10,7 @@ import numpy as np
import
pytest
from
PIL
import
Image
,
ImageChops
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.multimodal.utils
import
(
MediaConnector
,
merge_and_sort_multimodal_metadata
)
...
...
@@ -53,7 +54,7 @@ def get_supported_suffixes() -> tuple[str, ...]:
def
_image_equals
(
a
:
Image
.
Image
,
b
:
Image
.
Image
)
->
bool
:
return
(
np
.
asarray
(
a
)
==
np
.
asarray
(
b
.
convert
(
a
.
mode
))).
all
()
return
(
np
.
asarray
(
a
)
==
np
.
asarray
(
convert
_image_mode
(
b
,
a
.
mode
))).
all
()
@
pytest
.
mark
.
asyncio
...
...
tests/neuron/2_core/test_eagle.py
0 → 100644
View file @
4eabe123
# SPDX-License-Identifier: Apache-2.0
import
json
import
os
import
shutil
import
tempfile
import
torch
from
huggingface_hub
import
snapshot_download
from
safetensors
import
safe_open
from
vllm
import
LLM
,
SamplingParams
def
patch_eagle_draft_with_lm_head
(
target_model_id
:
str
,
draft_model_id
:
str
)
->
str
:
# In NxDI, draft model checkpoint must include lm_head weights from target
# model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
# /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
# #eagle-checkpoint-compatibility
final_draft_dir
=
"/tmp/patched_eagle_draft"
with
tempfile
.
TemporaryDirectory
()
as
tmp_dir
:
target_dir
=
snapshot_download
(
repo_id
=
target_model_id
,
local_dir
=
os
.
path
.
join
(
tmp_dir
,
"target"
))
draft_dir
=
snapshot_download
(
repo_id
=
draft_model_id
,
local_dir
=
os
.
path
.
join
(
tmp_dir
,
"draft"
))
lm_head_key
=
"lm_head.weight"
index_path
=
os
.
path
.
join
(
target_dir
,
"model.safetensors.index.json"
)
with
open
(
index_path
)
as
f
:
index
=
json
.
load
(
f
)
shard_name
=
index
[
"weight_map"
][
lm_head_key
]
target_safetensor_path
=
os
.
path
.
join
(
target_dir
,
shard_name
)
with
safe_open
(
target_safetensor_path
,
framework
=
"pt"
)
as
f
:
target_lm_head
=
f
.
get_tensor
(
lm_head_key
)
draft_path
=
os
.
path
.
join
(
draft_dir
,
"pytorch_model.bin"
)
draft_state_dict
=
torch
.
load
(
draft_path
,
map_location
=
"cpu"
)
draft_state_dict
[
lm_head_key
]
=
target_lm_head
.
to
(
torch
.
float16
)
torch
.
save
(
draft_state_dict
,
draft_path
)
shutil
.
copytree
(
draft_dir
,
final_draft_dir
,
dirs_exist_ok
=
True
)
return
final_draft_dir
def
test_eagle
():
patched_draft_path
=
patch_eagle_draft_with_lm_head
(
target_model_id
=
"meta-llama/Llama-2-7b-hf"
,
draft_model_id
=
"yuhuili/EAGLE-llama2-chat-7B"
)
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-hf"
,
speculative_config
=
{
"model"
:
patched_draft_path
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
128
},
max_num_seqs
=
1
,
max_model_len
=
128
,
tensor_parallel_size
=
2
,
override_neuron_config
=
{
"enable_eagle_speculation"
:
True
,
"enable_fused_speculation"
:
True
,
"fused_qkv"
:
True
},
)
prompts
=
[
"The president of the United States is"
,
]
outputs
=
llm
.
generate
(
prompts
,
SamplingParams
(
top_k
=
1
))
expected_output
=
" the head of state and head of government of "
\
"the United States. The president direct"
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
output
.
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
assert
(
expected_output
==
generated_text
)
print
(
"Neuron Eagle speculation test passed."
)
tests/neuron/2_core/test_mistral.py
View file @
4eabe123
...
...
@@ -7,26 +7,58 @@ def test_mistral():
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-v0.1"
,
tensor_parallel_size
=
2
,
max_num_seqs
=
4
,
max_model_len
=
5
12
,
max_model_len
=
12
8
,
use_v2_block_manager
=
True
,
override_neuron_config
=
{
"sequence_parallel_enabled"
:
False
,
"skip_warmup"
:
True
},
device
=
"neuron"
)
})
# Send more prompts than the compiled batch size (4) and request
# varying generation lengths to test accuracy related to Neuron
# specific sequence id sorting.
prompts
=
[
"The president of the United States is"
,
"The capital of France is"
,
"What is Annapurna labs?"
,
"I believe the meaning of life is"
,
"Tell me a story about a brave knight"
,
"Hello, my name is Llama"
,
]
outputs
=
llm
.
generate
(
prompts
,
SamplingParams
(
top_k
=
1
))
sampling_params
=
[
SamplingParams
(
top_k
=
1
,
max_tokens
=
10
),
SamplingParams
(
top_k
=
1
,
max_tokens
=
20
),
SamplingParams
(
top_k
=
1
,
max_tokens
=
30
),
SamplingParams
(
top_k
=
1
,
max_tokens
=
40
),
SamplingParams
(
top_k
=
1
,
max_tokens
=
50
),
SamplingParams
(
top_k
=
1
,
max_tokens
=
60
)
]
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
expected_outputs
=
[
" the most powerful person in the world. He is the head of state "
"and head"
,
" a city of many faces. It is a city of history, culture, art"
" the most powerful person in the world. He is"
,
" a city of many faces. It is a city of history, culture, art, "
"fashion, and"
,
"
\n\n
Annapurna Labs is a semiconductor company that was founded "
"in 2013 by Amazon. The company is"
,
" to be happy.
\n\n
I believe that happiness is a choice.
\n\n
I "
"believe that happiness is a state of mind.
\n\n
I believe that "
"happiness is a journey.
\n\n
I believe"
,
" who rescued a princess from a dragon.
\n\n
Tell me a story about"
" a princess who rescued herself from a dragon.
\n\n
Tell me a "
"story about a princess who rescued herself from a dragon and "
"then rescued a knight from"
,
" and I am a 10 year old male. I am a very friendly and "
"affectionate boy who loves to be around people. I am a very "
"active boy who loves to play and run around. I am a very smart "
"boy who loves to learn new things. I am a very loyal boy"
]
for
expected_output
,
output
in
zip
(
expected_outputs
,
outputs
):
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
output
.
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
assert
(
expected_output
==
generated_text
)
print
(
"Neuron Mistral test passed."
)
tests/plugins_tests/test_platform_plugins.py
View file @
4eabe123
...
...
@@ -29,5 +29,5 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
# ignore the backend env variable if it is set
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
STR_INVALID_VAL
)
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
torch
.
float16
,
16
,
False
)
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
"auto"
,
16
,
False
)
assert
backend
.
get_name
()
==
"Dummy_Backend"
tests/quantization/test_bitsandbytes.py
View file @
4eabe123
...
...
@@ -37,12 +37,6 @@ models_pre_quant_8bit_to_test = [
(
"yec019/fbopt-350m-8bit"
,
"read pre-quantized 8-bit opt model"
),
]
models_pre_quant_8bit_to_test
=
[
(
'meta-llama/Llama-Guard-3-8B-INT8'
,
'read pre-quantized llama 8-bit model'
),
(
"yec019/fbopt-350m-8bit"
,
"read pre-quantized 8-bit opt model"
),
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
...
...
tests/runai_model_streamer_test/test_weight_utils.py
View file @
4eabe123
...
...
@@ -23,10 +23,11 @@ def test_runai_model_loader():
runai_model_streamer_tensors
=
{}
hf_safetensors_tensors
=
{}
for
name
,
tensor
in
runai_safetensors_weights_iterator
(
safetensors
):
for
name
,
tensor
in
runai_safetensors_weights_iterator
(
safetensors
,
True
):
runai_model_streamer_tensors
[
name
]
=
tensor
for
name
,
tensor
in
safetensors_weights_iterator
(
safetensors
):
for
name
,
tensor
in
safetensors_weights_iterator
(
safetensors
,
True
):
hf_safetensors_tensors
[
name
]
=
tensor
assert
len
(
runai_model_streamer_tensors
)
==
len
(
hf_safetensors_tensors
)
...
...
tests/tensorizer_loader/conftest.py
View file @
4eabe123
...
...
@@ -5,14 +5,6 @@ from vllm.distributed import cleanup_dist_env_and_memory
from
vllm.model_executor.model_loader.tensorizer
import
TensorizerConfig
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
Tensorizer only tested on V0 so far.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
@
pytest
.
fixture
(
autouse
=
True
)
def
cleanup
():
cleanup_dist_env_and_memory
(
shutdown_ray
=
True
)
...
...
Prev
1
…
16
17
18
19
20
21
22
23
24
…
34
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment