Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
287 additions
and
626 deletions
+287
-626
tests/models/multimodal/generation/vlm_utils/builders.py
tests/models/multimodal/generation/vlm_utils/builders.py
+1
-1
tests/models/multimodal/generation/vlm_utils/case_filtering.py
.../models/multimodal/generation/vlm_utils/case_filtering.py
+1
-1
tests/models/multimodal/generation/vlm_utils/core.py
tests/models/multimodal/generation/vlm_utils/core.py
+4
-1
tests/models/multimodal/pooling/test_prithvi_mae.py
tests/models/multimodal/pooling/test_prithvi_mae.py
+1
-1
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+5
-1
tests/models/multimodal/processing/test_glm4_1v.py
tests/models/multimodal/processing/test_glm4_1v.py
+47
-0
tests/models/multimodal/processing/test_mllama4.py
tests/models/multimodal/processing/test_mllama4.py
+1
-1
tests/models/multimodal/processing/test_tensor_schema.py
tests/models/multimodal/processing/test_tensor_schema.py
+4
-4
tests/models/multimodal/test_mapping.py
tests/models/multimodal/test_mapping.py
+3
-1
tests/models/registry.py
tests/models/registry.py
+80
-21
tests/models/test_initialization.py
tests/models/test_initialization.py
+17
-4
tests/models/test_terratorch.py
tests/models/test_terratorch.py
+45
-0
tests/models/utils.py
tests/models/utils.py
+23
-4
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+55
-24
tests/neuron/1_core/test_activation.py
tests/neuron/1_core/test_activation.py
+0
-43
tests/neuron/1_core/test_block_table.py
tests/neuron/1_core/test_block_table.py
+0
-154
tests/neuron/1_core/test_cache.py
tests/neuron/1_core/test_cache.py
+0
-86
tests/neuron/1_core/test_layernorm.py
tests/neuron/1_core/test_layernorm.py
+0
-57
tests/neuron/1_core/test_logits_processor.py
tests/neuron/1_core/test_logits_processor.py
+0
-95
tests/neuron/1_core/test_neuron_model_runner.py
tests/neuron/1_core/test_neuron_model_runner.py
+0
-127
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
tests/models/multimodal/generation/vlm_utils/builders.py
View file @
38d80967
...
...
@@ -250,7 +250,7 @@ def build_video_inputs_from_test_info(
def
apply_image_size_scaling
(
image
,
size
:
Union
[
float
,
tuple
[
int
,
int
]],
size_type
:
SizeType
):
"""Applies a size scaler to one image; this can be
a
an image size factor,
"""Applies a size scaler to one image; this can be an image size factor,
which scales the image while maintaining the aspect ratio"""
# Special case for embeddings; if it's a tensor, it's only valid if we
# are considering size factors at constant scale, i.e., we just clone
...
...
tests/models/multimodal/generation/vlm_utils/case_filtering.py
View file @
38d80967
...
...
@@ -42,7 +42,7 @@ def get_filtered_test_settings(
else
:
assert
test_info
.
prompt_formatter
is
not
None
# Everything looks okay; keep if this is
has
correct proc handling
# Everything looks okay; keep if this is correct proc handling
if
(
test_info
.
distributed_executor_backend
is
not
None
)
==
new_proc_per_test
:
matching_tests
[
test_name
]
=
test_info
...
...
tests/models/multimodal/generation/vlm_utils/core.py
View file @
38d80967
...
...
@@ -42,7 +42,7 @@ def run_test(
tensor_parallel_size
:
int
=
1
,
vllm_embeddings
:
Optional
[
torch
.
Tensor
]
=
None
,
):
"""Modality agnostic test
test
executor for comparing HF/vLLM outputs."""
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs
=
vllm_embeddings
if
vllm_embeddings
is
not
None
else
inputs
...
...
@@ -69,6 +69,9 @@ def run_test(
vllm_runner_kwargs_
[
"tokenizer_mode"
]
=
model_info
.
tokenizer_mode
if
model_info
.
hf_overrides
:
vllm_runner_kwargs_
[
"hf_overrides"
]
=
model_info
.
hf_overrides
if
model_info
.
skip_tokenizer_init
:
vllm_runner_kwargs_
[
"skip_tokenizer_init"
]
=
model_info
.
skip_tokenizer_init
if
vllm_runner_kwargs
:
vllm_runner_kwargs_
.
update
(
vllm_runner_kwargs
)
...
...
tests/models/multimodal/pooling/test_prithvi_mae.py
View file @
38d80967
...
...
@@ -46,7 +46,7 @@ def _run_test(
vllm_model
.
encode
(
prompt
)
MODELS
=
[
"
christian-pinto
/Prithvi-EO-2.0-300M-TL-
VLLM
"
]
MODELS
=
[
"
mgazz
/Prithvi-EO-2.0-300M-TL-
Sen1Floods11
"
]
@
pytest
.
mark
.
core_model
...
...
tests/models/multimodal/processing/test_common.py
View file @
38d80967
...
...
@@ -66,7 +66,9 @@ def _test_processing_correctness(
hf_overrides
=
model_info
.
hf_overrides
,
# Ensure that the cache can fit all of the data
mm_processor_cache_gb
=
2048
,
)
skip_tokenizer_init
=
model_info
.
skip_tokenizer_init
,
enforce_eager
=
model_info
.
enforce_eager
,
dtype
=
model_info
.
dtype
)
model_cls
=
MULTIMODAL_REGISTRY
.
_get_model_cls
(
model_config
)
factories
=
MULTIMODAL_REGISTRY
.
_processor_factories
[
model_cls
]
...
...
@@ -293,6 +295,7 @@ def _test_processing_correctness_one(
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"
,
"OpenGVLab/InternVL3_5-30B-A3B"
,
"Kwai-Keye/Keye-VL-8B-Preview"
,
"Kwai-Keye/Keye-VL-1_5-8B"
,
"moonshotai/Kimi-VL-A3B-Instruct"
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
"llava-hf/llava-1.5-7b-hf"
,
...
...
@@ -301,6 +304,7 @@ def _test_processing_correctness_one(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
"mispeech/midashenglm-7b"
,
"openbmb/MiniCPM-Llama3-V-2_5"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
...
...
tests/models/multimodal/processing/test_glm4_1v.py
View file @
38d80967
...
...
@@ -5,6 +5,7 @@ import pytest
from
vllm.assets.video
import
VideoAsset
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.video
import
OpenCVDynamicVideoBackend
,
OpenCVVideoBackend
from
...utils
import
build_model_context
...
...
@@ -50,3 +51,49 @@ def test_processor_override(
assert
grid_t
==
expected_grid_t
assert
video_tok_count
==
expected_toks_per_frame
*
grid_t
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"zai-org/GLM-4.1V-9B-Thinking"
])
@
pytest
.
mark
.
parametrize
(
"fps"
,
[
2
])
def
test_video_loader_consistency
(
model_id
:
str
,
fps
:
int
,
):
"""
Ensure dynamic video loader (pre-sampled by loader) and normal video
loader (post-sampled by processor) produce same video processing outputs.
"""
ctx
=
build_model_context
(
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"video"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
=
{
"fps"
:
fps
}
# Build the image str / prompt based on the number of images we pass
prompt
=
"<|begin_of_video|><|video|><|end_of_video|>"
video_path
=
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=-
1
).
video_path
with
open
(
video_path
,
"rb"
)
as
f
:
video_bytes
=
f
.
read
()
static_video
,
static_metadata
=
OpenCVVideoBackend
.
load_bytes
(
video_bytes
)
dynamic_video
,
dynamic_metadata
=
OpenCVDynamicVideoBackend
.
load_bytes
(
video_bytes
,
requested_fps
=
fps
)
# pre-sampled loader shouldn't read all frames
assert
len
(
dynamic_video
)
<
len
(
static_video
)
static_mm_data
=
{
"video"
:
[(
static_video
,
static_metadata
)]}
dynamic_mm_data
=
{
"video"
:
[(
dynamic_video
,
dynamic_metadata
)]}
static_outputs
=
processor
.
apply
(
prompt
,
static_mm_data
,
hf_processor_mm_kwargs
)
dynamic_outputs
=
processor
.
apply
(
prompt
,
dynamic_mm_data
,
hf_processor_mm_kwargs
)
assert
static_outputs
[
"prompt_token_ids"
]
==
dynamic_outputs
[
"prompt_token_ids"
]
assert
static_outputs
[
"mm_kwargs"
].
get_data
(
)
==
dynamic_outputs
[
"mm_kwargs"
].
get_data
()
tests/models/multimodal/processing/test_mllama4.py
View file @
38d80967
...
...
@@ -52,7 +52,7 @@ def test_profiling(model_id: str, max_model_len: int):
chunks_per_image
=
prod
(
mm_data
[
"patches_per_image"
])
total_num_patches
=
chunks_per_image
*
tokens_per_patch
num_tiles
=
mm_data
[
"aspect_ratios"
][
0
][
0
]
*
mm_data
[
"aspect_ratios"
][
0
][
1
]
# x-y sep
e
rator tokens
1
]
# x-y sep
a
rator tokens
total_tokens
=
total_num_patches
.
item
()
+
num_tiles
.
item
(
)
+
3
# image start, image, image end
...
...
tests/models/multimodal/processing/test_tensor_schema.py
View file @
38d80967
...
...
@@ -31,6 +31,7 @@ from ...utils import dummy_hf_overrides
ARCH_TO_SKIP
=
{
"MolmoForCausalLM"
:
"incompatible requirements"
,
"Florence2ForConditionalGeneration"
:
"not supported in V1"
,
}
ARCH_NEEDS_EXTRAS
=
[
"InternVLChatModel"
,
...
...
@@ -41,9 +42,6 @@ ARCH_NEEDS_EXTRAS = [
]
REPO_ID_TO_SKIP
=
{
"nm-testing/pixtral-12b-FP8-dynamic"
:
"duplicated test"
,
# FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
# after support PP for GPT-OSS
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"
:
"Broken model"
,
}
ImageInput
=
list
[
Image
.
Image
]
...
...
@@ -199,7 +197,9 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
revision
=
model_info
.
revision
,
trust_remote_code
=
model_info
.
trust_remote_code
,
hf_overrides
=
hf_overrides_fn
,
)
skip_tokenizer_init
=
model_info
.
skip_tokenizer_init
,
enforce_eager
=
model_info
.
enforce_eager
,
dtype
=
model_info
.
dtype
)
model_cls
=
MULTIMODAL_REGISTRY
.
_get_model_cls
(
model_config
)
factories
=
MULTIMODAL_REGISTRY
.
_processor_factories
[
model_cls
]
...
...
tests/models/multimodal/test_mapping.py
View file @
38d80967
...
...
@@ -59,7 +59,9 @@ def test_hf_model_weights_mapper(model_arch: str):
revision
=
model_info
.
revision
,
trust_remote_code
=
model_info
.
trust_remote_code
,
hf_overrides
=
model_info
.
hf_overrides
,
)
skip_tokenizer_init
=
model_info
.
skip_tokenizer_init
,
enforce_eager
=
model_info
.
enforce_eager
,
dtype
=
model_info
.
dtype
)
model_cls
=
MULTIMODAL_REGISTRY
.
_get_model_cls
(
model_config
)
original_weights
=
create_repo_dummy_weights
(
model_id
)
...
...
tests/models/registry.py
View file @
38d80967
...
...
@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
from
typing
import
Any
,
Literal
,
Optional
import
pytest
import
torch
from
packaging.version
import
Version
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
vllm.config
import
TokenizerMode
from
vllm.config
import
ModelDType
,
TokenizerMode
@
dataclass
(
frozen
=
True
)
...
...
@@ -47,6 +48,23 @@ class _HfExamplesInfo:
The reason for the minimum/maximum version requirement.
"""
skip_tokenizer_init
:
bool
=
False
"""
If true, skip initialization of tokenizer and detokenizer.
"""
dtype
:
ModelDType
=
"auto"
"""
The data type for the model weights and activations.
"""
enforce_eager
:
bool
=
False
"""
Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
"""
is_available_online
:
bool
=
True
"""
Set this to ``False`` if the name of this architecture no longer exists on
...
...
@@ -76,6 +94,15 @@ class _HfExamplesInfo:
If not specified, the default revision will be used.
"""
max_num_seqs
:
Optional
[
int
]
=
None
"""Maximum number of sequences to be processed in a single iteration."""
use_original_num_layers
:
bool
=
False
"""
If True, use the original number of layers from the model config
instead of minimal layers for testing.
"""
def
check_transformers_version
(
self
,
*
,
...
...
@@ -137,7 +164,7 @@ class _HfExamplesInfo:
# yapf: disable
_TEXT_GENERATION_EXAMPLE_MODELS
=
{
# [Decoder-only]
"ApertusForCausalLM"
:
_HfExamplesInfo
(
"swiss-ai/Apertus-8B"
,
"ApertusForCausalLM"
:
_HfExamplesInfo
(
"swiss-ai/Apertus-8B
-2509
"
,
min_transformers_version
=
"4.56.0"
,
trust_remote_code
=
True
),
"AquilaModel"
:
_HfExamplesInfo
(
"BAAI/AquilaChat-7B"
,
...
...
@@ -154,7 +181,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"BailingMoeForCausalLM"
:
_HfExamplesInfo
(
"inclusionAI/Ling-lite-1.5"
,
trust_remote_code
=
True
),
"BambaForCausalLM"
:
_HfExamplesInfo
(
"ibm-ai-platform/Bamba-9B-v1"
,
min_transformers_version
=
"4.5
6.0
"
,
min_transformers_version
=
"4.5
5.3
"
,
extras
=
{
"tiny"
:
"hmellor/tiny-random-BambaForCausalLM"
}),
# noqa: E501
"BloomForCausalLM"
:
_HfExamplesInfo
(
"bigscience/bloom-560m"
,
{
"1b"
:
"bigscience/bloomz-1b1"
}),
...
...
@@ -208,7 +235,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"GptOssForCausalLM"
:
_HfExamplesInfo
(
"lmsys/gpt-oss-20b-bf16"
),
"GraniteForCausalLM"
:
_HfExamplesInfo
(
"ibm/PowerLM-3b"
),
"GraniteMoeForCausalLM"
:
_HfExamplesInfo
(
"ibm/PowerMoE-3b"
),
"GraniteMoeHybridForCausalLM"
:
_HfExamplesInfo
(
"ibm-granite/granite-4.0-tiny-preview"
),
# noqa: E501
"GraniteMoeHybridForCausalLM"
:
_HfExamplesInfo
(
"ibm-granite/granite-4.0-tiny-preview"
,
# noqa: E501
min_transformers_version
=
"4.55.3"
),
"GraniteMoeSharedForCausalLM"
:
_HfExamplesInfo
(
"ibm-research/moe-7b-1b-active-shared-experts"
),
# noqa: E501
"Grok1ModelForCausalLM"
:
_HfExamplesInfo
(
"hpcai-tech/grok-1"
,
trust_remote_code
=
True
),
...
...
@@ -228,7 +256,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"JAISLMHeadModel"
:
_HfExamplesInfo
(
"inceptionai/jais-13b-chat"
),
"JambaForCausalLM"
:
_HfExamplesInfo
(
"ai21labs/AI21-Jamba-1.5-Mini"
,
min_transformers_version
=
"4.5
6.0
"
,
min_transformers_version
=
"4.5
5.3
"
,
extras
=
{
"tiny"
:
"ai21labs/Jamba-tiny-dev"
,
"random"
:
"ai21labs/Jamba-tiny-random"
,
# noqa: E501
...
...
@@ -244,7 +272,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Llama4ForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
# noqa: E501
is_available_online
=
False
),
"MambaForCausalLM"
:
_HfExamplesInfo
(
"state-spaces/mamba-130m-hf"
),
"Mamba2ForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mamba-Codestral-7B-v0.1"
),
"Mamba2ForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mamba-Codestral-7B-v0.1"
,
min_transformers_version
=
"4.55.3"
,
extras
=
{
"random"
:
"yujiepan/mamba2-codestral-v0.1-tiny-random"
,
# noqa: E501
}),
"FalconMambaForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-mamba-7b-instruct"
),
# noqa: E501
"MiniCPMForCausalLM"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-2B-sft-bf16"
,
trust_remote_code
=
True
),
...
...
@@ -259,7 +291,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"MistralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-7B-Instruct-v0.1"
),
"MixtralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
# noqa: E501
{
"tiny"
:
"TitanML/tiny-mixtral"
}),
# noqa: E501
"QuantMixtralForCausalLM"
:
_HfExamplesInfo
(
"mistral-community/Mixtral-8x22B-v0.1-AWQ"
),
# noqa: E501
"MotifForCausalLM"
:
_HfExamplesInfo
(
"Motif-Technologies/Motif-2.6B"
,
trust_remote_code
=
True
,
v0_only
=
True
),
"MptForCausalLM"
:
_HfExamplesInfo
(
"mpt"
,
is_available_online
=
False
),
"MPTForCausalLM"
:
_HfExamplesInfo
(
"mosaicml/mpt-7b"
),
"NemotronForCausalLM"
:
_HfExamplesInfo
(
"nvidia/Minitron-8B-Base"
),
...
...
@@ -282,8 +316,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"PhiMoEForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3.5-MoE-instruct"
,
trust_remote_code
=
True
),
"Plamo2ForCausalLM"
:
_HfExamplesInfo
(
"pfnet/plamo-2-1b"
,
max_transformers_version
=
"4.53"
,
transformers_version_reason
=
"vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings"
,
# noqa: E501
trust_remote_code
=
True
),
"QWenLMHeadModel"
:
_HfExamplesInfo
(
"Qwen/Qwen-7B-Chat"
,
max_transformers_version
=
"4.53"
,
...
...
@@ -294,6 +326,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
),
"Qwen3MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-30B-A3B"
),
"Qwen3NextForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-Next-80B-A3B-Instruct"
,
min_transformers_version
=
"4.56.2"
),
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
),
"SeedOssForCausalLM"
:
_HfExamplesInfo
(
"ByteDance-Seed/Seed-OSS-36B-Instruct"
,
# noqa: E501
trust_remote_code
=
True
,
...
...
@@ -328,6 +362,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
# [Text-only]
"BertModel"
:
_HfExamplesInfo
(
"BAAI/bge-base-en-v1.5"
),
"Gemma2Model"
:
_HfExamplesInfo
(
"BAAI/bge-multilingual-gemma2"
),
# noqa: E501
"Gemma3TextModel"
:
_HfExamplesInfo
(
"google/embeddinggemma-300m"
),
"GritLM"
:
_HfExamplesInfo
(
"parasail-ai/GritLM-7B-vllm"
),
"GteModel"
:
_HfExamplesInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
trust_remote_code
=
True
),
...
...
@@ -359,7 +394,20 @@ _EMBEDDING_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"MrLight/dse-qwen2-2b-mrl-v1"
),
# noqa: E501
"PrithviGeoSpatialMAE"
:
_HfExamplesInfo
(
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
,
# noqa: E501
is_available_online
=
False
),
# noqa: E501
dtype
=
torch
.
float16
,
enforce_eager
=
True
,
skip_tokenizer_init
=
True
,
# This is to avoid the model
# going OOM in CI
max_num_seqs
=
32
,
),
"Terratorch"
:
_HfExamplesInfo
(
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
,
# noqa: E501
dtype
=
torch
.
float16
,
enforce_eager
=
True
,
skip_tokenizer_init
=
True
,
# This is to avoid the model going OOM in CI
max_num_seqs
=
32
,
),
}
_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS
=
{
...
...
@@ -438,6 +486,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"InternVLForConditionalGeneration"
:
_HfExamplesInfo
(
"OpenGVLab/InternVL3-1B-hf"
),
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
"Kwai-Keye/Keye-VL-8B-Preview"
,
# noqa: E501
trust_remote_code
=
True
),
"KeyeVL1_5ForConditionalGeneration"
:
_HfExamplesInfo
(
"Kwai-Keye/Keye-VL-1_5-8B"
,
# noqa: E501
trust_remote_code
=
True
),
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
"moonshotai/Kimi-VL-A3B-Instruct"
,
# noqa: E501
extras
=
{
"thinking"
:
"moonshotai/Kimi-VL-A3B-Thinking"
},
# noqa: E501
trust_remote_code
=
True
),
...
...
@@ -455,6 +505,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
max_transformers_version
=
"4.48"
,
# noqa: E501
transformers_version_reason
=
"HF model is not compatible."
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]}),
# noqa: E501
"MiDashengLMModel"
:
_HfExamplesInfo
(
"mispeech/midashenglm-7b"
,
trust_remote_code
=
True
),
"MiniCPMO"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-o-2_6"
,
trust_remote_code
=
True
),
"MiniCPMV"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-Llama3-V-2_5"
,
...
...
@@ -474,6 +526,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"Llama_Nemotron_Nano_VL"
:
_HfExamplesInfo
(
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
,
# noqa: E501
trust_remote_code
=
True
),
"NemotronH_Nano_VL"
:
_HfExamplesInfo
(
"nano_vl_dummy"
,
is_available_online
=
False
,
trust_remote_code
=
True
),
"Ovis"
:
_HfExamplesInfo
(
"AIDC-AI/Ovis2-1B"
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.53"
,
transformers_version_reason
=
"HF model is not compatible"
,
# noqa: E501
...
...
@@ -554,19 +609,21 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"EagleDeepSeekMTPModel"
:
_HfExamplesInfo
(
"eagle618/deepseek-v3-random"
,
speculative_model
=
"eagle618/eagle-deepseek-v3-random"
,
# noqa: E501
trust_remote_code
=
True
),
"EagleLlamaForCausalLM"
:
_HfExamplesInfo
(
"
yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
"EagleLlamaForCausalLM"
:
_HfExamplesInfo
(
"
meta-llama/Meta-Llama-3-8B-Instruct"
,
# noqa: E501
trust_remote_code
=
True
,
speculative_model
=
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
tokenizer
=
"meta-llama/Meta-Llama-3-8B-Instruct"
),
# noqa: E501
"Eagle3LlamaForCausalLM"
:
_HfExamplesInfo
(
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
,
# noqa: E501
tokenizer
=
"meta-llama/Meta-Llama-3-8B-Instruct"
),
# noqa: E501
"Eagle3LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.1-8B-Instruct"
,
# noqa: E501
trust_remote_code
=
True
,
speculative_model
=
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
,
# noqa: E501
tokenizer
=
"meta-llama/Llama-3.1-8B-Instruct"
,
use_original_num_layers
=
True
,
max_model_len
=
10240
),
"LlamaForCausalLMEagle3"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
,
# noqa: E501
trust_remote_code
=
True
,
speculative_model
=
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
,
tokenizer
=
"meta-llama/Llama-3.1-8B-Instruct"
),
# TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501
# "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3", # noqa: E501
# trust_remote_code=True,
# speculative_model="AngelSlim/Qwen3-8B_eagle3", # noqa: E501
# tokenizer="Qwen/Qwen3-8B"),
speculative_model
=
"AngelSlim/Qwen3-8B_eagle3"
,
# noqa: E501
tokenizer
=
"Qwen/Qwen3-8B"
,
use_original_num_layers
=
True
),
"EagleLlama4ForCausalLM"
:
_HfExamplesInfo
(
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"
,
trust_remote_code
=
True
,
...
...
@@ -586,7 +643,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
is_available_online
=
False
),
"MiMoMTPModel"
:
_HfExamplesInfo
(
"XiaomiMiMo/MiMo-7B-RL"
,
trust_remote_code
=
True
,
speculative_model
=
"XiaomiMiMo/MiMo-7B-RL"
)
speculative_model
=
"XiaomiMiMo/MiMo-7B-RL"
),
"Qwen3NextMTP"
:
_HfExamplesInfo
(
"Qwen/Qwen3-Next-80B-A3B-Instruct"
,
min_transformers_version
=
"4.56.2"
),
}
_TRANSFORMERS_BACKEND_MODELS
=
{
...
...
tests/models/test_initialization.py
View file @
38d80967
...
...
@@ -36,7 +36,10 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
hf_overrides_fn
=
partial
(
dummy_hf_overrides
,
model_arch
=
model_arch
,
exist_overrides
=
model_info
.
hf_overrides
)
exist_overrides
=
model_info
.
hf_overrides
,
use_original_num_layers
=
getattr
(
model_info
,
'use_original_num_layers'
,
False
))
# Avoid calling model.forward()
def
_initialize_kv_caches_v0
(
self
)
->
None
:
...
...
@@ -60,19 +63,29 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
_initialize_kv_caches_v1
),
monkeypatch
.
context
()
as
m
):
if
model_info
.
v0_only
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
if
model_arch
==
"Phi4FlashForCausalLM"
:
# Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
if
model_arch
in
(
"Phi4FlashForCausalLM"
,
"MotifForCausalLM"
):
# Phi4FlashForCausalLM and MotifForCausalLM
# only supports DIFFERENTIAL_FLASH_ATTN backend
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"DIFFERENTIAL_FLASH_ATTN"
)
if
model_arch
==
"GptOssForCausalLM"
:
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN_VLLM_V1"
)
if
model_arch
==
"Florence2ForConditionalGeneration"
:
# An encoder-decoder model that's V0-only. Just skip it
# since V0 is about to be removed.
pytest
.
skip
(
"Skipping Florence2ForConditionalGeneration"
)
if
model_arch
==
"WhisperForConditionalGeneration"
:
m
.
setenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
)
LLM
(
model_info
.
default
,
tokenizer
=
model_info
.
tokenizer
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
revision
=
model_info
.
revision
,
enforce_eager
=
model_info
.
enforce_eager
,
skip_tokenizer_init
=
model_info
.
skip_tokenizer_init
,
dtype
=
model_info
.
dtype
,
speculative_config
=
{
"model"
:
model_info
.
speculative_model
,
"num_speculative_tokens"
:
1
,
...
...
@@ -85,7 +98,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
model_impl
=
ModelImpl
.
TRANSFORMERS
if
model_arch
in
_TRANSFORMERS_BACKEND_MODELS
else
ModelImpl
.
VLLM
,
hf_overrides
=
hf_overrides_fn
,
)
max_num_seqs
=
model_info
.
max_num_seqs
)
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
HF_EXAMPLE_MODELS
.
get_supported_archs
())
...
...
tests/models/test_terratorch.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
tests.conftest
import
VllmRunner
from
vllm.utils
import
set_default_torch_num_threads
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
,
"mgazz/Prithvi_v2_eo_300_tl_unet_agb"
],
)
def
test_inference
(
vllm_runner
:
type
[
VllmRunner
],
model
:
str
,
)
->
None
:
pixel_values
=
torch
.
full
((
6
,
512
,
512
),
1.0
,
dtype
=
torch
.
float16
)
location_coords
=
torch
.
full
((
1
,
2
),
1.0
,
dtype
=
torch
.
float16
)
prompt
=
dict
(
prompt_token_ids
=
[
1
],
multi_modal_data
=
dict
(
pixel_values
=
pixel_values
,
location_coords
=
location_coords
))
with
(
set_default_torch_num_threads
(
1
),
vllm_runner
(
model
,
runner
=
"pooling"
,
dtype
=
torch
.
float16
,
enforce_eager
=
True
,
skip_tokenizer_init
=
True
,
# Limit the maximum number of sequences to avoid the
# test going OOM during the warmup run
max_num_seqs
=
32
,
)
as
vllm_model
,
):
vllm_output
=
vllm_model
.
llm
.
encode
(
prompt
)
assert
torch
.
equal
(
torch
.
isnan
(
vllm_output
[
0
].
outputs
.
data
).
any
(),
torch
.
tensor
(
False
))
tests/models/utils.py
View file @
38d80967
...
...
@@ -294,6 +294,8 @@ def build_model_context(
limit_mm_per_prompt
=
limit_mm_per_prompt
,
mm_processor_cache_gb
=
mm_processor_cache_gb
,
hf_overrides
=
model_info
.
hf_overrides
,
skip_tokenizer_init
=
model_info
.
skip_tokenizer_init
,
enforce_eager
=
model_info
.
enforce_eager
,
**
model_config_kwargs
,
)
return
InputContext
(
model_config
)
...
...
@@ -345,6 +347,7 @@ class ModelInfo:
name
:
str
architecture
:
str
=
""
dtype
:
str
=
"auto"
hf_dtype
:
str
=
"float32"
hf_overrides
:
Optional
[
dict
[
str
,
Any
]]
=
None
default_pooling_type
:
str
=
""
enable_test
:
bool
=
True
...
...
@@ -352,6 +355,7 @@ class ModelInfo:
@
dataclass
class
EmbedModelInfo
(
ModelInfo
):
mteb_score
:
Optional
[
float
]
=
None
is_matryoshka
:
bool
=
False
matryoshka_dimensions
:
Optional
[
list
[
int
]]
=
None
...
...
@@ -368,7 +372,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):
@
dataclass
class
RerankModelInfo
(
ModelInfo
):
pass
mteb_score
:
Optional
[
float
]
=
None
@
dataclass
...
...
@@ -381,11 +385,18 @@ class LASTPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type
:
str
=
"LAST"
@
dataclass
class
GenerateModelInfo
(
ModelInfo
):
hf_dtype
:
str
=
"auto"
hf_ppl
:
Optional
[
float
]
=
None
def
dummy_hf_overrides
(
hf_config
:
PretrainedConfig
,
*
,
model_arch
:
str
=
""
,
exist_overrides
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
use_original_num_layers
:
bool
=
False
,
)
->
PretrainedConfig
:
"""
Dummy HF overrides function used to create dummy model
...
...
@@ -402,10 +413,18 @@ def dummy_hf_overrides(
# we use three layers for Gemma-3n to check
# both normal layer and kv_shared_layer
num_hidden_layers
=
(
3
if
model_arch
==
"Gemma3nForConditionalGeneration"
else
1
)
if
use_original_num_layers
:
# Use the original number of layers from the config
num_layers
=
getattr
(
text_config
,
'num_layers'
,
1
)
num_hidden_layers
=
getattr
(
text_config
,
'num_hidden_layers'
,
1
)
else
:
# Use minimal layers for testing
num_layers
=
1
num_hidden_layers
=
(
3
if
model_arch
==
"Gemma3nForConditionalGeneration"
else
1
)
text_config
.
update
({
"num_layers"
:
1
,
"num_layers"
:
num_layers
,
"num_hidden_layers"
:
num_hidden_layers
,
"num_experts"
:
num_experts
,
"num_experts_per_tok"
:
2
,
...
...
tests/multimodal/test_utils.py
View file @
38d80967
...
...
@@ -31,11 +31,11 @@ if TYPE_CHECKING:
from
vllm.multimodal.inputs
import
MultiModalPlaceholderDict
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_
URL
S
=
[
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
,
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png"
,
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
TEST_IMAGE_
ASSET
S
=
[
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
#
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png"
,
#
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png"
,
#
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png"
,
#
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
TEST_VIDEO_URLS
=
[
...
...
@@ -45,12 +45,11 @@ TEST_VIDEO_URLS = [
@
pytest
.
fixture
(
scope
=
"module"
)
def
url_images
()
->
dict
[
str
,
Image
.
Image
]:
connector
=
MediaConnector
()
def
url_images
(
local_asset_server
)
->
dict
[
str
,
Image
.
Image
]:
return
{
image_url
:
connecto
r
.
f
et
ch
_image
(
image_url
)
for
image_url
in
TEST_IMAGE_
URL
S
image_url
:
local_asset_serve
r
.
g
et_image
_asset
(
image_url
)
for
image_url
in
TEST_IMAGE_
ASSET
S
}
...
...
@@ -69,7 +68,7 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_
URLS
)
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_
ASSETS
,
indirect
=
True
)
async
def
test_fetch_image_http
(
image_url
:
str
):
connector
=
MediaConnector
()
...
...
@@ -79,12 +78,12 @@ async def test_fetch_image_http(image_url: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_
URL
S
)
@
pytest
.
mark
.
parametrize
(
"
raw_
image_url"
,
TEST_IMAGE_
ASSET
S
)
@
pytest
.
mark
.
parametrize
(
"suffix"
,
get_supported_suffixes
())
async
def
test_fetch_image_base64
(
url_images
:
dict
[
str
,
Image
.
Image
],
image_url
:
str
,
suffix
:
str
):
raw_
image_url
:
str
,
suffix
:
str
):
connector
=
MediaConnector
()
url_image
=
url_images
[
image_url
]
url_image
=
url_images
[
raw_
image_url
]
try
:
mime_type
=
Image
.
MIME
[
Image
.
registered_extensions
()[
suffix
]]
...
...
@@ -117,7 +116,7 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image],
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_
URLS
)
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_
ASSETS
,
indirect
=
True
)
async
def
test_fetch_image_local_files
(
image_url
:
str
):
connector
=
MediaConnector
()
...
...
@@ -152,8 +151,8 @@ async def test_fetch_image_local_files(image_url: str):
@
pytest
.
mark
.
asyncio
async
def
test_fetch_image_local_files_with_space_in_name
():
image_url
=
TEST_IMAGE_URLS
[
0
]
@
pytest
.
mark
.
parametrize
(
"image_url"
,
[
TEST_IMAGE_ASSETS
[
0
]],
indirect
=
True
)
async
def
test_fetch_image_local_files_with_space_in_name
(
image_url
:
str
):
connector
=
MediaConnector
()
with
TemporaryDirectory
()
as
temp_dir
:
...
...
@@ -205,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
assert
metadata_sync
==
metadata_async
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"max_duration"
,
[
1
,
60
,
1800
])
@
pytest
.
mark
.
parametrize
(
"requested_fps"
,
[
2
,
24
])
async
def
test_fetch_video_http_with_dynamic_loader
(
video_url
:
str
,
max_duration
:
int
,
requested_fps
:
int
,
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"opencv_dynamic"
)
connector
=
MediaConnector
(
media_io_kwargs
=
{
"video"
:
{
"max_duration"
:
max_duration
,
"requested_fps"
:
requested_fps
,
}
})
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
metadata_sync
==
metadata_async
assert
metadata_sync
[
"video_backend"
]
==
"opencv_dynamic"
# Used for `test_argsort_mm_positions`.
class
TestCase
(
NamedTuple
):
mm_positions
:
"MultiModalPlaceholderDict"
...
...
@@ -458,7 +483,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
with
torch
.
inference_mode
():
sharded_output
=
run_dp_sharded_vision_model
(
image_input
,
vision_model
)
# Check that the world size is setup correctly
# Check that the world size is set
up correctly
assert
get_tensor_model_parallel_world_size
()
==
world_size
# Check that the outputs have the same shape
...
...
@@ -636,11 +661,13 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
# Run the model through the sharded function
with
torch
.
inference_mode
():
sharded_output
=
run_dp_sharded_mrope_vision_model
(
vision_model
,
pixel_values
,
grid_thw_list
)
sharded_output
=
run_dp_sharded_mrope_vision_model
(
vision_model
,
pixel_values
,
grid_thw_list
,
rope_type
=
"rope_3d"
)
sharded_output
=
torch
.
cat
(
sharded_output
,
dim
=
0
)
# Check that the world size is setup correctly
# Check that the world size is set
up correctly
assert
get_tensor_model_parallel_world_size
()
==
world_size
# Compare outputs (only on rank 0)
...
...
@@ -691,8 +718,10 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(
# Should handle empty input gracefully
with
torch
.
inference_mode
():
output
=
run_dp_sharded_mrope_vision_model
(
vision_model
,
pixel_values
,
grid_thw_list
)
output
=
run_dp_sharded_mrope_vision_model
(
vision_model
,
pixel_values
,
grid_thw_list
,
rope_type
=
"rope_3d"
)
assert
len
(
output
)
==
0
...
...
@@ -745,8 +774,10 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
# Should handle uneven distribution without errors
with
torch
.
inference_mode
():
output_tuple
=
run_dp_sharded_mrope_vision_model
(
vision_model
,
pixel_values
,
grid_thw_list
)
output_tuple
=
run_dp_sharded_mrope_vision_model
(
vision_model
,
pixel_values
,
grid_thw_list
,
rope_type
=
"rope_3d"
)
# Verify output shape is reasonable
merge_factor
=
vision_model
.
spatial_merge_size
**
2
...
...
tests/neuron/1_core/test_activation.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
import
torch.nn.functional
as
F
from
vllm.model_executor.layers.activation
import
FastGELU
,
SiluAndMul
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
parametrize
(
"activation"
,
[
"silu_and_mul"
,
"gelu_fast"
])
@
pytest
.
mark
.
parametrize
(
"num_tokens,d,dtype"
,
[
(
7
,
512
,
torch
.
half
),
(
7
,
512
,
torch
.
float
),
(
83
,
512
,
torch
.
half
),
])
@
torch
.
inference_mode
()
def
test_act_and_mul
(
activation
:
str
,
num_tokens
:
int
,
d
:
int
,
dtype
:
torch
.
dtype
,
)
->
None
:
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
current_platform
.
seed_everything
(
0
)
torch
.
set_default_device
(
"cpu"
)
x
=
torch
.
randn
(
num_tokens
,
2
*
d
,
dtype
=
dtype
).
to
(
device
=
device
)
if
activation
==
"silu_and_mul"
:
layer
=
SiluAndMul
()
fn
=
layer
.
forward_native
elif
activation
==
"gelu_fast"
:
layer
=
FastGELU
()
fn
=
F
.
gelu
else
:
raise
NotImplementedError
(
f
"activation
{
activation
}
is not implemented."
)
assert
x
.
is_xla
,
"input tensor under testing is expected to be XLA tensor."
out
=
layer
.
to
(
device
=
device
).
forward_neuron
(
x
)
ref_out
=
fn
(
x
.
cpu
())
torch
.
testing
.
assert_close
(
out
.
cpu
(),
ref_out
,
atol
=
0.01
,
rtol
=
0.0
)
tests/neuron/1_core/test_block_table.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
neuronxcc.nki.language
as
nl
import
pytest
import
torch
import
torch.nn.functional
as
F
from
neuronxcc
import
nki
from
vllm.attention.ops.nki_flash_attn
import
(
load_block_tables
,
transform_block_tables_for_indirect_load
)
def
is_power_of_2
(
n
):
return
n
>
0
and
(
n
&
(
n
-
1
)
==
0
)
def
nki_load_and_transform_block_tables
(
block_tables
,
num_tiles
,
num_blocks_per_tile
,
num_head
,
head_id
,
block_size_tiling_factor
,
):
assert
is_power_of_2
(
num_blocks_per_tile
),
f
"
{
num_blocks_per_tile
=
}
must be power of 2"
block_tables_sbuf
=
load_block_tables
(
block_tables
,
num_tiles
,
num_blocks_per_tile
)
# we need to pass an Index as head_id
head_id
=
nl
.
arange
(
1
)[
None
,
:]
+
head_id
block_tables_transposed
=
transform_block_tables_for_indirect_load
(
block_tables_sbuf
,
block_size_tiling_factor
,
num_head
,
head_id
)
B_P_SIZE
=
128
assert
block_tables_transposed
.
shape
[
1
]
==
B_P_SIZE
out
=
nl
.
ndarray
(
block_tables_transposed
.
shape
,
dtype
=
nl
.
int32
,
buffer
=
nl
.
shared_hbm
,
)
for
i
in
nl
.
affine_range
(
block_tables_transposed
.
shape
[
0
]):
nl
.
store
(
dst
=
out
[
i
],
value
=
block_tables_transposed
[
i
])
return
out
def
ref_block_tables_transform
(
block_tables
,
num_tiles
,
num_blocks_per_tile
,
num_head
,
head_id
,
block_size_tiling_factor
,
):
assert
block_tables
.
numel
()
==
num_tiles
*
num_blocks_per_tile
block_tables
=
block_tables
.
view
(
num_tiles
,
num_blocks_per_tile
)
B_F_SIZE
=
128
num_tiles_padded
=
(
num_tiles
+
B_F_SIZE
-
1
)
//
B_F_SIZE
*
B_F_SIZE
block_tables
=
F
.
pad
(
block_tables
,
(
0
,
0
,
0
,
num_tiles_padded
-
num_tiles
),
"constant"
,
0
,
)
block_tables
=
block_tables
*
num_head
+
head_id
block_tables
=
block_tables
.
view
(
num_tiles_padded
,
num_blocks_per_tile
,
1
)
offset
=
torch
.
arange
(
0
,
block_size_tiling_factor
).
view
(
1
,
1
,
-
1
)
block_tables
=
block_tables
*
block_size_tiling_factor
+
offset
block_tables_transposed
=
block_tables
.
view
(
num_tiles_padded
,
-
1
).
t
()
num_blocks_per_tile
=
block_tables_transposed
.
shape
[
0
]
assert
num_blocks_per_tile
%
B_F_SIZE
==
0
return
block_tables_transposed
.
view
(
num_blocks_per_tile
//
B_F_SIZE
,
B_F_SIZE
,
num_tiles_padded
)
@
pytest
.
mark
.
parametrize
(
"q_head_per_kv_head,head_id"
,
[
(
1
,
0
),
(
3
,
1
),
],
)
@
pytest
.
mark
.
parametrize
(
"num_tiles,num_blocks_per_tile"
,
[
(
1
,
1
),
(
13
,
16
),
(
17
,
128
),
(
35
,
512
),
(
128
,
128
),
(
130
,
64
),
(
280
,
256
),
(
315
,
1
),
],
)
@
torch
.
inference_mode
()
def
test_load_and_transform_block_tables
(
monkeypatch
:
pytest
.
MonkeyPatch
,
num_tiles
,
num_blocks_per_tile
,
q_head_per_kv_head
,
head_id
,
)
->
None
:
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
compiler_flags_str
=
" "
.
join
([
"-O1"
,
"--retry_failed_compilation"
,
])
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"NEURON_CC_FLAGS"
,
compiler_flags_str
)
torch
.
manual_seed
(
10000
)
torch
.
set_printoptions
(
sci_mode
=
False
)
# On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
B_P_SIZE
=
128
if
num_blocks_per_tile
<
B_P_SIZE
:
assert
B_P_SIZE
%
num_blocks_per_tile
==
0
block_size_tiling_factor
=
B_P_SIZE
//
num_blocks_per_tile
else
:
block_size_tiling_factor
=
1
max_num_blocks
=
100000
block_tables
=
torch
.
randint
(
0
,
max_num_blocks
,
(
num_tiles
*
num_blocks_per_tile
,
),
dtype
=
torch
.
int32
,
)
nki_out
=
nki
.
jit
(
nki_load_and_transform_block_tables
)[
1
,
1
](
block_tables
.
to
(
device
=
device
),
num_tiles
,
num_blocks_per_tile
,
q_head_per_kv_head
,
head_id
,
block_size_tiling_factor
,
).
cpu
()
ref_out
=
ref_block_tables_transform
(
block_tables
,
num_tiles
,
num_blocks_per_tile
,
q_head_per_kv_head
,
head_id
,
block_size_tiling_factor
,
)
assert
(
nki_out
.
shape
==
ref_out
.
shape
),
f
"
{
nki_out
.
shape
=
}
!=
{
ref_out
.
shape
=
}
"
assert
torch
.
all
(
nki_out
==
ref_out
)
tests/neuron/1_core/test_cache.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
vllm.attention.ops.nki_flash_attn
import
reshape_and_cache
@
pytest
.
mark
.
parametrize
(
"num_tokens, n_kv_head, d_head, num_blocks, block_size"
,
[
# Small model configuration (e.g., GPT-2 small)
(
32
,
12
,
64
,
4
,
128
),
# Typical sequence processing
(
1
,
12
,
64
,
4
,
128
),
# Single token update
(
128
,
12
,
64
,
4
,
128
),
# Longer sequence
# Medium model configuration (e.g., GPT-2 medium)
(
64
,
16
,
96
,
8
,
256
),
# Standard batch
(
256
,
16
,
96
,
8
,
256
),
# Large batch
# Large model configuration (e.g., GPT-3 style)
(
48
,
32
,
128
,
16
,
512
),
# Typical processing window
(
512
,
32
,
128
,
16
,
512
),
# Full context window
# Edge cases and stress tests
(
1024
,
8
,
32
,
32
,
32
),
# Many tokens, small heads
(
16
,
64
,
256
,
4
,
64
),
# Few tokens, many heads
(
2048
,
24
,
128
,
64
,
128
),
# Large scale test
# Minimal configurations for debugging
(
4
,
2
,
16
,
2
,
16
),
# Tiny test case
(
1
,
1
,
8
,
1
,
8
),
# Minimal possible
])
def
test_reshape_and_cache
(
num_tokens
,
n_kv_head
,
d_head
,
num_blocks
,
block_size
):
# Set random seed for reproducibility
torch
.
manual_seed
(
42
)
# Create CPU tensors for reference implementation
key_cpu
=
torch
.
randn
(
num_tokens
,
n_kv_head
,
d_head
)
/
torch
.
sqrt
(
torch
.
tensor
(
d_head
))
value_cpu
=
torch
.
randn
(
num_tokens
,
n_kv_head
,
d_head
)
/
torch
.
sqrt
(
torch
.
tensor
(
d_head
))
key_cache_cpu
=
torch
.
zeros
(
num_blocks
,
n_kv_head
,
block_size
,
d_head
)
value_cache_cpu
=
torch
.
zeros
(
num_blocks
,
n_kv_head
,
block_size
,
d_head
)
slot_mapping_cpu
=
torch
.
randperm
(
num_blocks
*
block_size
)[:
num_tokens
]
# Run reference implementation on CPU
block_indices
=
torch
.
div
(
slot_mapping_cpu
,
block_size
,
rounding_mode
=
"floor"
)
block_offsets
=
slot_mapping_cpu
%
block_size
for
i
in
range
(
num_tokens
):
block_idx
=
block_indices
[
i
]
block_offset
=
block_offsets
[
i
]
key_cache_cpu
[
block_idx
,
:,
block_offset
,
:]
=
key_cpu
[
i
]
value_cache_cpu
[
block_idx
,
:,
block_offset
,
:]
=
value_cpu
[
i
]
# Create XLA device tensors
device
=
torch
.
device
(
'xla'
)
key
=
key_cpu
.
to
(
device
)
value
=
value_cpu
.
to
(
device
)
key_cache
=
torch
.
zeros_like
(
key_cache_cpu
,
device
=
device
)
value_cache
=
torch
.
zeros_like
(
value_cache_cpu
,
device
=
device
)
slot_mapping
=
slot_mapping_cpu
.
to
(
device
)
kv_cache
=
torch
.
stack
([
key_cache
,
value_cache
])
# Run vectorized implementation on XLA device
reshape_and_cache
(
key
,
value
,
kv_cache
,
slot_mapping
)
key_cache
,
value_cache
=
torch
.
unbind
(
kv_cache
,
dim
=
0
)
# Move results back to CPU for comparison
key_cache_result
=
key_cache
.
cpu
()
value_cache_result
=
value_cache
.
cpu
()
# Assert results match
torch
.
testing
.
assert_close
(
key_cache_result
,
key_cache_cpu
,
rtol
=
1e-5
,
atol
=
1e-5
)
torch
.
testing
.
assert_close
(
value_cache_result
,
value_cache_cpu
,
rtol
=
1e-5
,
atol
=
1e-5
)
tests/neuron/1_core/test_layernorm.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
parametrize
(
"num_tokens,hidden_size,add_residual,dtype"
,
[
(
7
,
8
,
False
,
torch
.
half
),
(
83
,
768
,
False
,
torch
.
half
),
(
83
,
768
,
True
,
torch
.
half
),
(
83
,
768
,
True
,
torch
.
bfloat16
),
(
83
,
768
,
True
,
torch
.
float32
),
])
@
torch
.
inference_mode
()
def
test_rms_norm
(
num_tokens
:
int
,
hidden_size
:
int
,
add_residual
:
bool
,
dtype
:
torch
.
dtype
,
)
->
None
:
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
current_platform
.
seed_everything
(
0
)
torch
.
set_default_device
(
"cpu"
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
layer
.
weight
.
data
.
normal_
(
mean
=
1.0
,
std
=
0.1
)
scale
=
1
/
(
2
*
hidden_size
)
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
).
to
(
device
=
device
)
x
*=
scale
residual
=
torch
.
randn_like
(
x
)
*
scale
if
add_residual
else
None
residual_cpu
=
residual
.
cpu
()
if
add_residual
else
None
ref_out
=
layer
.
to
(
device
=
"cpu"
).
forward_native
(
x
.
cpu
(),
residual_cpu
)
assert
x
.
is_xla
,
"input tensor under testing is expected to be XLA tensor."
out
=
layer
.
to
(
device
=
device
)(
x
,
residual
)
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
# numerical errors than other operators because they involve reductions.
# Therefore, we use a larger tolerance.
if
add_residual
:
assert
out
[
0
].
is_xla
,
"output tensor is expected to be XLA tensor"
torch
.
testing
.
assert_close
(
out
[
0
].
cpu
(),
ref_out
[
0
],
atol
=
1e-2
,
rtol
=
1e-2
)
torch
.
testing
.
assert_close
(
out
[
1
].
cpu
(),
ref_out
[
1
],
atol
=
1e-2
,
rtol
=
1e-2
)
else
:
assert
out
.
is_xla
,
"output tensor is expected to be XLA tensor"
torch
.
testing
.
assert_close
(
out
.
cpu
(),
ref_out
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/neuron/1_core/test_logits_processor.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
from
unittest.mock
import
patch
import
pytest
import
torch
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
from
vllm.utils
import
is_pin_memory_available
class
MockLogitsProcessor
(
LogitsProcessor
):
def
__init__
(
self
,
vocab_size
:
int
,
scale
:
float
,
fake_logits
:
torch
.
Tensor
):
super
().
__init__
(
vocab_size
=
vocab_size
,
scale
=
scale
)
self
.
fake_logits
=
fake_logits
.
clone
()
def
forward
(
self
,
*
args
,
**
kwargs
):
with
patch
(
"vllm.model_executor.layers.logits_processor._prune_hidden_states"
,
lambda
x
,
y
:
x
),
patch
(
"vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits"
,
lambda
*
args
,
**
kwargs
:
self
.
fake_logits
):
return
super
().
forward
(
*
args
,
**
kwargs
)
def
_prepare_test
(
batch_size
:
int
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
MockLogitsProcessor
]:
vocab_size
=
32000
input_tensor
=
torch
.
rand
((
batch_size
,
1024
),
dtype
=
torch
.
float16
)
fake_logits
=
torch
.
full
((
batch_size
,
vocab_size
),
1e-2
,
dtype
=
input_tensor
.
dtype
)
logits_processor
=
MockLogitsProcessor
(
32000
,
0.5
,
fake_logits
)
return
input_tensor
,
fake_logits
,
logits_processor
RANDOM_SEEDS
=
list
(
range
(
8
))
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_logits_processors
(
seed
:
int
):
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
set_random_seed
(
seed
)
torch
.
set_default_device
(
"cpu"
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
fake_logits
,
logits_processor
=
_prepare_test
(
batch_size
)
# This sample logits processor gives infinite score to the i-th token,
# where i is the length of the input sequence.
# We therefore expect the output token sequence to be [0, 1, 2, ...]
def
pick_ith
(
token_ids
,
logits
):
logits
[
len
(
token_ids
)]
=
float
(
"inf"
)
return
logits
seq_group_metadata_list
=
[]
seq_lens
=
[]
for
i
in
range
(
batch_size
):
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
.
from_seqs
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
0
,
logits_processors
=
[
pick_ith
]),
block_tables
=
{
0
:
[
1
]},
))
seq_lens
.
append
(
seq_group_metadata_list
[
-
1
].
seq_data
[
0
].
get_len
())
sampling_metadata
=
SamplingMetadata
.
prepare
(
seq_group_metadata_list
,
seq_lens
,
query_lens
=
seq_lens
,
device
=
device
,
pin_memory
=
is_pin_memory_available
())
logits_processor_output
=
logits_processor
(
lm_head
=
None
,
hidden_states
=
input_tensor
,
sampling_metadata
=
sampling_metadata
)
fake_logits
*=
logits_processor
.
scale
torch
.
testing
.
assert_close
(
logits_processor_output
[:,
1
],
fake_logits
[:,
1
],
rtol
=
1e-4
,
atol
=
0.0
)
tests/neuron/1_core/test_neuron_model_runner.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
unittest.mock
import
MagicMock
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.platforms
import
current_platform
from
vllm.platforms.neuron
import
NeuronFramework
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
SequenceData
,
SequenceGroupMetadata
from
vllm.worker.neuron_model_runner
import
NeuronModelRunner
os
.
environ
[
'VLLM_NEURON_FRAMEWORK'
]
=
NeuronFramework
.
TRANSFORMERS_NEURONX
.
value
def
_create_neuron_model_runner
(
model
:
str
,
*
args
,
**
kwargs
)
->
NeuronModelRunner
:
engine_args
=
EngineArgs
(
model
,
*
args
,
**
kwargs
)
engine_config
=
engine_args
.
create_engine_config
()
vllm_config
=
VllmConfig
(
model_config
=
engine_config
.
model_config
,
parallel_config
=
engine_config
.
parallel_config
,
scheduler_config
=
engine_config
.
scheduler_config
,
device_config
=
engine_config
.
device_config
,
)
neuron_model_runner
=
NeuronModelRunner
(
vllm_config
=
vllm_config
)
return
neuron_model_runner
def
test_update_neuron_sampling_params_not_full_batch
():
os
.
environ
[
"NEURON_ON_DEVICE_SAMPLING_DISABLED"
]
=
"0"
model_runner
=
_create_neuron_model_runner
(
"facebook/opt-125m"
,
seed
=
0
,
dtype
=
"float16"
,
max_num_seqs
=
2
,
)
assert
not
model_runner
.
_on_device_sampling_disabled
# Test sampling param updating only when TNx is framework
# NxDI handles sampling parameter updating inside model
if
current_platform
.
use_transformers_neuronx
():
model_mock
=
MagicMock
()
model_runner
.
model
=
model_mock
seq_group_metadata_list
=
[
SequenceGroupMetadata
(
request_id
=
"test_0"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
.
from_seqs
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
0.5
,
top_k
=
1
,
top_p
=
0.5
),
block_tables
=
{
0
:
[
1
]},
)
]
model_runner
.
prepare_model_input
(
seq_group_metadata_list
)
# Index neuron sampling parameters based on block_tables indices.
# The first block_id of the sequence 0 is 1, so its parameters are
# placed at index 1. So the sampling parameters will be:
# Index 0: default sampling parameters
# Index 1: sequecne 0's sampling parameters.
neuron_sampling_params
=
(
model_runner
.
model_config
.
neuron_sampling_params
)
assert
neuron_sampling_params
.
temperature
==
[
1.0
,
0.5
]
assert
neuron_sampling_params
.
top_k
==
[
model_runner
.
_MAX_NEURON_SAMPLING_TOP_K
,
1
]
assert
neuron_sampling_params
.
top_p
==
[
1.0
,
0.5
]
model_mock
.
model
.
update_generation_config
.
assert_called_once_with
(
neuron_sampling_params
)
def
test_update_neuron_sampling_params_full_batch
():
os
.
environ
[
"NEURON_ON_DEVICE_SAMPLING_DISABLED"
]
=
"0"
model_runner
=
_create_neuron_model_runner
(
"facebook/opt-125m"
,
seed
=
0
,
dtype
=
"float16"
,
max_num_seqs
=
2
,
)
assert
not
model_runner
.
_on_device_sampling_disabled
# Test sampling param updating only when TNx is framework
# NxDI handles sampling parameter updating inside model
if
current_platform
.
use_transformers_neuronx
():
model_mock
=
MagicMock
()
model_runner
.
model
=
model_mock
seq_group_metadata_list
=
[
SequenceGroupMetadata
(
request_id
=
"test_0"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
.
from_seqs
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
0.5
,
top_k
=
1
,
top_p
=
0.5
),
block_tables
=
{
0
:
[
1
]},
),
SequenceGroupMetadata
(
request_id
=
"test_0"
,
is_prompt
=
True
,
seq_data
=
{
1
:
SequenceData
.
from_seqs
([
4
,
5
,
6
])},
sampling_params
=
SamplingParams
(
temperature
=
0.2
,
top_k
=
2
,
top_p
=
0.2
),
block_tables
=
{
1
:
[
0
]},
)
]
model_runner
.
prepare_model_input
(
seq_group_metadata_list
)
# Index neuron sampling parameters based on block_tables indices.
# The first block_id of the sequence 0 is 1, so its parameters are
# placed at index 1. So the sampling parameters will be:
# Index 0: sequence 1's sampling parameters
# Index 1: sequecne 0's sampling parameters.
neuron_sampling_params
=
(
model_runner
.
model_config
.
neuron_sampling_params
)
assert
neuron_sampling_params
.
temperature
==
[
0.2
,
0.5
]
assert
neuron_sampling_params
.
top_k
==
[
2
,
1
]
assert
neuron_sampling_params
.
top_p
==
[
0.2
,
0.5
]
model_mock
.
model
.
update_generation_config
.
assert_called_once_with
(
neuron_sampling_params
)
Prev
1
…
11
12
13
14
15
16
17
18
19
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment