Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c721b814
Commit
c721b814
authored
Feb 05, 2026
by
zhuwenwen
Browse files
sync v0.15.1
parent
d53fe7e5
Changes
328
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
78 additions
and
628 deletions
+78
-628
tests/kernels/attention/test_flashmla.py
tests/kernels/attention/test_flashmla.py
+12
-10
tests/kernels/moe/modular_kernel_tools/common.py
tests/kernels/moe/modular_kernel_tools/common.py
+4
-7
tests/kernels/moe/modular_kernel_tools/mk_objects.py
tests/kernels/moe/modular_kernel_tools/mk_objects.py
+25
-3
tests/kernels/moe/test_flashinfer.py
tests/kernels/moe/test_flashinfer.py
+6
-1
tests/kernels/moe/test_flashinfer_moe.py
tests/kernels/moe/test_flashinfer_moe.py
+6
-1
tests/kernels/moe/test_nvfp4_moe.py
tests/kernels/moe/test_nvfp4_moe.py
+1
-1
tests/models/language/pooling/test_token_classification.py
tests/models/language/pooling/test_token_classification.py
+0
-25
tests/models/multimodal/generation/test_common.py
tests/models/multimodal/generation/test_common.py
+0
-29
tests/models/multimodal/generation/test_vit_backend_functionality.py
...s/multimodal/generation/test_vit_backend_functionality.py
+0
-13
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+0
-1
tests/models/registry.py
tests/models/registry.py
+12
-22
tests/models/test_initialization.py
tests/models/test_initialization.py
+1
-0
tests/models/test_transformers.py
tests/models/test_transformers.py
+1
-1
tests/plugins/lora_resolvers/test_hf_hub_resolver.py
tests/plugins/lora_resolvers/test_hf_hub_resolver.py
+0
-107
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
...dd_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+1
-1
tests/test_access_log_filter.py
tests/test_access_log_filter.py
+0
-371
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+1
-1
tests/v1/kv_connector/unit/utils.py
tests/v1/kv_connector/unit/utils.py
+0
-7
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle.py
+0
-2
vllm/_custom_ops.py
vllm/_custom_ops.py
+8
-25
No files found.
tests/kernels/attention/test_flashmla.py
View file @
c721b814
...
...
@@ -104,16 +104,18 @@ def test_flash_mla(
descale_k
=
None
def
flash_mla
():
return
flash_mla_with_kvcache
(
q
,
blocked_k
,
block_table
,
cache_seqlens
,
dv
,
tile_scheduler_metadata
,
num_splits
,
causal
=
causal
,
descale_q
=
descale_q
,
descale_k
=
descale_k
)
return
flash_mla_with_kvcache
(
q
,
blocked_k
,
block_table
,
cache_seqlens
,
dv
,
tile_scheduler_metadata
,
num_splits
,
causal
=
causal
,
descale_q
=
descale_q
,
descale_k
=
descale_k
,
)
def
scaled_dot_product_attention
(
query
,
key
,
value
,
is_causal
=
False
):
query
=
query
.
float
()
...
...
tests/kernels/moe/modular_kernel_tools/common.py
View file @
c721b814
...
...
@@ -22,9 +22,6 @@ from vllm.distributed import (
)
from
vllm.forward_context
import
set_forward_context
from
vllm.model_executor.layers.fused_moe
import
fused_topk
from
vllm.model_executor.layers.fused_moe.all2all_utils
import
(
maybe_make_prepare_finalize
,
)
from
vllm.model_executor.layers.fused_moe.config
import
(
FusedMoEConfig
,
FusedMoEParallelConfig
,
...
...
@@ -43,6 +40,7 @@ from .mk_objects import (
TestMoEQuantConfig
,
expert_info
,
make_fused_experts
,
make_prepare_finalize
,
prepare_finalize_info
,
)
from
.parallel_utils
import
ProcessGroupInfo
...
...
@@ -605,10 +603,9 @@ def make_modular_kernel(
routing_method
=
RoutingMethodType
.
DeepSeekV3
,
)
prepare_finalize
=
maybe_make_prepare_finalize
(
moe
=
moe
,
quant_config
=
quant_config
,
allow_new_interface
=
True
,
# make modular kernel
prepare_finalize
=
make_prepare_finalize
(
config
.
prepare_finalize_type
,
config
.
all2all_backend
(),
moe
,
quant_config
)
assert
prepare_finalize
is
not
None
...
...
tests/kernels/moe/modular_kernel_tools/mk_objects.py
View file @
c721b814
...
...
@@ -7,6 +7,9 @@ import torch
# Fused experts and PrepareFinalize imports
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm.model_executor.layers.fused_moe
import
TritonExperts
from
vllm.model_executor.layers.fused_moe.all2all_utils
import
(
maybe_make_prepare_finalize
,
)
from
vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe
import
(
BatchedDeepGemmExperts
,
)
...
...
@@ -252,12 +255,13 @@ if has_pplx():
)
if
has_flashinfer_cutlass_fused_moe
()
and
current_platform
.
has_device_capability
(
100
):
from
vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize
import
(
# noqa: E501
FlashInferCutlassMoEPrepareAndFinalize
,
)
from
vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe
import
(
FlashInferExperts
,
)
from
vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize
import
(
# noqa: E501
FlashInferCutlassMoEPrepareAndFinalize
,
create_flashinfer_prepare_finalize
,
)
register_prepare_and_finalize
(
FlashInferCutlassMoEPrepareAndFinalize
,
...
...
@@ -425,6 +429,24 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
]
def
make_prepare_finalize
(
prepare_finalize_type
:
mk
.
FusedMoEPrepareAndFinalize
,
backend
:
str
|
None
,
moe
:
FusedMoEConfig
,
quant_config
:
FusedMoEQuantConfig
,
)
->
mk
.
FusedMoEPrepareAndFinalize
:
if
backend
!=
"naive"
and
backend
is
not
None
:
prepare_finalize
=
maybe_make_prepare_finalize
(
moe
,
quant_config
)
assert
prepare_finalize
is
not
None
return
prepare_finalize
elif
prepare_finalize_type
==
FlashInferCutlassMoEPrepareAndFinalize
:
return
create_flashinfer_prepare_finalize
(
use_dp
=
moe
.
moe_parallel_config
.
dp_size
>
1
)
else
:
return
MoEPrepareAndFinalizeNoEP
()
def
_slice
(
rank
:
int
,
num_local_experts
:
int
,
t
:
torch
.
Tensor
)
->
torch
.
Tensor
:
s
=
rank
*
num_local_experts
e
=
s
+
num_local_experts
...
...
tests/kernels/moe/test_flashinfer.py
View file @
c721b814
...
...
@@ -294,7 +294,12 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
)
kernel
=
mk
.
FusedMoEModularKernel
(
MoEPrepareAndFinalizeNoEP
(),
MoEPrepareAndFinalizeNoEP
(
defer_input_quant
=
FlashInferExperts
.
expects_unquantized_inputs
(
moe_config
=
moe_config
,
quant_config
=
quant_config
,
)
),
FlashInferExperts
(
moe_config
=
moe_config
,
quant_config
=
quant_config
,
...
...
tests/kernels/moe/test_flashinfer_moe.py
View file @
c721b814
...
...
@@ -106,7 +106,12 @@ def test_flashinfer_fp4_moe_no_graph(
)
flashinfer_experts
=
FusedMoEModularKernel
(
MoEPrepareAndFinalizeNoEP
(),
MoEPrepareAndFinalizeNoEP
(
defer_input_quant
=
FlashInferExperts
.
expects_unquantized_inputs
(
moe_config
=
moe_config
,
quant_config
=
quant_config
,
)
),
FlashInferExperts
(
moe_config
=
moe_config
,
quant_config
=
quant_config
),
)
...
...
tests/kernels/moe/test_nvfp4_moe.py
View file @
c721b814
...
...
@@ -90,7 +90,7 @@ def test_cutlass_fp4_moe_no_graph(
)
kernel
=
mk
.
FusedMoEModularKernel
(
MoEPrepareAndFinalizeNoEP
(),
MoEPrepareAndFinalizeNoEP
(
defer_input_quant
=
True
),
CutlassExpertsFp4
(
moe_config
=
make_dummy_moe_config
(),
quant_config
=
quant_config
,
...
...
tests/models/language/pooling/test_token_classification.py
View file @
c721b814
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
import
numpy
as
np
import
pytest
import
torch
from
transformers
import
AutoModelForTokenClassification
...
...
@@ -11,20 +9,6 @@ from tests.models.utils import softmax
from
vllm.platforms
import
current_platform
@
pytest
.
fixture
(
autouse
=
True
)
def
seed_everything
():
"""Seed all random number generators for reproducibility."""
seed
=
0
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed_all
(
seed
)
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
benchmark
=
False
yield
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"boltuix/NeuroBERT-NER"
])
# The float32 is required for this tiny model to pass the test.
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
...
...
@@ -68,7 +52,6 @@ def test_bert_models(
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"disham993/electrical-ner-ModernBERT-base"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
flaky
(
reruns
=
3
)
@
torch
.
inference_mode
def
test_modernbert_models
(
hf_runner
,
...
...
@@ -77,14 +60,6 @@ def test_modernbert_models(
model
:
str
,
dtype
:
str
,
)
->
None
:
# NOTE: https://github.com/vllm-project/vllm/pull/32403
# `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
# model, which can cause numerical precision variance and edge cases.
# We use @flaky(reruns=3) to mitigate intermittent failures.
print
(
f
"
\n
[NOTE] Testing
{
model
}
(randomly initialized weights) - "
"flaky tolerance enabled due to numerical precision variance."
)
with
vllm_runner
(
model
,
max_model_len
=
None
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
token_classify
(
example_prompts
)
...
...
tests/models/multimodal/generation/test_common.py
View file @
c721b814
...
...
@@ -458,20 +458,6 @@ VLM_TEST_SETTINGS = {
],
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
"glm_ocr"
:
VLMTestInfo
(
models
=
[
"zai-org/GLM-OCR"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"[gMASK]<|user|>
\n
{
img_prompt
}
<|assistant|>
\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"<|begin_of_image|><|image|><|end_of_image|>"
,
video_idx_to_prompt
=
lambda
idx
:
"<|begin_of_video|><|video|><|end_of_video|>"
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
[
151329
,
151336
,
151338
],
num_logprobs
=
10
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
auto_cls
=
AutoModelForImageTextToText
,
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
"h2ovl"
:
VLMTestInfo
(
models
=
[
"h2oai/h2ovl-mississippi-800m"
,
...
...
@@ -587,21 +573,6 @@ VLM_TEST_SETTINGS = {
vllm_output_post_proc
=
model_utils
.
kimiv_vl_vllm_to_hf_output
,
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
),
"llama4"
:
VLMTestInfo
(
models
=
[
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
],
prompt_formatter
=
lambda
img_prompt
:
f
"<|begin_of_text|><|header_start|>user<|header_end|>
\n\n
{
img_prompt
}
<|eot|><|header_start|>assistant<|header_end|>
\n\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
_
:
"<|image|>"
,
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
distributed_executor_backend
=
"mp"
,
image_size_factors
=
[(
0.25
,
0.5
,
1.0
)],
hf_model_kwargs
=
{
"device_map"
:
"auto"
},
max_model_len
=
8192
,
max_num_seqs
=
4
,
dtype
=
"bfloat16"
,
auto_cls
=
AutoModelForImageTextToText
,
tensor_parallel_size
=
4
,
marks
=
multi_gpu_marks
(
num_gpus
=
4
),
),
"llava_next"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-v1.6-mistral-7b-hf"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
...
...
tests/models/multimodal/generation/test_vit_backend_functionality.py
View file @
c721b814
...
...
@@ -91,19 +91,6 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"use_processor"
:
True
,
"question"
:
"What is the content of each image?"
,
},
"glm_ocr"
:
{
"model_name"
:
"zai-org/GLM-OCR"
,
"interface"
:
"llm_generate"
,
"max_model_len"
:
131072
,
"max_num_seqs"
:
2
,
"sampling_params"
:
{
"temperature"
:
0.0
,
"max_tokens"
:
256
,
"stop_token_ids"
:
None
,
},
"use_processor"
:
True
,
"question"
:
"Text Recognition:"
,
},
"keye_vl"
:
{
"model_name"
:
"Kwai-Keye/Keye-VL-8B-Preview"
,
"interface"
:
"llm_generate"
,
...
...
tests/models/multimodal/processing/test_common.py
View file @
c721b814
...
...
@@ -122,7 +122,6 @@ MM_DATA_PATCHES = {
"ernie4_5_moe_vl"
:
qwen3_vl_patch_mm_data
,
"glm4v"
:
glm4_1v_patch_mm_data
,
"glm4v_moe"
:
glm4_1v_patch_mm_data
,
"glm_ocr"
:
glm4_1v_patch_mm_data
,
"glmasr"
:
glmasr_patch_mm_data
,
"molmo2"
:
qwen3_vl_patch_mm_data
,
"qwen3_vl"
:
qwen3_vl_patch_mm_data
,
...
...
tests/models/registry.py
View file @
c721b814
...
...
@@ -256,7 +256,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
),
"Exaone4ForCausalLM"
:
_HfExamplesInfo
(
"LGAI-EXAONE/EXAONE-4.0-32B"
),
"ExaoneMoEForCausalLM"
:
_HfExamplesInfo
(
"LGAI-EXAONE/K-EXAONE-236B-A23B"
,
min_transformers_version
=
"5.
1
.0"
"LGAI-EXAONE/K-EXAONE-236B-A23B"
,
min_transformers_version
=
"5.
0
.0"
),
"Fairseq2LlamaForCausalLM"
:
_HfExamplesInfo
(
"mgleize/fairseq2-dummy-Llama-3.2-1B"
),
"FalconForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-7b"
),
...
...
@@ -273,7 +273,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Glm4MoeForCausalLM"
:
_HfExamplesInfo
(
"zai-org/GLM-4.5"
),
"Glm4MoeLiteForCausalLM"
:
_HfExamplesInfo
(
"zai-org/GLM-4.7-Flash"
,
min_transformers_version
=
"5.0.0"
,
min_transformers_version
=
"5.0.0.dev"
,
is_available_online
=
False
,
),
"GPT2LMHeadModel"
:
_HfExamplesInfo
(
"openai-community/gpt2"
,
{
"alias"
:
"gpt2"
}),
"GPTBigCodeForCausalLM"
:
_HfExamplesInfo
(
...
...
@@ -653,7 +654,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# [Decoder-only]
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AudioFlamingo3ForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/audio-flamingo-3-hf"
,
min_transformers_version
=
"5.0.0"
"nvidia/audio-flamingo-3-hf"
,
min_transformers_version
=
"5.0.0
.dev
"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereLabs/aya-vision-8b"
),
"BagelForConditionalGeneration"
:
_HfExamplesInfo
(
"ByteDance-Seed/BAGEL-7B-MoT"
),
...
...
@@ -696,7 +697,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"GlmAsrForConditionalGeneration"
:
_HfExamplesInfo
(
"zai-org/GLM-ASR-Nano-2512"
,
trust_remote_code
=
True
,
min_transformers_version
=
"5.0
.0
"
,
min_transformers_version
=
"5.0"
,
),
"GraniteVision"
:
_HfExamplesInfo
(
"ibm-granite/granite-vision-3.3-2b"
),
"GraniteSpeechForConditionalGeneration"
:
_HfExamplesInfo
(
...
...
@@ -709,11 +710,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
"Glm4vForConditionalGeneration"
:
_HfExamplesInfo
(
"zai-org/GLM-4.1V-9B-Thinking"
),
"Glm4vMoeForConditionalGeneration"
:
_HfExamplesInfo
(
"zai-org/GLM-4.5V"
),
"GlmOcrForConditionalGeneration"
:
_HfExamplesInfo
(
"zai-org/GLM-OCR"
,
is_available_online
=
False
,
min_transformers_version
=
"5.1.0"
,
),
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
,
trust_remote_code
=
True
,
...
...
@@ -1056,7 +1052,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"ExaoneMoeMTP"
:
_HfExamplesInfo
(
"LGAI-EXAONE/K-EXAONE-236B-A23B"
,
speculative_model
=
"LGAI-EXAONE/K-EXAONE-236B-A23B"
,
min_transformers_version
=
"5.
1
.0"
,
min_transformers_version
=
"5.
0
.0"
,
),
"Glm4MoeMTPModel"
:
_HfExamplesInfo
(
"zai-org/GLM-4.5"
,
...
...
@@ -1067,12 +1063,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
speculative_model
=
"zai-org/GLM-4.7-Flash"
,
min_transformers_version
=
"5.0.0"
,
),
"GlmOcrMTPModel"
:
_HfExamplesInfo
(
"zai-org/GLM-OCR"
,
speculative_model
=
"zai-org/GLM-OCR"
,
is_available_online
=
False
,
min_transformers_version
=
"5.1.0"
,
),
"LongCatFlashMTPModel"
:
_HfExamplesInfo
(
"meituan-longcat/LongCat-Flash-Chat"
,
trust_remote_code
=
True
,
...
...
@@ -1104,27 +1094,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
_TRANSFORMERS_BACKEND_MODELS
=
{
"TransformersEmbeddingModel"
:
_HfExamplesInfo
(
"BAAI/bge-base-en-v1.5"
,
min_transformers_version
=
"5.0.0"
"BAAI/bge-base-en-v1.5"
,
min_transformers_version
=
"5.0.0
.dev
"
),
"TransformersForSequenceClassification"
:
_HfExamplesInfo
(
"papluca/xlm-roberta-base-language-detection"
,
min_transformers_version
=
"5.0.0"
,
min_transformers_version
=
"5.0.0
.dev
"
,
),
"TransformersForCausalLM"
:
_HfExamplesInfo
(
"hmellor/Ilama-3.2-1B"
,
trust_remote_code
=
True
),
"TransformersMultiModalForCausalLM"
:
_HfExamplesInfo
(
"BAAI/Emu3-Chat-hf"
),
"TransformersMoEForCausalLM"
:
_HfExamplesInfo
(
"allenai/OLMoE-1B-7B-0924"
,
min_transformers_version
=
"5.0.0"
"allenai/OLMoE-1B-7B-0924"
,
min_transformers_version
=
"5.0.0
.dev
"
),
"TransformersMultiModalMoEForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-VL-30B-A3B-Instruct"
,
min_transformers_version
=
"5.0.0"
"Qwen/Qwen3-VL-30B-A3B-Instruct"
,
min_transformers_version
=
"5.0.0
.dev
"
),
"TransformersMoEEmbeddingModel"
:
_HfExamplesInfo
(
"Qwen/Qwen3-30B-A3B"
,
min_transformers_version
=
"5.0.0"
"Qwen/Qwen3-30B-A3B"
,
min_transformers_version
=
"5.0.0
.dev
"
),
"TransformersMoEForSequenceClassification"
:
_HfExamplesInfo
(
"Qwen/Qwen3-30B-A3B"
,
min_transformers_version
=
"5.0.0"
"Qwen/Qwen3-30B-A3B"
,
min_transformers_version
=
"5.0.0
.dev
"
),
"TransformersMultiModalEmbeddingModel"
:
_HfExamplesInfo
(
"google/gemma-3-4b-it"
),
"TransformersMultiModalForSequenceClassification"
:
_HfExamplesInfo
(
...
...
tests/models/test_initialization.py
View file @
c721b814
...
...
@@ -88,6 +88,7 @@ def can_initialize(
[
10
*
GiB_bytes
],
)
scheduler_kv_cache_config
=
generate_scheduler_kv_cache_config
(
kv_cache_configs
)
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
return
1
,
0
,
scheduler_kv_cache_config
...
...
tests/models/test_transformers.py
View file @
c721b814
...
...
@@ -78,7 +78,7 @@ def test_models(
from
packaging.version
import
Version
installed
=
Version
(
transformers
.
__version__
)
required
=
Version
(
"5.0.0"
)
required
=
Version
(
"5.0.0
.dev
"
)
if
model
==
"allenai/OLMoE-1B-7B-0924"
and
installed
<
required
:
pytest
.
skip
(
"MoE models with the Transformers modeling backend require "
...
...
tests/plugins/lora_resolvers/test_hf_hub_resolver.py
deleted
100644 → 0
View file @
d53fe7e5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
from
huggingface_hub.constants
import
HF_HUB_CACHE
from
vllm.plugins.lora_resolvers.hf_hub_resolver
import
HfHubResolver
LORA_LIB_MODEL_NAME
=
"ibm-granite/granite-3.3-8b-instruct"
# Repo with multiple LoRAs contained in it
LORA_LIB
=
"ibm-granite/granite-3.3-8b-rag-agent-lib"
LORA_NAME
=
"ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora"
# noqa: E501
NON_LORA_SUBPATH
=
"ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
LIB_DOWNLOAD_DIR
=
os
.
path
.
join
(
HF_HUB_CACHE
,
"models--ibm-granite--granite-3.3-8b-rag-agent-lib"
)
INVALID_REPO_NAME
=
"thisrepodoesnotexist"
# Repo with only one LoRA in the root dir
LORA_REPO_MODEL_NAME
=
"meta-llama/Llama-2-7b-hf"
LORA_REPO
=
"yard1/llama-2-7b-sql-lora-test"
REPO_DOWNLOAD_DIR
=
os
.
path
.
join
(
HF_HUB_CACHE
,
"models--yard1--llama-2-7b-sql-lora-test"
)
@
pytest
.
mark
.
asyncio
async
def
test_hf_resolver_with_direct_path
():
hf_resolver
=
HfHubResolver
([
LORA_REPO
])
assert
hf_resolver
is
not
None
lora_request
=
await
hf_resolver
.
resolve_lora
(
LORA_REPO_MODEL_NAME
,
LORA_REPO
)
assert
lora_request
.
lora_name
==
LORA_REPO
assert
REPO_DOWNLOAD_DIR
in
lora_request
.
lora_path
assert
"adapter_config.json"
in
os
.
listdir
(
lora_request
.
lora_path
)
@
pytest
.
mark
.
asyncio
async
def
test_hf_resolver_with_nested_paths
():
hf_resolver
=
HfHubResolver
([
LORA_LIB
])
assert
hf_resolver
is
not
None
lora_request
=
await
hf_resolver
.
resolve_lora
(
LORA_LIB_MODEL_NAME
,
LORA_NAME
)
assert
lora_request
is
not
None
assert
lora_request
.
lora_name
==
LORA_NAME
assert
LIB_DOWNLOAD_DIR
in
lora_request
.
lora_path
assert
"adapter_config.json"
in
os
.
listdir
(
lora_request
.
lora_path
)
@
pytest
.
mark
.
asyncio
async
def
test_hf_resolver_with_multiple_repos
():
hf_resolver
=
HfHubResolver
([
LORA_LIB
,
LORA_REPO
])
assert
hf_resolver
is
not
None
lora_request
=
await
hf_resolver
.
resolve_lora
(
LORA_LIB_MODEL_NAME
,
LORA_NAME
)
assert
lora_request
is
not
None
assert
lora_request
.
lora_name
==
LORA_NAME
assert
LIB_DOWNLOAD_DIR
in
lora_request
.
lora_path
assert
"adapter_config.json"
in
os
.
listdir
(
lora_request
.
lora_path
)
@
pytest
.
mark
.
asyncio
async
def
test_missing_adapter
():
hf_resolver
=
HfHubResolver
([
LORA_LIB
])
assert
hf_resolver
is
not
None
missing_lora_request
=
await
hf_resolver
.
resolve_lora
(
LORA_LIB_MODEL_NAME
,
"foobar"
)
assert
missing_lora_request
is
None
@
pytest
.
mark
.
asyncio
async
def
test_nonlora_adapter
():
hf_resolver
=
HfHubResolver
([
LORA_LIB
])
assert
hf_resolver
is
not
None
readme_request
=
await
hf_resolver
.
resolve_lora
(
LORA_LIB_MODEL_NAME
,
NON_LORA_SUBPATH
)
assert
readme_request
is
None
@
pytest
.
mark
.
asyncio
async
def
test_invalid_repo
():
hf_resolver
=
HfHubResolver
([
LORA_LIB
])
assert
hf_resolver
is
not
None
invalid_repo_req
=
await
hf_resolver
.
resolve_lora
(
INVALID_REPO_NAME
,
f
"
{
INVALID_REPO_NAME
}
/foo"
,
)
assert
invalid_repo_req
is
None
@
pytest
.
mark
.
asyncio
async
def
test_trailing_slash
():
hf_resolver
=
HfHubResolver
([
LORA_LIB
])
assert
hf_resolver
is
not
None
lora_request
=
await
hf_resolver
.
resolve_lora
(
LORA_LIB_MODEL_NAME
,
f
"
{
LORA_NAME
}
/"
,
)
assert
lora_request
is
not
None
assert
lora_request
.
lora_name
==
f
"
{
LORA_NAME
}
/"
assert
LIB_DOWNLOAD_DIR
in
lora_request
.
lora_path
assert
"adapter_config.json"
in
os
.
listdir
(
lora_request
.
lora_path
)
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
View file @
c721b814
...
...
@@ -36,7 +36,7 @@ class MyGemma2Embedding(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
tests/test_access_log_filter.py
deleted
100644 → 0
View file @
d53fe7e5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for the UvicornAccessLogFilter class.
"""
import
logging
from
vllm.logging_utils.access_log_filter
import
(
UvicornAccessLogFilter
,
create_uvicorn_log_config
,
)
class
TestUvicornAccessLogFilter
:
"""Test cases for UvicornAccessLogFilter."""
def
test_filter_allows_all_when_no_excluded_paths
(
self
):
"""Filter should allow all logs when no paths are excluded."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[])
record
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"GET"
,
"/v1/completions"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record
)
is
True
def
test_filter_allows_all_when_excluded_paths_is_none
(
self
):
"""Filter should allow all logs when excluded_paths is None."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
None
)
record
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"GET"
,
"/health"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record
)
is
True
def
test_filter_excludes_health_endpoint
(
self
):
"""Filter should exclude /health endpoint when configured."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/health"
])
record
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"GET"
,
"/health"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record
)
is
False
def
test_filter_excludes_metrics_endpoint
(
self
):
"""Filter should exclude /metrics endpoint when configured."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/metrics"
])
record
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"GET"
,
"/metrics"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record
)
is
False
def
test_filter_allows_non_excluded_endpoints
(
self
):
"""Filter should allow endpoints not in the excluded list."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/health"
,
"/metrics"
])
record
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"POST"
,
"/v1/completions"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record
)
is
True
def
test_filter_excludes_multiple_endpoints
(
self
):
"""Filter should exclude multiple configured endpoints."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/health"
,
"/metrics"
,
"/ping"
])
# Test /health
record_health
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"GET"
,
"/health"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record_health
)
is
False
# Test /metrics
record_metrics
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"GET"
,
"/metrics"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record_metrics
)
is
False
# Test /ping
record_ping
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"GET"
,
"/ping"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record_ping
)
is
False
def
test_filter_with_query_parameters
(
self
):
"""Filter should exclude endpoints even with query parameters."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/health"
])
record
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"GET"
,
"/health?verbose=true"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record
)
is
False
def
test_filter_different_http_methods
(
self
):
"""Filter should exclude endpoints regardless of HTTP method."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/ping"
])
# Test GET
record_get
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"GET"
,
"/ping"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record_get
)
is
False
# Test POST
record_post
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"POST"
,
"/ping"
,
"1.1"
,
200
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record_post
)
is
False
def
test_filter_with_different_status_codes
(
self
):
"""Filter should exclude endpoints regardless of status code."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/health"
])
for
status_code
in
[
200
,
500
,
503
]:
record
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
'%s - "%s %s HTTP/%s" %d'
,
args
=
(
"127.0.0.1:12345"
,
"GET"
,
"/health"
,
"1.1"
,
status_code
),
exc_info
=
None
,
)
assert
filter
.
filter
(
record
)
is
False
class
TestCreateUvicornLogConfig
:
"""Test cases for create_uvicorn_log_config function."""
def
test_creates_valid_config_structure
(
self
):
"""Config should have required logging configuration keys."""
config
=
create_uvicorn_log_config
(
excluded_paths
=
[
"/health"
])
assert
"version"
in
config
assert
config
[
"version"
]
==
1
assert
"disable_existing_loggers"
in
config
assert
"formatters"
in
config
assert
"handlers"
in
config
assert
"loggers"
in
config
assert
"filters"
in
config
def
test_config_includes_access_log_filter
(
self
):
"""Config should include the access log filter."""
config
=
create_uvicorn_log_config
(
excluded_paths
=
[
"/health"
,
"/metrics"
])
assert
"access_log_filter"
in
config
[
"filters"
]
filter_config
=
config
[
"filters"
][
"access_log_filter"
]
assert
filter_config
[
"()"
]
==
UvicornAccessLogFilter
assert
filter_config
[
"excluded_paths"
]
==
[
"/health"
,
"/metrics"
]
def
test_config_applies_filter_to_access_handler
(
self
):
"""Config should apply the filter to the access handler."""
config
=
create_uvicorn_log_config
(
excluded_paths
=
[
"/health"
])
assert
"access"
in
config
[
"handlers"
]
assert
"filters"
in
config
[
"handlers"
][
"access"
]
assert
"access_log_filter"
in
config
[
"handlers"
][
"access"
][
"filters"
]
def
test_config_with_custom_log_level
(
self
):
"""Config should respect custom log level."""
config
=
create_uvicorn_log_config
(
excluded_paths
=
[
"/health"
],
log_level
=
"debug"
)
assert
config
[
"loggers"
][
"uvicorn"
][
"level"
]
==
"DEBUG"
assert
config
[
"loggers"
][
"uvicorn.access"
][
"level"
]
==
"DEBUG"
assert
config
[
"loggers"
][
"uvicorn.error"
][
"level"
]
==
"DEBUG"
def
test_config_with_empty_excluded_paths
(
self
):
"""Config should work with empty excluded paths."""
config
=
create_uvicorn_log_config
(
excluded_paths
=
[])
assert
config
[
"filters"
][
"access_log_filter"
][
"excluded_paths"
]
==
[]
def
test_config_with_none_excluded_paths
(
self
):
"""Config should work with None excluded paths."""
config
=
create_uvicorn_log_config
(
excluded_paths
=
None
)
assert
config
[
"filters"
][
"access_log_filter"
][
"excluded_paths"
]
==
[]
class
TestIntegration
:
"""Integration tests for the access log filter."""
def
test_filter_with_real_logger
(
self
):
"""Test filter works with a real Python logger simulating uvicorn."""
# Create a logger with our filter (simulating uvicorn.access)
logger
=
logging
.
getLogger
(
"uvicorn.access"
)
logger
.
setLevel
(
logging
.
INFO
)
# Clear any existing handlers
logger
.
handlers
=
[]
# Create a custom handler that tracks messages
logged_messages
:
list
[
str
]
=
[]
class
TrackingHandler
(
logging
.
Handler
):
def
emit
(
self
,
record
):
logged_messages
.
append
(
record
.
getMessage
())
handler
=
TrackingHandler
()
handler
.
setLevel
(
logging
.
INFO
)
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/health"
,
"/metrics"
])
handler
.
addFilter
(
filter
)
logger
.
addHandler
(
handler
)
# Log using uvicorn's format with args tuple
# Format: '%s - "%s %s HTTP/%s" %d'
logger
.
info
(
'%s - "%s %s HTTP/%s" %d'
,
"127.0.0.1:12345"
,
"GET"
,
"/health"
,
"1.1"
,
200
,
)
logger
.
info
(
'%s - "%s %s HTTP/%s" %d'
,
"127.0.0.1:12345"
,
"GET"
,
"/v1/completions"
,
"1.1"
,
200
,
)
logger
.
info
(
'%s - "%s %s HTTP/%s" %d'
,
"127.0.0.1:12345"
,
"GET"
,
"/metrics"
,
"1.1"
,
200
,
)
logger
.
info
(
'%s - "%s %s HTTP/%s" %d'
,
"127.0.0.1:12345"
,
"POST"
,
"/v1/chat/completions"
,
"1.1"
,
200
,
)
# Verify only non-excluded endpoints were logged
assert
len
(
logged_messages
)
==
2
assert
"/v1/completions"
in
logged_messages
[
0
]
assert
"/v1/chat/completions"
in
logged_messages
[
1
]
def
test_filter_allows_non_uvicorn_access_logs
(
self
):
"""Test filter allows logs from non-uvicorn.access loggers."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/health"
])
# Log record from a different logger name
record
=
logging
.
LogRecord
(
name
=
"uvicorn.error"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
"Some error message about /health"
,
args
=
(),
exc_info
=
None
,
)
# Should allow because it's not from uvicorn.access
assert
filter
.
filter
(
record
)
is
True
def
test_filter_handles_malformed_args
(
self
):
"""Test filter handles log records with unexpected args format."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/health"
])
# Log record with insufficient args
record
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
"Some message"
,
args
=
(
"only"
,
"two"
),
exc_info
=
None
,
)
# Should allow because args doesn't have expected format
assert
filter
.
filter
(
record
)
is
True
def
test_filter_handles_non_tuple_args
(
self
):
"""Test filter handles log records with non-tuple args."""
filter
=
UvicornAccessLogFilter
(
excluded_paths
=
[
"/health"
])
# Log record with None args
record
=
logging
.
LogRecord
(
name
=
"uvicorn.access"
,
level
=
logging
.
INFO
,
pathname
=
""
,
lineno
=
0
,
msg
=
"Some message without args"
,
args
=
None
,
exc_info
=
None
,
)
# Should allow because args is None
assert
filter
.
filter
(
record
)
is
True
tests/v1/e2e/test_spec_decode.py
View file @
c721b814
...
...
@@ -455,7 +455,7 @@ def test_eagle_correctness(
from
packaging.version
import
Version
installed
=
Version
(
transformers
.
__version__
)
required
=
Version
(
"5.0.0"
)
required
=
Version
(
"5.0.0
.dev
"
)
if
installed
<
required
:
pytest
.
skip
(
"Eagle3 with the Transformers modeling backend requires "
...
...
tests/v1/kv_connector/unit/utils.py
View file @
c721b814
...
...
@@ -112,13 +112,6 @@ def create_vllm_config(
enable_chunked_prefill
=
enable_chunked_prefill
,
is_encoder_decoder
=
model_config
.
is_encoder_decoder
,
)
scheduler_config
=
SchedulerConfig
(
max_num_seqs
=
max_num_seqs
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_model_len
=
max_model_len
,
enable_chunked_prefill
=
enable_chunked_prefill
,
is_encoder_decoder
=
model_config
.
is_encoder_decoder
,
)
# Cache config, optionally force APC
cache_config
=
CacheConfig
(
block_size
=
block_size
,
...
...
tests/v1/spec_decode/test_eagle.py
View file @
c721b814
...
...
@@ -372,8 +372,6 @@ def test_load_model(
all_indx_layers
:
dict
[
str
,
mock
.
MagicMock
]
=
{}
all_indx_layers
:
dict
[
str
,
mock
.
MagicMock
]
=
{}
# Make mock_get_layers return different values for each call
mock_get_layers
.
side_effect
=
[
target_attn_layers
,
...
...
vllm/_custom_ops.py
View file @
c721b814
...
...
@@ -2831,13 +2831,13 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
class
CPUDNNLGEMMHandler
:
def
__init__
(
self
)
->
None
:
self
.
handler
_tensor
:
torch
.
Tensor
|
None
=
None
self
.
handler
:
int
|
None
=
None
self
.
n
=
-
1
self
.
k
=
-
1
def
__del__
(
self
):
if
self
.
handler
_tensor
is
not
None
:
torch
.
ops
.
_C
.
release_dnnl_matmul_handler
(
self
.
handler
_tensor
.
item
()
)
if
self
.
handler
is
not
None
:
torch
.
ops
.
_C
.
release_dnnl_matmul_handler
(
self
.
handler
)
_supports_onednn
=
bool
(
hasattr
(
torch
.
ops
.
_C
,
"create_onednn_mm_handler"
))
...
...
@@ -2853,10 +2853,8 @@ def create_onednn_mm(
)
->
CPUDNNLGEMMHandler
:
handler
=
CPUDNNLGEMMHandler
()
handler
.
k
,
handler
.
n
=
weight
.
size
()
# store the handler pointer in a tensor it doesn't get inlined
handler
.
handler_tensor
=
torch
.
tensor
(
torch
.
ops
.
_C
.
create_onednn_mm_handler
(
weight
,
primitive_cache_size
),
dtype
=
torch
.
int64
,
handler
.
handler
=
torch
.
ops
.
_C
.
create_onednn_mm_handler
(
weight
,
primitive_cache_size
)
return
handler
...
...
@@ -2884,17 +2882,8 @@ def create_onednn_scaled_mm(
)
->
CPUDNNLGEMMHandler
:
handler
=
CPUDNNLGEMMHandler
()
handler
.
k
,
handler
.
n
=
weight
.
size
()
# store the handler pointer in a tensor so it doesn't get inlined
handler
.
handler_tensor
=
torch
.
tensor
(
torch
.
ops
.
_C
.
create_onednn_scaled_mm_handler
(
weight
,
weight_scales
,
output_type
,
dynamic_quant
,
use_azp
,
primitive_cache_size
,
),
dtype
=
torch
.
int64
,
handler
.
handler
=
torch
.
ops
.
_C
.
create_onednn_scaled_mm_handler
(
weight
,
weight_scales
,
output_type
,
dynamic_quant
,
use_azp
,
primitive_cache_size
)
return
handler
...
...
@@ -2947,13 +2936,7 @@ def onednn_scaled_mm(
bias
:
torch
.
Tensor
|
None
,
)
->
torch
.
Tensor
:
torch
.
ops
.
_C
.
onednn_scaled_mm
(
output
,
x
,
input_scale
,
input_zp
,
input_zp_adj
,
bias
,
dnnl_handler
.
handler_tensor
,
output
,
x
,
input_scale
,
input_zp
,
input_zp_adj
,
bias
,
dnnl_handler
.
handler
)
return
output
...
...
Prev
1
2
3
4
5
6
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment