Commit c721b814 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.15.1

parent d53fe7e5
...@@ -104,16 +104,18 @@ def test_flash_mla( ...@@ -104,16 +104,18 @@ def test_flash_mla(
descale_k = None descale_k = None
def flash_mla(): def flash_mla():
return flash_mla_with_kvcache(q, return flash_mla_with_kvcache(
blocked_k, q,
block_table, blocked_k,
cache_seqlens, block_table,
dv, cache_seqlens,
tile_scheduler_metadata, dv,
num_splits, tile_scheduler_metadata,
causal=causal, num_splits,
descale_q=descale_q, causal=causal,
descale_k=descale_k) descale_q=descale_q,
descale_k=descale_k,
)
def scaled_dot_product_attention(query, key, value, is_causal=False): def scaled_dot_product_attention(query, key, value, is_causal=False):
query = query.float() query = query.float()
......
...@@ -22,9 +22,6 @@ from vllm.distributed import ( ...@@ -22,9 +22,6 @@ from vllm.distributed import (
) )
from vllm.forward_context import set_forward_context from vllm.forward_context import set_forward_context
from vllm.model_executor.layers.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig, FusedMoEConfig,
FusedMoEParallelConfig, FusedMoEParallelConfig,
...@@ -43,6 +40,7 @@ from .mk_objects import ( ...@@ -43,6 +40,7 @@ from .mk_objects import (
TestMoEQuantConfig, TestMoEQuantConfig,
expert_info, expert_info,
make_fused_experts, make_fused_experts,
make_prepare_finalize,
prepare_finalize_info, prepare_finalize_info,
) )
from .parallel_utils import ProcessGroupInfo from .parallel_utils import ProcessGroupInfo
...@@ -605,10 +603,9 @@ def make_modular_kernel( ...@@ -605,10 +603,9 @@ def make_modular_kernel(
routing_method=RoutingMethodType.DeepSeekV3, routing_method=RoutingMethodType.DeepSeekV3,
) )
prepare_finalize = maybe_make_prepare_finalize( # make modular kernel
moe=moe, prepare_finalize = make_prepare_finalize(
quant_config=quant_config, config.prepare_finalize_type, config.all2all_backend(), moe, quant_config
allow_new_interface=True,
) )
assert prepare_finalize is not None assert prepare_finalize is not None
......
...@@ -7,6 +7,9 @@ import torch ...@@ -7,6 +7,9 @@ import torch
# Fused experts and PrepareFinalize imports # Fused experts and PrepareFinalize imports
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe import TritonExperts from vllm.model_executor.layers.fused_moe import TritonExperts
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts, BatchedDeepGemmExperts,
) )
...@@ -252,12 +255,13 @@ if has_pplx(): ...@@ -252,12 +255,13 @@ if has_pplx():
) )
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100): if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts, FlashInferExperts,
) )
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize,
create_flashinfer_prepare_finalize,
)
register_prepare_and_finalize( register_prepare_and_finalize(
FlashInferCutlassMoEPrepareAndFinalize, FlashInferCutlassMoEPrepareAndFinalize,
...@@ -425,6 +429,24 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe(): ...@@ -425,6 +429,24 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
] ]
def make_prepare_finalize(
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
backend: str | None,
moe: FusedMoEConfig,
quant_config: FusedMoEQuantConfig,
) -> mk.FusedMoEPrepareAndFinalize:
if backend != "naive" and backend is not None:
prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
assert prepare_finalize is not None
return prepare_finalize
elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
return create_flashinfer_prepare_finalize(
use_dp=moe.moe_parallel_config.dp_size > 1
)
else:
return MoEPrepareAndFinalizeNoEP()
def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor: def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
s = rank * num_local_experts s = rank * num_local_experts
e = s + num_local_experts e = s + num_local_experts
......
...@@ -294,7 +294,12 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( ...@@ -294,7 +294,12 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
) )
kernel = mk.FusedMoEModularKernel( kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(
defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
moe_config=moe_config,
quant_config=quant_config,
)
),
FlashInferExperts( FlashInferExperts(
moe_config=moe_config, moe_config=moe_config,
quant_config=quant_config, quant_config=quant_config,
......
...@@ -106,7 +106,12 @@ def test_flashinfer_fp4_moe_no_graph( ...@@ -106,7 +106,12 @@ def test_flashinfer_fp4_moe_no_graph(
) )
flashinfer_experts = FusedMoEModularKernel( flashinfer_experts = FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(
defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
moe_config=moe_config,
quant_config=quant_config,
)
),
FlashInferExperts(moe_config=moe_config, quant_config=quant_config), FlashInferExperts(moe_config=moe_config, quant_config=quant_config),
) )
......
...@@ -90,7 +90,7 @@ def test_cutlass_fp4_moe_no_graph( ...@@ -90,7 +90,7 @@ def test_cutlass_fp4_moe_no_graph(
) )
kernel = mk.FusedMoEModularKernel( kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
CutlassExpertsFp4( CutlassExpertsFp4(
moe_config=make_dummy_moe_config(), moe_config=make_dummy_moe_config(),
quant_config=quant_config, quant_config=quant_config,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import numpy as np
import pytest import pytest
import torch import torch
from transformers import AutoModelForTokenClassification from transformers import AutoModelForTokenClassification
...@@ -11,20 +9,6 @@ from tests.models.utils import softmax ...@@ -11,20 +9,6 @@ from tests.models.utils import softmax
from vllm.platforms import current_platform from vllm.platforms import current_platform
@pytest.fixture(autouse=True)
def seed_everything():
"""Seed all random number generators for reproducibility."""
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
yield
@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"]) @pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
# The float32 is required for this tiny model to pass the test. # The float32 is required for this tiny model to pass the test.
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
...@@ -68,7 +52,6 @@ def test_bert_models( ...@@ -68,7 +52,6 @@ def test_bert_models(
@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"]) @pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.flaky(reruns=3)
@torch.inference_mode @torch.inference_mode
def test_modernbert_models( def test_modernbert_models(
hf_runner, hf_runner,
...@@ -77,14 +60,6 @@ def test_modernbert_models( ...@@ -77,14 +60,6 @@ def test_modernbert_models(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
# NOTE: https://github.com/vllm-project/vllm/pull/32403
# `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
# model, which can cause numerical precision variance and edge cases.
# We use @flaky(reruns=3) to mitigate intermittent failures.
print(
f"\n[NOTE] Testing {model} (randomly initialized weights) - "
"flaky tolerance enabled due to numerical precision variance."
)
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.token_classify(example_prompts) vllm_outputs = vllm_model.token_classify(example_prompts)
......
...@@ -458,20 +458,6 @@ VLM_TEST_SETTINGS = { ...@@ -458,20 +458,6 @@ VLM_TEST_SETTINGS = {
], ],
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"glm_ocr": VLMTestInfo(
models=["zai-org/GLM-OCR"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
max_model_len=2048,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=32)],
),
"h2ovl": VLMTestInfo( "h2ovl": VLMTestInfo(
models=[ models=[
"h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-800m",
...@@ -587,21 +573,6 @@ VLM_TEST_SETTINGS = { ...@@ -587,21 +573,6 @@ VLM_TEST_SETTINGS = {
vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output, vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
marks=[large_gpu_mark(min_gb=48)], marks=[large_gpu_mark(min_gb=48)],
), ),
"llama4": VLMTestInfo(
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
img_idx_to_prompt=lambda _: "<|image|>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
distributed_executor_backend="mp",
image_size_factors=[(0.25, 0.5, 1.0)],
hf_model_kwargs={"device_map": "auto"},
max_model_len=8192,
max_num_seqs=4,
dtype="bfloat16",
auto_cls=AutoModelForImageTextToText,
tensor_parallel_size=4,
marks=multi_gpu_marks(num_gpus=4),
),
"llava_next": VLMTestInfo( "llava_next": VLMTestInfo(
models=["llava-hf/llava-v1.6-mistral-7b-hf"], models=["llava-hf/llava-v1.6-mistral-7b-hf"],
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
......
...@@ -91,19 +91,6 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -91,19 +91,6 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"use_processor": True, "use_processor": True,
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"glm_ocr": {
"model_name": "zai-org/GLM-OCR",
"interface": "llm_generate",
"max_model_len": 131072,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"use_processor": True,
"question": "Text Recognition:",
},
"keye_vl": { "keye_vl": {
"model_name": "Kwai-Keye/Keye-VL-8B-Preview", "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
"interface": "llm_generate", "interface": "llm_generate",
......
...@@ -122,7 +122,6 @@ MM_DATA_PATCHES = { ...@@ -122,7 +122,6 @@ MM_DATA_PATCHES = {
"ernie4_5_moe_vl": qwen3_vl_patch_mm_data, "ernie4_5_moe_vl": qwen3_vl_patch_mm_data,
"glm4v": glm4_1v_patch_mm_data, "glm4v": glm4_1v_patch_mm_data,
"glm4v_moe": glm4_1v_patch_mm_data, "glm4v_moe": glm4_1v_patch_mm_data,
"glm_ocr": glm4_1v_patch_mm_data,
"glmasr": glmasr_patch_mm_data, "glmasr": glmasr_patch_mm_data,
"molmo2": qwen3_vl_patch_mm_data, "molmo2": qwen3_vl_patch_mm_data,
"qwen3_vl": qwen3_vl_patch_mm_data, "qwen3_vl": qwen3_vl_patch_mm_data,
......
...@@ -256,7 +256,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -256,7 +256,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
), ),
"Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"), "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"),
"ExaoneMoEForCausalLM": _HfExamplesInfo( "ExaoneMoEForCausalLM": _HfExamplesInfo(
"LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.1.0" "LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.0.0"
), ),
"Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
...@@ -273,7 +273,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -273,7 +273,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5"), "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5"),
"Glm4MoeLiteForCausalLM": _HfExamplesInfo( "Glm4MoeLiteForCausalLM": _HfExamplesInfo(
"zai-org/GLM-4.7-Flash", "zai-org/GLM-4.7-Flash",
min_transformers_version="5.0.0", min_transformers_version="5.0.0.dev",
is_available_online=False,
), ),
"GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}),
"GPTBigCodeForCausalLM": _HfExamplesInfo( "GPTBigCodeForCausalLM": _HfExamplesInfo(
...@@ -653,7 +654,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -653,7 +654,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# [Decoder-only] # [Decoder-only]
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo( "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
"nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0" "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
), ),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"), "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
"BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"), "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
...@@ -696,7 +697,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -696,7 +697,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"GlmAsrForConditionalGeneration": _HfExamplesInfo( "GlmAsrForConditionalGeneration": _HfExamplesInfo(
"zai-org/GLM-ASR-Nano-2512", "zai-org/GLM-ASR-Nano-2512",
trust_remote_code=True, trust_remote_code=True,
min_transformers_version="5.0.0", min_transformers_version="5.0",
), ),
"GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"), "GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"),
"GraniteSpeechForConditionalGeneration": _HfExamplesInfo( "GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
...@@ -709,11 +710,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -709,11 +710,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
), ),
"Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),
"Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V"), "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V"),
"GlmOcrForConditionalGeneration": _HfExamplesInfo(
"zai-org/GLM-OCR",
is_available_online=False,
min_transformers_version="5.1.0",
),
"H2OVLChatModel": _HfExamplesInfo( "H2OVLChatModel": _HfExamplesInfo(
"h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-800m",
trust_remote_code=True, trust_remote_code=True,
...@@ -1056,7 +1052,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { ...@@ -1056,7 +1052,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"ExaoneMoeMTP": _HfExamplesInfo( "ExaoneMoeMTP": _HfExamplesInfo(
"LGAI-EXAONE/K-EXAONE-236B-A23B", "LGAI-EXAONE/K-EXAONE-236B-A23B",
speculative_model="LGAI-EXAONE/K-EXAONE-236B-A23B", speculative_model="LGAI-EXAONE/K-EXAONE-236B-A23B",
min_transformers_version="5.1.0", min_transformers_version="5.0.0",
), ),
"Glm4MoeMTPModel": _HfExamplesInfo( "Glm4MoeMTPModel": _HfExamplesInfo(
"zai-org/GLM-4.5", "zai-org/GLM-4.5",
...@@ -1067,12 +1063,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { ...@@ -1067,12 +1063,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
speculative_model="zai-org/GLM-4.7-Flash", speculative_model="zai-org/GLM-4.7-Flash",
min_transformers_version="5.0.0", min_transformers_version="5.0.0",
), ),
"GlmOcrMTPModel": _HfExamplesInfo(
"zai-org/GLM-OCR",
speculative_model="zai-org/GLM-OCR",
is_available_online=False,
min_transformers_version="5.1.0",
),
"LongCatFlashMTPModel": _HfExamplesInfo( "LongCatFlashMTPModel": _HfExamplesInfo(
"meituan-longcat/LongCat-Flash-Chat", "meituan-longcat/LongCat-Flash-Chat",
trust_remote_code=True, trust_remote_code=True,
...@@ -1104,27 +1094,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { ...@@ -1104,27 +1094,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
_TRANSFORMERS_BACKEND_MODELS = { _TRANSFORMERS_BACKEND_MODELS = {
"TransformersEmbeddingModel": _HfExamplesInfo( "TransformersEmbeddingModel": _HfExamplesInfo(
"BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0" "BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0.dev"
), ),
"TransformersForSequenceClassification": _HfExamplesInfo( "TransformersForSequenceClassification": _HfExamplesInfo(
"papluca/xlm-roberta-base-language-detection", "papluca/xlm-roberta-base-language-detection",
min_transformers_version="5.0.0", min_transformers_version="5.0.0.dev",
), ),
"TransformersForCausalLM": _HfExamplesInfo( "TransformersForCausalLM": _HfExamplesInfo(
"hmellor/Ilama-3.2-1B", trust_remote_code=True "hmellor/Ilama-3.2-1B", trust_remote_code=True
), ),
"TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), "TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"TransformersMoEForCausalLM": _HfExamplesInfo( "TransformersMoEForCausalLM": _HfExamplesInfo(
"allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0" "allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0.dev"
), ),
"TransformersMultiModalMoEForCausalLM": _HfExamplesInfo( "TransformersMultiModalMoEForCausalLM": _HfExamplesInfo(
"Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0" "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0.dev"
), ),
"TransformersMoEEmbeddingModel": _HfExamplesInfo( "TransformersMoEEmbeddingModel": _HfExamplesInfo(
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0" "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
), ),
"TransformersMoEForSequenceClassification": _HfExamplesInfo( "TransformersMoEForSequenceClassification": _HfExamplesInfo(
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0" "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
), ),
"TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"), "TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"),
"TransformersMultiModalForSequenceClassification": _HfExamplesInfo( "TransformersMultiModalForSequenceClassification": _HfExamplesInfo(
......
...@@ -88,6 +88,7 @@ def can_initialize( ...@@ -88,6 +88,7 @@ def can_initialize(
[10 * GiB_bytes], [10 * GiB_bytes],
) )
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs) scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
return 1, 0, scheduler_kv_cache_config return 1, 0, scheduler_kv_cache_config
......
...@@ -78,7 +78,7 @@ def test_models( ...@@ -78,7 +78,7 @@ def test_models(
from packaging.version import Version from packaging.version import Version
installed = Version(transformers.__version__) installed = Version(transformers.__version__)
required = Version("5.0.0") required = Version("5.0.0.dev")
if model == "allenai/OLMoE-1B-7B-0924" and installed < required: if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
pytest.skip( pytest.skip(
"MoE models with the Transformers modeling backend require " "MoE models with the Transformers modeling backend require "
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from huggingface_hub.constants import HF_HUB_CACHE
from vllm.plugins.lora_resolvers.hf_hub_resolver import HfHubResolver
LORA_LIB_MODEL_NAME = "ibm-granite/granite-3.3-8b-instruct"
# Repo with multiple LoRAs contained in it
LORA_LIB = "ibm-granite/granite-3.3-8b-rag-agent-lib"
LORA_NAME = "ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora" # noqa: E501
NON_LORA_SUBPATH = "ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
LIB_DOWNLOAD_DIR = os.path.join(
HF_HUB_CACHE, "models--ibm-granite--granite-3.3-8b-rag-agent-lib"
)
INVALID_REPO_NAME = "thisrepodoesnotexist"
# Repo with only one LoRA in the root dir
LORA_REPO_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
LORA_REPO = "yard1/llama-2-7b-sql-lora-test"
REPO_DOWNLOAD_DIR = os.path.join(
HF_HUB_CACHE, "models--yard1--llama-2-7b-sql-lora-test"
)
@pytest.mark.asyncio
async def test_hf_resolver_with_direct_path():
hf_resolver = HfHubResolver([LORA_REPO])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_REPO_MODEL_NAME, LORA_REPO)
assert lora_request.lora_name == LORA_REPO
assert REPO_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_hf_resolver_with_nested_paths():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
assert lora_request is not None
assert lora_request.lora_name == LORA_NAME
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_hf_resolver_with_multiple_repos():
hf_resolver = HfHubResolver([LORA_LIB, LORA_REPO])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
assert lora_request is not None
assert lora_request.lora_name == LORA_NAME
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_missing_adapter():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
missing_lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, "foobar")
assert missing_lora_request is None
@pytest.mark.asyncio
async def test_nonlora_adapter():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
readme_request = await hf_resolver.resolve_lora(
LORA_LIB_MODEL_NAME, NON_LORA_SUBPATH
)
assert readme_request is None
@pytest.mark.asyncio
async def test_invalid_repo():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
invalid_repo_req = await hf_resolver.resolve_lora(
INVALID_REPO_NAME,
f"{INVALID_REPO_NAME}/foo",
)
assert invalid_repo_req is None
@pytest.mark.asyncio
async def test_trailing_slash():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(
LORA_LIB_MODEL_NAME,
f"{LORA_NAME}/",
)
assert lora_request is not None
assert lora_request.lora_name == f"{LORA_NAME}/"
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
...@@ -36,7 +36,7 @@ class MyGemma2Embedding(nn.Module): ...@@ -36,7 +36,7 @@ class MyGemma2Embedding(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for the UvicornAccessLogFilter class.
"""
import logging
from vllm.logging_utils.access_log_filter import (
UvicornAccessLogFilter,
create_uvicorn_log_config,
)
class TestUvicornAccessLogFilter:
"""Test cases for UvicornAccessLogFilter."""
def test_filter_allows_all_when_no_excluded_paths(self):
"""Filter should allow all logs when no paths are excluded."""
filter = UvicornAccessLogFilter(excluded_paths=[])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/v1/completions", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_allows_all_when_excluded_paths_is_none(self):
"""Filter should allow all logs when excluded_paths is None."""
filter = UvicornAccessLogFilter(excluded_paths=None)
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_excludes_health_endpoint(self):
"""Filter should exclude /health endpoint when configured."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_excludes_metrics_endpoint(self):
"""Filter should exclude /metrics endpoint when configured."""
filter = UvicornAccessLogFilter(excluded_paths=["/metrics"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_allows_non_excluded_endpoints(self):
"""Filter should allow endpoints not in the excluded list."""
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "POST", "/v1/completions", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_excludes_multiple_endpoints(self):
"""Filter should exclude multiple configured endpoints."""
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics", "/ping"])
# Test /health
record_health = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_health) is False
# Test /metrics
record_metrics = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_metrics) is False
# Test /ping
record_ping = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_ping) is False
def test_filter_with_query_parameters(self):
"""Filter should exclude endpoints even with query parameters."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health?verbose=true", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_different_http_methods(self):
"""Filter should exclude endpoints regardless of HTTP method."""
filter = UvicornAccessLogFilter(excluded_paths=["/ping"])
# Test GET
record_get = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_get) is False
# Test POST
record_post = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "POST", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_post) is False
def test_filter_with_different_status_codes(self):
"""Filter should exclude endpoints regardless of status code."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
for status_code in [200, 500, 503]:
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", status_code),
exc_info=None,
)
assert filter.filter(record) is False
class TestCreateUvicornLogConfig:
"""Test cases for create_uvicorn_log_config function."""
def test_creates_valid_config_structure(self):
"""Config should have required logging configuration keys."""
config = create_uvicorn_log_config(excluded_paths=["/health"])
assert "version" in config
assert config["version"] == 1
assert "disable_existing_loggers" in config
assert "formatters" in config
assert "handlers" in config
assert "loggers" in config
assert "filters" in config
def test_config_includes_access_log_filter(self):
"""Config should include the access log filter."""
config = create_uvicorn_log_config(excluded_paths=["/health", "/metrics"])
assert "access_log_filter" in config["filters"]
filter_config = config["filters"]["access_log_filter"]
assert filter_config["()"] == UvicornAccessLogFilter
assert filter_config["excluded_paths"] == ["/health", "/metrics"]
def test_config_applies_filter_to_access_handler(self):
"""Config should apply the filter to the access handler."""
config = create_uvicorn_log_config(excluded_paths=["/health"])
assert "access" in config["handlers"]
assert "filters" in config["handlers"]["access"]
assert "access_log_filter" in config["handlers"]["access"]["filters"]
def test_config_with_custom_log_level(self):
"""Config should respect custom log level."""
config = create_uvicorn_log_config(
excluded_paths=["/health"], log_level="debug"
)
assert config["loggers"]["uvicorn"]["level"] == "DEBUG"
assert config["loggers"]["uvicorn.access"]["level"] == "DEBUG"
assert config["loggers"]["uvicorn.error"]["level"] == "DEBUG"
def test_config_with_empty_excluded_paths(self):
"""Config should work with empty excluded paths."""
config = create_uvicorn_log_config(excluded_paths=[])
assert config["filters"]["access_log_filter"]["excluded_paths"] == []
def test_config_with_none_excluded_paths(self):
"""Config should work with None excluded paths."""
config = create_uvicorn_log_config(excluded_paths=None)
assert config["filters"]["access_log_filter"]["excluded_paths"] == []
class TestIntegration:
"""Integration tests for the access log filter."""
def test_filter_with_real_logger(self):
"""Test filter works with a real Python logger simulating uvicorn."""
# Create a logger with our filter (simulating uvicorn.access)
logger = logging.getLogger("uvicorn.access")
logger.setLevel(logging.INFO)
# Clear any existing handlers
logger.handlers = []
# Create a custom handler that tracks messages
logged_messages: list[str] = []
class TrackingHandler(logging.Handler):
def emit(self, record):
logged_messages.append(record.getMessage())
handler = TrackingHandler()
handler.setLevel(logging.INFO)
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
handler.addFilter(filter)
logger.addHandler(handler)
# Log using uvicorn's format with args tuple
# Format: '%s - "%s %s HTTP/%s" %d'
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/health",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/v1/completions",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/metrics",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"POST",
"/v1/chat/completions",
"1.1",
200,
)
# Verify only non-excluded endpoints were logged
assert len(logged_messages) == 2
assert "/v1/completions" in logged_messages[0]
assert "/v1/chat/completions" in logged_messages[1]
def test_filter_allows_non_uvicorn_access_logs(self):
"""Test filter allows logs from non-uvicorn.access loggers."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record from a different logger name
record = logging.LogRecord(
name="uvicorn.error",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some error message about /health",
args=(),
exc_info=None,
)
# Should allow because it's not from uvicorn.access
assert filter.filter(record) is True
def test_filter_handles_malformed_args(self):
"""Test filter handles log records with unexpected args format."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record with insufficient args
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some message",
args=("only", "two"),
exc_info=None,
)
# Should allow because args doesn't have expected format
assert filter.filter(record) is True
def test_filter_handles_non_tuple_args(self):
"""Test filter handles log records with non-tuple args."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record with None args
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some message without args",
args=None,
exc_info=None,
)
# Should allow because args is None
assert filter.filter(record) is True
...@@ -455,7 +455,7 @@ def test_eagle_correctness( ...@@ -455,7 +455,7 @@ def test_eagle_correctness(
from packaging.version import Version from packaging.version import Version
installed = Version(transformers.__version__) installed = Version(transformers.__version__)
required = Version("5.0.0") required = Version("5.0.0.dev")
if installed < required: if installed < required:
pytest.skip( pytest.skip(
"Eagle3 with the Transformers modeling backend requires " "Eagle3 with the Transformers modeling backend requires "
......
...@@ -112,13 +112,6 @@ def create_vllm_config( ...@@ -112,13 +112,6 @@ def create_vllm_config(
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
is_encoder_decoder=model_config.is_encoder_decoder, is_encoder_decoder=model_config.is_encoder_decoder,
) )
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_model_len,
enable_chunked_prefill=enable_chunked_prefill,
is_encoder_decoder=model_config.is_encoder_decoder,
)
# Cache config, optionally force APC # Cache config, optionally force APC
cache_config = CacheConfig( cache_config = CacheConfig(
block_size=block_size, block_size=block_size,
......
...@@ -372,8 +372,6 @@ def test_load_model( ...@@ -372,8 +372,6 @@ def test_load_model(
all_indx_layers: dict[str, mock.MagicMock] = {} all_indx_layers: dict[str, mock.MagicMock] = {}
all_indx_layers: dict[str, mock.MagicMock] = {}
# Make mock_get_layers return different values for each call # Make mock_get_layers return different values for each call
mock_get_layers.side_effect = [ mock_get_layers.side_effect = [
target_attn_layers, target_attn_layers,
......
...@@ -2831,13 +2831,13 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"): ...@@ -2831,13 +2831,13 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
class CPUDNNLGEMMHandler: class CPUDNNLGEMMHandler:
def __init__(self) -> None: def __init__(self) -> None:
self.handler_tensor: torch.Tensor | None = None self.handler: int | None = None
self.n = -1 self.n = -1
self.k = -1 self.k = -1
def __del__(self): def __del__(self):
if self.handler_tensor is not None: if self.handler is not None:
torch.ops._C.release_dnnl_matmul_handler(self.handler_tensor.item()) torch.ops._C.release_dnnl_matmul_handler(self.handler)
_supports_onednn = bool(hasattr(torch.ops._C, "create_onednn_mm_handler")) _supports_onednn = bool(hasattr(torch.ops._C, "create_onednn_mm_handler"))
...@@ -2853,10 +2853,8 @@ def create_onednn_mm( ...@@ -2853,10 +2853,8 @@ def create_onednn_mm(
) -> CPUDNNLGEMMHandler: ) -> CPUDNNLGEMMHandler:
handler = CPUDNNLGEMMHandler() handler = CPUDNNLGEMMHandler()
handler.k, handler.n = weight.size() handler.k, handler.n = weight.size()
# store the handler pointer in a tensor it doesn't get inlined handler.handler = torch.ops._C.create_onednn_mm_handler(
handler.handler_tensor = torch.tensor( weight, primitive_cache_size
torch.ops._C.create_onednn_mm_handler(weight, primitive_cache_size),
dtype=torch.int64,
) )
return handler return handler
...@@ -2884,17 +2882,8 @@ def create_onednn_scaled_mm( ...@@ -2884,17 +2882,8 @@ def create_onednn_scaled_mm(
) -> CPUDNNLGEMMHandler: ) -> CPUDNNLGEMMHandler:
handler = CPUDNNLGEMMHandler() handler = CPUDNNLGEMMHandler()
handler.k, handler.n = weight.size() handler.k, handler.n = weight.size()
# store the handler pointer in a tensor so it doesn't get inlined handler.handler = torch.ops._C.create_onednn_scaled_mm_handler(
handler.handler_tensor = torch.tensor( weight, weight_scales, output_type, dynamic_quant, use_azp, primitive_cache_size
torch.ops._C.create_onednn_scaled_mm_handler(
weight,
weight_scales,
output_type,
dynamic_quant,
use_azp,
primitive_cache_size,
),
dtype=torch.int64,
) )
return handler return handler
...@@ -2947,13 +2936,7 @@ def onednn_scaled_mm( ...@@ -2947,13 +2936,7 @@ def onednn_scaled_mm(
bias: torch.Tensor | None, bias: torch.Tensor | None,
) -> torch.Tensor: ) -> torch.Tensor:
torch.ops._C.onednn_scaled_mm( torch.ops._C.onednn_scaled_mm(
output, output, x, input_scale, input_zp, input_zp_adj, bias, dnnl_handler.handler
x,
input_scale,
input_zp,
input_zp_adj,
bias,
dnnl_handler.handler_tensor,
) )
return output return output
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment