Commit c721b814 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.15.1

parent d53fe7e5
......@@ -104,16 +104,18 @@ def test_flash_mla(
descale_k = None
def flash_mla():
return flash_mla_with_kvcache(q,
blocked_k,
block_table,
cache_seqlens,
dv,
tile_scheduler_metadata,
num_splits,
causal=causal,
descale_q=descale_q,
descale_k=descale_k)
return flash_mla_with_kvcache(
q,
blocked_k,
block_table,
cache_seqlens,
dv,
tile_scheduler_metadata,
num_splits,
causal=causal,
descale_q=descale_q,
descale_k=descale_k,
)
def scaled_dot_product_attention(query, key, value, is_causal=False):
query = query.float()
......
......@@ -22,9 +22,6 @@ from vllm.distributed import (
)
from vllm.forward_context import set_forward_context
from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
......@@ -43,6 +40,7 @@ from .mk_objects import (
TestMoEQuantConfig,
expert_info,
make_fused_experts,
make_prepare_finalize,
prepare_finalize_info,
)
from .parallel_utils import ProcessGroupInfo
......@@ -605,10 +603,9 @@ def make_modular_kernel(
routing_method=RoutingMethodType.DeepSeekV3,
)
prepare_finalize = maybe_make_prepare_finalize(
moe=moe,
quant_config=quant_config,
allow_new_interface=True,
# make modular kernel
prepare_finalize = make_prepare_finalize(
config.prepare_finalize_type, config.all2all_backend(), moe, quant_config
)
assert prepare_finalize is not None
......
......@@ -7,6 +7,9 @@ import torch
# Fused experts and PrepareFinalize imports
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe import TritonExperts
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts,
)
......@@ -252,12 +255,13 @@ if has_pplx():
)
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts,
)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize,
create_flashinfer_prepare_finalize,
)
register_prepare_and_finalize(
FlashInferCutlassMoEPrepareAndFinalize,
......@@ -425,6 +429,24 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
]
def make_prepare_finalize(
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
backend: str | None,
moe: FusedMoEConfig,
quant_config: FusedMoEQuantConfig,
) -> mk.FusedMoEPrepareAndFinalize:
if backend != "naive" and backend is not None:
prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
assert prepare_finalize is not None
return prepare_finalize
elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
return create_flashinfer_prepare_finalize(
use_dp=moe.moe_parallel_config.dp_size > 1
)
else:
return MoEPrepareAndFinalizeNoEP()
def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
s = rank * num_local_experts
e = s + num_local_experts
......
......@@ -294,7 +294,12 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
)
kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
MoEPrepareAndFinalizeNoEP(
defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
moe_config=moe_config,
quant_config=quant_config,
)
),
FlashInferExperts(
moe_config=moe_config,
quant_config=quant_config,
......
......@@ -106,7 +106,12 @@ def test_flashinfer_fp4_moe_no_graph(
)
flashinfer_experts = FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
MoEPrepareAndFinalizeNoEP(
defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
moe_config=moe_config,
quant_config=quant_config,
)
),
FlashInferExperts(moe_config=moe_config, quant_config=quant_config),
)
......
......@@ -90,7 +90,7 @@ def test_cutlass_fp4_moe_no_graph(
)
kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
CutlassExpertsFp4(
moe_config=make_dummy_moe_config(),
quant_config=quant_config,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import numpy as np
import pytest
import torch
from transformers import AutoModelForTokenClassification
......@@ -11,20 +9,6 @@ from tests.models.utils import softmax
from vllm.platforms import current_platform
@pytest.fixture(autouse=True)
def seed_everything():
"""Seed all random number generators for reproducibility."""
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
yield
@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
# The float32 is required for this tiny model to pass the test.
@pytest.mark.parametrize("dtype", ["float"])
......@@ -68,7 +52,6 @@ def test_bert_models(
@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.flaky(reruns=3)
@torch.inference_mode
def test_modernbert_models(
hf_runner,
......@@ -77,14 +60,6 @@ def test_modernbert_models(
model: str,
dtype: str,
) -> None:
# NOTE: https://github.com/vllm-project/vllm/pull/32403
# `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
# model, which can cause numerical precision variance and edge cases.
# We use @flaky(reruns=3) to mitigate intermittent failures.
print(
f"\n[NOTE] Testing {model} (randomly initialized weights) - "
"flaky tolerance enabled due to numerical precision variance."
)
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.token_classify(example_prompts)
......
......@@ -458,20 +458,6 @@ VLM_TEST_SETTINGS = {
],
marks=[large_gpu_mark(min_gb=32)],
),
"glm_ocr": VLMTestInfo(
models=["zai-org/GLM-OCR"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
max_model_len=2048,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=32)],
),
"h2ovl": VLMTestInfo(
models=[
"h2oai/h2ovl-mississippi-800m",
......@@ -587,21 +573,6 @@ VLM_TEST_SETTINGS = {
vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
marks=[large_gpu_mark(min_gb=48)],
),
"llama4": VLMTestInfo(
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
img_idx_to_prompt=lambda _: "<|image|>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
distributed_executor_backend="mp",
image_size_factors=[(0.25, 0.5, 1.0)],
hf_model_kwargs={"device_map": "auto"},
max_model_len=8192,
max_num_seqs=4,
dtype="bfloat16",
auto_cls=AutoModelForImageTextToText,
tensor_parallel_size=4,
marks=multi_gpu_marks(num_gpus=4),
),
"llava_next": VLMTestInfo(
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
......
......@@ -91,19 +91,6 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"use_processor": True,
"question": "What is the content of each image?",
},
"glm_ocr": {
"model_name": "zai-org/GLM-OCR",
"interface": "llm_generate",
"max_model_len": 131072,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"use_processor": True,
"question": "Text Recognition:",
},
"keye_vl": {
"model_name": "Kwai-Keye/Keye-VL-8B-Preview",
"interface": "llm_generate",
......
......@@ -122,7 +122,6 @@ MM_DATA_PATCHES = {
"ernie4_5_moe_vl": qwen3_vl_patch_mm_data,
"glm4v": glm4_1v_patch_mm_data,
"glm4v_moe": glm4_1v_patch_mm_data,
"glm_ocr": glm4_1v_patch_mm_data,
"glmasr": glmasr_patch_mm_data,
"molmo2": qwen3_vl_patch_mm_data,
"qwen3_vl": qwen3_vl_patch_mm_data,
......
......@@ -256,7 +256,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
),
"Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"),
"ExaoneMoEForCausalLM": _HfExamplesInfo(
"LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.1.0"
"LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.0.0"
),
"Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
......@@ -273,7 +273,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5"),
"Glm4MoeLiteForCausalLM": _HfExamplesInfo(
"zai-org/GLM-4.7-Flash",
min_transformers_version="5.0.0",
min_transformers_version="5.0.0.dev",
is_available_online=False,
),
"GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}),
"GPTBigCodeForCausalLM": _HfExamplesInfo(
......@@ -653,7 +654,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# [Decoder-only]
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
"nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0"
"nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
"BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
......@@ -696,7 +697,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"GlmAsrForConditionalGeneration": _HfExamplesInfo(
"zai-org/GLM-ASR-Nano-2512",
trust_remote_code=True,
min_transformers_version="5.0.0",
min_transformers_version="5.0",
),
"GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"),
"GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
......@@ -709,11 +710,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
"Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),
"Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V"),
"GlmOcrForConditionalGeneration": _HfExamplesInfo(
"zai-org/GLM-OCR",
is_available_online=False,
min_transformers_version="5.1.0",
),
"H2OVLChatModel": _HfExamplesInfo(
"h2oai/h2ovl-mississippi-800m",
trust_remote_code=True,
......@@ -1056,7 +1052,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"ExaoneMoeMTP": _HfExamplesInfo(
"LGAI-EXAONE/K-EXAONE-236B-A23B",
speculative_model="LGAI-EXAONE/K-EXAONE-236B-A23B",
min_transformers_version="5.1.0",
min_transformers_version="5.0.0",
),
"Glm4MoeMTPModel": _HfExamplesInfo(
"zai-org/GLM-4.5",
......@@ -1067,12 +1063,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
speculative_model="zai-org/GLM-4.7-Flash",
min_transformers_version="5.0.0",
),
"GlmOcrMTPModel": _HfExamplesInfo(
"zai-org/GLM-OCR",
speculative_model="zai-org/GLM-OCR",
is_available_online=False,
min_transformers_version="5.1.0",
),
"LongCatFlashMTPModel": _HfExamplesInfo(
"meituan-longcat/LongCat-Flash-Chat",
trust_remote_code=True,
......@@ -1104,27 +1094,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
_TRANSFORMERS_BACKEND_MODELS = {
"TransformersEmbeddingModel": _HfExamplesInfo(
"BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0"
"BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0.dev"
),
"TransformersForSequenceClassification": _HfExamplesInfo(
"papluca/xlm-roberta-base-language-detection",
min_transformers_version="5.0.0",
min_transformers_version="5.0.0.dev",
),
"TransformersForCausalLM": _HfExamplesInfo(
"hmellor/Ilama-3.2-1B", trust_remote_code=True
),
"TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"TransformersMoEForCausalLM": _HfExamplesInfo(
"allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0"
"allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0.dev"
),
"TransformersMultiModalMoEForCausalLM": _HfExamplesInfo(
"Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0"
"Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0.dev"
),
"TransformersMoEEmbeddingModel": _HfExamplesInfo(
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
),
"TransformersMoEForSequenceClassification": _HfExamplesInfo(
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
),
"TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"),
"TransformersMultiModalForSequenceClassification": _HfExamplesInfo(
......
......@@ -88,6 +88,7 @@ def can_initialize(
[10 * GiB_bytes],
)
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
return 1, 0, scheduler_kv_cache_config
......
......@@ -78,7 +78,7 @@ def test_models(
from packaging.version import Version
installed = Version(transformers.__version__)
required = Version("5.0.0")
required = Version("5.0.0.dev")
if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
pytest.skip(
"MoE models with the Transformers modeling backend require "
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from huggingface_hub.constants import HF_HUB_CACHE
from vllm.plugins.lora_resolvers.hf_hub_resolver import HfHubResolver
LORA_LIB_MODEL_NAME = "ibm-granite/granite-3.3-8b-instruct"
# Repo with multiple LoRAs contained in it
LORA_LIB = "ibm-granite/granite-3.3-8b-rag-agent-lib"
LORA_NAME = "ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora" # noqa: E501
NON_LORA_SUBPATH = "ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
LIB_DOWNLOAD_DIR = os.path.join(
HF_HUB_CACHE, "models--ibm-granite--granite-3.3-8b-rag-agent-lib"
)
INVALID_REPO_NAME = "thisrepodoesnotexist"
# Repo with only one LoRA in the root dir
LORA_REPO_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
LORA_REPO = "yard1/llama-2-7b-sql-lora-test"
REPO_DOWNLOAD_DIR = os.path.join(
HF_HUB_CACHE, "models--yard1--llama-2-7b-sql-lora-test"
)
@pytest.mark.asyncio
async def test_hf_resolver_with_direct_path():
hf_resolver = HfHubResolver([LORA_REPO])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_REPO_MODEL_NAME, LORA_REPO)
assert lora_request.lora_name == LORA_REPO
assert REPO_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_hf_resolver_with_nested_paths():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
assert lora_request is not None
assert lora_request.lora_name == LORA_NAME
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_hf_resolver_with_multiple_repos():
hf_resolver = HfHubResolver([LORA_LIB, LORA_REPO])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
assert lora_request is not None
assert lora_request.lora_name == LORA_NAME
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_missing_adapter():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
missing_lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, "foobar")
assert missing_lora_request is None
@pytest.mark.asyncio
async def test_nonlora_adapter():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
readme_request = await hf_resolver.resolve_lora(
LORA_LIB_MODEL_NAME, NON_LORA_SUBPATH
)
assert readme_request is None
@pytest.mark.asyncio
async def test_invalid_repo():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
invalid_repo_req = await hf_resolver.resolve_lora(
INVALID_REPO_NAME,
f"{INVALID_REPO_NAME}/foo",
)
assert invalid_repo_req is None
@pytest.mark.asyncio
async def test_trailing_slash():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(
LORA_LIB_MODEL_NAME,
f"{LORA_NAME}/",
)
assert lora_request is not None
assert lora_request.lora_name == f"{LORA_NAME}/"
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
......@@ -36,7 +36,7 @@ class MyGemma2Embedding(nn.Module):
def forward(
self,
input_ids: torch.Tensor | None,
input_ids: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for the UvicornAccessLogFilter class.
"""
import logging
from vllm.logging_utils.access_log_filter import (
UvicornAccessLogFilter,
create_uvicorn_log_config,
)
class TestUvicornAccessLogFilter:
"""Test cases for UvicornAccessLogFilter."""
def test_filter_allows_all_when_no_excluded_paths(self):
"""Filter should allow all logs when no paths are excluded."""
filter = UvicornAccessLogFilter(excluded_paths=[])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/v1/completions", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_allows_all_when_excluded_paths_is_none(self):
"""Filter should allow all logs when excluded_paths is None."""
filter = UvicornAccessLogFilter(excluded_paths=None)
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_excludes_health_endpoint(self):
"""Filter should exclude /health endpoint when configured."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_excludes_metrics_endpoint(self):
"""Filter should exclude /metrics endpoint when configured."""
filter = UvicornAccessLogFilter(excluded_paths=["/metrics"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_allows_non_excluded_endpoints(self):
"""Filter should allow endpoints not in the excluded list."""
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "POST", "/v1/completions", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_excludes_multiple_endpoints(self):
"""Filter should exclude multiple configured endpoints."""
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics", "/ping"])
# Test /health
record_health = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_health) is False
# Test /metrics
record_metrics = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_metrics) is False
# Test /ping
record_ping = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_ping) is False
def test_filter_with_query_parameters(self):
"""Filter should exclude endpoints even with query parameters."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health?verbose=true", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_different_http_methods(self):
"""Filter should exclude endpoints regardless of HTTP method."""
filter = UvicornAccessLogFilter(excluded_paths=["/ping"])
# Test GET
record_get = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_get) is False
# Test POST
record_post = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "POST", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_post) is False
def test_filter_with_different_status_codes(self):
"""Filter should exclude endpoints regardless of status code."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
for status_code in [200, 500, 503]:
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", status_code),
exc_info=None,
)
assert filter.filter(record) is False
class TestCreateUvicornLogConfig:
"""Test cases for create_uvicorn_log_config function."""
def test_creates_valid_config_structure(self):
"""Config should have required logging configuration keys."""
config = create_uvicorn_log_config(excluded_paths=["/health"])
assert "version" in config
assert config["version"] == 1
assert "disable_existing_loggers" in config
assert "formatters" in config
assert "handlers" in config
assert "loggers" in config
assert "filters" in config
def test_config_includes_access_log_filter(self):
"""Config should include the access log filter."""
config = create_uvicorn_log_config(excluded_paths=["/health", "/metrics"])
assert "access_log_filter" in config["filters"]
filter_config = config["filters"]["access_log_filter"]
assert filter_config["()"] == UvicornAccessLogFilter
assert filter_config["excluded_paths"] == ["/health", "/metrics"]
def test_config_applies_filter_to_access_handler(self):
"""Config should apply the filter to the access handler."""
config = create_uvicorn_log_config(excluded_paths=["/health"])
assert "access" in config["handlers"]
assert "filters" in config["handlers"]["access"]
assert "access_log_filter" in config["handlers"]["access"]["filters"]
def test_config_with_custom_log_level(self):
"""Config should respect custom log level."""
config = create_uvicorn_log_config(
excluded_paths=["/health"], log_level="debug"
)
assert config["loggers"]["uvicorn"]["level"] == "DEBUG"
assert config["loggers"]["uvicorn.access"]["level"] == "DEBUG"
assert config["loggers"]["uvicorn.error"]["level"] == "DEBUG"
def test_config_with_empty_excluded_paths(self):
"""Config should work with empty excluded paths."""
config = create_uvicorn_log_config(excluded_paths=[])
assert config["filters"]["access_log_filter"]["excluded_paths"] == []
def test_config_with_none_excluded_paths(self):
"""Config should work with None excluded paths."""
config = create_uvicorn_log_config(excluded_paths=None)
assert config["filters"]["access_log_filter"]["excluded_paths"] == []
class TestIntegration:
"""Integration tests for the access log filter."""
def test_filter_with_real_logger(self):
"""Test filter works with a real Python logger simulating uvicorn."""
# Create a logger with our filter (simulating uvicorn.access)
logger = logging.getLogger("uvicorn.access")
logger.setLevel(logging.INFO)
# Clear any existing handlers
logger.handlers = []
# Create a custom handler that tracks messages
logged_messages: list[str] = []
class TrackingHandler(logging.Handler):
def emit(self, record):
logged_messages.append(record.getMessage())
handler = TrackingHandler()
handler.setLevel(logging.INFO)
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
handler.addFilter(filter)
logger.addHandler(handler)
# Log using uvicorn's format with args tuple
# Format: '%s - "%s %s HTTP/%s" %d'
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/health",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/v1/completions",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/metrics",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"POST",
"/v1/chat/completions",
"1.1",
200,
)
# Verify only non-excluded endpoints were logged
assert len(logged_messages) == 2
assert "/v1/completions" in logged_messages[0]
assert "/v1/chat/completions" in logged_messages[1]
def test_filter_allows_non_uvicorn_access_logs(self):
"""Test filter allows logs from non-uvicorn.access loggers."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record from a different logger name
record = logging.LogRecord(
name="uvicorn.error",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some error message about /health",
args=(),
exc_info=None,
)
# Should allow because it's not from uvicorn.access
assert filter.filter(record) is True
def test_filter_handles_malformed_args(self):
"""Test filter handles log records with unexpected args format."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record with insufficient args
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some message",
args=("only", "two"),
exc_info=None,
)
# Should allow because args doesn't have expected format
assert filter.filter(record) is True
def test_filter_handles_non_tuple_args(self):
"""Test filter handles log records with non-tuple args."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record with None args
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some message without args",
args=None,
exc_info=None,
)
# Should allow because args is None
assert filter.filter(record) is True
......@@ -455,7 +455,7 @@ def test_eagle_correctness(
from packaging.version import Version
installed = Version(transformers.__version__)
required = Version("5.0.0")
required = Version("5.0.0.dev")
if installed < required:
pytest.skip(
"Eagle3 with the Transformers modeling backend requires "
......
......@@ -112,13 +112,6 @@ def create_vllm_config(
enable_chunked_prefill=enable_chunked_prefill,
is_encoder_decoder=model_config.is_encoder_decoder,
)
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_model_len,
enable_chunked_prefill=enable_chunked_prefill,
is_encoder_decoder=model_config.is_encoder_decoder,
)
# Cache config, optionally force APC
cache_config = CacheConfig(
block_size=block_size,
......
......@@ -372,8 +372,6 @@ def test_load_model(
all_indx_layers: dict[str, mock.MagicMock] = {}
all_indx_layers: dict[str, mock.MagicMock] = {}
# Make mock_get_layers return different values for each call
mock_get_layers.side_effect = [
target_attn_layers,
......
......@@ -2831,13 +2831,13 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
class CPUDNNLGEMMHandler:
def __init__(self) -> None:
self.handler_tensor: torch.Tensor | None = None
self.handler: int | None = None
self.n = -1
self.k = -1
def __del__(self):
if self.handler_tensor is not None:
torch.ops._C.release_dnnl_matmul_handler(self.handler_tensor.item())
if self.handler is not None:
torch.ops._C.release_dnnl_matmul_handler(self.handler)
_supports_onednn = bool(hasattr(torch.ops._C, "create_onednn_mm_handler"))
......@@ -2853,10 +2853,8 @@ def create_onednn_mm(
) -> CPUDNNLGEMMHandler:
handler = CPUDNNLGEMMHandler()
handler.k, handler.n = weight.size()
# store the handler pointer in a tensor it doesn't get inlined
handler.handler_tensor = torch.tensor(
torch.ops._C.create_onednn_mm_handler(weight, primitive_cache_size),
dtype=torch.int64,
handler.handler = torch.ops._C.create_onednn_mm_handler(
weight, primitive_cache_size
)
return handler
......@@ -2884,17 +2882,8 @@ def create_onednn_scaled_mm(
) -> CPUDNNLGEMMHandler:
handler = CPUDNNLGEMMHandler()
handler.k, handler.n = weight.size()
# store the handler pointer in a tensor so it doesn't get inlined
handler.handler_tensor = torch.tensor(
torch.ops._C.create_onednn_scaled_mm_handler(
weight,
weight_scales,
output_type,
dynamic_quant,
use_azp,
primitive_cache_size,
),
dtype=torch.int64,
handler.handler = torch.ops._C.create_onednn_scaled_mm_handler(
weight, weight_scales, output_type, dynamic_quant, use_azp, primitive_cache_size
)
return handler
......@@ -2947,13 +2936,7 @@ def onednn_scaled_mm(
bias: torch.Tensor | None,
) -> torch.Tensor:
torch.ops._C.onednn_scaled_mm(
output,
x,
input_scale,
input_zp,
input_zp_adj,
bias,
dnnl_handler.handler_tensor,
output, x, input_scale, input_zp, input_zp_adj, bias, dnnl_handler.handler
)
return output
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment