Commit dcb5624a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.5' into v0.8.5-dev

parents 55880ca2 ba41cc90
# SPDX-License-Identifier: Apache-2.0
from contextlib import suppress
from dataclasses import dataclass, field
from http import HTTPStatus
from typing import Optional
from unittest.mock import MagicMock
import pytest
from vllm.config import MultiModalConfig
from vllm.engine.multiprocessing.client import MQLLMEngineClient
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels)
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
from vllm.transformers_utils.tokenizer import get_tokenizer
MODEL_NAME = "openai-community/gpt2"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
MOCK_RESOLVER_NAME = "mock_test_resolver"
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass
class MockModelConfig:
"""Minimal mock ModelConfig for testing."""
model: str = MODEL_NAME
tokenizer: str = MODEL_NAME
trust_remote_code: bool = False
tokenizer_mode: str = "auto"
max_model_len: int = 100
tokenizer_revision: Optional[str] = None
multimodal_config: MultiModalConfig = field(
default_factory=MultiModalConfig)
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
logits_processor_pattern: Optional[str] = None
diff_sampling_param: Optional[dict] = None
allowed_local_media_path: str = ""
encoder_config = None
generation_config: str = "auto"
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
class MockLoRAResolver(LoRAResolver):
async def resolve_lora(self, base_model_name: str,
lora_name: str) -> Optional[LoRARequest]:
if lora_name == "test-lora":
return LoRARequest(lora_name="test-lora",
lora_int_id=1,
lora_local_path="/fake/path/test-lora")
elif lora_name == "invalid-lora":
return LoRARequest(lora_name="invalid-lora",
lora_int_id=2,
lora_local_path="/fake/path/invalid-lora")
return None
@pytest.fixture(autouse=True)
def register_mock_resolver():
"""Fixture to register and unregister the mock LoRA resolver."""
resolver = MockLoRAResolver()
LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
yield
# Cleanup: remove the resolver after the test runs
if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]
@pytest.fixture
def mock_serving_setup():
"""Provides a mocked engine and serving completion instance."""
mock_engine = MagicMock(spec=MQLLMEngineClient)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
def mock_add_lora_side_effect(lora_request: LoRARequest):
"""Simulate engine behavior when adding LoRAs."""
if lora_request.lora_name == "test-lora":
# Simulate successful addition
return
elif lora_request.lora_name == "invalid-lora":
# Simulate failure during addition (e.g. invalid format)
raise ValueError(f"Simulated failure adding LoRA: "
f"{lora_request.lora_name}")
mock_engine.add_lora.side_effect = mock_add_lora_side_effect
mock_engine.generate.reset_mock()
mock_engine.add_lora.reset_mock()
mock_model_config = MockModelConfig()
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config)
serving_completion = OpenAIServingCompletion(mock_engine,
mock_model_config,
models,
request_logger=None)
return mock_engine, serving_completion
@pytest.mark.asyncio
async def test_serving_completion_with_lora_resolver(mock_serving_setup,
monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
mock_engine, serving_completion = mock_serving_setup
lora_model_name = "test-lora"
req_found = CompletionRequest(
model=lora_model_name,
prompt="Generate with LoRA",
)
# Suppress potential errors during the mocked generate call,
# as we are primarily checking for add_lora and generate calls
with suppress(Exception):
await serving_completion.create_completion(req_found)
mock_engine.add_lora.assert_called_once()
called_lora_request = mock_engine.add_lora.call_args[0][0]
assert isinstance(called_lora_request, LoRARequest)
assert called_lora_request.lora_name == lora_model_name
mock_engine.generate.assert_called_once()
called_lora_request = mock_engine.generate.call_args[1]['lora_request']
assert isinstance(called_lora_request, LoRARequest)
assert called_lora_request.lora_name == lora_model_name
@pytest.mark.asyncio
async def test_serving_completion_resolver_not_found(mock_serving_setup,
monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
mock_engine, serving_completion = mock_serving_setup
non_existent_model = "non-existent-lora-adapter"
req = CompletionRequest(
model=non_existent_model,
prompt="what is 1+1?",
)
response = await serving_completion.create_completion(req)
mock_engine.add_lora.assert_not_called()
mock_engine.generate.assert_not_called()
assert isinstance(response, ErrorResponse)
assert response.code == HTTPStatus.NOT_FOUND.value
assert non_existent_model in response.message
@pytest.mark.asyncio
async def test_serving_completion_resolver_add_lora_fails(
mock_serving_setup, monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
mock_engine, serving_completion = mock_serving_setup
invalid_model = "invalid-lora"
req = CompletionRequest(
model=invalid_model,
prompt="what is 1+1?",
)
response = await serving_completion.create_completion(req)
# Assert add_lora was called before the failure
mock_engine.add_lora.assert_called_once()
called_lora_request = mock_engine.add_lora.call_args[0][0]
assert isinstance(called_lora_request, LoRARequest)
assert called_lora_request.lora_name == invalid_model
# Assert generate was *not* called due to the failure
mock_engine.generate.assert_not_called()
# Assert the correct error response
assert isinstance(response, ErrorResponse)
assert response.code == HTTPStatus.BAD_REQUEST.value
assert invalid_model in response.message
@pytest.mark.asyncio
async def test_serving_completion_flag_not_set(mock_serving_setup):
mock_engine, serving_completion = mock_serving_setup
lora_model_name = "test-lora"
req_found = CompletionRequest(
model=lora_model_name,
prompt="Generate with LoRA",
)
await serving_completion.create_completion(req_found)
mock_engine.add_lora.assert_not_called()
mock_engine.generate.assert_not_called()
# SPDX-License-Identifier: Apache-2.0
import pytest
import schemathesis
from schemathesis import GenerationConfig
from ...utils import RemoteOpenAIServer
schemathesis.experimental.OPEN_API_3_1.enable()
MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
MAXIMUM_IMAGES = 2
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"generate",
"--max-model-len",
"2048",
"--max-num-seqs",
"5",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def get_schema(server):
# avoid generating null (\x00) bytes in strings during test case generation
return schemathesis.openapi.from_uri(
f"{server.url_root}/openapi.json",
generation_config=GenerationConfig(allow_x00=False),
)
schema = schemathesis.from_pytest_fixture("get_schema")
@schema.parametrize()
@schema.override(headers={"Content-Type": "application/json"})
async def test_openapi_stateless(case):
#No need to verify SSL certificate for localhost
await case.call_and_validate(verify=False)
......@@ -192,3 +192,36 @@ async def test_stream_options(winning_call):
else:
continuous = continuous and hasattr(chunk, 'usage')
assert final and continuous
@pytest.mark.asyncio
async def test_sampling_params(mary_had_lamb):
"""
Compare sampling with params and greedy sampling to assert results
are different when extreme sampling parameters values are picked.
"""
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
language="en",
temperature=0.8,
extra_body=dict(seed=42,
repetition_penalty=1.9,
top_k=12,
top_p=0.4,
min_p=0.5,
frequency_penalty=1.8,
presence_penalty=2.0))
greedy_transcription = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
language="en",
temperature=0.0,
extra_body=dict(seed=42))
assert greedy_transcription.text != transcription.text
# SPDX-License-Identifier: Apache-2.0
import os
import json
import openai
import pytest
import pytest_asyncio
......@@ -39,7 +41,7 @@ def server():
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"video={MAXIMUM_VIDEOS}",
json.dumps({"video": MAXIMUM_VIDEOS}),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......@@ -114,6 +116,35 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_error_on_invalid_video_url_type(client: openai.AsyncOpenAI,
model_name: str,
video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": video_url
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# video_url should be a dict {"url": "some url"}, not directly a string
with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
......
# SPDX-License-Identifier: Apache-2.0
import json
import openai
import pytest
import os
......@@ -44,7 +46,7 @@ def server():
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}",
json.dumps({"image": MAXIMUM_IMAGES}),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......@@ -144,6 +146,36 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
model_name: str,
image_url: str):
content_text = "What's in this image?"
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": image_url
},
{
"type": "text",
"text": content_text
},
],
}]
# image_url should be a dict {"url": "some url"}, not directly a string
with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
......
# SPDX-License-Identifier: Apache-2.0
import os
import json
import pytest
import requests
from PIL import Image
......@@ -45,7 +47,7 @@ def server():
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}",
json.dumps({"image": MAXIMUM_IMAGES}),
"--chat-template",
str(vlm2vec_jinja_path),
]
......
......@@ -6,13 +6,12 @@ from typing import Optional
import pytest
import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes
from .allclose_default import get_default_atol, get_default_rtol
if not current_platform.is_rocm():
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
......
......@@ -6,14 +6,13 @@ from typing import Optional
import pytest
import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm import _custom_ops as ops
from vllm.attention.ops.blocksparse_attention.interface import (
LocalStridedBlockSparseAttn)
from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes
from .allclose_default import get_default_atol, get_default_rtol
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
# - 512 as a buffer
......
......@@ -16,6 +16,7 @@ NUM_LAYERS = [1] # Arbitrary values for testing
NUM_HEADS = [8] # Arbitrary values for testing
HEAD_SIZES = [64, 80, 120, 256]
BLOCK_SIZES = [8, 16, 32]
CACHE_LAYOUTS = ["NHD", "HND"]
# Parameters for MLA tests.
KV_LORA_RANKS = [512]
......@@ -221,6 +222,7 @@ def test_reshape_and_cache(
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@pytest.mark.parametrize("kv_cache_layout", CACHE_LAYOUTS)
@torch.inference_mode()
def test_reshape_and_cache_flash(
kv_cache_factory_flashinfer,
......@@ -233,17 +235,21 @@ def test_reshape_and_cache_flash(
seed: int,
device: str,
kv_cache_dtype: str,
kv_cache_layout: str,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)
# fp8 conversion requires continugous memory buffer. Reduce the number of
# blocks and tokens to consume less memory.
num_tokens = num_tokens // 2
num_blocks = num_blocks // 2
# Create a random slot mapping.
num_slots = block_size * num_blocks
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping_lst,
dtype=torch.long,
device=device)
qkv = torch.randn(num_tokens,
3,
num_heads,
......@@ -262,27 +268,35 @@ def test_reshape_and_cache_flash(
kv_cache_dtype,
dtype,
device=device,
cache_layout=kv_cache_layout,
)
key_cache, value_cache = key_caches[0].contiguous(
), value_caches[0].contiguous()
key_cache, value_cache = key_caches[0], value_caches[0]
del key_caches
del value_caches
k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32)
def permute_and_compact(x):
y = x if kv_cache_layout == "NHD" else x.permute(0, 2, 1, 3)
return y.contiguous()
key_cache_compact = permute_and_compact(key_cache)
value_cache_compact = permute_and_compact(value_cache)
# Clone the KV caches.
if kv_cache_dtype == "fp8":
cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
ops.convert_fp8(cloned_key_cache, key_cache, k_scale.item(),
kv_cache_dtype)
cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
ops.convert_fp8(cloned_value_cache, value_cache, v_scale.item(),
cloned_key_cache = torch.empty_like(key_cache_compact,
dtype=torch.float16)
ops.convert_fp8(cloned_key_cache, key_cache_compact, k_scale.item(),
kv_cache_dtype)
cloned_value_cache = torch.empty_like(value_cache_compact,
dtype=torch.float16)
ops.convert_fp8(cloned_value_cache, value_cache_compact,
v_scale.item(), kv_cache_dtype)
else:
cloned_key_cache = key_cache.clone()
cloned_value_cache = value_cache.clone()
cloned_key_cache = key_cache_compact.clone()
cloned_value_cache = value_cache_compact.clone()
# Call the reshape_and_cache kernel.
opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
......@@ -290,16 +304,20 @@ def test_reshape_and_cache_flash(
cond=(head_size == HEAD_SIZES[0]))
ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
slot_mapping, kv_cache_dtype, k_scale, v_scale)
key_cache_compact = permute_and_compact(key_cache)
value_cache_compact = permute_and_compact(value_cache)
if kv_cache_dtype == "fp8":
result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
result_key_cache = torch.empty_like(key_cache_compact,
dtype=torch.float16)
ops.convert_fp8(result_key_cache,
key_cache,
key_cache_compact,
k_scale.item(),
kv_dtype=kv_cache_dtype)
result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
result_value_cache = torch.empty_like(value_cache_compact,
dtype=torch.float16)
ops.convert_fp8(result_value_cache,
value_cache,
value_cache_compact,
v_scale.item(),
kv_dtype=kv_cache_dtype)
......@@ -311,8 +329,12 @@ def test_reshape_and_cache_flash(
for i in range(num_tokens):
block_idx = block_indicies_lst[i]
block_offset = block_offsets_lst[i]
cloned_key_cache[block_idx, block_offset, :, :] = key[i]
cloned_value_cache[block_idx, block_offset, :, :] = value[i]
if kv_cache_layout == "NHD":
cloned_key_cache[block_idx, block_offset, :, :] = key[i]
cloned_value_cache[block_idx, block_offset, :, :] = value[i]
else:
cloned_key_cache[block_idx, :, block_offset, :] = key[i]
cloned_value_cache[block_idx, :, block_offset, :] = value[i]
if kv_cache_dtype == "fp8":
torch.testing.assert_close(result_key_cache,
......@@ -324,9 +346,9 @@ def test_reshape_and_cache_flash(
atol=0.001,
rtol=0.1)
else:
torch.testing.assert_close(key_cache, cloned_key_cache)
torch.testing.assert_close(value_cache, cloned_value_cache)
torch.testing.assert_close(key_cache_compact, cloned_key_cache)
torch.testing.assert_close(value_cache_compact, cloned_value_cache)
@pytest.mark.parametrize("direction", COPYING_DIRECTION)
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
......
......@@ -151,7 +151,7 @@ def test_flash_attn_with_paged_kv(
v_descale = None
if q_dtype is not None:
# QKV are drawn from N(0, 1): no need for a fp8 scaling factor
maybe_quantized_query = query.to(q_dtype)
maybe_quantized_query = q.to(q_dtype)
maybe_quantized_key_cache = key_cache.to(q_dtype)
maybe_quantized_value_cache = value_cache.to(q_dtype)
......
......@@ -28,7 +28,34 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
assert (backend.get_name() == "ROCM_FLASH"
or backend.get_name() == "TRITON_ATTN_VLLM_V1")
# mla test for deepseek related
# MLA test for deepseek related
# change the attention backend to triton MLA
m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
False, True)
assert backend.get_name() == "TRITON_MLA"
# If attention backend is None
# If use_mla is true
# The selected backend is triton MLA
m.setenv(STR_BACKEND_ENV_VAR, None)
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
False, True)
assert backend.get_name() == "TRITON_MLA"
# change the attention backend to AITER MLA
m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False,
False, True)
assert backend.get_name() == "ROCM_AITER_MLA"
# If attention backend is None
# If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA
m.setenv(STR_BACKEND_ENV_VAR, None)
m.setenv("VLLM_ROCM_USE_AITER", "1")
backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False,
False, True)
assert backend.get_name() == "ROCM_AITER_MLA"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment