Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
......@@ -30,8 +30,9 @@ example_prompts = [first_prompt, "In one word, the capital of France is "] + [
default_params = dict(
temperature=0.0, # greedy
max_tokens=23,
min_tokens=18,
max_tokens=30,
# spec decoding currently doesn't support min_tokens
# min_tokens=28,
)
......@@ -50,6 +51,14 @@ def test_without_spec_decoding(
dict(logprobs=2),
dict(logprobs=2, presence_penalty=-1.0),
dict(structured_outputs=struct_outputs),
dict(
structured_outputs=struct_outputs,
logprobs=2,
),
dict(
structured_outputs=struct_outputs,
presence_penalty=-1.0,
),
dict(
structured_outputs=struct_outputs,
logprobs=2,
......@@ -86,7 +95,7 @@ def test_without_spec_decoding(
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
"""Test consistency and acceptance rates with some different combos of
preemption, executor, async scheduling, prefill chunking,
spec decoding model length.
......@@ -100,9 +109,20 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
# Set small draft model len to force doesn't-fit-in-drafter case.
spec_config_short = spec_config | {"max_model_len": 50}
struct_outputs = StructuredOutputsParams(json=sample_json_schema)
test_sampling_params = [
dict(),
dict(presence_penalty=-1.0),
dict(bad_words=["the", " the"]),
dict(logprobs=2),
dict(logprobs=2, presence_penalty=-1.0),
dict(structured_outputs=struct_outputs),
dict(
structured_outputs=struct_outputs,
logprobs=2,
presence_penalty=-1.0,
),
]
# test_preemption, executor, async_scheduling,
......@@ -142,18 +162,12 @@ def run_tests(
"""Test consistency of combos of async scheduling, preemption,
uni/multiproc executor with spec decoding."""
# Determine attention config based on platform
attention_config = {"backend": "FLEX_ATTENTION"}
with monkeypatch.context() as m:
# avoid precision errors
if current_platform.is_rocm():
if is_testing_with_spec_decoding:
# Use TRITON_ATTN for spec decoding test for consistency
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
else:
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
else:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
# lock matmul precision to full FP32 (IEEE)
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
# m.setenv("VLLM_BATCH_INVARIANT", "1")
outputs: list[tuple[str, list, list]] = []
for n, (
......@@ -174,6 +188,7 @@ def run_tests(
spec_config,
test_prefill_chunking=test_prefill_chunking,
is_testing_with_spec_decoding=is_testing_with_spec_decoding,
attention_config=attention_config,
)
outputs.append(test_results)
......@@ -204,15 +219,7 @@ def run_tests(
name_1=f"config=[{test_config}], params={params}",
)
# On ROCm with TRITON_ATTN (spec decoding test), skip strict
# logprobs comparison when logprobs are requested
skip_logprobs_check = (
current_platform.is_rocm()
and params.get("logprobs")
and is_testing_with_spec_decoding
)
if not skip_logprobs_check:
assert _all_logprobs_match(base_logprobs, test_logprobs)
assert _all_logprobs_match(base_logprobs, test_logprobs)
if (
base_acceptance_rate is not None
......@@ -262,6 +269,7 @@ def run_test(
spec_config: dict[str, Any] | None,
test_prefill_chunking: bool,
is_testing_with_spec_decoding: bool = False,
attention_config: dict[str, Any] | None = None,
):
spec_decoding = spec_config is not None
cache_arg: dict[str, Any] = (
......@@ -281,14 +289,6 @@ def run_test(
print(f"---- TESTING {test_str}: {test_config}")
print("-" * 80)
# On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
# spec decoding test (TRITON_ATTN) for better precision.
# On others: always use float32.
if current_platform.is_rocm() and not is_testing_with_spec_decoding:
dtype = "float16"
else:
dtype = "float32"
with VllmRunner(
model,
max_model_len=512,
......@@ -298,9 +298,10 @@ def run_test(
# enforce_eager=True,
async_scheduling=async_scheduling,
distributed_executor_backend=executor,
dtype=dtype,
dtype="float32",
speculative_config=spec_config,
disable_log_stats=False,
attention_config=attention_config,
**cache_arg,
) as vllm_model:
results = []
......@@ -358,12 +359,7 @@ def _all_logprobs_match(req_a, req_b) -> bool:
def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
if current_platform.is_rocm():
# ROCm has higher numerical variance
# due to use of float16.
rel_tol, abs_tol = 5e-2, 1e-5
else:
rel_tol, abs_tol = 1e-3, 1e-6
rel_tol, abs_tol = 1e-3, 1e-6
return (
len(lps_a) == len(lps_b)
and lps_a.keys() == lps_b.keys()
......
......@@ -19,7 +19,7 @@ def sync_tracker():
Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
lazy init syncs. Prints stack traces immediately when syncs occur.
"""
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.attention.backend import CommonAttentionMetadata
# Shared counter for cross-process communication (inherited by fork)
sync_count = multiprocessing.Value("i", 0)
......
......@@ -445,25 +445,26 @@ def test_eagle_correctness(
should be the same when using eagle speculative decoding.
model_setup: (method, model_name, eagle_model_name, tp_size)
"""
with monkeypatch.context() as m:
if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
# Scout requires default backend selection
# because vision encoder has head_dim 88 being incompatible
# with FLASH_ATTN and needs to fall back to Flex Attn
# pass if not ROCm
if current_platform.is_rocm():
# TODO: Enable Flex Attn for spec_decode on ROCm
pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
else:
m.setenv("VLLM_MLA_DISABLE", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
# Determine attention config
# Scout requires default backend selection because vision encoder has
# head_dim 88 being incompatible with FLASH_ATTN and needs to fall back
# to Flex Attn
if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
if current_platform.is_rocm():
# TODO: Enable Flex Attn for spec_decode on ROCm
pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
attention_config = None # Let it fall back to default
else:
attention_config = {"backend": attn_backend}
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
pytest.skip(
"TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform"
)
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
pytest.skip(
"TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform"
)
with monkeypatch.context() as m:
m.setenv("VLLM_MLA_DISABLE", "1")
if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
if "deepseek" in model_setup[1].lower():
......@@ -478,7 +479,10 @@ def test_eagle_correctness(
max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
ref_llm = LLM(
model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
model=model_name,
max_model_len=max_model_len,
tensor_parallel_size=tp_size,
attention_config=attention_config,
)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
......@@ -499,6 +503,7 @@ def test_eagle_correctness(
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
model_impl=model_impl,
attention_config=attention_config,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from vllm import LLM, SamplingParams
from ...utils import create_new_process_for_each_test
from ...utils import models_path_prefix
@create_new_process_for_each_test()
@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
def test_cascade_attention(example_system_message, attn_backend):
prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
if attn_backend == "FLASHINFER":
......@@ -19,19 +21,18 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
"needs investigation. See issue #25679."
)
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
# No cascade attention.
single_prompt = [example_system_message + prompt]
responses = llm.generate(single_prompt, sampling_params)
ref_output = responses[0].outputs[0].text
# (Probably) Use cascade attention.
prompts = [example_system_message + prompt] * 64
responses = llm.generate(prompts, sampling_params)
for response in responses:
assert response.outputs[0].text == ref_output
\ No newline at end of file
llm = LLM(
model=os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"), attention_config={"backend": attn_backend}
)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
# No cascade attention.
single_prompt = [example_system_message + prompt]
responses = llm.generate(single_prompt, sampling_params)
ref_output = responses[0].outputs[0].text
# (Probably) Use cascade attention.
prompts = [example_system_message + prompt] * 64
responses = llm.generate(prompts, sampling_params)
for response in responses:
assert response.outputs[0].text == ref_output
......@@ -31,7 +31,7 @@ import openai
import requests
from vllm.assets.image import ImageAsset
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.utils import encode_image_url
MAX_OUTPUT_LEN = 256
......@@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image;base64,{encode_image_base64(image_1)}"
},
"image_url": {"url": encode_image_url(image_1)},
},
{"type": "text", "text": "What's in this image?"},
],
......@@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image;base64,{encode_image_base64(image_2)}"
},
"image_url": {"url": encode_image_url(image_2)},
},
{
"type": "image_url",
......
......@@ -12,6 +12,13 @@ from vllm import SamplingParams
from vllm.assets.image import ImageAsset
from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
ErrorResponse,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.inputs import PromptType
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
......@@ -255,7 +262,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
# Use multi-abort to abort multiple requests at once
abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
await engine.abort(abort_request_ids)
await engine.abort(abort_request_ids, internal=False)
# Wait for all tasks to complete
results = await asyncio.gather(*tasks, return_exceptions=True)
......@@ -486,6 +493,60 @@ async def test_dp_rank_argument():
pass
@pytest.mark.asyncio(scope="module")
async def test_header_dp_rank_argument():
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown)
MODEL_NAME = "test-model"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
# Create models first
models = OpenAIServingModels(
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
# Create serving chat instance
serving_chat = OpenAIServingChat(
engine_client=engine,
models=models,
response_role="assistant",
chat_template=None,
chat_template_content_format="auto",
request_logger=None,
)
# Create a chat completion request
req = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": TEXT_PROMPT}],
max_tokens=100,
temperature=1.0,
seed=33,
)
# Test 1: Valid DP rank (0)
mock_raw_request = MagicMock()
mock_raw_request.headers = {"X-data-parallel-rank": "0"}
mock_raw_request.state = MagicMock()
# Should succeed with valid rank
response = await serving_chat.create_chat_completion(req, mock_raw_request)
assert isinstance(response, ChatCompletionResponse), (
"Expected a ChatCompletionResponse for valid DP rank"
)
# Test 2: Out-of-range DP rank (1)
mock_raw_request.headers = {"X-data-parallel-rank": "1"}
# should return ErrorResponse for out-of-range rank
response2 = await serving_chat.create_chat_completion(req, mock_raw_request)
assert isinstance(response2, ErrorResponse), (
"Expected an ErrorResponse for out-of-range DP rank"
)
@pytest.mark.asyncio
async def test_check_health():
"""Test that check_health returns normally for healthy engine
......@@ -550,7 +611,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
await asyncio.sleep(0.5)
# Abort the request
await engine.abort(request_id)
await engine.abort(request_id, internal=False)
# Wait for generation to complete and return final output
final_output = await generated
......
......@@ -42,10 +42,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
PROMPT = "I am Gyoubu Masataka Oniwa"
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
_REQUEST_COUNTER = 0
def make_request() -> EngineCoreRequest:
global _REQUEST_COUNTER
_REQUEST_COUNTER += 1
request_id = f"request-{_REQUEST_COUNTER}"
return EngineCoreRequest(
request_id=str(uuid.uuid4()),
request_id=request_id,
external_req_id=f"{request_id}-{uuid.uuid4()}",
prompt_token_ids=PROMPT_TOKENS,
mm_features=None,
sampling_params=SamplingParams(),
......
......@@ -2,12 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import importlib
import os
import signal
import time
import uuid
from dataclasses import dataclass
from threading import Thread
from types import SimpleNamespace
from typing import Any
from unittest.mock import MagicMock
......@@ -25,7 +27,11 @@ from vllm.usage.usage_lib import UsageContext
from vllm.utils.torch_utils import set_default_torch_num_threads
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore
from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
from vllm.v1.engine.core_client import (
AsyncMPClient,
EngineCoreClient,
SyncMPClient,
)
from vllm.v1.engine.utils import CoreEngineProcManager
from vllm.v1.executor.abstract import Executor
......@@ -33,14 +39,19 @@ from ...distributed.conftest import MockSubscriber
from ...utils import create_new_process_for_each_test
from ...utils import models_path_prefix
if not current_platform.is_cuda():
pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
if not current_platform.is_cuda_alike():
pytest.skip(
reason="V1 currently only supported on CUDA-alike platforms.",
allow_module_level=True,
)
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
PROMPT = "Hello my name is Robert and I love quantization kernels"
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
_REQUEST_COUNTER = 0
def make_request(
params: SamplingParams, prompt_tokens_ids: list[int] | None = None
......@@ -48,8 +59,12 @@ def make_request(
if not prompt_tokens_ids:
prompt_tokens_ids = PROMPT_TOKENS
global _REQUEST_COUNTER
_REQUEST_COUNTER += 1
request_id = f"request-{_REQUEST_COUNTER}"
return EngineCoreRequest(
request_id=str(uuid.uuid4()),
request_id=request_id,
external_req_id=f"{request_id}-{uuid.uuid4()}",
prompt_token_ids=prompt_tokens_ids,
mm_features=None,
sampling_params=params,
......@@ -62,6 +77,92 @@ def make_request(
)
def _reload_envs_module():
import vllm.envs as envs_mod
cache_clear = getattr(getattr(envs_mod, "__getattr__", None), "cache_clear", None)
if cache_clear is not None:
cache_clear()
return importlib.reload(envs_mod)
def _reload_core_client_module():
module = importlib.import_module("vllm.v1.engine.core_client")
return importlib.reload(module)
def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
timeout_value = 654
monkeypatch.setenv("VLLM_ENGINE_READY_TIMEOUT_S", str(timeout_value))
# Ensure that the environment variable is loaded if caching is enabled
_reload_envs_module()
core_client_mod = _reload_core_client_module()
poll_timeouts: list[int] = []
class ShadowSocket:
def poll(self, timeout: int) -> int:
# Capture the timeout value for each poll call
poll_timeouts.append(timeout)
return 1
def recv_multipart(self):
return (b"\x00\x00", b"ready")
class DummySocket:
def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):
if track:
return SimpleNamespace(done=True)
def recv_multipart(self, *, copy: bool = False):
return (b"", b"")
def close(self, *, linger: int = 0):
pass
def bind(self, _address):
pass
def connect(self, _address):
pass
def setsockopt(self, *_args, **_kwargs):
pass
monkeypatch.setattr(core_client_mod.zmq.Socket, "shadow", lambda *_: ShadowSocket())
monkeypatch.setattr(
core_client_mod, "make_zmq_socket", lambda *_, **__: DummySocket()
)
parallel_config = SimpleNamespace(
data_parallel_size=1,
data_parallel_rank=0,
data_parallel_index=0,
data_parallel_size_local=1,
data_parallel_rank_local=None,
data_parallel_hybrid_lb=False,
data_parallel_external_lb=False,
)
vllm_config = SimpleNamespace(parallel_config=parallel_config)
client = core_client_mod.MPClient(
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=object,
log_stats=False,
client_addresses={
"input_address": "inproc://input",
"output_address": "inproc://output",
},
)
try:
# timeout_value is in seconds, but poll receives milliseconds
assert poll_timeouts == [timeout_value * 1000]
finally:
client.shutdown()
def loop_until_done(client: EngineCoreClient, outputs: dict):
while True:
engine_core_outputs = client.get_output().outputs
......@@ -638,6 +739,7 @@ def test_kv_cache_events(
)
assert event.parent_block_hash is None, "Parent block hash should be None"
assert event.lora_id is None, "Lora id should be None"
assert event.lora_name is None, "Lora name should be None"
assert len(event.token_ids) == num_blocks * block_size, (
"Token ids should be the same as the custom tokens"
)
......
......@@ -29,6 +29,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
params = SamplingParams(skip_special_tokens=True)
request = EngineCoreRequest(
request_id="test",
external_req_id="test-ext",
prompt_token_ids=prompt_token_ids,
mm_features=None,
sampling_params=params,
......
......@@ -59,12 +59,12 @@ def test_incremental_detokenization(
output_processor = OutputProcessor(
dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
)
engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
# Make N requests.
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
request_id=f"request-{idx}-int",
external_req_id=f"request-{idx}",
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
......@@ -84,6 +84,11 @@ def test_incremental_detokenization(
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
request_ids=[req.request_id for req in requests],
)
# Add requests to the detokenizer.
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)
......@@ -270,12 +275,28 @@ def _validate_logprobs(
# the logprob token id at this sequence position
decoded_token = pos_logprob_dict[lp_tok].decoded_token
ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, lp_tok)
assert decoded_token == ref_decoded_token, (
f"Sampled logprob token id {lp_tok} decodes to"
f" {ref_decoded_token} but Logprob decoded"
f" token is {decoded_token} instead"
f" (at position {idx})"
)
# With UTF-8 correction logic, tokens ending with "�"
# (incomplete byte sequences) are corrected to either
# empty string or proper UTF-8 characters
if ref_decoded_token.endswith("�"):
# Token needs UTF-8 correction
assert not decoded_token.endswith("�"), (
f"Sampled logprob token id {lp_tok} decodes to"
f" '{ref_decoded_token}' (ends with replacement char)"
f" but corrected decoded token '{decoded_token}'"
f" still ends with replacement char"
f" (at position {idx}). UTF-8 correction should"
f" have removed it."
)
else:
# No correction needed, should match exactly
assert decoded_token == ref_decoded_token, (
f"Sampled logprob token id {lp_tok} decodes to"
f" {ref_decoded_token} but Logprob decoded"
f" token is {decoded_token} instead"
f" (at position {idx})"
)
ref_cumulative_logprob += pos_logprob_dict[sampled_token].logprob
# Assert that cumulative logprobs are correct
......@@ -416,12 +437,28 @@ def _validate_logprobs(
# the logprob token id at this sequence position
decoded_token = pos_logprob_dict[plp_tok].decoded_token
ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, plp_tok)
assert decoded_token == ref_decoded_token, (
f"Prompt logprob token id {plp_tok} decodes to"
f" {ref_decoded_token} but Logprob decoded"
f" token is {decoded_token} instead"
f" (at position {idx})"
)
# With UTF-8 correction logic, tokens ending with "�"
# (incomplete byte sequences) are corrected to either
# empty string or proper UTF-8 characters
if ref_decoded_token.endswith("�"):
# Token needs UTF-8 correction
assert not decoded_token.endswith("�"), (
f"Prompt logprob token id {plp_tok} decodes to"
f" '{ref_decoded_token}' (ends with replacement char)"
f" but corrected decoded token '{decoded_token}'"
f" still ends with replacement char"
f" (at position {idx}). UTF-8 correction should"
f" have removed it."
)
else:
# No correction needed, should match exactly
assert decoded_token == ref_decoded_token, (
f"Prompt logprob token id {plp_tok} decodes to"
f" {ref_decoded_token} but Logprob decoded"
f" token is {decoded_token} instead"
f" (at position {idx})"
)
else:
# Prompt logprobs disabled for this request
assert prompt_logprobs is None
......@@ -439,15 +476,6 @@ def test_logprobs_processor(
dummy_test_vectors,
):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=None
if num_sample_logprobs is None
else dummy_test_vectors.generation_logprobs,
prompt_logprobs_raw=None
if num_prompt_logprobs is None
else dummy_test_vectors.prompt_logprobs,
)
# Make N requests.
request_id_list = [
......@@ -455,7 +483,8 @@ def test_logprobs_processor(
]
requests = [
EngineCoreRequest(
request_id=request_id_list[idx],
request_id=request_id_list[idx] + "-int",
external_req_id=request_id_list[idx],
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
......@@ -477,6 +506,17 @@ def test_logprobs_processor(
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=None
if num_sample_logprobs is None
else dummy_test_vectors.generation_logprobs,
prompt_logprobs_raw=None
if num_prompt_logprobs is None
else dummy_test_vectors.prompt_logprobs,
request_ids=[req.request_id for req in requests],
)
# Add requests to the detokenizer.
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)
......@@ -622,19 +662,12 @@ def test_stop_token(
]
prompt_string = dummy_test_vectors.prompt_strings[0]
prompt_tokens = dummy_test_vectors.prompt_tokens[0]
engine_core = MockEngineCore(
tokens_list=[generation_tokens],
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
prompt_logprobs_raw=None,
eos_token_id=eos_token_id,
stop_token_ids=stop_token_ids,
ignore_eos=ignore_eos,
)
# Make request.
request_id = "request-0"
request = EngineCoreRequest(
request_id=request_id,
external_req_id=request_id + "-ext",
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=eos_token_id,
......@@ -656,6 +689,16 @@ def test_stop_token(
pooling_params=None,
)
engine_core = MockEngineCore(
tokens_list=[generation_tokens],
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
prompt_logprobs_raw=None,
eos_token_id=eos_token_id,
stop_token_ids=stop_token_ids,
ignore_eos=ignore_eos,
request_ids=[request.request_id],
)
# Add request to the detokenizer.
output_processor.add_request(request, prompt_string)
......@@ -721,13 +764,6 @@ def test_stop_string(
dummy_test_vectors,
):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
if num_sample_logprobs
else None,
prompt_logprobs_raw=None,
)
# Make N requests.
request_id_list = [
......@@ -735,7 +771,8 @@ def test_stop_string(
]
requests = [
EngineCoreRequest(
request_id=request_id_list[idx],
request_id=request_id_list[idx] + "-int",
external_req_id=request_id_list[idx],
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
......@@ -757,6 +794,15 @@ def test_stop_string(
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
if num_sample_logprobs
else None,
prompt_logprobs_raw=None,
request_ids=[req.request_id for req in requests],
)
# Add requests to the detokenizer.
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)
......@@ -814,9 +860,12 @@ def test_stop_string(
for idx, (ref_gen_str, stop_str) in enumerate(
zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
):
# Request should be aborted.
# Request should be aborted (check internal ID in abort list).
internal_request_id = f"request-{idx}-int"
assert internal_request_id in aborted
# Use external ID for collecting outputs
request_id = f"request-{idx}"
assert request_id in aborted
# Collected values that were generated.
gen_str = gen_strings[request_id]
......@@ -849,13 +898,13 @@ def test_stop_string(
def test_iteration_stats(dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
engine_core_timestamp = time.monotonic()
# Make N requests.
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
external_req_id=f"request-{idx}-ext",
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
......@@ -869,6 +918,11 @@ def test_iteration_stats(dummy_test_vectors):
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
engine_core = MockEngineCore(
dummy_test_vectors.generation_tokens,
request_ids=[req.request_id for req in requests],
)
# Add all requests except one to the OutputProcessor.
num_active = len(dummy_test_vectors.generation_tokens) - 1
for request in requests[:num_active]:
......@@ -923,7 +977,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
output_processor = OutputProcessor(
dummy_test_vectors.tokenizer, log_stats=log_stats
)
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
engine_core_timestamp = time.monotonic()
# Create LoRA requests
......@@ -937,7 +990,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
lora_assignments = [lora1, lora2, None]
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
request_id=f"request-{idx}-int",
external_req_id=f"request-{idx}",
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
......@@ -951,6 +1005,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
engine_core = MockEngineCore(
dummy_test_vectors.generation_tokens,
request_ids=[req.request_id for req in requests],
)
# Add all requests to the OutputProcessor
for request in requests:
output_processor.add_request(request, None)
......@@ -1016,9 +1075,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs = EngineCoreOutputs(
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
)
# Find and mark request-0 as finished (it uses lora-1)
# Find and mark request-0-int as finished (it uses lora-1)
for output in outputs.outputs:
if output.request_id == "request-0":
if output.request_id == "request-0-int":
output.finish_reason = FinishReason.LENGTH
break
......@@ -1041,9 +1100,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs = EngineCoreOutputs(
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
)
# Find and mark request-1 as finished (it uses lora-2)
# Find and mark request-1-int as finished (it uses lora-2)
for output in outputs.outputs:
if output.request_id == "request-1":
if output.request_id == "request-1-int":
output.finish_reason = FinishReason.LENGTH
break
......@@ -1065,9 +1124,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs = EngineCoreOutputs(
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
)
# Find and mark request-2 as finished (it has no LoRA)
# Find and mark request-2-int as finished (it has no LoRA)
for output in outputs.outputs:
if output.request_id == "request-2":
if output.request_id == "request-2-int":
output.finish_reason = FinishReason.LENGTH
break
......@@ -1108,7 +1167,9 @@ async def test_request_output_collector():
for idx in range(NUM_REQS)
]
collector = RequestOutputCollector(RequestOutputKind.DELTA)
collector = RequestOutputCollector(
RequestOutputKind.DELTA, request_id="my-request-id-int"
)
# CASE 1: Put then get.
outputs = make_outputs()
......@@ -1164,7 +1225,9 @@ async def test_request_output_collector():
@pytest.mark.asyncio
async def test_cumulative_output_collector_n():
"""Test collector correctly handles multiple outputs by index."""
collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
collector = RequestOutputCollector(
RequestOutputKind.CUMULATIVE, request_id="my-request-id-int"
)
outputs = [
RequestOutput(
request_id="my-request-id",
......@@ -1243,11 +1306,13 @@ async def test_cumulative_output_collector_n():
@pytest.mark.parametrize("runner", ["generate", "pooling"])
def test_abort_requests(runner: str, dummy_test_vectors):
@pytest.mark.parametrize("abort_by", ["internal", "external"])
def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
external_req_id=f"external-{idx}",
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
......@@ -1266,8 +1331,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
output_kind = request.sampling_params.output_kind
else:
output_kind = request.pooling_params.output_kind
queue = RequestOutputCollector(output_kind=output_kind)
queue = RequestOutputCollector(
output_kind=output_kind, request_id=request.request_id
)
output_processor.add_request(request, None, queue=queue)
for request in requests:
output_processor.abort_requests([request.request_id])
if abort_by == "internal":
output_processor.abort_requests([request.request_id], internal=True)
else:
output_processor.abort_requests([request.external_req_id], internal=False)
......@@ -4,11 +4,12 @@
from vllm import SamplingParams
from vllm.outputs import CompletionOutput
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.parallel_sampling import ParentRequest
def test_parent_request_to_output_stream() -> None:
parent_request = ParentRequest("parent_id", SamplingParams(n=2))
parent_request = ParentRequest(make_request(SamplingParams(n=2)))
parent_request.child_requests = {"child_id_0", "child_id_1"}
output_0 = CompletionOutput(
index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
......@@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
)
# Request not finished
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
"child_id_0", output_0
)
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
"child_id_1", output_1
)
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
"child_id_0", output_0
)
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
"child_id_1", output_1
)
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
# output_1 finished
output_1.finish_reason = "ended"
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
"child_id_0", output_0
)
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
"child_id_1", output_1
)
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
# Finished output_1 had already returned, DO NOT returned again
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
"child_id_0", output_0
)
assert parent_request.get_outputs("child_id_1", output_1) == (
"parent_id",
[],
False,
)
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
# output_0 finished
output_0.finish_reason = "ended"
assert ("parent_id", [output_0], True) == parent_request.get_outputs(
"child_id_0", output_0
)
assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
# Finished output_0 had already returned, DO NOT returned again
assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
def test_parent_request_to_output_final_only() -> None:
parent_request = ParentRequest(
"parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
)
parent_request.child_requests = {"child_id_0", "child_id_1"}
output_0 = CompletionOutput(
......@@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
)
# Request not finished, return nothing
assert parent_request.get_outputs("child_id_0", output_0) == (
"parent_id",
[],
False,
)
assert parent_request.get_outputs("child_id_1", output_1) == (
"parent_id",
[],
False,
)
assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
# output_1 finished, but outputs won't be returned until all child requests finished
output_1.finish_reason = "ended"
assert parent_request.get_outputs("child_id_0", output_0) == (
"parent_id",
[],
False,
)
assert parent_request.get_outputs("child_id_1", output_1) == (
"parent_id",
[],
False,
)
assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
# output_0 finished, as all child requests finished, the output would be returned
output_0.finish_reason = "ended"
assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
assert ([output_0, output_1], True) == parent_request.get_outputs(
"child_id_0", output_0
)
assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
assert ([output_0, output_1], True) == parent_request.get_outputs(
"child_id_1", output_1
)
def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
return EngineCoreRequest(
request_id="parent_id",
external_req_id="ext_parent_id",
prompt_token_ids=None,
mm_features=None,
sampling_params=sampling_params,
pooling_params=None,
eos_token_id=None,
arrival_time=0.0,
lora_request=None,
cache_salt=None,
data_parallel_rank=None,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch.cuda
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
"""Test that preprocessing errors are handled gracefully."""
if current_platform.is_rocm() or current_platform.is_xpu():
pytest.skip(
"Skipped on ROCm/XPU: this test only works with 'fork', "
"but ROCm/XPU uses 'spawn'."
)
assert not torch.cuda.is_initialized(), (
"fork needs to be used for the engine "
"core process and this isn't possible if cuda is already initialized"
)
# Store original method to call for non-failing requests
original_preprocess = EngineCore.preprocess_add_request
# Monkeypatch to make preprocess_add_request raise an exception
# only for requests with "FAIL" in the first token
def conditional_failing_preprocess(self, request: EngineCoreRequest):
# Fail if the first token id is 333
if request.prompt_token_ids and request.prompt_token_ids[0] == 333:
raise ValueError("Simulated preprocessing error!")
return original_preprocess(self, request)
monkeypatch.setattr(
EngineCore, "preprocess_add_request", conditional_failing_preprocess
)
llm = LLM(model=MODEL_NAME)
# Create a failing request by crafting a request with an invalid token
# We need to use a direct approach since LLM.generate tokenizes for us
from vllm.inputs import TokensPrompt
# This should raise an exception due to the preprocessing failure
# Special token id to trigger the failure
failing_prompt = TokensPrompt(prompt_token_ids=[333])
outputs = llm.generate(failing_prompt, SamplingParams(max_tokens=10)) # type: ignore
assert len(outputs) == 1
assert len(outputs[0].outputs[0].token_ids) == 0
assert outputs[0].finished
assert outputs[0].outputs[0].finish_reason == "error"
# Verify the engine is still functional with a normal request
outputs = llm.generate("Hello, my name is", SamplingParams(max_tokens=10))
assert len(outputs) == 1
assert len(outputs[0].outputs[0].token_ids) > 0
assert outputs[0].outputs[0].finish_reason in ("stop", "length")
......@@ -6,6 +6,7 @@ import pytest
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
from vllm.multimodal import MultiModalUUIDDict
from vllm.sampling_params import SamplingParams
from vllm.v1.engine import input_processor as input_processor_mod
from vllm.v1.engine.input_processor import InputProcessor
......@@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
)
captured: dict[str, object] = {}
captured: dict[str, MultiModalUUIDDict] = {}
def fake_preprocess(
prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
......@@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
)
# Expect request-id-based overrides are passed through
assert captured["mm_uuids"] == {
"image": [f"{request_id}-image-0", f"{request_id}-image-1"],
"video": [f"{request_id}-video-0"],
}
mm_uuids = captured["mm_uuids"]
assert set(mm_uuids.keys()) == {"image", "video"}
assert len(mm_uuids["image"]) == 2
assert len(mm_uuids["video"]) == 1
assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
"image"
][0].endswith("-0")
assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
"image"
][1].endswith("-1")
assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
"video"
][0].endswith("-0")
......@@ -343,6 +343,7 @@ class MockEngineCore:
eos_token_id: int | None = None,
stop_token_ids: list[int] | None = None,
ignore_eos: bool = False,
request_ids: list[str] | None = None,
) -> None:
self.num_requests = len(tokens_list)
self.tokens_list = tokens_list
......@@ -355,6 +356,11 @@ class MockEngineCore:
self.eos_token_id = eos_token_id
self.stop_token_ids = stop_token_ids
self.ignore_eos = ignore_eos
self.request_ids = (
request_ids
if request_ids is not None
else [f"request-{i}" for i in range(self.num_requests)]
)
def get_outputs(self) -> list[EngineCoreOutput]:
do_logprobs = self.do_logprobs
......@@ -386,7 +392,7 @@ class MockEngineCore:
prompt_logprobs = None
new_token_id = token_ids[token_idx]
output = EngineCoreOutput(
request_id=f"request-{req_idx}",
request_id=self.request_ids[req_idx],
new_token_ids=[new_token_id],
new_logprobs=logprobs,
new_prompt_logprobs_tensors=prompt_logprobs,
......
......@@ -610,7 +610,7 @@ Make the response as short as possible.
@pytest.mark.parametrize(
"model_name, backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501
"model_name, backend, tokenizer_mode, reasoning_parser, speculative_config, async_scheduling", # noqa: E501
[
(
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
......@@ -618,8 +618,10 @@ Make the response as short as possible.
"auto",
"deepseek_r1",
NGRAM_SPEC_CONFIG,
False,
),
("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None),
("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, False),
("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, True),
],
)
def test_structured_output_with_reasoning_matrices(
......@@ -628,6 +630,7 @@ def test_structured_output_with_reasoning_matrices(
reasoning_parser: str,
model_name: str,
speculative_config: dict[str, Any] | None,
async_scheduling: bool,
):
if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding")
......@@ -648,6 +651,7 @@ def test_structured_output_with_reasoning_matrices(
),
tokenizer_mode=tokenizer_mode,
speculative_config=speculative_config,
async_scheduling=async_scheduling,
)
tokenizer = llm.get_tokenizer()
reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)(
......
......@@ -8,7 +8,7 @@ import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.utils import encode_image_url
# Use a small vision model for testing
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
......@@ -52,9 +52,9 @@ async def client(image_server):
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
def url_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
image_url: encode_image_url(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
......@@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
raw_image_url: str,
base64_encoded_image: dict[str, str],
url_encoded_image: dict[str, str],
):
content_text = "What's in this image?"
messages = [
......@@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
"content": [
{
"type": "input_image",
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501
"image_url": url_encoded_image[raw_image_url],
"detail": "auto",
},
{"type": "input_text", "text": content_text},
......
......@@ -5,24 +5,37 @@ set -euo pipefail
SCRIPT="v1/kv_connector/nixl_integration/run_accuracy_test.sh"
# Define test configurations
configs=(
tp_configs=(
"GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2"
"GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2"
"GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1"
"GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case
"GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1)
"GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
)
dp_ep_configs=(
"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1)
"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP2, D-DPEP=2 (TP=1)
)
# Select config array based on DP_EP env var
if [[ -n "${DP_EP:-}" ]]; then
configs=("${dp_ep_configs[@]}")
echo "DP_EP is set, using dp_ep_configs"
else
configs=("${tp_configs[@]}")
fi
run_tests() {
local label=$1
local extra_env=$2
local extra_args=$2
echo "=== Running tests (${label}) ==="
for cfg in "${configs[@]}"; do
echo "-> Running with ${cfg} ${extra_env:+and ${extra_env}}"
echo "-> Running with ${cfg} ${extra_args:+and ${extra_args}}"
# Use 'env' to safely set variables without eval
if ! env ${extra_env} ${cfg} bash "${SCRIPT}"; then
echo "❌ Test failed for config: ${cfg} ${extra_env:+(${extra_env})}"
if ! env ${cfg} bash "${SCRIPT}" ${extra_args}; then
echo "❌ Test failed for config: ${cfg} ${extra_args:+(${extra_args})}"
exit 1
fi
done
......@@ -34,8 +47,8 @@ run_tests "default backend" ""
# Check if FLASHINFER is set (non-empty)
if [[ -n "${FLASHINFER:-}" ]]; then
echo "FLASHINFER is set, rerunning with VLLM_ATTENTION_BACKEND=FLASHINFER"
run_tests "FLASHINFER backend" "VLLM_ATTENTION_BACKEND=FLASHINFER"
echo "FLASHINFER is set, rerunning with --attention-backend FLASHINFER"
run_tests "FLASHINFER backend" "--attention-backend FLASHINFER"
else
echo "FLASHINFER not set, skipping FLASHINFER runs."
fi
......@@ -3,21 +3,29 @@ set -xe
# Parse command line arguments
KV_BUFFER_DEVICE="cuda" # Default to cuda
ATTENTION_BACKEND="" # Default to empty (use vllm default)
while [[ $# -gt 0 ]]; do
case $1 in
--kv_buffer_device)
KV_BUFFER_DEVICE="$2"
shift 2
;;
--attention-backend)
ATTENTION_BACKEND="$2"
shift 2
;;
*)
echo "Unknown option $1"
echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
echo "Usage: $0 [--kv_buffer_device <cuda|cpu>] [--attention-backend <backend>]"
exit 1
;;
esac
done
echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
if [[ -n "$ATTENTION_BACKEND" ]]; then
echo "Using attention backend: $ATTENTION_BACKEND"
fi
DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD
if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then
......@@ -148,6 +156,11 @@ run_tests_for_model() {
--tensor-parallel-size $PREFILLER_TP_SIZE \
--kv-transfer-config '$KV_CONFIG'"
# Add attention backend config if specified
if [[ -n "$ATTENTION_BACKEND" ]]; then
BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
fi
if [ -n "$model_args" ]; then
FULL_CMD="$BASE_CMD $model_args"
else
......@@ -188,7 +201,12 @@ run_tests_for_model() {
--block-size ${DECODE_BLOCK_SIZE} \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--kv-transfer-config '$KV_CONFIG'"
# Add attention backend config if specified
if [[ -n "$ATTENTION_BACKEND" ]]; then
BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
fi
# DP-EP attention mode
if [[ -z "$DP_EP" ]]; then
BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE"
......
......@@ -14,12 +14,12 @@ from unittest.mock import patch
import pytest
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
from vllm.distributed.kv_transfer.kv_connector.v1 import (
KVConnectorBase_V1,
KVConnectorRole,
)
from vllm.v1.attention.backend import AttentionMetadata
from vllm.v1.core.sched.output import SchedulerOutput
from .utils import create_scheduler, create_vllm_config
......
......@@ -15,11 +15,12 @@ pytestmark = pytest.mark.cpu_test
[
("native", 4.0, 1, 1, "OffloadingConnector", 4.0 * (1 << 30)),
# bytes per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
("native", 8.0, 2, 2, "OffloadingConnector", 8.0 * (1 << 30) / 4),
("native", 8.0, 2, 2, "OffloadingConnector", 8.0 * (1 << 30)),
("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
(None, None, 1, 1, None, None),
# When kv_offloading_size is None, offloading is disabled (backend is ignored)
("native", None, 1, 1, None, None),
],
)
def test_kv_connector(
......@@ -54,8 +55,7 @@ def test_kv_connector(
assert kv_transfer_config.kv_role == "kv_both"
if kv_offloading_backend == "native":
assert kv_connector_extra_config["kv_bytes_per_rank"] == expected_bytes
assert kv_connector_extra_config["num_cpu_blocks"] == 0
assert kv_connector_extra_config["cpu_bytes_to_use"] == expected_bytes
# Existing config should be preserved
assert kv_connector_extra_config["existing_key"] == "existing_value"
elif kv_offloading_backend == "lmcache":
......@@ -63,3 +63,19 @@ def test_kv_connector(
assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
# Existing config should be replaced
assert "existing_key" not in kv_connector_extra_config
def test_kv_offloading_size_only_uses_native_default():
"""Test that setting only kv_offloading_size enables native offloading."""
vllm_config = VllmConfig(
cache_config=CacheConfig(
kv_offloading_size=4.0,
# kv_offloading_backend not set, should default to "native"
),
)
kv_transfer_config = vllm_config.kv_transfer_config
kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config
assert kv_transfer_config.kv_connector == "OffloadingConnector"
assert kv_transfer_config.kv_role == "kv_both"
assert kv_connector_extra_config["cpu_bytes_to_use"] == 4.0 * (1 << 30)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment