"docs/vscode:/vscode.git/clone" did not exist on "189860102539b54098cfa04b6381ee86c53a16c1"
Commit 9c4ecf15 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.4' into v0.8.4-ori

parents bfc2d6f7 dc1b4a6f
...@@ -47,12 +47,10 @@ def test_filter_subtensors(): ...@@ -47,12 +47,10 @@ def test_filter_subtensors():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llama_3p2_1b_files(): def llama_3p2_1b_files():
with TemporaryDirectory() as cache_dir: input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct", ignore_patterns=["*.bin*", "original/*"])
cache_dir=cache_dir,
ignore_patterns=["*.bin*", "original/*"])
yield input_dir yield input_dir
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs): def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
...@@ -64,9 +62,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs): ...@@ -64,9 +62,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
# Copy metadata files to output directory # Copy metadata files to output directory
for file in os.listdir(input_dir): for file in os.listdir(input_dir):
if not any( if os.path.isdir(os.path.join(input_dir, file)):
file.endswith(ext) and not os.path.isdir(file) continue
for ext in weights_patterns): if not any(file.endswith(ext) for ext in weights_patterns):
shutil.copy(f"{input_dir}/{file}", output_dir) shutil.copy(f"{input_dir}/{file}", output_dir)
...@@ -81,7 +79,8 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs): ...@@ -81,7 +79,8 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
@pytest.mark.parametrize("enable_lora", [False, True]) @pytest.mark.parametrize("enable_lora", [False, True])
@pytest.mark.parametrize("tp_size", [1, 2]) @pytest.mark.parametrize("tp_size", [1, 2])
def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
llama_3p2_1b_files): llama_3p2_1b_files,
monkeypatch: pytest.MonkeyPatch):
if num_gpus_available < tp_size: if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
...@@ -89,6 +88,8 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, ...@@ -89,6 +88,8 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
gpu_memory_utilization = 0.8 gpu_memory_utilization = 0.8
input_dir = llama_3p2_1b_files input_dir = llama_3p2_1b_files
ctx = mp.get_context("spawn") ctx = mp.get_context("spawn")
# The interface in v1 engine has changed, run in v1 engine will hang.
monkeypatch.setenv("VLLM_USE_V1", "0")
# Run in separate processes for memory & CUDA isolation # Run in separate processes for memory & CUDA isolation
with TemporaryDirectory() as output_dir: with TemporaryDirectory() as output_dir:
......
...@@ -10,10 +10,33 @@ from vllm.platforms import current_platform ...@@ -10,10 +10,33 @@ from vllm.platforms import current_platform
from .utils import ARGS, CONFIGS, ServerConfig from .utils import ARGS, CONFIGS, ServerConfig
# select models to test based on command line arguments
def pytest_addoption(parser):
parser.addoption("--models",
nargs="+",
help="Specify one or more models to test")
parser.addoption("--extended",
action="store_true",
default=False,
help="invoke extended tests requiring large GPUs")
# for each server config, download the model and return the config # for each server config, download the model and return the config
@pytest.fixture(scope="session", params=CONFIGS.keys()) @pytest.fixture(scope="session", params=CONFIGS.keys())
def server_config(request): def server_config(request):
config = CONFIGS[request.param] extended = request.config.getoption("--extended")
models = request.config.getoption("--models")
config_keys_to_test = [
key for key in CONFIGS if (models is None or key in models) and (
extended or not CONFIGS[key].get("extended", False))
]
config_key = request.param
if config_key not in config_keys_to_test:
pytest.skip(f"Skipping config '{config_key}'")
config = CONFIGS[config_key]
if current_platform.is_rocm() and not config.get("supports_rocm", True): if current_platform.is_rocm() and not config.get("supports_rocm", True):
pytest.skip("The {} model can't be tested on the ROCm platform".format( pytest.skip("The {} model can't be tested on the ROCm platform".format(
......
...@@ -16,6 +16,7 @@ class ServerConfig(TypedDict, total=False): ...@@ -16,6 +16,7 @@ class ServerConfig(TypedDict, total=False):
system_prompt: Optional[str] system_prompt: Optional[str]
supports_parallel: Optional[bool] supports_parallel: Optional[bool]
supports_rocm: Optional[bool] supports_rocm: Optional[bool]
extended: Optional[bool] # tests do not run in CI automatically
def patch_system_prompt(messages: list[dict[str, Any]], def patch_system_prompt(messages: list[dict[str, Any]],
...@@ -82,6 +83,21 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -82,6 +83,21 @@ CONFIGS: dict[str, ServerConfig] = {
"supports_parallel": "supports_parallel":
False, False,
}, },
"llama4": {
"model":
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template",
str(VLLM_PATH /
"examples/tool_chat_template_llama4_pythonic.jinja"), "-tp",
"4"
],
"supports_parallel":
False,
"extended":
True
},
"mistral": { "mistral": {
"model": "model":
"mistralai/Mistral-7B-Instruct-v0.3", "mistralai/Mistral-7B-Instruct-v0.3",
......
...@@ -44,7 +44,7 @@ def test_tpu_compilation(): ...@@ -44,7 +44,7 @@ def test_tpu_compilation():
assert generated_text.startswith(answer) assert generated_text.startswith(answer)
compiled_codes = sorted( compiled_codes = sorted(
glob.glob(os.path.join(temp_dir, "__transformed_code*.py"))) glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py")))
for i, compiled_code in enumerate(compiled_codes): for i, compiled_code in enumerate(compiled_codes):
print("{} file: {}".format(i + 1, compiled_code)) print("{} file: {}".format(i + 1, compiled_code))
...@@ -52,15 +52,21 @@ def test_tpu_compilation(): ...@@ -52,15 +52,21 @@ def test_tpu_compilation():
# We should only trigger Dynamo compilation 2 times: # We should only trigger Dynamo compilation 2 times:
# 1. Forward pass without kv_caches # 1. Forward pass without kv_caches
# 2. Forward pass with kv_caches # 2. Forward pass with kv_caches
# Check we have 4 compiled codes # Check we have 2 compiled codes
assert len(compiled_codes) == 2 assert len(compiled_codes) == 2
kv_cache_prefix = "kv_cache" kv_cache_prefix = "kv_cache"
attn_prefix = "ragged_paged_attention" attn_prefix = "ragged_paged_attention"
def extract_compiled_index(s):
parts = s.replace(".", "_").split("_")
numbers = [int(part) for part in parts if part.isdigit()]
return numbers[0]
# Check all the compilations are as expected # Check all the compilations are as expected
compiled_fns = sorted( compiled_fns = sorted(glob.glob(
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py"))) os.path.join(temp_dir, "__compiled_fn*Captured*.py")),
key=lambda s: extract_compiled_index(s))
for i, compiled_fn in enumerate(compiled_fns): for i, compiled_fn in enumerate(compiled_fns):
print("{} file: {}".format(i + 1, compiled_fn)) print("{} file: {}".format(i + 1, compiled_fn))
......
...@@ -3,14 +3,17 @@ ...@@ -3,14 +3,17 @@
import pytest import pytest
import torch import torch
from vllm.multimodal.inputs import MultiModalKwargs from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import sha256 from vllm.utils import GiB_bytes, sha256
from vllm.v1.core.kv_cache_manager import KVCacheManager
# disable yapf here as it formats differently than isort such that both fail # disable yapf here as it formats differently than isort such that both fail
# yapf: disable # yapf: disable
from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType, from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
FreeKVCacheBlockQueue, KVCacheBlock, FreeKVCacheBlockQueue, KVCacheBlock,
PrefixCachingMetrics, PrefixCachingMetrics,
estimate_max_model_len,
generate_block_hash_extra_keys, generate_block_hash_extra_keys,
hash_block_tokens, hash_block_tokens,
hash_request_tokens, hash_request_tokens,
...@@ -46,6 +49,18 @@ def make_request(request_id, ...@@ -46,6 +49,18 @@ def make_request(request_id,
) )
def new_kv_cache_spec(block_size=16,
num_kv_heads=2,
head_size=64,
dtype=torch.float32,
use_mla=False):
return FullAttentionSpec(block_size=block_size,
num_kv_heads=num_kv_heads,
head_size=head_size,
dtype=dtype,
use_mla=use_mla)
def test_none_hash(): def test_none_hash():
assert NONE_HASH is not None assert NONE_HASH is not None
assert isinstance(NONE_HASH, int) assert isinstance(NONE_HASH, int)
...@@ -158,13 +173,10 @@ def test_generate_block_hash_extra_keys(): ...@@ -158,13 +173,10 @@ def test_generate_block_hash_extra_keys():
request = make_request( request = make_request(
request_id=0, request_id=0,
prompt_token_ids=[_ for _ in range(20)], prompt_token_ids=[_ for _ in range(20)],
mm_positions=[{ mm_positions=[
"offset": 0, PlaceholderRange(offset=0, length=5),
"length": 5 PlaceholderRange(offset=10, length=5),
}, { ],
"offset": 10,
"length": 5
}],
mm_hashes=["hash1", "hash2"], mm_hashes=["hash1", "hash2"],
) )
...@@ -222,13 +234,10 @@ def test_hash_request_tokens(hash_fn): ...@@ -222,13 +234,10 @@ def test_hash_request_tokens(hash_fn):
request = make_request( request = make_request(
request_id=0, request_id=0,
prompt_token_ids=[_ for _ in range(6)], prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{ mm_positions=[
"offset": 0, PlaceholderRange(offset=0, length=3),
"length": 3 PlaceholderRange(offset=3, length=3),
}, { ],
"offset": 3,
"length": 3
}],
mm_hashes=["hash1", "hash2"], mm_hashes=["hash1", "hash2"],
) )
...@@ -253,25 +262,19 @@ def test_hash_tokens_different_mm_input(hash_fn): ...@@ -253,25 +262,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
request1 = make_request( request1 = make_request(
request_id=0, request_id=0,
prompt_token_ids=[_ for _ in range(6)], prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{ mm_positions=[
"offset": 0, PlaceholderRange(offset=0, length=3),
"length": 3 PlaceholderRange(offset=3, length=3),
}, { ],
"offset": 3,
"length": 3
}],
mm_hashes=["hash1", "hash2"], mm_hashes=["hash1", "hash2"],
) )
request2 = make_request( request2 = make_request(
request_id=1, request_id=1,
prompt_token_ids=[_ for _ in range(6)], prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{ mm_positions=[
"offset": 0, PlaceholderRange(offset=0, length=3),
"length": 3 PlaceholderRange(offset=3, length=3),
}, { ],
"offset": 3,
"length": 3
}],
mm_hashes=["hash3", "hash2"], mm_hashes=["hash3", "hash2"],
) )
block_size = 3 block_size = 3
...@@ -337,18 +340,6 @@ def test_metrics(): ...@@ -337,18 +340,6 @@ def test_metrics():
def test_unify_kv_cache_configs(): def test_unify_kv_cache_configs():
def new_kv_cache_spec(block_size=16,
num_kv_heads=2,
head_size=64,
dtype=torch.float32,
use_mla=False):
return FullAttentionSpec(block_size=block_size,
num_kv_heads=num_kv_heads,
head_size=head_size,
dtype=dtype,
use_mla=use_mla)
same_kv_cache_config = [ same_kv_cache_config = [
KVCacheConfig( KVCacheConfig(
num_blocks=10, num_blocks=10,
...@@ -438,3 +429,106 @@ def test_unify_kv_cache_configs(): ...@@ -438,3 +429,106 @@ def test_unify_kv_cache_configs():
] ]
with pytest.raises(AssertionError): with pytest.raises(AssertionError):
unify_kv_cache_configs(diff_kv_cache_config) unify_kv_cache_configs(diff_kv_cache_config)
@pytest.mark.parametrize(
("model_id", "max_model_len", "want_estimated_max_len"), [
("Qwen/Qwen1.5-7B", 16385, 16384),
("Qwen/Qwen1.5-7B", 16383, 16383),
])
def test_estimate_max_model_len(model_id, max_model_len,
want_estimated_max_len):
# Create a VllmConfig
model_config = ModelConfig(
model_id,
task="generate",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
max_model_len=max_model_len,
)
scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
vllm_config = VllmConfig(
model_config=model_config,
scheduler_config=scheduler_config,
)
# Create KV cache specs
kv_cache_spec = {}
for i in range(32):
layer_name = f"layer_{i}"
kv_cache_spec[layer_name] = FullAttentionSpec(
block_size=16,
num_kv_heads=32,
head_size=128,
dtype=torch.float16,
use_mla=False,
)
# Estimate the maximum model length, 16384 model_len need 8GB
estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
8 * GiB_bytes)
assert estimated_max_len == want_estimated_max_len
def test_allocate_with_lookahead():
"""Verify that lookahead tokens correctly affect block allocation"""
block_size = 4
config = KVCacheConfig(
num_blocks=10,
tensors={
"layer1": KVCacheTensor(100),
},
kv_cache_groups=[
KVCacheGroupSpec(["layer1"],
new_kv_cache_spec(block_size=block_size)),
],
)
request = make_request(
request_id=0,
prompt_token_ids=[],
mm_positions=None,
mm_hashes=None,
)
# Test case 1: Requires additional lookahead tokens
kv_cache_manager = KVCacheManager(kv_cache_config=config,
max_model_len=100,
num_preallocate_tokens=0)
blocks = kv_cache_manager.allocate_slots(
request,
num_tokens=3,
num_lookahead_tokens=2, # Total required: 3+2=5 tokens
)
assert len(blocks) == 2 # ceil(5/4)=2 blocks
# Test case 2: With precomputed blocks
kv_cache_manager = KVCacheManager(kv_cache_config=config,
max_model_len=100,
num_preallocate_tokens=4)
# num_preallocate_blocks = 4 // 4 - 2 // 4 = 1
# required_blocks = ceil((3 + 2) /4) = 2
# total_blocks = 1 + 2 = 3
blocks = kv_cache_manager.allocate_slots(
request,
num_tokens=3,
num_lookahead_tokens=2,
)
assert len(blocks) == 3
# Test case 3: With precomputed blocks
# num_preallocate_blocks = 4 // 4 - 4 // 4 = 0
# required_blocks = ceil((3 + 4) / 4) = 2
# total_blocks = 0 + 2 = 2
kv_cache_manager = KVCacheManager(kv_cache_config=config,
max_model_len=100,
num_preallocate_tokens=4)
blocks = kv_cache_manager.allocate_slots(
request,
num_tokens=3,
num_lookahead_tokens=4,
)
assert len(blocks) == 2
...@@ -24,6 +24,7 @@ def create_scheduler( ...@@ -24,6 +24,7 @@ def create_scheduler(
max_num_batched_tokens: int = 8192, max_num_batched_tokens: int = 8192,
enable_prefix_caching: Optional[bool] = None, enable_prefix_caching: Optional[bool] = None,
long_prefill_token_threshold: int = 0, long_prefill_token_threshold: int = 0,
disable_chunked_mm_input: bool = False,
) -> Scheduler: ) -> Scheduler:
'''Create scheduler under test. '''Create scheduler under test.
...@@ -43,6 +44,7 @@ def create_scheduler( ...@@ -43,6 +44,7 @@ def create_scheduler(
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_num_batched_tokens, max_model_len=max_num_batched_tokens,
long_prefill_token_threshold=long_prefill_token_threshold, long_prefill_token_threshold=long_prefill_token_threshold,
disable_chunked_mm_input=disable_chunked_mm_input,
) )
model_config = ModelConfig( model_config = ModelConfig(
model=model, model=model,
...@@ -278,6 +280,58 @@ def test_schedule_partial_requests(): ...@@ -278,6 +280,58 @@ def test_schedule_partial_requests():
assert requests[2].request_id not in output.num_scheduled_tokens assert requests[2].request_id not in output.num_scheduled_tokens
def test_no_mm_input_chunking():
# Disable multimodal input chunking.
scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
max_num_batched_tokens=1024,
disable_chunked_mm_input=True,
)
mm_positions = [[PlaceholderRange(offset=400, length=800)]]
requests = create_requests(num_requests=1,
num_tokens=1200,
mm_positions=mm_positions)
for request in requests:
scheduler.add_request(request)
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == 1
assert len(output.scheduled_cached_reqs) == 0
assert len(output.finished_req_ids) == 0
# We want to only see the 400 text tokens at the start scheduled
assert output.num_scheduled_tokens[requests[0].request_id] == 400
req_to_index = {
request.request_id: i
for i, request in enumerate(requests)
}
model_runner_output = ModelRunnerOutput(
req_ids=[request.request_id for request in requests],
req_id_to_index=req_to_index,
sampled_token_ids=[[] for _ in range(len(requests))],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
)
scheduler.update_from_output(output, model_runner_output)
output = scheduler.schedule()
assert len(scheduler.running) == 1
assert len(output.scheduled_new_reqs) == 0
assert len(output.scheduled_cached_reqs) == 1
assert len(output.finished_req_ids) == 0
assert output.num_scheduled_tokens[requests[0].request_id] == 800
# Test that we fail if we disable chunked mm input and use too small
# of a max_num_batched_tokens for the mm input.
with pytest.raises(ValueError):
_ = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
max_num_batched_tokens=100,
disable_chunked_mm_input=True,
)
@pytest.mark.parametrize("enable_prefix_caching", [True, False]) @pytest.mark.parametrize("enable_prefix_caching", [True, False])
def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
"""Test scheduling behavior with concurrent partial requests. """Test scheduling behavior with concurrent partial requests.
......
...@@ -53,6 +53,11 @@ def model_name(): ...@@ -53,6 +53,11 @@ def model_name():
return "meta-llama/Meta-Llama-3-8B-Instruct" return "meta-llama/Meta-Llama-3-8B-Instruct"
@pytest.fixture
def eagle_model_name():
return "yuhuili/EAGLE-LLaMA3-Instruct-8B"
def test_ngram_correctness( def test_ngram_correctness(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]], test_prompts: list[list[dict[str, Any]]],
...@@ -95,3 +100,47 @@ def test_ngram_correctness( ...@@ -95,3 +100,47 @@ def test_ngram_correctness(
# Upon failure, inspect the outputs to check for inaccuracy. # Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.7 * len(ref_outputs)) assert matches > int(0.7 * len(ref_outputs))
del spec_llm del spec_llm
def test_eagle_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
sampling_config: SamplingParams,
model_name: str,
eagle_model_name: str,
):
'''
Compare the outputs of a original LLM and a speculative LLM
should be the same when using eagle speculative decoding.
'''
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
ref_llm = LLM(model=model_name, max_model_len=1024)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
spec_llm = LLM(
model=model_name,
speculative_config={
"method": "eagle",
"model": eagle_model_name,
"num_speculative_tokens": 3,
},
max_model_len=1024,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
if ref_output.outputs[0].text == spec_output.outputs[0].text:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output.outputs[0].text}")
print(f"spec_output: {spec_output.outputs[0].text}")
# Heuristic: expect at least 70% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.7 * len(ref_outputs))
del spec_llm
...@@ -64,15 +64,17 @@ def test_defaults_with_usage_context(): ...@@ -64,15 +64,17 @@ def test_defaults_with_usage_context():
# For H100 and H200, we use larger default values. # For H100 and H200, we use larger default values.
default_llm_tokens = 16384 default_llm_tokens = 16384
default_server_tokens = 8192 default_server_tokens = 8192
default_max_num_seqs = 1024
else: else:
default_llm_tokens = 8192 default_llm_tokens = 8192
default_server_tokens = 2048 default_server_tokens = 2048
default_max_num_seqs = 256
assert vllm_config.scheduler_config.max_num_seqs == 1024 assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens # noqa: E501 assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens # noqa: E501
engine_args = EngineArgs(model="facebook/opt-125m") engine_args = EngineArgs(model="facebook/opt-125m")
vllm_config = engine_args.create_engine_config( vllm_config = engine_args.create_engine_config(
UsageContext.OPENAI_API_SERVER) UsageContext.OPENAI_API_SERVER)
assert vllm_config.scheduler_config.max_num_seqs == 1024 assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens # noqa: E501 assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens # noqa: E501
...@@ -3,8 +3,10 @@ ...@@ -3,8 +3,10 @@
import asyncio import asyncio
import time import time
import uuid import uuid
from threading import Thread
from typing import Optional from typing import Optional
import psutil
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
...@@ -245,3 +247,42 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): ...@@ -245,3 +247,42 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
await core_client.call_utility_async("echo", None, "help!") await core_client.call_utility_async("echo", None, "help!")
assert str(e_info.value) == "Call to echo method failed: help!" assert str(e_info.value) == "Call to echo method failed: help!"
@pytest.mark.timeout(10)
def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
m.setenv("VLLM_USE_V1", "1")
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config(
usage_context=UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config)
# Start another thread to wait for engine core process to start
# and kill it - simulate fatal uncaught process exit.
this_proc = psutil.Process()
children_before = set(this_proc.children())
def kill_first_child():
while True:
time.sleep(0.5)
children = set(this_proc.children()) - children_before
if children:
child = children.pop()
print("Killing child core process", child.pid)
child.kill()
break
Thread(target=kill_first_child, daemon=True).start()
_core_client = EngineCoreClient.make_client(
multiprocess_mode=True,
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=True,
)
assert "Engine core initialization failed" in str(e_info.value)
...@@ -325,6 +325,45 @@ def test_structured_output( ...@@ -325,6 +325,45 @@ def test_structured_output(
output_json = json.loads(generated_text) output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json, schema=json_schema) jsonschema.validate(instance=output_json, schema=json_schema)
#
# Test 10: Generate structured with minLength and maxLength
#
min_length = 50
max_length = 50
json_schema = {
"type": "object",
"properties": {
"description": {
"type": "string",
"maxLength": max_length,
"minLength": min_length
}
},
"required": ["description"]
}
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
guided_decoding=GuidedDecodingParams(json=json_schema))
outputs = llm.generate(
prompts="Generate a description of a frog using 50 characters.",
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json, schema=json_schema)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("model_name, tokenizer_mode", @pytest.mark.parametrize("model_name, tokenizer_mode",
......
...@@ -13,14 +13,6 @@ def unsupported_string_schemas(): ...@@ -13,14 +13,6 @@ def unsupported_string_schemas():
"type": "string", "type": "string",
"pattern": "^[a-zA-Z]+$" "pattern": "^[a-zA-Z]+$"
}, },
{
"type": "string",
"minLength": 1
},
{
"type": "string",
"maxLength": 100
},
{ {
"type": "string", "type": "string",
"format": "email" "format": "email"
...@@ -164,6 +156,14 @@ def supported_schema(): ...@@ -164,6 +156,14 @@ def supported_schema():
"type": "string", "type": "string",
"enum": ["sedan", "suv", "truck"] "enum": ["sedan", "suv", "truck"]
}, },
"short_description": {
"type": "string",
"maxLength": 50
},
"long_description": {
"type": "string",
"minLength": 50
},
"address": { "address": {
"type": "object", "type": "object",
"properties": { "properties": {
......
# SPDX-License-Identifier: Apache-2.0
from collections import UserDict
from dataclasses import dataclass
import numpy as np
import torch
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
class UnrecognizedType(UserDict):
def __init__(self, an_int: int):
super().__init__()
self.an_int = an_int
@dataclass
class MyType:
tensor1: torch.Tensor
a_string: str
list_of_tensors: list[torch.Tensor]
numpy_array: np.ndarray
unrecognized: UnrecognizedType
small_f_contig_tensor: torch.Tensor
large_f_contig_tensor: torch.Tensor
small_non_contig_tensor: torch.Tensor
large_non_contig_tensor: torch.Tensor
def test_encode_decode():
"""Test encode/decode loop with zero-copy tensors."""
obj = MyType(
tensor1=torch.randint(low=0,
high=100,
size=(1024, ),
dtype=torch.int32),
a_string="hello",
list_of_tensors=[
torch.rand((1, 10), dtype=torch.float32),
torch.rand((3, 5, 4000), dtype=torch.float64),
torch.tensor(1984), # test scalar too
],
numpy_array=np.arange(512),
unrecognized=UnrecognizedType(33),
small_f_contig_tensor=torch.rand(5, 4).t(),
large_f_contig_tensor=torch.rand(1024, 4).t(),
small_non_contig_tensor=torch.rand(2, 4)[:, 1:3],
large_non_contig_tensor=torch.rand(1024, 512)[:, 10:20],
)
encoder = MsgpackEncoder()
decoder = MsgpackDecoder(MyType)
encoded = encoder.encode(obj)
# There should be the main buffer + 4 large tensor buffers
# + 1 large numpy array. "large" is <= 512 bytes.
# The two small tensors are encoded inline.
assert len(encoded) == 6
decoded: MyType = decoder.decode(encoded)
assert_equal(decoded, obj)
# Test encode_into case
preallocated = bytearray()
encoded2 = encoder.encode_into(obj, preallocated)
assert len(encoded2) == 6
assert encoded2[0] is preallocated
decoded2: MyType = decoder.decode(encoded2)
assert_equal(decoded2, obj)
def assert_equal(obj1: MyType, obj2: MyType):
assert torch.equal(obj1.tensor1, obj2.tensor1)
assert obj1.a_string == obj2.a_string
assert all(
torch.equal(a, b)
for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors))
assert np.array_equal(obj1.numpy_array, obj2.numpy_array)
assert obj1.unrecognized.an_int == obj2.unrecognized.an_int
assert torch.equal(obj1.small_f_contig_tensor, obj2.small_f_contig_tensor)
assert torch.equal(obj1.large_f_contig_tensor, obj2.large_f_contig_tensor)
assert torch.equal(obj1.small_non_contig_tensor,
obj2.small_non_contig_tensor)
assert torch.equal(obj1.large_non_contig_tensor,
obj2.large_non_contig_tensor)
...@@ -4,9 +4,7 @@ from unittest.mock import ANY, patch ...@@ -4,9 +4,7 @@ from unittest.mock import ANY, patch
import torch import torch
from vllm.attention.backends.abstract import AttentionType from vllm.attention.backends.abstract import AttentionType
from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK, from vllm.v1.attention.backends.pallas import (PallasAttentionBackendImpl,
NUM_QUERIES_PER_BLOCK,
PallasAttentionBackendImpl,
PallasMetadata) PallasMetadata)
...@@ -32,8 +30,6 @@ def test_ragged_paged_attention(): ...@@ -32,8 +30,6 @@ def test_ragged_paged_attention():
logits_soft_cap=logits_soft_cap, logits_soft_cap=logits_soft_cap,
attn_type=AttentionType.DECODER, attn_type=AttentionType.DECODER,
) )
mock_vmem_limit_bytes = 1024
attn_impl.vmem_limit_bytes = mock_vmem_limit_bytes
class FakeAttentionLayer: class FakeAttentionLayer:
_k_scale_float: float _k_scale_float: float
...@@ -88,9 +84,9 @@ def test_ragged_paged_attention(): ...@@ -88,9 +84,9 @@ def test_ragged_paged_attention():
ANY, # block_tables ANY, # block_tables
ANY, # query_start_loc ANY, # query_start_loc
ANY, # num_seqs ANY, # num_seqs
num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK, num_kv_pages_per_block=None,
num_queries_per_block=NUM_QUERIES_PER_BLOCK, num_queries_per_block=None,
vmem_limit_bytes=mock_vmem_limit_bytes, vmem_limit_bytes=None,
use_kernel=True, use_kernel=True,
sm_scale=scale, sm_scale=scale,
sliding_window=sliding_window, sliding_window=sliding_window,
......
...@@ -34,3 +34,8 @@ def test_sampler_different(model_name: str): ...@@ -34,3 +34,8 @@ def test_sampler_different(model_name: str):
sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64) sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64)
output2 = llm.generate(prompts, sampling_params) output2 = llm.generate(prompts, sampling_params)
assert output[0].outputs[0].text != output2[0].outputs[0].text assert output[0].outputs[0].text != output2[0].outputs[0].text
with pytest.raises(ValueError):
# Unsupported `seed` param.
sampling_params = SamplingParams(temperature=0.3, seed=42)
output2 = llm.generate(prompts, sampling_params)
...@@ -7,9 +7,9 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig ...@@ -7,9 +7,9 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
SchedulerOutput) SchedulerOutput)
from vllm.v1.worker.tpu_model_runner import (TPUModelRunner, from vllm.v1.worker.tpu_model_runner import (
_get_padded_token_len, TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
_get_paddings) _get_padded_token_len, _get_req_paddings, _get_token_paddings)
# Mock torch_xla module since it may not be available in the test environments # Mock torch_xla module since it may not be available in the test environments
torch_xla_patcher = mock.patch.dict( torch_xla_patcher = mock.patch.dict(
...@@ -296,16 +296,29 @@ def test_update_states_request_unscheduled(model_runner): ...@@ -296,16 +296,29 @@ def test_update_states_request_unscheduled(model_runner):
def test_get_paddings(): def test_get_paddings():
min_token_size, max_token_size, padding_gap = 16, 512, 64 min_token_size, max_token_size, padding_gap = 16, 512, 64
expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512] expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
actual_paddings = _get_paddings(min_token_size, max_token_size, actual_paddings = _get_token_paddings(min_token_size, max_token_size,
padding_gap) padding_gap)
assert actual_paddings == expected_paddings assert actual_paddings == expected_paddings
def test_get_padded_token_len(): def test_get_padded_token_len():
min_token_size, max_token_size, padding_gap = 16, 512, 64 min_token_size, max_token_size, padding_gap = 16, 512, 64
paddings = _get_paddings(min_token_size, max_token_size, padding_gap) paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
assert _get_padded_token_len(paddings, 1) == 16 assert _get_padded_token_len(paddings, 1) == 16
assert _get_padded_token_len(paddings, 16) == 16 assert _get_padded_token_len(paddings, 16) == 16
assert _get_padded_token_len(paddings, 20) == 32 assert _get_padded_token_len(paddings, 20) == 32
assert _get_padded_token_len(paddings, 300) == 320 assert _get_padded_token_len(paddings, 300) == 320
assert _get_padded_token_len(paddings, 512) == 512 assert _get_padded_token_len(paddings, 512) == 512
def test_get_padded_num_reqs_with_upper_limit():
assert _get_padded_num_reqs_with_upper_limit(3, 32) == 8
assert _get_padded_num_reqs_with_upper_limit(9, 32) == 16
assert _get_padded_num_reqs_with_upper_limit(19, 32) == 32
assert _get_padded_num_reqs_with_upper_limit(17, 28) == 28
def test_get_req_paddings():
assert _get_req_paddings(1, 32) == [8, 16, 32]
assert _get_req_paddings(8, 32) == [8, 16, 32]
assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
#!/bin/bash
# Update Dockerfile dependency graph when docker/Dockerfile changes.
# This script is designed to be used as a pre-commit hook.
set -euo pipefail
# Check if docker/Dockerfile is staged for commit
if git diff --cached --name-only | grep -q "^docker/Dockerfile$"; then
echo "docker/Dockerfile has changed, attempting to update dependency graph..."
# Check if Docker is installed and running
if ! command -v docker &> /dev/null; then
echo "Warning: Docker command not found. Skipping Dockerfile graph update."
echo "Please install Docker to automatically update the graph: https://docs.docker.com/get-docker/"
exit 0
fi
if ! docker info &> /dev/null; then
echo "Warning: Docker daemon is not running. Skipping Dockerfile graph update."
echo "Please start Docker to automatically update the graph."
exit 0
fi
# Define the target file path
TARGET_GRAPH_FILE="docs/source/assets/contributing/dockerfile-stages-dependency.png"
# Ensure target directory exists
mkdir -p "$(dirname "$TARGET_GRAPH_FILE")"
# Store old image hash in a variable if the file exists
OLD_HASH=""
if [ -f "$TARGET_GRAPH_FILE" ]; then
OLD_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
fi
# Generate Dockerfile graph
echo "Running dockerfilegraph tool..."
docker run \
--rm \
--user "$(id -u):$(id -g)" \
--workdir /workspace \
--volume "$(pwd)":/workspace \
ghcr.io/patrickhoefler/dockerfilegraph:alpine \
--output png \
--dpi 200 \
--max-label-length 50 \
--filename docker/Dockerfile \
--legend
echo "Finding generated PNG file..."
# Check for Dockerfile.png in the root directory (most likely location)
if [ -f "./Dockerfile.png" ]; then
echo "Found generated file at: ./Dockerfile.png"
mv "./Dockerfile.png" "$TARGET_GRAPH_FILE"
else
# Try to find it elsewhere
DOCKERFILE_PNG=$(find . -name "Dockerfile.png" -type f | head -1)
if [ -n "$DOCKERFILE_PNG" ]; then
echo "Found generated file at: $DOCKERFILE_PNG"
mv "$DOCKERFILE_PNG" "$TARGET_GRAPH_FILE"
else
echo "Error: Could not find the generated PNG file"
find . -name "*.png" -type f -mmin -5
exit 1
fi
fi
# Check if the graph has changed
NEW_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
if [ "$NEW_HASH" != "$OLD_HASH" ]; then
echo "Graph has changed. Please stage the updated file: $TARGET_GRAPH_FILE"
exit 1
else
echo "No changes in graph detected."
fi
fi
exit 0
...@@ -138,6 +138,17 @@ def mla_decode_kvcache_cpu( ...@@ -138,6 +138,17 @@ def mla_decode_kvcache_cpu(
block_tables, seq_lens) block_tables, seq_lens)
# merge attn states ops
def merge_attn_states(output: torch.Tensor,
prefix_output: torch.Tensor,
prefix_lse: torch.Tensor,
suffix_output: torch.Tensor,
suffix_lse: torch.Tensor,
output_lse: Optional[torch.Tensor] = None) -> None:
torch.ops._C.merge_attn_states(output, output_lse, prefix_output,
prefix_lse, suffix_output, suffix_lse)
# pos encoding ops # pos encoding ops
def rotary_embedding( def rotary_embedding(
positions: torch.Tensor, positions: torch.Tensor,
......
...@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata): ...@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
assert self.use_cuda_graph assert self.use_cuda_graph
if turn_prefills_into_decodes: if turn_prefills_into_decodes:
# When Mutli-Step is enabled with Chunked-Prefill, prefills and # When Multi-Step is enabled with Chunked-Prefill, prefills and
# decodes are scheduled together. In the first step, all the # decodes are scheduled together. In the first step, all the
# prefills turn into decodes. This update reflects that # prefills turn into decodes. This update reflects that
# conversion. # conversion.
...@@ -617,10 +617,15 @@ class FlashAttentionImpl(AttentionImpl): ...@@ -617,10 +617,15 @@ class FlashAttentionImpl(AttentionImpl):
blocksparse_params: Optional[Dict[str, Any]] = None, blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None, logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None: ) -> None:
if blocksparse_params is not None: if blocksparse_params is not None:
raise ValueError( raise ValueError(
"FlashAttention does not support block-sparse attention.") "FlashAttention does not support block-sparse attention.")
if use_irope:
logger.warning(
"Using irope in V0 is not supported yet, it will fall back "
"to global attention for long context.")
self.num_heads = num_heads self.num_heads = num_heads
self.head_size = head_size self.head_size = head_size
self.scale = float(scale) self.scale = float(scale)
......
...@@ -38,9 +38,12 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, ...@@ -38,9 +38,12 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
from vllm.attention.layer import Attention from vllm.attention.layer import Attention
from vllm.attention.ops.paged_attn import PagedAttention from vllm.attention.ops.paged_attn import PagedAttention
from vllm.config import VllmConfig, get_current_vllm_config from vllm.config import VllmConfig, get_current_vllm_config
from vllm.logger import init_logger
from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
make_tensor_with_pad) make_tensor_with_pad)
logger = init_logger(__name__)
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.worker.model_runner import (ModelInputForGPUBuilder, from vllm.worker.model_runner import (ModelInputForGPUBuilder,
ModelInputForGPUWithSamplingMetadata) ModelInputForGPUWithSamplingMetadata)
...@@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl): ...@@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl):
blocksparse_params: Optional[Dict[str, Any]] = None, blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None, logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None: ) -> None:
if use_irope:
logger.warning_once(
"Using irope in FlashInfer is not supported yet, it will fall"
" back to global attention for long context.")
self.num_heads = num_heads self.num_heads = num_heads
self.head_size = head_size self.head_size = head_size
self.scale = float(scale) self.scale = float(scale)
......
...@@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): ...@@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
blocksparse_params: Optional[Dict[str, Any]] = None, blocksparse_params: Optional[Dict[str, Any]] = None,
max_seq_len: int = 4096, max_seq_len: int = 4096,
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None: ) -> None:
super(AttentionImpl, self).__init__() super(AttentionImpl, self).__init__()
if use_irope:
logger.warning_once(
"Using irope in HPU is not supported yet, it will fall back "
"to global attention for long context.")
self.kv_cache_dtype = kv_cache_dtype self.kv_cache_dtype = kv_cache_dtype
self.num_heads = num_heads self.num_heads = num_heads
self.head_size = head_size self.head_size = head_size
...@@ -144,14 +149,14 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): ...@@ -144,14 +149,14 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
self.fused_scaled_dot_product_attention = ModuleFusedSDPA( self.fused_scaled_dot_product_attention = ModuleFusedSDPA(
FusedSDPA) FusedSDPA)
except ImportError: except ImportError:
logger().warning("Could not import HPU FusedSDPA kernel. " logger.warning("Could not import HPU FusedSDPA kernel. "
"vLLM will use native implementation.") "vLLM will use native implementation.")
suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes() supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
if head_size not in suppored_head_sizes: if head_size not in supported_head_sizes:
raise ValueError( raise ValueError(
f"Head size {head_size} is not supported by PagedAttention. " f"Head size {head_size} is not supported by PagedAttention. "
f"Supported head sizes are: {suppored_head_sizes}.") f"Supported head sizes are: {supported_head_sizes}.")
if attn_type != AttentionType.DECODER: if attn_type != AttentionType.DECODER:
raise NotImplementedError("Encoder self-attention and " raise NotImplementedError("Encoder self-attention and "
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment