Commit 31330101 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.4' into v0.8.4-dev

parents e8933c34 dc1b4a6f
# SPDX-License-Identifier: Apache-2.0
import importlib.metadata
import importlib.util
import pytest
DTYPE = ["bfloat16"]
TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
def test_pre_quantized_model(vllm_runner):
with vllm_runner("drisspg/float8_dynamic_act_float8_weight-opt-125m",
quantization="torchao",
dtype="bfloat16",
enforce_eager=True) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
assert output
print(output)
if __name__ == "__main__":
pytest.main([__file__])
# SPDX-License-Identifier: Apache-2.0
"""Tests for the SamplingParams class.
"""
import pytest
from vllm import SamplingParams
from vllm.config import ModelConfig
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
MODEL_NAME = "Qwen/Qwen1.5-7B"
def test_max_tokens_none():
......@@ -9,6 +16,74 @@ def test_max_tokens_none():
SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
if __name__ == "__main__":
import pytest
pytest.main([__file__])
@pytest.fixture(scope="module")
def model_config():
return ModelConfig(
MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
@pytest.fixture(scope="module")
def default_max_tokens():
return 4096
def test_sampling_params_from_request_with_no_guided_decoding_backend(
model_config, default_max_tokens):
# guided_decoding_backend is not present at request level
request = ChatCompletionRequest.model_validate({
'messages': [{
'role': 'user',
'content': 'Hello'
}],
'model':
MODEL_NAME,
'response_format': {
'type': 'json_object',
},
})
sampling_params = request.to_sampling_params(
default_max_tokens,
model_config.logits_processor_pattern,
)
# we do not expect any backend to be present and the default
# guided_decoding_backend at engine level will be used.
assert sampling_params.guided_decoding.backend is None
@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
[("xgrammar", "xgrammar"),
("lm-format-enforcer", "lm-format-enforcer"),
("outlines", "outlines")])
def test_sampling_params_from_request_with_guided_decoding_backend(
request_level_guided_decoding_backend: str, expected: str,
model_config, default_max_tokens):
request = ChatCompletionRequest.model_validate({
'messages': [{
'role': 'user',
'content': 'Hello'
}],
'model':
MODEL_NAME,
'response_format': {
'type': 'json_object',
},
'guided_decoding_backend':
request_level_guided_decoding_backend,
})
sampling_params = request.to_sampling_params(
default_max_tokens,
model_config.logits_processor_pattern,
)
# backend correctly identified in resulting sampling_params
assert sampling_params.guided_decoding.backend == expected
......@@ -48,12 +48,11 @@ def test_filter_subtensors():
@pytest.fixture(scope="module")
def llama_3p2_1b_files():
with TemporaryDirectory() as cache_dir:
# input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
# cache_dir=cache_dir,
# ignore_patterns=["*.bin*", "original/*"])
input_dir = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
yield input_dir
# input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
# ignore_patterns=["*.bin*", "original/*"])
input_dir = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
yield input_dir
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
......@@ -65,9 +64,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
# Copy metadata files to output directory
for file in os.listdir(input_dir):
if not any(
file.endswith(ext) and not os.path.isdir(file)
for ext in weights_patterns):
if os.path.isdir(os.path.join(input_dir, file)):
continue
if not any(file.endswith(ext) for ext in weights_patterns):
shutil.copy(f"{input_dir}/{file}", output_dir)
......@@ -82,7 +81,8 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
@pytest.mark.parametrize("enable_lora", [False, True])
@pytest.mark.parametrize("tp_size", [1, 2])
def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
llama_3p2_1b_files):
llama_3p2_1b_files,
monkeypatch: pytest.MonkeyPatch):
if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
......@@ -90,6 +90,8 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
gpu_memory_utilization = 0.8
input_dir = llama_3p2_1b_files
ctx = mp.get_context("spawn")
# The interface in v1 engine has changed, run in v1 engine will hang.
monkeypatch.setenv("VLLM_USE_V1", "0")
# Run in separate processes for memory & CUDA isolation
with TemporaryDirectory() as output_dir:
......
......@@ -10,10 +10,33 @@ from vllm.platforms import current_platform
from .utils import ARGS, CONFIGS, ServerConfig
# select models to test based on command line arguments
def pytest_addoption(parser):
parser.addoption("--models",
nargs="+",
help="Specify one or more models to test")
parser.addoption("--extended",
action="store_true",
default=False,
help="invoke extended tests requiring large GPUs")
# for each server config, download the model and return the config
@pytest.fixture(scope="session", params=CONFIGS.keys())
def server_config(request):
config = CONFIGS[request.param]
extended = request.config.getoption("--extended")
models = request.config.getoption("--models")
config_keys_to_test = [
key for key in CONFIGS if (models is None or key in models) and (
extended or not CONFIGS[key].get("extended", False))
]
config_key = request.param
if config_key not in config_keys_to_test:
pytest.skip(f"Skipping config '{config_key}'")
config = CONFIGS[config_key]
if current_platform.is_rocm() and not config.get("supports_rocm", True):
pytest.skip("The {} model can't be tested on the ROCm platform".format(
......
......@@ -19,6 +19,7 @@ class ServerConfig(TypedDict, total=False):
system_prompt: Optional[str]
supports_parallel: Optional[bool]
supports_rocm: Optional[bool]
extended: Optional[bool] # tests do not run in CI automatically
def patch_system_prompt(messages: list[dict[str, Any]],
......@@ -85,6 +86,21 @@ CONFIGS: dict[str, ServerConfig] = {
"supports_parallel":
False,
},
"llama4": {
"model":
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template",
str(VLLM_PATH /
"examples/tool_chat_template_llama4_pythonic.jinja"), "-tp",
"4"
],
"supports_parallel":
False,
"extended":
True
},
"mistral": {
"model":
os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3"),
......
......@@ -44,7 +44,7 @@ def test_tpu_compilation():
assert generated_text.startswith(answer)
compiled_codes = sorted(
glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py")))
for i, compiled_code in enumerate(compiled_codes):
print("{} file: {}".format(i + 1, compiled_code))
......@@ -52,15 +52,21 @@ def test_tpu_compilation():
# We should only trigger Dynamo compilation 2 times:
# 1. Forward pass without kv_caches
# 2. Forward pass with kv_caches
# Check we have 4 compiled codes
# Check we have 2 compiled codes
assert len(compiled_codes) == 2
kv_cache_prefix = "kv_cache"
attn_prefix = "ragged_paged_attention"
def extract_compiled_index(s):
parts = s.replace(".", "_").split("_")
numbers = [int(part) for part in parts if part.isdigit()]
return numbers[0]
# Check all the compilations are as expected
compiled_fns = sorted(
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
compiled_fns = sorted(glob.glob(
os.path.join(temp_dir, "__compiled_fn*Captured*.py")),
key=lambda s: extract_compiled_index(s))
for i, compiled_fn in enumerate(compiled_fns):
print("{} file: {}".format(i + 1, compiled_fn))
......
......@@ -3,14 +3,17 @@
import pytest
import torch
from vllm.multimodal.inputs import MultiModalKwargs
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
from vllm.utils import sha256
from vllm.utils import GiB_bytes, sha256
from vllm.v1.core.kv_cache_manager import KVCacheManager
# disable yapf here as it formats differently than isort such that both fail
# yapf: disable
from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
FreeKVCacheBlockQueue, KVCacheBlock,
PrefixCachingMetrics,
estimate_max_model_len,
generate_block_hash_extra_keys,
hash_block_tokens,
hash_request_tokens,
......@@ -46,6 +49,18 @@ def make_request(request_id,
)
def new_kv_cache_spec(block_size=16,
num_kv_heads=2,
head_size=64,
dtype=torch.float32,
use_mla=False):
return FullAttentionSpec(block_size=block_size,
num_kv_heads=num_kv_heads,
head_size=head_size,
dtype=dtype,
use_mla=use_mla)
def test_none_hash():
assert NONE_HASH is not None
assert isinstance(NONE_HASH, int)
......@@ -158,13 +173,10 @@ def test_generate_block_hash_extra_keys():
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(20)],
mm_positions=[{
"offset": 0,
"length": 5
}, {
"offset": 10,
"length": 5
}],
mm_positions=[
PlaceholderRange(offset=0, length=5),
PlaceholderRange(offset=10, length=5),
],
mm_hashes=["hash1", "hash2"],
)
......@@ -222,13 +234,10 @@ def test_hash_request_tokens(hash_fn):
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{
"offset": 0,
"length": 3
}, {
"offset": 3,
"length": 3
}],
mm_positions=[
PlaceholderRange(offset=0, length=3),
PlaceholderRange(offset=3, length=3),
],
mm_hashes=["hash1", "hash2"],
)
......@@ -253,25 +262,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
request1 = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{
"offset": 0,
"length": 3
}, {
"offset": 3,
"length": 3
}],
mm_positions=[
PlaceholderRange(offset=0, length=3),
PlaceholderRange(offset=3, length=3),
],
mm_hashes=["hash1", "hash2"],
)
request2 = make_request(
request_id=1,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{
"offset": 0,
"length": 3
}, {
"offset": 3,
"length": 3
}],
mm_positions=[
PlaceholderRange(offset=0, length=3),
PlaceholderRange(offset=3, length=3),
],
mm_hashes=["hash3", "hash2"],
)
block_size = 3
......@@ -337,18 +340,6 @@ def test_metrics():
def test_unify_kv_cache_configs():
def new_kv_cache_spec(block_size=16,
num_kv_heads=2,
head_size=64,
dtype=torch.float32,
use_mla=False):
return FullAttentionSpec(block_size=block_size,
num_kv_heads=num_kv_heads,
head_size=head_size,
dtype=dtype,
use_mla=use_mla)
same_kv_cache_config = [
KVCacheConfig(
num_blocks=10,
......@@ -438,3 +429,106 @@ def test_unify_kv_cache_configs():
]
with pytest.raises(AssertionError):
unify_kv_cache_configs(diff_kv_cache_config)
@pytest.mark.parametrize(
("model_id", "max_model_len", "want_estimated_max_len"), [
("Qwen/Qwen1.5-7B", 16385, 16384),
("Qwen/Qwen1.5-7B", 16383, 16383),
])
def test_estimate_max_model_len(model_id, max_model_len,
want_estimated_max_len):
# Create a VllmConfig
model_config = ModelConfig(
model_id,
task="generate",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
max_model_len=max_model_len,
)
scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
vllm_config = VllmConfig(
model_config=model_config,
scheduler_config=scheduler_config,
)
# Create KV cache specs
kv_cache_spec = {}
for i in range(32):
layer_name = f"layer_{i}"
kv_cache_spec[layer_name] = FullAttentionSpec(
block_size=16,
num_kv_heads=32,
head_size=128,
dtype=torch.float16,
use_mla=False,
)
# Estimate the maximum model length, 16384 model_len need 8GB
estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
8 * GiB_bytes)
assert estimated_max_len == want_estimated_max_len
def test_allocate_with_lookahead():
"""Verify that lookahead tokens correctly affect block allocation"""
block_size = 4
config = KVCacheConfig(
num_blocks=10,
tensors={
"layer1": KVCacheTensor(100),
},
kv_cache_groups=[
KVCacheGroupSpec(["layer1"],
new_kv_cache_spec(block_size=block_size)),
],
)
request = make_request(
request_id=0,
prompt_token_ids=[],
mm_positions=None,
mm_hashes=None,
)
# Test case 1: Requires additional lookahead tokens
kv_cache_manager = KVCacheManager(kv_cache_config=config,
max_model_len=100,
num_preallocate_tokens=0)
blocks = kv_cache_manager.allocate_slots(
request,
num_tokens=3,
num_lookahead_tokens=2, # Total required: 3+2=5 tokens
)
assert len(blocks) == 2 # ceil(5/4)=2 blocks
# Test case 2: With precomputed blocks
kv_cache_manager = KVCacheManager(kv_cache_config=config,
max_model_len=100,
num_preallocate_tokens=4)
# num_preallocate_blocks = 4 // 4 - 2 // 4 = 1
# required_blocks = ceil((3 + 2) /4) = 2
# total_blocks = 1 + 2 = 3
blocks = kv_cache_manager.allocate_slots(
request,
num_tokens=3,
num_lookahead_tokens=2,
)
assert len(blocks) == 3
# Test case 3: With precomputed blocks
# num_preallocate_blocks = 4 // 4 - 4 // 4 = 0
# required_blocks = ceil((3 + 4) / 4) = 2
# total_blocks = 0 + 2 = 2
kv_cache_manager = KVCacheManager(kv_cache_config=config,
max_model_len=100,
num_preallocate_tokens=4)
blocks = kv_cache_manager.allocate_slots(
request,
num_tokens=3,
num_lookahead_tokens=4,
)
assert len(blocks) == 2
......@@ -24,6 +24,7 @@ def create_scheduler(
max_num_batched_tokens: int = 8192,
enable_prefix_caching: Optional[bool] = None,
long_prefill_token_threshold: int = 0,
disable_chunked_mm_input: bool = False,
) -> Scheduler:
'''Create scheduler under test.
......@@ -43,6 +44,7 @@ def create_scheduler(
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_num_batched_tokens,
long_prefill_token_threshold=long_prefill_token_threshold,
disable_chunked_mm_input=disable_chunked_mm_input,
)
model_config = ModelConfig(
model=model,
......@@ -278,6 +280,58 @@ def test_schedule_partial_requests():
assert requests[2].request_id not in output.num_scheduled_tokens
def test_no_mm_input_chunking():
# Disable multimodal input chunking.
scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
max_num_batched_tokens=1024,
disable_chunked_mm_input=True,
)
mm_positions = [[PlaceholderRange(offset=400, length=800)]]
requests = create_requests(num_requests=1,
num_tokens=1200,
mm_positions=mm_positions)
for request in requests:
scheduler.add_request(request)
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == 1
assert len(output.scheduled_cached_reqs) == 0
assert len(output.finished_req_ids) == 0
# We want to only see the 400 text tokens at the start scheduled
assert output.num_scheduled_tokens[requests[0].request_id] == 400
req_to_index = {
request.request_id: i
for i, request in enumerate(requests)
}
model_runner_output = ModelRunnerOutput(
req_ids=[request.request_id for request in requests],
req_id_to_index=req_to_index,
sampled_token_ids=[[] for _ in range(len(requests))],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
)
scheduler.update_from_output(output, model_runner_output)
output = scheduler.schedule()
assert len(scheduler.running) == 1
assert len(output.scheduled_new_reqs) == 0
assert len(output.scheduled_cached_reqs) == 1
assert len(output.finished_req_ids) == 0
assert output.num_scheduled_tokens[requests[0].request_id] == 800
# Test that we fail if we disable chunked mm input and use too small
# of a max_num_batched_tokens for the mm input.
with pytest.raises(ValueError):
_ = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
max_num_batched_tokens=100,
disable_chunked_mm_input=True,
)
@pytest.mark.parametrize("enable_prefix_caching", [True, False])
def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
"""Test scheduling behavior with concurrent partial requests.
......
......@@ -53,6 +53,11 @@ def model_name():
return "meta-llama/Meta-Llama-3-8B-Instruct"
@pytest.fixture
def eagle_model_name():
return "yuhuili/EAGLE-LLaMA3-Instruct-8B"
def test_ngram_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
......@@ -95,3 +100,47 @@ def test_ngram_correctness(
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.7 * len(ref_outputs))
del spec_llm
def test_eagle_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
sampling_config: SamplingParams,
model_name: str,
eagle_model_name: str,
):
'''
Compare the outputs of a original LLM and a speculative LLM
should be the same when using eagle speculative decoding.
'''
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
ref_llm = LLM(model=model_name, max_model_len=1024)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
spec_llm = LLM(
model=model_name,
speculative_config={
"method": "eagle",
"model": eagle_model_name,
"num_speculative_tokens": 3,
},
max_model_len=1024,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
if ref_output.outputs[0].text == spec_output.outputs[0].text:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output.outputs[0].text}")
print(f"spec_output: {spec_output.outputs[0].text}")
# Heuristic: expect at least 70% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.7 * len(ref_outputs))
del spec_llm
......@@ -66,15 +66,17 @@ def test_defaults_with_usage_context():
# For H100 and H200, we use larger default values.
default_llm_tokens = 16384
default_server_tokens = 8192
default_max_num_seqs = 1024
else:
default_llm_tokens = 8192
default_server_tokens = 2048
default_max_num_seqs = 256
assert vllm_config.scheduler_config.max_num_seqs == 1024
assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens # noqa: E501
engine_args = EngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"))
vllm_config = engine_args.create_engine_config(
UsageContext.OPENAI_API_SERVER)
assert vllm_config.scheduler_config.max_num_seqs == 1024
assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens # noqa: E501
......@@ -3,9 +3,11 @@
import asyncio
import time
import uuid
from threading import Thread
from typing import Optional
import os
import psutil
import pytest
from transformers import AutoTokenizer
......@@ -247,3 +249,42 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
await core_client.call_utility_async("echo", None, "help!")
assert str(e_info.value) == "Call to echo method failed: help!"
@pytest.mark.timeout(10)
def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
m.setenv("VLLM_USE_V1", "1")
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config(
usage_context=UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config)
# Start another thread to wait for engine core process to start
# and kill it - simulate fatal uncaught process exit.
this_proc = psutil.Process()
children_before = set(this_proc.children())
def kill_first_child():
while True:
time.sleep(0.5)
children = set(this_proc.children()) - children_before
if children:
child = children.pop()
print("Killing child core process", child.pid)
child.kill()
break
Thread(target=kill_first_child, daemon=True).start()
_core_client = EngineCoreClient.make_client(
multiprocess_mode=True,
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=True,
)
assert "Engine core initialization failed" in str(e_info.value)
......@@ -325,6 +325,45 @@ def test_structured_output(
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json, schema=json_schema)
#
# Test 10: Generate structured with minLength and maxLength
#
min_length = 50
max_length = 50
json_schema = {
"type": "object",
"properties": {
"description": {
"type": "string",
"maxLength": max_length,
"minLength": min_length
}
},
"required": ["description"]
}
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
guided_decoding=GuidedDecodingParams(json=json_schema))
outputs = llm.generate(
prompts="Generate a description of a frog using 50 characters.",
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json, schema=json_schema)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("model_name, tokenizer_mode",
......
......@@ -13,14 +13,6 @@ def unsupported_string_schemas():
"type": "string",
"pattern": "^[a-zA-Z]+$"
},
{
"type": "string",
"minLength": 1
},
{
"type": "string",
"maxLength": 100
},
{
"type": "string",
"format": "email"
......@@ -164,6 +156,14 @@ def supported_schema():
"type": "string",
"enum": ["sedan", "suv", "truck"]
},
"short_description": {
"type": "string",
"maxLength": 50
},
"long_description": {
"type": "string",
"minLength": 50
},
"address": {
"type": "object",
"properties": {
......
# SPDX-License-Identifier: Apache-2.0
from collections import UserDict
from dataclasses import dataclass
import numpy as np
import torch
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
class UnrecognizedType(UserDict):
def __init__(self, an_int: int):
super().__init__()
self.an_int = an_int
@dataclass
class MyType:
tensor1: torch.Tensor
a_string: str
list_of_tensors: list[torch.Tensor]
numpy_array: np.ndarray
unrecognized: UnrecognizedType
small_f_contig_tensor: torch.Tensor
large_f_contig_tensor: torch.Tensor
small_non_contig_tensor: torch.Tensor
large_non_contig_tensor: torch.Tensor
def test_encode_decode():
"""Test encode/decode loop with zero-copy tensors."""
obj = MyType(
tensor1=torch.randint(low=0,
high=100,
size=(1024, ),
dtype=torch.int32),
a_string="hello",
list_of_tensors=[
torch.rand((1, 10), dtype=torch.float32),
torch.rand((3, 5, 4000), dtype=torch.float64),
torch.tensor(1984), # test scalar too
],
numpy_array=np.arange(512),
unrecognized=UnrecognizedType(33),
small_f_contig_tensor=torch.rand(5, 4).t(),
large_f_contig_tensor=torch.rand(1024, 4).t(),
small_non_contig_tensor=torch.rand(2, 4)[:, 1:3],
large_non_contig_tensor=torch.rand(1024, 512)[:, 10:20],
)
encoder = MsgpackEncoder()
decoder = MsgpackDecoder(MyType)
encoded = encoder.encode(obj)
# There should be the main buffer + 4 large tensor buffers
# + 1 large numpy array. "large" is <= 512 bytes.
# The two small tensors are encoded inline.
assert len(encoded) == 6
decoded: MyType = decoder.decode(encoded)
assert_equal(decoded, obj)
# Test encode_into case
preallocated = bytearray()
encoded2 = encoder.encode_into(obj, preallocated)
assert len(encoded2) == 6
assert encoded2[0] is preallocated
decoded2: MyType = decoder.decode(encoded2)
assert_equal(decoded2, obj)
def assert_equal(obj1: MyType, obj2: MyType):
assert torch.equal(obj1.tensor1, obj2.tensor1)
assert obj1.a_string == obj2.a_string
assert all(
torch.equal(a, b)
for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors))
assert np.array_equal(obj1.numpy_array, obj2.numpy_array)
assert obj1.unrecognized.an_int == obj2.unrecognized.an_int
assert torch.equal(obj1.small_f_contig_tensor, obj2.small_f_contig_tensor)
assert torch.equal(obj1.large_f_contig_tensor, obj2.large_f_contig_tensor)
assert torch.equal(obj1.small_non_contig_tensor,
obj2.small_non_contig_tensor)
assert torch.equal(obj1.large_non_contig_tensor,
obj2.large_non_contig_tensor)
......@@ -4,9 +4,7 @@ from unittest.mock import ANY, patch
import torch
from vllm.attention.backends.abstract import AttentionType
from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
NUM_QUERIES_PER_BLOCK,
PallasAttentionBackendImpl,
from vllm.v1.attention.backends.pallas import (PallasAttentionBackendImpl,
PallasMetadata)
......@@ -32,8 +30,6 @@ def test_ragged_paged_attention():
logits_soft_cap=logits_soft_cap,
attn_type=AttentionType.DECODER,
)
mock_vmem_limit_bytes = 1024
attn_impl.vmem_limit_bytes = mock_vmem_limit_bytes
class FakeAttentionLayer:
_k_scale_float: float
......@@ -88,9 +84,9 @@ def test_ragged_paged_attention():
ANY, # block_tables
ANY, # query_start_loc
ANY, # num_seqs
num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
num_queries_per_block=NUM_QUERIES_PER_BLOCK,
vmem_limit_bytes=mock_vmem_limit_bytes,
num_kv_pages_per_block=None,
num_queries_per_block=None,
vmem_limit_bytes=None,
use_kernel=True,
sm_scale=scale,
sliding_window=sliding_window,
......
......@@ -34,3 +34,8 @@ def test_sampler_different(model_name: str):
sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64)
output2 = llm.generate(prompts, sampling_params)
assert output[0].outputs[0].text != output2[0].outputs[0].text
with pytest.raises(ValueError):
# Unsupported `seed` param.
sampling_params = SamplingParams(temperature=0.3, seed=42)
output2 = llm.generate(prompts, sampling_params)
......@@ -7,9 +7,9 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
from vllm.sampling_params import SamplingParams
from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
SchedulerOutput)
from vllm.v1.worker.tpu_model_runner import (TPUModelRunner,
_get_padded_token_len,
_get_paddings)
from vllm.v1.worker.tpu_model_runner import (
TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
_get_padded_token_len, _get_req_paddings, _get_token_paddings)
# Mock torch_xla module since it may not be available in the test environments
torch_xla_patcher = mock.patch.dict(
......@@ -296,16 +296,29 @@ def test_update_states_request_unscheduled(model_runner):
def test_get_paddings():
min_token_size, max_token_size, padding_gap = 16, 512, 64
expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
actual_paddings = _get_paddings(min_token_size, max_token_size,
padding_gap)
actual_paddings = _get_token_paddings(min_token_size, max_token_size,
padding_gap)
assert actual_paddings == expected_paddings
def test_get_padded_token_len():
min_token_size, max_token_size, padding_gap = 16, 512, 64
paddings = _get_paddings(min_token_size, max_token_size, padding_gap)
paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
assert _get_padded_token_len(paddings, 1) == 16
assert _get_padded_token_len(paddings, 16) == 16
assert _get_padded_token_len(paddings, 20) == 32
assert _get_padded_token_len(paddings, 300) == 320
assert _get_padded_token_len(paddings, 512) == 512
def test_get_padded_num_reqs_with_upper_limit():
assert _get_padded_num_reqs_with_upper_limit(3, 32) == 8
assert _get_padded_num_reqs_with_upper_limit(9, 32) == 16
assert _get_padded_num_reqs_with_upper_limit(19, 32) == 32
assert _get_padded_num_reqs_with_upper_limit(17, 28) == 28
def test_get_req_paddings():
assert _get_req_paddings(1, 32) == [8, 16, 32]
assert _get_req_paddings(8, 32) == [8, 16, 32]
assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
#!/bin/bash
# Update Dockerfile dependency graph when docker/Dockerfile changes.
# This script is designed to be used as a pre-commit hook.
set -euo pipefail
# Check if docker/Dockerfile is staged for commit
if git diff --cached --name-only | grep -q "^docker/Dockerfile$"; then
echo "docker/Dockerfile has changed, attempting to update dependency graph..."
# Check if Docker is installed and running
if ! command -v docker &> /dev/null; then
echo "Warning: Docker command not found. Skipping Dockerfile graph update."
echo "Please install Docker to automatically update the graph: https://docs.docker.com/get-docker/"
exit 0
fi
if ! docker info &> /dev/null; then
echo "Warning: Docker daemon is not running. Skipping Dockerfile graph update."
echo "Please start Docker to automatically update the graph."
exit 0
fi
# Define the target file path
TARGET_GRAPH_FILE="docs/source/assets/contributing/dockerfile-stages-dependency.png"
# Ensure target directory exists
mkdir -p "$(dirname "$TARGET_GRAPH_FILE")"
# Store old image hash in a variable if the file exists
OLD_HASH=""
if [ -f "$TARGET_GRAPH_FILE" ]; then
OLD_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
fi
# Generate Dockerfile graph
echo "Running dockerfilegraph tool..."
docker run \
--rm \
--user "$(id -u):$(id -g)" \
--workdir /workspace \
--volume "$(pwd)":/workspace \
ghcr.io/patrickhoefler/dockerfilegraph:alpine \
--output png \
--dpi 200 \
--max-label-length 50 \
--filename docker/Dockerfile \
--legend
echo "Finding generated PNG file..."
# Check for Dockerfile.png in the root directory (most likely location)
if [ -f "./Dockerfile.png" ]; then
echo "Found generated file at: ./Dockerfile.png"
mv "./Dockerfile.png" "$TARGET_GRAPH_FILE"
else
# Try to find it elsewhere
DOCKERFILE_PNG=$(find . -name "Dockerfile.png" -type f | head -1)
if [ -n "$DOCKERFILE_PNG" ]; then
echo "Found generated file at: $DOCKERFILE_PNG"
mv "$DOCKERFILE_PNG" "$TARGET_GRAPH_FILE"
else
echo "Error: Could not find the generated PNG file"
find . -name "*.png" -type f -mmin -5
exit 1
fi
fi
# Check if the graph has changed
NEW_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
if [ "$NEW_HASH" != "$OLD_HASH" ]; then
echo "Graph has changed. Please stage the updated file: $TARGET_GRAPH_FILE"
exit 1
else
echo "No changes in graph detected."
fi
fi
exit 0
......@@ -469,6 +469,17 @@ def mla_decode_kvcache_cpu(
block_tables, seq_lens)
# merge attn states ops
def merge_attn_states(output: torch.Tensor,
prefix_output: torch.Tensor,
prefix_lse: torch.Tensor,
suffix_output: torch.Tensor,
suffix_lse: torch.Tensor,
output_lse: Optional[torch.Tensor] = None) -> None:
torch.ops._C.merge_attn_states(output, output_lse, prefix_output,
prefix_lse, suffix_output, suffix_lse)
# pos encoding ops
def rotary_embedding(
positions: torch.Tensor,
......
......@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
assert self.use_cuda_graph
if turn_prefills_into_decodes:
# When Mutli-Step is enabled with Chunked-Prefill, prefills and
# When Multi-Step is enabled with Chunked-Prefill, prefills and
# decodes are scheduled together. In the first step, all the
# prefills turn into decodes. This update reflects that
# conversion.
......@@ -619,10 +619,15 @@ class FlashAttentionImpl(AttentionImpl):
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None:
if blocksparse_params is not None:
raise ValueError(
"FlashAttention does not support block-sparse attention.")
if use_irope:
logger.warning(
"Using irope in V0 is not supported yet, it will fall back "
"to global attention for long context.")
self.num_heads = num_heads
self.head_size = head_size
self.scale = float(scale)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment