Commit 6d2051cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

parents 2c7f740a a2c71c54
...@@ -2,14 +2,9 @@ ...@@ -2,14 +2,9 @@
Run `pytest tests/prefix_caching/test_prefix_caching.py`. Run `pytest tests/prefix_caching/test_prefix_caching.py`.
""" """
from typing import List
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable from tests.kernels.utils import override_backend_env_variable
from vllm.block import PhysicalTokenBlock
from vllm.core.block_manager_v1 import CachedBlockAllocator
from vllm.utils import Device
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
...@@ -18,86 +13,11 @@ MODELS = [ ...@@ -18,86 +13,11 @@ MODELS = [
] ]
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_blocks", [16])
def test_block_allocator(
block_size: int,
num_blocks: int,
):
block_hash = 1
block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
# Allocate two PysicalTokenBlocks with the same hash and check
# that they are the same PhysicalTokenBlock
first_block = block_allocator.allocate(block_hash, 0)
second_block = block_allocator.allocate(block_hash, 0)
assert (first_block == second_block)
assert (second_block.ref_count == 2)
# Check metric: 1 hit of 2 queries
assert block_allocator.get_prefix_cache_hit_rate() == 0.5
# Free the first_block and confirm that the ref_count is correctly
# decremented on the second block
block_allocator.free(first_block)
assert (second_block.ref_count == 1)
# Free the second block
block_allocator.free(second_block)
# Reallocate the first block and confirm that, even after the block
# had its ref_count go to 0, we still get the same block back
first_block = block_allocator.allocate(block_hash, 0)
assert (first_block == second_block)
assert (first_block.block_hash == block_hash)
# Allocate one more time to get 3/4 hit rate for easy checking
block_allocator.allocate(block_hash, 0)
assert block_allocator.get_prefix_cache_hit_rate() == 0.75
@pytest.mark.parametrize("num_blocks", [16])
def test_eviction(num_blocks: int, ):
block_size = 16
block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
blocks: List[PhysicalTokenBlock] = []
for i in range(num_blocks):
# use i as the block_hash
blocks.append(block_allocator.allocate(i, 0))
#Free all blocks
for block in blocks:
block_allocator.free(block)
# Allocate a new block and confirm that it's the first block freed.
# I.E The Least Recently Used block
new_block_hash = block_size
new_block = block_allocator.allocate(new_block_hash, 0)
assert (new_block == blocks[0])
assert (new_block.block_hash == new_block_hash)
# Reallocate the second in blocks to remove it from the free list
realloc_block_hash = 1
realloc_block = block_allocator.allocate(realloc_block_hash, 0)
assert (realloc_block == blocks[realloc_block_hash])
assert (realloc_block.block_hash == realloc_block_hash)
# Allocate a new block and confirm that it's not the realloc_block,
# since the realloc_block shouldn't be in the free list
new_block_hash = block_size + 1
new_block = block_allocator.allocate(new_block_hash, 0)
assert (realloc_block != new_block)
assert (new_block.block_hash == new_block_hash)
assert (new_block.block_number == 2)
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("cached_position", [0, 1]) @pytest.mark.parametrize("cached_position", [0, 1])
@pytest.mark.parametrize("use_v2_block_manager", [False, True])
def test_mixed_requests( def test_mixed_requests(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
...@@ -107,7 +27,6 @@ def test_mixed_requests( ...@@ -107,7 +27,6 @@ def test_mixed_requests(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
cached_position: int, cached_position: int,
use_v2_block_manager: bool,
monkeypatch, monkeypatch,
) -> None: ) -> None:
""" """
...@@ -125,7 +44,6 @@ def test_mixed_requests( ...@@ -125,7 +44,6 @@ def test_mixed_requests(
model, model,
dtype=dtype, dtype=dtype,
enable_prefix_caching=True, enable_prefix_caching=True,
use_v2_block_manager=use_v2_block_manager,
) as vllm_model: ) as vllm_model:
# Run the first prompt so the cache is populated # Run the first prompt so the cache is populated
vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens) vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
......
...@@ -9,22 +9,22 @@ import pytest ...@@ -9,22 +9,22 @@ import pytest
import torch import torch
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from tests.utils import fork_new_process_for_each_test
from ..utils import fork_new_process_for_each_test
models_4bit_to_test = [ models_4bit_to_test = [
('huggyllama/llama-7b', 'quantize model inflight'), ("facebook/opt-125m", "quantize opt model inflight"),
] ]
models_pre_qaunt_4bit_to_test = [ models_pre_qaunt_4bit_to_test = [
('lllyasviel/omost-llama-3-8b-4bits',
'read pre-quantized 4-bit NF4 model'),
('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed', ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
'read pre-quantized 4-bit FP4 model'), 'read pre-quantized 4-bit FP4 model'),
('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
] ]
models_pre_quant_8bit_to_test = [ models_pre_quant_8bit_to_test = [
('meta-llama/Llama-Guard-3-8B-INT8', 'read pre-quantized 8-bit model'), ('meta-llama/Llama-Guard-3-8B-INT8',
'read pre-quantized llama 8-bit model'),
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
] ]
...@@ -133,6 +133,7 @@ def validate_generated_texts(hf_runner, ...@@ -133,6 +133,7 @@ def validate_generated_texts(hf_runner,
hf_str = hf_log["generated_text"] hf_str = hf_log["generated_text"]
vllm_str = vllm_log["generated_text"] vllm_str = vllm_log["generated_text"]
prompt = hf_log["prompt"] prompt = hf_log["prompt"]
assert hf_str == vllm_str, (f"Model: {model_name}" assert hf_str == vllm_str, (f"Model: {model_name}"
f"Mismatch between HF and vLLM outputs:\n" f"Mismatch between HF and vLLM outputs:\n"
f"Prompt: {prompt}\n" f"Prompt: {prompt}\n"
......
...@@ -2,26 +2,28 @@ ...@@ -2,26 +2,28 @@
Run `pytest tests/quantization/test_compressed_tensors.py`. Run `pytest tests/quantization/test_compressed_tensors.py`.
""" """
from typing import Optional
import pytest import pytest
import torch import torch
from compressed_tensors.quantization import QuantizationType
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24, CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
QuantizationType)
@pytest.mark.parametrize("model_args", [ @pytest.mark.parametrize(
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor", "model_args",
QuantizationType.INT, 2560), [("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel", QuantizationType.INT, 2560, True),
QuantizationType.INT, 2560), ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
]) QuantizationType.INT, 2560, True),
("nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "tensor",
QuantizationType.INT, 2560, False)])
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
model_path, strategy, quant_type, shape_0 = model_args model_path, strategy, quant_type, shape_0, is_symmetric = model_args
with vllm_runner(model_path, enforce_eager=True) as llm: with vllm_runner(model_path, enforce_eager=True) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0] layer = model.model.layers[0]
...@@ -31,6 +33,18 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): ...@@ -31,6 +33,18 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
gate_up_proj = layer.mlp.gate_up_proj gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj down_proj = layer.mlp.down_proj
# assert zp for symmetric and asymmetric cases
def zp_valid(zp: Optional[torch.Tensor]):
if is_symmetric:
return zp is None
return zp is not None and zp.dtype is torch.int32
assert zp_valid(qkv_proj.input_zero_point)
assert zp_valid(o_proj.input_zero_point)
assert zp_valid(gate_up_proj.input_zero_point)
assert zp_valid(down_proj.input_zero_point)
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(gate_up_proj.quant_method, assert isinstance(gate_up_proj.quant_method,
...@@ -69,9 +83,12 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): ...@@ -69,9 +83,12 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
@pytest.mark.parametrize("model_args", [ @pytest.mark.parametrize("model_args", [
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"), ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"), ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
"channel"),
]) ])
def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args): def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
model_path, strategy = model_args model_path, strategy = model_args
with vllm_runner(model_path, dtype=torch.float16) as llm: with vllm_runner(model_path, dtype=torch.float16) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
...@@ -160,4 +177,4 @@ def test_compressed_tensors_kv_cache(vllm_runner): ...@@ -160,4 +177,4 @@ def test_compressed_tensors_kv_cache(vllm_runner):
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme" model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm: with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
output = llm.generate_greedy("Hello world!", max_tokens=20) output = llm.generate_greedy("Hello world!", max_tokens=20)
assert output assert output
\ No newline at end of file
"""Test model set-up and inference for quantized HF models supported
on the CPU backend using IPEX (including AWQ).
Validating the configuration and printing results for manual checking.
Run `pytest tests/quantization/test_ipex_quant.py`.
"""
import pytest
from vllm.platforms import current_platform
MODELS = [
"casperhansen/llama-3-8b-instruct-awq",
]
DTYPE = ["bfloat16"]
@pytest.mark.skipif(not current_platform.is_cpu(),
reason="only supports the CPU backend.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", DTYPE)
def test_ipex_quant(vllm_runner, model, dtype):
with vllm_runner(model, dtype=dtype) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
assert output
print(output)
...@@ -33,8 +33,8 @@ def test_beam_search_single_input( ...@@ -33,8 +33,8 @@ def test_beam_search_single_input(
max_tokens) max_tokens)
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_beam_search_new( vllm_outputs = vllm_model.generate_beam_search(example_prompts,
example_prompts, beam_width, max_tokens) beam_width, max_tokens)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_texts = hf_outputs[i] hf_output_ids, hf_output_texts = hf_outputs[i]
......
import itertools import itertools
import random import random
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
from unittest.mock import Mock, patch from unittest.mock import Mock, patch
...@@ -158,26 +159,6 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str): ...@@ -158,26 +159,6 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
assert first_sampler_output == second_sampler_output assert first_sampler_output == second_sampler_output
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_all_beam(seed: int, device: str):
set_random_seed(seed)
torch.set_default_device(device)
batch_size = random.randint(1, 256)
_, fake_logits, sampler = _prepare_test(batch_size)
sampling_params = SamplingParams(
temperature=0,
best_of=2,
use_beam_search=True,
)
_do_sample(batch_size, fake_logits, sampler, sampling_params, device)
# no assertion here as I am not sure how to determine whether
# the outputs are expected - in other words, this just tests
# whether there are no exceptions in the sampler
# when handling an all-beam search case.
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_min_tokens_penalty(seed: int, device: str): def test_sampler_min_tokens_penalty(seed: int, device: str):
...@@ -433,7 +414,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str): ...@@ -433,7 +414,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
sampling_metadata = SamplingMetadata.prepare( sampling_metadata = SamplingMetadata.prepare(
seq_group_metadata_list, seq_group_metadata_list,
seq_lens=seq_lens if seq_lens else None, seq_lens=seq_lens if seq_lens else None,
query_lens=seq_lens if seq_lens else None, query_lens=seq_lens if seq_lens else [1] * batch_size,
device=device, device=device,
pin_memory=is_pin_memory_available()) pin_memory=is_pin_memory_available())
# the logits tensor is modified in-place by the sampler # the logits tensor is modified in-place by the sampler
...@@ -478,7 +459,7 @@ def test_sampler_mixed(seed: int, device: str): ...@@ -478,7 +459,7 @@ def test_sampler_mixed(seed: int, device: str):
seq_lens: List[int] = [] seq_lens: List[int] = []
for i in range(batch_size): for i in range(batch_size):
expected: Optional[List[int]] = None expected: Optional[List[int]] = None
sampling_type = random.randint(0, 3) sampling_type = random.randint(0, 2)
if sampling_type == 0: if sampling_type == 0:
sampling_params = SamplingParams(temperature=0) sampling_params = SamplingParams(temperature=0)
expected = [int(torch.argmax(fake_logits[i], dim=-1).item())] expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
...@@ -497,10 +478,7 @@ def test_sampler_mixed(seed: int, device: str): ...@@ -497,10 +478,7 @@ def test_sampler_mixed(seed: int, device: str):
for idx in range(n): for idx in range(n):
fake_logits[i, i + idx] = 1e2 fake_logits[i, i + idx] = 1e2
expected = list(range(i, i + n)) expected = list(range(i, i + n))
else:
sampling_params = SamplingParams(temperature=0,
use_beam_search=True,
best_of=2)
expected_tokens.append(expected) expected_tokens.append(expected)
seq_group_metadata_list.append( seq_group_metadata_list.append(
SequenceGroupMetadata( SequenceGroupMetadata(
...@@ -529,9 +507,6 @@ def test_sampler_mixed(seed: int, device: str): ...@@ -529,9 +507,6 @@ def test_sampler_mixed(seed: int, device: str):
zip(sampler_output, seq_group_metadata_list)): zip(sampler_output, seq_group_metadata_list)):
assert metadata.sampling_params is not None assert metadata.sampling_params is not None
if metadata.sampling_params.use_beam_search:
continue
if (metadata.sampling_params.seed is not None if (metadata.sampling_params.seed is not None
and expected_tokens[i] is None): and expected_tokens[i] is None):
# Record seeded random result to compare with results of # Record seeded random result to compare with results of
...@@ -596,8 +571,19 @@ def test_sampler_top_k_top_p(seed: int, device: str): ...@@ -596,8 +571,19 @@ def test_sampler_top_k_top_p(seed: int, device: str):
generation_config = GenerationConfig(top_k=top_k, generation_config = GenerationConfig(top_k=top_k,
top_p=top_p, top_p=top_p,
do_sample=True) do_sample=True)
warpers = generation_model._get_logits_warper(generation_config, device)
assert len(warpers) == 2 # top_p and top_k @dataclass
class MockConfig:
is_encoder_decoder: bool = False
generation_model.config = MockConfig() # needed by the following method
generation_model._prepare_special_tokens(generation_config, device=device)
processors = generation_model._get_logits_processor(generation_config,
None,
None,
None, [],
device=device)
assert len(processors) == 2 # top_p and top_k
seq_group_metadata_list: List[SequenceGroupMetadata] = [] seq_group_metadata_list: List[SequenceGroupMetadata] = []
seq_lens: List[int] = [] seq_lens: List[int] = []
...@@ -639,7 +625,7 @@ def test_sampler_top_k_top_p(seed: int, device: str): ...@@ -639,7 +625,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
assert sample_probs is not None assert sample_probs is not None
hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5) torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
......
...@@ -5,16 +5,11 @@ from vllm import SamplingParams ...@@ -5,16 +5,11 @@ from vllm import SamplingParams
from .conftest import get_output_from_llm_generator from .conftest import get_output_from_llm_generator
@pytest.mark.parametrize( @pytest.mark.parametrize("common_llm_kwargs", [{
"common_llm_kwargs", "model": "JackFram/llama-68m",
[{ "speculative_model": "JackFram/llama-68m",
"model": "JackFram/llama-68m", "num_speculative_tokens": 5,
"speculative_model": "JackFram/llama-68m", }])
"num_speculative_tokens": 5,
# Required for spec decode.
"use_v2_block_manager": True
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
{ {
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
...@@ -44,16 +39,11 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator): ...@@ -44,16 +39,11 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
sampling_params) sampling_params)
@pytest.mark.parametrize( @pytest.mark.parametrize("common_llm_kwargs", [{
"common_llm_kwargs", "model": "meta-llama/Llama-2-7b-chat-hf",
[{ "speculative_model": "JackFram/llama-68m",
"model": "meta-llama/Llama-2-7b-chat-hf", "num_speculative_tokens": 5,
"speculative_model": "JackFram/llama-68m", }])
"num_speculative_tokens": 5,
# Required for spec decode.
"use_v2_block_manager": True
}])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[ [
...@@ -94,33 +84,3 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): ...@@ -94,33 +84,3 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
with pytest.raises(ValueError, match="cannot be larger than"): with pytest.raises(ValueError, match="cannot be larger than"):
get_output_from_llm_generator(test_llm_generator, prompts, get_output_from_llm_generator(test_llm_generator, prompts,
sampling_params) sampling_params)
@pytest.mark.parametrize("common_llm_kwargs", [{
"model": "JackFram/llama-68m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("seed", [1])
def test_spec_decode_xfail_block_manager_v1(test_llm_generator):
"""Verify that speculative decoding with block manager v1 fails.
"""
output_len = 128
temperature = 0.0
prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
with pytest.raises(ValueError,
match="Speculative decoding requires usage of the V2"):
get_output_from_llm_generator(test_llm_generator, prompts,
sampling_params)
...@@ -43,9 +43,6 @@ PRECISION = "float32" ...@@ -43,9 +43,6 @@ PRECISION = "float32"
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
...@@ -86,9 +83,6 @@ def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -86,9 +83,6 @@ def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
...@@ -143,9 +137,6 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -143,9 +137,6 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
[{ [{
"enforce_eager": False, "enforce_eager": False,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
...@@ -191,9 +182,6 @@ def test_eagle_e2e_greedy_correctness_cuda_graph( ...@@ -191,9 +182,6 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Precision # Precision
"dtype": PRECISION, "dtype": PRECISION,
...@@ -235,9 +223,6 @@ def test_eagle_e2e_greedy_correctness_with_preemption( ...@@ -235,9 +223,6 @@ def test_eagle_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Precision # Precision
"dtype": PRECISION, "dtype": PRECISION,
...@@ -283,9 +268,6 @@ def test_eagle_different_k(vllm_runner, common_llm_kwargs, ...@@ -283,9 +268,6 @@ def test_eagle_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Precision # Precision
"dtype": PRECISION, "dtype": PRECISION,
......
...@@ -12,8 +12,6 @@ MAIN_MODEL = "JackFram/llama-68m" ...@@ -12,8 +12,6 @@ MAIN_MODEL = "JackFram/llama-68m"
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# Required for spec decode.
"use_v2_block_manager": True,
# Verify equality when cuda graphs allowed. # Verify equality when cuda graphs allowed.
"enforce_eager": False, "enforce_eager": False,
...@@ -57,9 +55,6 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -57,9 +55,6 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
{ {
...@@ -102,3 +97,44 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs, ...@@ -102,3 +97,44 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
max_output_len=32, max_output_len=32,
seed=seed, seed=seed,
temperature=0.0) temperature=0.0)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": MAIN_MODEL,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
[{
"speculative_disable_mqa_scorer": True,
}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("seed", [1])
def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
output_len: int, seed: int):
"""Verify that ngram speculative decoding generates the same output
with batch expansion scorer and mqa scorer.
"""
run_equality_correctness_test(vllm_runner,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=output_len,
seed=seed,
temperature=0.0)
...@@ -17,9 +17,6 @@ from .conftest import run_equality_correctness_test_tp ...@@ -17,9 +17,6 @@ from .conftest import run_equality_correctness_test_tp
[[ [[
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"--enforce-eager", "--enforce-eager",
# Required for spec decode.
"--use-v2-block-manager",
"--tensor-parallel-size", "--tensor-parallel-size",
"2" "2"
]]) ]])
...@@ -74,9 +71,6 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -74,9 +71,6 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
[[ [[
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"--enforce-eager", "--enforce-eager",
# Required for spec decode.
"--use_v2_block_manager",
"--tensor_parallel_size", "--tensor_parallel_size",
"2", "2",
......
...@@ -19,9 +19,6 @@ SPEC_MODEL = "JackFram/llama-68m" ...@@ -19,9 +19,6 @@ SPEC_MODEL = "JackFram/llama-68m"
[[ [[
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"--enforce_eager", "--enforce_eager",
# Required for spec decode.
"--use-v2-block-manager",
"--tensor-parallel-size", "--tensor-parallel-size",
"4", "4",
]]) ]])
...@@ -71,9 +68,6 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs, ...@@ -71,9 +68,6 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"--enforce-eager", "--enforce-eager",
# Required for spec decode.
"--use-v2-block-manager",
"--tensor-parallel-size", "--tensor-parallel-size",
"4", "4",
]]) ]])
......
...@@ -14,9 +14,6 @@ from .conftest import run_equality_correctness_test ...@@ -14,9 +14,6 @@ from .conftest import run_equality_correctness_test
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -67,9 +64,6 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs, ...@@ -67,9 +64,6 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -119,9 +113,6 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs, ...@@ -119,9 +113,6 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -173,9 +164,6 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs, ...@@ -173,9 +164,6 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -251,8 +239,6 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs, ...@@ -251,8 +239,6 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
"model_name": "JackFram/llama-160m", "model_name": "JackFram/llama-160m",
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......
...@@ -45,9 +45,6 @@ PRECISION = "float32" ...@@ -45,9 +45,6 @@ PRECISION = "float32"
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
...@@ -93,9 +90,6 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -93,9 +90,6 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
...@@ -151,9 +145,6 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -151,9 +145,6 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
[{ [{
"enforce_eager": False, "enforce_eager": False,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
...@@ -204,9 +195,6 @@ def test_medusa_e2e_greedy_correctness_cuda_graph( ...@@ -204,9 +195,6 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Precision # Precision
"dtype": PRECISION, "dtype": PRECISION,
...@@ -253,9 +241,6 @@ def test_medusa_e2e_greedy_correctness_with_preemption( ...@@ -253,9 +241,6 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Precision # Precision
"dtype": PRECISION, "dtype": PRECISION,
...@@ -306,9 +291,6 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs, ...@@ -306,9 +291,6 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Precision # Precision
"dtype": PRECISION, "dtype": PRECISION,
...@@ -350,6 +332,52 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs, ...@@ -350,6 +332,52 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
temperature=0.0) temperature=0.0)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
"speculative_model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS,
"speculative_disable_by_batch_size": 4
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
[{
"speculative_disable_mqa_scorer": True,
}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("seed", [1])
def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
output_len: int, seed: int):
"""Verify that speculative decoding generates the same output
with batch expansion scorer and mqa scorer.
"""
run_equality_correctness_test(vllm_runner,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=output_len,
seed=seed,
temperature=0.0)
if __name__ == "__main__": if __name__ == "__main__":
import pytest import pytest
pytest.main([__file__]) pytest.main([__file__])
...@@ -47,9 +47,6 @@ PRECISION = "float32" ...@@ -47,9 +47,6 @@ PRECISION = "float32"
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
...@@ -94,9 +91,6 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -94,9 +91,6 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
...@@ -149,9 +143,6 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -149,9 +143,6 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
...@@ -195,9 +186,6 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, ...@@ -195,9 +186,6 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
...@@ -258,9 +246,6 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, ...@@ -258,9 +246,6 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Precision # Precision
"dtype": PRECISION, "dtype": PRECISION,
...@@ -311,9 +296,6 @@ def test_mlp_e2e_greedy_correctness_with_preemption( ...@@ -311,9 +296,6 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Precision # Precision
"dtype": PRECISION, "dtype": PRECISION,
...@@ -366,9 +348,6 @@ def test_mlp_e2e_greedy_correctness_with_padding( ...@@ -366,9 +348,6 @@ def test_mlp_e2e_greedy_correctness_with_padding(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Precision # Precision
"dtype": PRECISION, "dtype": PRECISION,
...@@ -419,9 +398,6 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs, ...@@ -419,9 +398,6 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Precision # Precision
"dtype": PRECISION, "dtype": PRECISION,
...@@ -460,3 +436,43 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, ...@@ -460,3 +436,43 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
max_output_len=output_len, max_output_len=output_len,
seed=seed, seed=seed,
temperature=0.0) temperature=0.0)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": MAIN_MODEL,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
"speculative_model": SPEC_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
[{
"speculative_disable_mqa_scorer": True,
}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("seed", [1])
def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
output_len: int, seed: int):
"""Verify that speculative decoding generates the same output
with batch expansion scorer and mqa scorer.
"""
run_equality_correctness_test(vllm_runner,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=output_len,
seed=seed,
temperature=0.0)
...@@ -55,9 +55,6 @@ from .conftest import (get_output_from_llm_generator, ...@@ -55,9 +55,6 @@ from .conftest import (get_output_from_llm_generator,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
}]) }])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
...@@ -124,9 +121,6 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, ...@@ -124,9 +121,6 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
}]) }])
...@@ -190,9 +184,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( ...@@ -190,9 +184,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
}]) }])
...@@ -246,9 +237,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( ...@@ -246,9 +237,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
[{ [{
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
...@@ -303,9 +291,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( ...@@ -303,9 +291,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
}]) }])
...@@ -353,9 +338,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( ...@@ -353,9 +338,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
}]) }])
...@@ -404,9 +386,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( ...@@ -404,9 +386,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
{ {
...@@ -454,9 +433,6 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( ...@@ -454,9 +433,6 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
...@@ -514,9 +490,6 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, ...@@ -514,9 +490,6 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -570,9 +543,6 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs, ...@@ -570,9 +543,6 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -611,9 +581,6 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs, ...@@ -611,9 +581,6 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -660,9 +627,6 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -660,9 +627,6 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......
...@@ -35,9 +35,6 @@ from .conftest import run_equality_correctness_test ...@@ -35,9 +35,6 @@ from .conftest import run_equality_correctness_test
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
}]) }])
...@@ -82,9 +79,6 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -82,9 +79,6 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics. # Print spec metrics.
"disable_log_stats": False, "disable_log_stats": False,
}]) }])
...@@ -145,9 +139,6 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -145,9 +139,6 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
{ {
...@@ -195,9 +186,6 @@ def test_ngram_e2e_greedy_correctness_with_preemption( ...@@ -195,9 +186,6 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -254,9 +242,6 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs, ...@@ -254,9 +242,6 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -292,3 +277,48 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs, ...@@ -292,3 +277,48 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
max_output_len=output_len, max_output_len=output_len,
seed=seed, seed=seed,
temperature=0.0) temperature=0.0)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Required for spec decode.
"speculative_model": "[ngram]",
"num_speculative_tokens": 5,
"ngram_prompt_lookup_max": 3,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
[{
"speculative_disable_mqa_scorer": True,
}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("seed", [1])
def test_ngram_scorer(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs, baseline_llm_kwargs,
test_llm_kwargs, batch_size: int, output_len: int,
seed: int):
"""Verify that ngram speculative decoding generates the same output
with batch expansion scorer and mqa scorer.
"""
run_equality_correctness_test(vllm_runner,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=output_len,
seed=seed,
temperature=0.0)
...@@ -17,9 +17,6 @@ SPEC_MODEL = "JackFram/llama-160m" ...@@ -17,9 +17,6 @@ SPEC_MODEL = "JackFram/llama-160m"
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# speculative model # speculative model
"speculative_model": "JackFram/llama-160m", "speculative_model": "JackFram/llama-160m",
......
...@@ -173,7 +173,6 @@ def test_same_output_for_multi_step(): ...@@ -173,7 +173,6 @@ def test_same_output_for_multi_step():
block_size, block_size,
num_gpu_blocks, num_gpu_blocks,
seed, seed,
model_runner_cls=TP1DraftModelRunner,
) )
worker = create_worker( worker = create_worker(
...@@ -673,7 +672,10 @@ def test_use_draft_model_runner_advance_step(): ...@@ -673,7 +672,10 @@ def test_use_draft_model_runner_advance_step():
worker.model_runner._gpu_advance_step.side_effect = ValueError( worker.model_runner._gpu_advance_step.side_effect = ValueError(
exception_secret) exception_secret)
seq_group_metadata_list, _, _ = create_batch(batch_size, k) seq_group_metadata_list, _, _ = create_batch(batch_size,
k,
block_size=block_size,
num_gpu_blocks=num_gpu_blocks)
# Fallback (should not call) when num_steps=1. # Fallback (should not call) when num_steps=1.
execute_model_req = ExecuteModelRequest( execute_model_req = ExecuteModelRequest(
......
import random
from typing import List
import pytest
import torch
from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores
from vllm.spec_decode.mqa_scorer import MQAScorer
from vllm.worker.worker import Worker
from .utils import create_batch, create_worker
def create_proposal(propose_lens: List[int], vocab_size: int,
device: str) -> SpeculativeProposals:
batch_size = len(propose_lens)
max_propose_len = max(propose_lens)
proposal_probs = torch.rand((batch_size, max_propose_len, vocab_size),
device=device)
proposal_token_ids = torch.full((batch_size, max_propose_len),
fill_value=-1,
device=device)
for i in range(batch_size):
proposal_token_ids[i][:propose_lens[i]] = torch.argmax(
proposal_probs[i][:propose_lens[i]], dim=-1)
propose_lens = torch.tensor(propose_lens, device=device)
return SpeculativeProposals(proposal_token_ids, proposal_probs,
propose_lens)
def assert_score_equal(score1: SpeculativeScores,
score2: SpeculativeScores) -> None:
assert torch.allclose(score1.probs, score2.probs)
assert torch.allclose(score1.logprobs, score2.logprobs)
assert torch.equal(
score1.token_ids,
score2.token_ids), f"{score1.token_ids}, {score2.token_ids}"
@pytest.mark.parametrize('model_name', ['facebook/opt-125m'])
@pytest.mark.parametrize('batch_size', [1, 2, 4, 8, 16])
@pytest.mark.parametrize('max_propose_len', [1, 3, 5])
@pytest.mark.parametrize('mixed_propose_len', [True])
@pytest.mark.parametrize('device', ['cuda'])
def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
mixed_propose_len: bool, device: str) -> None:
"""
Compare the batch expansion scorer and mqa scorer return the same score.
We test for both queries with the same propose length and different
propose length.
"""
seed = 0
block_size = 32
num_gpu_blocks = 2048 // block_size
scorer_worker = create_worker(Worker, model_name, block_size,
num_gpu_blocks, seed)
scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True
scorer_worker.model_runner.model.sampler.\
should_modify_greedy_probs_inplace = True
vocab_size = scorer_worker.vocab_size
if not mixed_propose_len:
propose_lens = [max_propose_len] * batch_size
else:
non_zero_cnt = random.randint(0, batch_size)
propose_lens = [max_propose_len
] * non_zero_cnt + [0] * (batch_size - non_zero_cnt)
random.shuffle(propose_lens)
proposals = create_proposal(propose_lens, vocab_size, device)
seq_group_metadatalist, _, _ = create_batch(batch_size,
max_propose_len,
block_size=block_size,
num_gpu_blocks=num_gpu_blocks)
requests = ExecuteModelRequest(seq_group_metadatalist,
num_lookahead_slots=max_propose_len)
batch_expansion_scorer = BatchExpansionTop1Scorer(scorer_worker, device,
vocab_size)
batch_expansion_score = batch_expansion_scorer.score_proposals(
requests, proposals)
mqa_scorer = MQAScorer(scorer_worker, device, vocab_size)
mqa_score = mqa_scorer.score_proposals(requests, proposals)
assert_score_equal(batch_expansion_score, mqa_score)
...@@ -63,10 +63,10 @@ def test_correctly_calls_draft_model(k: int, batch_size: int, ...@@ -63,10 +63,10 @@ def test_correctly_calls_draft_model(k: int, batch_size: int,
@pytest.mark.parametrize("acceptance_sampler_method", @pytest.mark.parametrize("acceptance_sampler_method",
["rejection_sampler", "typical_acceptance_sampler"]) ["rejection_sampler", "typical_acceptance_sampler"])
@torch.inference_mode() @torch.inference_mode()
def test_correctly_calls_target_model(k: int, batch_size: int, def test_batch_expansion_correctly_calls_target_model(
acceptance_sampler_method: str): k: int, batch_size: int, acceptance_sampler_method: str):
"""Verify SpecDecodeWorker calls the target model with correct """Verify SpecDecodeWorker calls the target model with correct
inputs. Everything else is mocked out. inputs with batch expansion. Everything else is mocked out.
""" """
draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False) draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
target_worker = mock_worker(use_spec=False) target_worker = mock_worker(use_spec=False)
...@@ -82,7 +82,8 @@ def test_correctly_calls_target_model(k: int, batch_size: int, ...@@ -82,7 +82,8 @@ def test_correctly_calls_target_model(k: int, batch_size: int,
target_worker, target_worker,
mock_spec_decode_sampler(acceptance_sampler_method), mock_spec_decode_sampler(acceptance_sampler_method),
disable_logprobs=False, disable_logprobs=False,
metrics_collector=metrics_collector) metrics_collector=metrics_collector,
disable_mqa_scorer=True)
worker.init_device() worker.init_device()
vocab_size = 32_000 vocab_size = 32_000
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment