Commit 500b93c8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.3.post1' into v0.5.3.post1-dtk24.04.1

parents 99426767 38c4b7e8
from typing import List
import pytest
from vllm.lora.models import LoRAModel
from vllm.lora.utils import get_adapter_absolute_path
from vllm.model_executor.models.llama import LlamaForCausalLM
# Provide absolute path and huggingface lora ids
lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
@pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
lora_name = request.getfixturevalue(lora_fixture_name)
supported_lora_modules = LlamaForCausalLM.supported_lora_modules
packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
embedding_modules = LlamaForCausalLM.embedding_modules
embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
expected_lora_modules: List[str] = []
for module in supported_lora_modules:
if module in packed_modules_mapping:
expected_lora_modules.extend(packed_modules_mapping[module])
else:
expected_lora_modules.append(module)
lora_path = get_adapter_absolute_path(lora_name)
# lora loading should work for either absolute path and hugggingface id.
lora_model = LoRAModel.from_local_checkpoint(
lora_path,
expected_lora_modules,
lora_model_id=1,
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules)
# Assertions to ensure the model is loaded correctly
assert lora_model is not None, "LoRAModel is not loaded correctly"
from collections import OrderedDict from collections import OrderedDict
from unittest.mock import patch
import pytest import pytest
from huggingface_hub.utils import HfHubHTTPError
from torch import nn from torch import nn
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule from vllm.lora.utils import (get_adapter_absolute_path,
parse_fine_tuned_lora_name, replace_submodule)
from vllm.utils import LRUCache from vllm.utils import LRUCache
...@@ -182,3 +185,55 @@ def test_lru_cache(): ...@@ -182,3 +185,55 @@ def test_lru_cache():
assert 2 in cache assert 2 in cache
assert 4 in cache assert 4 in cache
assert 6 in cache assert 6 in cache
# Unit tests for get_adapter_absolute_path
@patch('os.path.isabs')
def test_get_adapter_absolute_path_absolute(mock_isabs):
path = '/absolute/path/to/lora'
mock_isabs.return_value = True
assert get_adapter_absolute_path(path) == path
@patch('os.path.expanduser')
def test_get_adapter_absolute_path_expanduser(mock_expanduser):
# Path with ~ that needs to be expanded
path = '~/relative/path/to/lora'
absolute_path = '/home/user/relative/path/to/lora'
mock_expanduser.return_value = absolute_path
assert get_adapter_absolute_path(path) == absolute_path
@patch('os.path.exists')
@patch('os.path.abspath')
def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
# Relative path that exists locally
path = 'relative/path/to/lora'
absolute_path = '/absolute/path/to/lora'
mock_exist.return_value = True
mock_abspath.return_value = absolute_path
assert get_adapter_absolute_path(path) == absolute_path
@patch('huggingface_hub.snapshot_download')
@patch('os.path.exists')
def test_get_adapter_absolute_path_huggingface(mock_exist,
mock_snapshot_download):
# Hugging Face model identifier
path = 'org/repo'
absolute_path = '/mock/snapshot/path'
mock_exist.return_value = False
mock_snapshot_download.return_value = absolute_path
assert get_adapter_absolute_path(path) == absolute_path
@patch('huggingface_hub.snapshot_download')
@patch('os.path.exists')
def test_get_adapter_absolute_path_huggingface_error(mock_exist,
mock_snapshot_download):
# Hugging Face model identifier with download error
path = 'org/repo'
mock_exist.return_value = False
mock_snapshot_download.side_effect = HfHubHTTPError(
"failed to query model info")
assert get_adapter_absolute_path(path) == path
from typing import List from typing import List
import pytest import pytest
import ray
from prometheus_client import REGISTRY from prometheus_client import REGISTRY
from vllm import EngineArgs, LLMEngine from vllm import EngineArgs, LLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
MODELS = [ MODELS = [
...@@ -168,6 +170,55 @@ def test_engine_log_metrics_regression( ...@@ -168,6 +170,55 @@ def test_engine_log_metrics_regression(
assert_metrics(engine, disable_log_stats, len(example_prompts)) assert_metrics(engine, disable_log_stats, len(example_prompts))
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [10])
def test_metric_spec_decode(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
k = 5
with vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
use_v2_block_manager=True) as vllm_model:
# Force log interval to be 0 to catch all metrics.
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
stat_logger.local_interval = 0
# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn = {
"gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
"gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
"counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
"counter_spec_decode_num_draft_tokens": lambda v: v == k,
"counter_spec_decode_num_emitted_tokens":
lambda v: 0 <= v <= k + 1,
}
# Use one request to better inspect the metrics.
prompts = example_prompts[:1]
_ = vllm_model.generate_greedy(prompts, max_tokens)
for metric_name, is_expected in metric_name_to_expected_fn.items():
metric_val = getattr(
stat_logger.metrics,
metric_name).labels(**stat_logger.labels)._value.get()
assert is_expected(metric_val), (
f"the value of metric {metric_name} ({metric_val}) "
"does not meet expectation")
def assert_metrics(engine: LLMEngine, disable_log_stats: bool, def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
num_requests: int) -> None: num_requests: int) -> None:
if disable_log_stats: if disable_log_stats:
...@@ -192,3 +243,55 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool, ...@@ -192,3 +243,55 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
labels) labels)
assert ( assert (
metric_value == num_requests), "Metrics should be collected" metric_value == num_requests), "Metrics should be collected"
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [16])
def test_engine_log_metrics_ray(
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
# This test is quite weak - it only checks that we can use
# RayPrometheusStatLogger without exceptions.
# Checking whether the metrics are actually emitted is unfortunately
# non-trivial.
# We have to run in a Ray task for Ray metrics to be emitted correctly
@ray.remote(num_gpus=1)
def _inner():
class _RayPrometheusStatLogger(RayPrometheusStatLogger):
def __init__(self, *args, **kwargs):
self._i = 0
super().__init__(*args, **kwargs)
def log(self, *args, **kwargs):
self._i += 1
return super().log(*args, **kwargs)
engine_args = EngineArgs(
model=model,
dtype=dtype,
disable_log_stats=False,
)
engine = LLMEngine.from_engine_args(engine_args)
logger = _RayPrometheusStatLogger(
local_interval=0.5,
labels=dict(model_name=engine.model_config.served_model_name),
max_model_len=engine.model_config.max_model_len)
engine.add_logger("ray", logger)
for i, prompt in enumerate(example_prompts):
engine.add_request(
f"request-id-{i}",
prompt,
SamplingParams(max_tokens=max_tokens),
)
while engine.has_unfinished_requests():
engine.step()
assert logger._i > 0, ".log must be called at least once"
ray.get(_inner.remote())
import re
from typing import List, Optional, Type
import pytest
from vllm.multimodal.utils import rescale_image_size
from ..conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
pytestmark = pytest.mark.vlm
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
"cherry_blossom":
"USER: <image>\nWhat is the season?\nASSISTANT:",
})
models = ["facebook/chameleon-7b"]
#TODO (ywang96): Add correctness test when chameleon is
# available on transformers.
def run_test(
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets,
model: str,
*,
size_factors: List[float],
dtype: str,
max_tokens: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Test if the model can generate text given
a batch of images and prompts.
"""
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
with vllm_runner(model,
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
for prompts, images in inputs_per_image:
vllm_outputs = vllm_model.generate_greedy(prompts,
max_tokens,
images=images)
for i in range(len(vllm_outputs)):
# format prompt back to original
replacements = {
"<racm3:break>": "",
"<eoss>": "",
"<reserved08706>": ""
}
pattern = '|'.join(replacements.keys())
vllm_result = re.sub(
pattern,
lambda match: replacements[match.group(0)], #noqa B023
vllm_outputs[i][1])
vllm_result = vllm_result.replace("<image>", "", 1023)
assert vllm_result[:len(prompts[i])] == prompts[i]
# assert at least 10 new characters are generated
# (to take stop token into account)
assert len(vllm_outputs[i][1]) - len(prompts[i]) > 10
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128])
def test_models(vllm_runner, image_assets, model, size_factors, dtype: str,
max_tokens: int) -> None:
run_test(
vllm_runner,
image_assets,
model,
size_factors=size_factors,
dtype=dtype,
max_tokens=max_tokens,
tensor_parallel_size=1,
)
...@@ -12,9 +12,10 @@ from .utils import check_logprobs_close ...@@ -12,9 +12,10 @@ from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm pytestmark = pytest.mark.vlm
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "What's the content of the image?\n", # noqa: E501 "stop_sign":
"cherry_blossom": "What is the season?\n", "What's the content of the image?\n",
"boardwalk": "What's in this image?\n", "cherry_blossom":
"What is the season?\n",
}) })
models = ["adept/fuyu-8b"] models = ["adept/fuyu-8b"]
......
import pytest import pytest
from tests.models.utils import check_outputs_equal
from vllm.worker.model_runner import _get_graph_batch_size from vllm.worker.model_runner import _get_graph_batch_size
MODELS = ["ai21labs/Jamba-tiny-random"] MODELS = ["ai21labs/Jamba-tiny-random"]
...@@ -34,6 +35,34 @@ def test_models( ...@@ -34,6 +35,34 @@ def test_models(
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [20])
def test_batching(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
# To pass the small model tests, we need full precision.
for_loop_outputs = []
with vllm_runner(model, dtype=dtype) as vllm_model:
for prompt in example_prompts:
for_loop_outputs.append(
vllm_model.generate_greedy([prompt], max_tokens)[0])
batched_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal(
outputs_0_lst=for_loop_outputs,
outputs_1_lst=batched_outputs,
name_0="for_loop_vllm",
name_1="batched_vllm",
)
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [20]) @pytest.mark.parametrize("max_tokens", [20])
...@@ -60,6 +89,60 @@ def test_mamba_cache_cg_padding( ...@@ -60,6 +89,60 @@ def test_mamba_cache_cg_padding(
"Could be related to mamba cache not padded correctly") "Could be related to mamba cache not padded correctly")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [20])
def test_models_preemption_recompute(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
# Tests that outputs are identical with and w/o preemtions (recompute)
assert dtype == "float"
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_model.model.llm_engine.scheduler[
0].ENABLE_ARTIFICIAL_PREEMPT = True
preempt_vllm_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens)
vllm_model.model.llm_engine.scheduler[
0].ENABLE_ARTIFICIAL_PREEMPT = False
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=preempt_vllm_outputs,
outputs_1_lst=vllm_outputs,
name_0="vllm_preepmtions",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
vllm_runner,
model: str,
dtype: str,
example_prompts,
) -> None:
# This test is for verifying that the Jamba inner state management doesn't
# collapse in case where the number of incoming requests and
# finished_requests_ids is larger than the maximum mamba block capacity.
# This could generally happen due to the fact that Jamba does support
# statelessness mechanism where it can cleanup new incoming requests in
# a single step.
try:
with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
except ValueError:
pytest.fail("Jamba inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily ")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
def test_state_cleanup( def test_state_cleanup(
......
...@@ -16,8 +16,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -16,8 +16,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"USER: <image>\nWhat's the content of the image?\nASSISTANT:", "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
"cherry_blossom": "cherry_blossom":
"USER: <image>\nWhat is the season?\nASSISTANT:", "USER: <image>\nWhat is the season?\nASSISTANT:",
"boardwalk":
"USER: <image>\nWhat's in this image?\nASSISTANT:",
}) })
IMAGE_TOKEN_ID = 32000 IMAGE_TOKEN_ID = 32000
......
...@@ -23,8 +23,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -23,8 +23,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
f"{_PREFACE} USER: <image>\nWhat's the content of the image? ASSISTANT:", f"{_PREFACE} USER: <image>\nWhat's the content of the image? ASSISTANT:",
"cherry_blossom": "cherry_blossom":
f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:", f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
"boardwalk":
f"{_PREFACE} USER: <image>\nWhat's in this image? ASSISTANT:",
}) })
IMAGE_TOKEN_ID = 32000 IMAGE_TOKEN_ID = 32000
......
import os
from typing import List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
import pytest import pytest
...@@ -5,6 +6,7 @@ from transformers import AutoTokenizer ...@@ -5,6 +6,7 @@ from transformers import AutoTokenizer
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import is_hip
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from .utils import check_logprobs_close from .utils import check_logprobs_close
...@@ -12,15 +14,22 @@ from .utils import check_logprobs_close ...@@ -12,15 +14,22 @@ from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm pytestmark = pytest.mark.vlm
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "caption es", "stop_sign":
"cherry_blossom": "What is in the picture?", "caption es",
"boardwalk": "What is in the picture?", "cherry_blossom":
"What is in the picture?",
}) })
IMAGE_TOKEN_ID = 257152 IMAGE_TOKEN_ID = 257152
models = ["google/paligemma-3b-mix-224"] models = ["google/paligemma-3b-mix-224"]
# ROCm Triton FA can run into compilation issues with these models due to,
# excessive use of shared memory. Use other backends in the meantime.
# FIXME (mattwong, gshtrasb, hongxiayan)
if is_hip():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
Optional[SampleLogprobs]], Optional[SampleLogprobs]],
...@@ -129,7 +138,15 @@ def run_test( ...@@ -129,7 +138,15 @@ def run_test(
[0.25, 0.5, 1.0], [0.25, 0.5, 1.0],
], ],
) )
@pytest.mark.parametrize("dtype", ["float", "half"]) @pytest.mark.parametrize("dtype", [
pytest.param(
"float",
marks=pytest.mark.skipif(
is_hip(),
reason=
"ROCm FA does not yet fully support 32-bit precision on PaliGemma")
), "half"
])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
......
import os
import re import re
from typing import List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
...@@ -6,7 +7,7 @@ from transformers import AutoTokenizer ...@@ -6,7 +7,7 @@ from transformers import AutoTokenizer
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu from vllm.utils import is_cpu, is_hip
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from .utils import check_logprobs_close from .utils import check_logprobs_close
...@@ -18,8 +19,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -18,8 +19,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501 "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom": "cherry_blossom":
"<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
"boardwalk":
"<|user|>\n<|image_1|>\nWhat's in this image?<|end|>\n<|assistant|>\n",
}) })
models = ["microsoft/Phi-3-vision-128k-instruct"] models = ["microsoft/Phi-3-vision-128k-instruct"]
...@@ -49,6 +48,12 @@ target_dtype = "half" ...@@ -49,6 +48,12 @@ target_dtype = "half"
if is_cpu(): if is_cpu():
target_dtype = "bfloat16" target_dtype = "bfloat16"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if is_hip():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
def run_test( def run_test(
hf_runner: Type[HfRunner], hf_runner: Type[HfRunner],
......
...@@ -7,7 +7,7 @@ import numpy as np ...@@ -7,7 +7,7 @@ import numpy as np
import pytest import pytest
from PIL import Image from PIL import Image
from vllm.multimodal.utils import ImageFetchAiohttp, fetch_image from vllm.multimodal.utils import async_fetch_image, fetch_image
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [ TEST_IMAGE_URLS = [
...@@ -37,15 +37,15 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool: ...@@ -37,15 +37,15 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
return (np.asarray(a) == np.asarray(b.convert(a.mode))).all() return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
@pytest.mark.asyncio(scope="module") @pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_fetch_image_http(image_url: str): async def test_fetch_image_http(image_url: str):
image_sync = fetch_image(image_url) image_sync = fetch_image(image_url)
image_async = await ImageFetchAiohttp.fetch_image(image_url) image_async = await async_fetch_image(image_url)
assert _image_equals(image_sync, image_async) assert _image_equals(image_sync, image_async)
@pytest.mark.asyncio(scope="module") @pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("suffix", get_supported_suffixes()) @pytest.mark.parametrize("suffix", get_supported_suffixes())
async def test_fetch_image_base64(url_images: Dict[str, Image.Image], async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
...@@ -78,5 +78,5 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image], ...@@ -78,5 +78,5 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
else: else:
pass # Lossy format; only check that image can be opened pass # Lossy format; only check that image can be opened
data_image_async = await ImageFetchAiohttp.fetch_image(data_url) data_image_async = await async_fetch_image(data_url)
assert _image_equals(data_image_sync, data_image_async) assert _image_equals(data_image_sync, data_image_async)
...@@ -150,3 +150,10 @@ def test_compressed_tensors_fp8(vllm_runner): ...@@ -150,3 +150,10 @@ def test_compressed_tensors_fp8(vllm_runner):
output = llm.generate_greedy("Hello my name is", max_tokens=20) output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output assert output
def test_compressed_tensors_kv_cache(vllm_runner):
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
output = llm.generate_greedy("Hello world!", max_tokens=20)
assert output
...@@ -44,9 +44,9 @@ MODEL_ARG_EXPTYPES = [ ...@@ -44,9 +44,9 @@ MODEL_ARG_EXPTYPES = [
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"), ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
# AUTOAWQ # AUTOAWQ
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq"), ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"), ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "ERROR"), ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"), ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
] ]
......
...@@ -7,19 +7,49 @@ import torch ...@@ -7,19 +7,49 @@ import torch
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
Fp8LinearMethod)
MODELS = [ MODELS = [
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8", "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Phi-3-mini-128k-instruct-FP8", "nm-testing/Phi-3-mini-128k-instruct-FP8",
] ]
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model_id", MODELS)
def test_model_load_and_run(vllm_runner, model: str): def test_model_load_and_run(vllm_runner, model_id: str):
with vllm_runner(model) as llm: with vllm_runner(model_id) as llm:
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
outputs = llm.generate_greedy(prompts=["Hello my name is"],
max_tokens=10)
print(outputs[0][1])
KV_CACHE_MODELS = [
# Deprecated AutoFP8 format using .kv_scale
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
# AutoFP8 format using separate .k_scale and .v_scale
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
]
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
attn = model.model.layers[0].self_attn.attn
assert isinstance(attn.quant_method, Fp8KVCacheMethod)
# NOTE: it is valid for scales to be 1.0 (default value), but we know
# these checkpoints have scales < 1.0
assert 0.0 < attn._k_scale < 1.0
assert 0.0 < attn._v_scale < 1.0
# note: this does not test accuracy, just that we can run through # note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy # see lm-eval tests for accuracy
outputs = llm.generate_greedy(prompts=["Hello my name is"], outputs = llm.generate_greedy(prompts=["Hello my name is"],
......
...@@ -150,9 +150,54 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, ...@@ -150,9 +150,54 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
high=vocab_size, high=vocab_size,
size=(batch_size, k), size=(batch_size, k),
dtype=torch.int64) dtype=torch.int64)
generators = [None] * batch_size
rejection_sampler(target_probs, bonus_token_ids, draft_probs, rejection_sampler(target_probs, bonus_token_ids, draft_probs,
draft_token_ids) draft_token_ids, generators)
@pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
@pytest.mark.parametrize("k", [1, 3, 6])
@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
@pytest.mark.parametrize("n_rep", [100])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
frac_seeded: float, n_rep: int,
device: str):
torch.set_default_device(device)
rejection_sampler = RejectionSampler()
rejection_sampler.init_gpu_tensors(rank=0)
draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
bonus_token_ids = torch.randint(low=0,
high=vocab_size,
size=(batch_size, 1),
dtype=torch.int64)
draft_token_ids = torch.randint(low=0,
high=vocab_size,
size=(batch_size, k),
dtype=torch.int64)
seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
results = []
for _ in range(n_rep):
generators = [
torch.Generator(
device=device).manual_seed(i) if seeded_mask[i] else None
for i in range(batch_size)
]
results.append(
rejection_sampler(target_probs, bonus_token_ids, draft_probs,
draft_token_ids, generators))
for i in range(batch_size):
if seeded_mask[i]:
for j in range(1, n_rep):
assert torch.equal(results[j][i], results[0][i])
@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
...@@ -197,10 +242,11 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str, ...@@ -197,10 +242,11 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
raise AssertionError() raise AssertionError()
oob_token_ids[0][0] = rogue_token_id oob_token_ids[0][0] = rogue_token_id
generators = [None] * batch_size
with pytest.raises(AssertionError): with pytest.raises(AssertionError):
rejection_sampler(target_probs, bonus_token_ids, draft_probs, rejection_sampler(target_probs, bonus_token_ids, draft_probs,
draft_token_ids) draft_token_ids, generators)
@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False]) @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
...@@ -371,11 +417,15 @@ class _CorrectnessTestHelper: ...@@ -371,11 +417,15 @@ class _CorrectnessTestHelper:
dtype=torch.int64, dtype=torch.int64,
device="cuda").repeat(num_samples, 1) device="cuda").repeat(num_samples, 1)
# unseeded
generators = [None]
# Get output tokens via rejection sampling. # Get output tokens via rejection sampling.
output_token_ids = self.rejection_sampler(target_probs.to("cuda"), output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
bonus_token_ids.to("cuda"), bonus_token_ids.to("cuda"),
draft_probs.to("cuda"), draft_probs.to("cuda"),
draft_token_ids.to("cuda")) draft_token_ids.to("cuda"),
generators)
# Remove bonus tokens # Remove bonus tokens
output_token_ids = output_token_ids[:, :-1].flatten() output_token_ids = output_token_ids[:, :-1].flatten()
......
import asyncio import asyncio
from itertools import cycle from itertools import cycle
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Sequence, Tuple, Union
import pytest import pytest
import ray import ray
...@@ -128,7 +128,9 @@ class AsyncLLM: ...@@ -128,7 +128,9 @@ class AsyncLLM:
try: try:
for i in range(num_requests): for i in range(num_requests):
prompt = prompts[i] if prompts is not None else None prompt = prompts[i] if prompts is not None else None
res = asyncio.run(get_output(prompt, sampling_params)) params = sampling_params[i] if isinstance(
sampling_params, Sequence) else sampling_params
res = asyncio.run(get_output(prompt, params))
outputs.append(res) outputs.append(res)
finally: finally:
ray.shutdown() ray.shutdown()
...@@ -162,6 +164,11 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs, ...@@ -162,6 +164,11 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
} }
test_name = request.node.name test_name = request.node.name
model = kwargs["model"]
draft_model = kwargs.get("speculative_model", None)
same_draft_target_model = (draft_model is not None
and draft_model == model)
def generator_inner(): def generator_inner():
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
...@@ -177,6 +184,13 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs, ...@@ -177,6 +184,13 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}') print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
llm = AsyncLLM(**kwargs) if use_async else LLM(**kwargs) llm = AsyncLLM(**kwargs) if use_async else LLM(**kwargs)
# Override logging interval to 0 for spec decode test run to
# log all metrics in time.
if (baseline_or_test == "test" and not use_async
and llm.llm_engine.log_stats):
for sate_logger in llm.llm_engine.stat_loggers.values():
sate_logger.local_interval = 0
set_random_seed(seed) set_random_seed(seed)
yield llm yield llm
...@@ -188,6 +202,9 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs, ...@@ -188,6 +202,9 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
yield llm yield llm
del llm del llm
# Set an attribute to the generator_outer function to allow us to
# determine whether to further check the acceptance rate in tests.
generator_outer.same_draft_target_model = same_draft_target_model # type: ignore
return generator_outer return generator_outer
...@@ -204,18 +221,27 @@ def maybe_assert_ngram_worker(llm): ...@@ -204,18 +221,27 @@ def maybe_assert_ngram_worker(llm):
def get_output_from_llm_generator( def get_output_from_llm_generator(
llm_generator, prompts, llm_generator, prompts,
sampling_params) -> Tuple[List[str], List[List[int]]]: sampling_params) -> Tuple[List[str], List[List[int]], float]:
tokens: List[str] = [] tokens: List[str] = []
token_ids: List[List[int]] = [] token_ids: List[List[int]] = []
acceptance_rate: float = -1.0
for llm in llm_generator(): for llm in llm_generator():
maybe_assert_ngram_worker(llm) maybe_assert_ngram_worker(llm)
outputs = llm.generate(prompts, sampling_params, use_tqdm=True) outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
token_ids = [output.outputs[0].token_ids for output in outputs] token_ids = [output.outputs[0].token_ids for output in outputs]
tokens = [output.outputs[0].text for output in outputs] tokens = [output.outputs[0].text for output in outputs]
# Fetch acceptance rate if logging is enabled.
if stat_loggers := getattr(llm.llm_engine, "stat_loggers", None):
stat_logger = stat_loggers["prometheus"]
acceptance_rate = (stat_logger.metrics.
gauge_spec_decode_draft_acceptance_rate.labels(
**stat_logger.labels)._value.get())
del llm del llm
return tokens, token_ids return tokens, token_ids, acceptance_rate
def get_logprobs_from_llm_generator( def get_logprobs_from_llm_generator(
...@@ -237,12 +263,37 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, ...@@ -237,12 +263,37 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
batch_size, batch_size,
max_output_len, max_output_len,
force_output_len: bool, force_output_len: bool,
print_tokens: bool = False): print_tokens: bool = False,
ensure_all_accepted: bool = False):
"""Helper method that compares the outputs of both the baseline LLM and """Helper method that compares the outputs of both the baseline LLM and
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
the same when temperature is zero. the same when temperature is zero.
""" """
temperature = 0.0
run_equality_correctness_test(baseline_llm_generator,
test_llm_generator,
batch_size,
max_output_len,
force_output_len,
temperature=0.0,
seeded=False,
print_tokens=print_tokens,
ensure_all_accepted=ensure_all_accepted)
def run_equality_correctness_test(baseline_llm_generator,
test_llm_generator,
batch_size,
max_output_len,
force_output_len: bool,
temperature: float,
seeded: bool,
print_tokens: bool = False,
ensure_all_accepted: bool = False):
"""Helper method that compares the outputs of both the baseline LLM and
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
the same when temperature is zero (or when temperature is > 0 and seeded).
"""
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
...@@ -261,18 +312,29 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, ...@@ -261,18 +312,29 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
# sampling params to ignore eos token. # sampling params to ignore eos token.
ignore_eos = force_output_len ignore_eos = force_output_len
sampling_params = SamplingParams( if seeded:
max_tokens=max_output_len, sampling_params = [
ignore_eos=ignore_eos, SamplingParams(
temperature=temperature, max_tokens=max_output_len,
) ignore_eos=ignore_eos,
temperature=temperature,
seed=i,
) for i in range(len(prompts))
]
else:
sampling_params = SamplingParams(
max_tokens=max_output_len,
ignore_eos=ignore_eos,
temperature=temperature,
)
spec_batch_tokens, spec_batch_token_ids = get_output_from_llm_generator( (spec_batch_tokens, spec_batch_token_ids,
test_llm_generator, prompts, sampling_params) acceptance_rate) = get_output_from_llm_generator(test_llm_generator,
prompts, sampling_params)
(baseline_batch_tokens, (baseline_batch_tokens, baseline_batch_token_ids,
baseline_batch_token_ids) = get_output_from_llm_generator( _) = get_output_from_llm_generator(baseline_llm_generator, prompts,
baseline_llm_generator, prompts, sampling_params) sampling_params)
assert len(baseline_batch_token_ids) == len(prompts) assert len(baseline_batch_token_ids) == len(prompts)
assert len(spec_batch_token_ids) == len(prompts) assert len(spec_batch_token_ids) == len(prompts)
...@@ -287,3 +349,6 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, ...@@ -287,3 +349,6 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
print(f'{i=} {baseline_token_ids=}') print(f'{i=} {baseline_token_ids=}')
print(f'{i=} {spec_token_ids=}') print(f'{i=} {spec_token_ids=}')
assert baseline_token_ids == spec_token_ids assert baseline_token_ids == spec_token_ids
if ensure_all_accepted:
assert acceptance_rate == 1.0
...@@ -83,6 +83,9 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, ...@@ -83,6 +83,9 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
# cleaned up properly, and its server host thread leaks, causing the # cleaned up properly, and its server host thread leaks, causing the
# second run of the test to fail with internal NCCL error. # second run of the test to fail with internal NCCL error.
"use_async": True, "use_async": True,
# precision
"dtype": "float32",
}]) }])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize( @pytest.mark.parametrize(
......
...@@ -58,3 +58,65 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, ...@@ -58,3 +58,65 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
batch_size, batch_size,
max_output_len=32, max_output_len=32,
force_output_len=True) force_output_len=True)
@pytest.mark.skipif(torch.cuda.device_count() < 4,
reason="Need at least 4 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": "JackFram/llama-160m",
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
"tensor_parallel_size": 4,
# Use AsyncLLM engine, so that the engine runs in its own process.
# Otherwise, since vLLM does not follow true SPMD, the test runner
# process will have both the engine and the rank0 worker. NCCL is not
# cleaned up properly, and its server host thread leaks, causing the
# second run of the test to fail with internal NCCL error.
"use_async": True,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize(
"test_llm_kwargs",
[
{
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
# Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens.
"speculative_max_model_len": 32,
},
])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize(
"output_len",
[
# This must be a good bit larger than speculative_max_model_len so that
# we can test the case where all seqs are skipped, but still small to
# ensure fast test.
64,
])
@pytest.mark.parametrize("seed", [1])
def test_skip_speculation(baseline_llm_generator, test_llm_generator,
batch_size: int, output_len: int):
"""Verify job failure with RuntimeError when all sequences skip speculation.
We do this by setting the max model len of the draft model to an
artificially low value, such that when the sequences grow beyond it, they
are skipped in speculative decoding.
TODO: fix it to pass without raising Error. (#5814)
"""
with pytest.raises(RuntimeError):
run_greedy_equality_correctness_test(baseline_llm_generator,
test_llm_generator,
batch_size,
max_output_len=output_len,
force_output_len=True)
...@@ -22,10 +22,12 @@ from .conftest import get_logprobs_from_llm_generator ...@@ -22,10 +22,12 @@ from .conftest import get_logprobs_from_llm_generator
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{ @pytest.mark.parametrize("test_llm_kwargs",
"speculative_model": "JackFram/llama-160m", [{
"num_speculative_tokens": 3, "speculative_model": "JackFram/llama-160m",
}]) "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False,
}])
@pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
...@@ -59,10 +61,12 @@ def test_logprobs_equality(baseline_llm_generator, test_llm_generator, ...@@ -59,10 +61,12 @@ def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{ @pytest.mark.parametrize("test_llm_kwargs",
"speculative_model": "JackFram/llama-160m", [{
"num_speculative_tokens": 3, "speculative_model": "JackFram/llama-160m",
}]) "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False,
}])
@pytest.mark.parametrize("batch_size", [1]) @pytest.mark.parametrize("batch_size", [1])
@pytest.mark.parametrize("num_logprobs", [6]) @pytest.mark.parametrize("num_logprobs", [6])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -99,13 +103,16 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator, ...@@ -99,13 +103,16 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{ @pytest.mark.parametrize("test_llm_kwargs",
"speculative_model": "JackFram/llama-160m", [{
"num_speculative_tokens": 3, "speculative_model": "JackFram/llama-160m",
}, { "num_speculative_tokens": 3,
"speculative_model": "JackFram/llama-160m", "disable_logprobs_during_spec_decoding": False,
"num_speculative_tokens": 6, }, {
}]) "speculative_model": "JackFram/llama-160m",
"num_speculative_tokens": 6,
"disable_logprobs_during_spec_decoding": False,
}])
@pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
...@@ -143,6 +150,7 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator, ...@@ -143,6 +150,7 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
[{ [{
"speculative_model": "JackFram/llama-160m", "speculative_model": "JackFram/llama-160m",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False,
# Artificially limit the draft model max model len; this forces vLLM # Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens. # to skip speculation once the sequences grow beyond 32-k tokens.
...@@ -181,10 +189,12 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator, ...@@ -181,10 +189,12 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{ @pytest.mark.parametrize("test_llm_kwargs",
"speculative_model": "JackFram/llama-160m", [{
"num_speculative_tokens": 3, "speculative_model": "JackFram/llama-160m",
}]) "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False,
}])
@pytest.mark.parametrize("batch_size", [1]) @pytest.mark.parametrize("batch_size", [1])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
......
...@@ -97,7 +97,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, ...@@ -97,7 +97,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
temperature=temperature, temperature=temperature,
) )
batch_tokens, batch_token_ids = get_output_from_llm_generator( batch_tokens, batch_token_ids, _ = get_output_from_llm_generator(
test_llm_generator, prompts, sampling_params) test_llm_generator, prompts, sampling_params)
# Expect a generation for each prompt in the batch. # Expect a generation for each prompt in the batch.
...@@ -200,12 +200,18 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( ...@@ -200,12 +200,18 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
Since this test is cheaper than other e2e correctness tests, we generate Since this test is cheaper than other e2e correctness tests, we generate
with a higher output_len. with a higher output_len.
When the draft model is the same as the target model, we further check
whether all speculative tokens are accepted.
""" """
run_greedy_equality_correctness_test(baseline_llm_generator, ensure_all_accepted = test_llm_generator.same_draft_target_model
test_llm_generator, run_greedy_equality_correctness_test(
batch_size, baseline_llm_generator,
max_output_len=output_len, test_llm_generator,
force_output_len=True) batch_size,
max_output_len=output_len,
force_output_len=True,
ensure_all_accepted=ensure_all_accepted)
@pytest.mark.parametrize( @pytest.mark.parametrize(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment