"vscode:/vscode.git/clone" did not exist on "53d2420b4447fbcab572dc23d2c3bb9224a8a561"
Commit 7c4f76e3 authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.4.0

parents 2da0dd3e 51c31bc1
...@@ -2,57 +2,43 @@ import pytest ...@@ -2,57 +2,43 @@ import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import TokenizerGroup, get_lora_tokenizer from vllm.transformers_utils.tokenizer import get_lora_tokenizer
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
from ..conftest import get_tokenizer_pool_config
@pytest.mark.asyncio
async def test_transformers_tokenizer():
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer = TokenizerGroup(
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_input_length=None,
)
assert reference_tokenizer.encode("prompt") == tokenizer.encode(
request_id="request_id", prompt="prompt", lora_request=None)
assert reference_tokenizer.encode(
"prompt") == await tokenizer.encode_async(request_id="request_id",
prompt="prompt",
lora_request=None)
assert isinstance(tokenizer.get_lora_tokenizer(None),
PreTrainedTokenizerBase)
assert tokenizer.get_lora_tokenizer(
None) == await tokenizer.get_lora_tokenizer_async(None)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_transformers_tokenizer_lora(sql_lora_files): @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files) reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
tokenizer = TokenizerGroup( tokenizer_group = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_id="gpt2", tokenizer_id="gpt2",
enable_lora=True, enable_lora=True,
max_num_seqs=1, max_num_seqs=1,
max_input_length=None, max_input_length=None,
) )
lora_request = LoRARequest("1", 1, sql_lora_files) lora_request = LoRARequest("1", 1, sql_lora_files)
assert reference_tokenizer.encode("prompt") == tokenizer.encode( assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
request_id="request_id", prompt="prompt", lora_request=lora_request) request_id="request_id", prompt="prompt", lora_request=lora_request)
assert reference_tokenizer.encode( assert reference_tokenizer.encode(
"prompt") == await tokenizer.encode_async(request_id="request_id", "prompt") == await tokenizer_group.encode_async(
prompt="prompt", request_id="request_id",
lora_request=lora_request) prompt="prompt",
assert isinstance(tokenizer.get_lora_tokenizer(None), lora_request=lora_request)
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
PreTrainedTokenizerBase) PreTrainedTokenizerBase)
assert tokenizer.get_lora_tokenizer( assert tokenizer_group.get_lora_tokenizer(
None) == await tokenizer.get_lora_tokenizer_async(None) None) == await tokenizer_group.get_lora_tokenizer_async(None)
assert isinstance(tokenizer.get_lora_tokenizer(lora_request), assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request),
PreTrainedTokenizerBase) PreTrainedTokenizerBase)
assert tokenizer.get_lora_tokenizer( assert tokenizer_group.get_lora_tokenizer(
lora_request) != tokenizer.get_lora_tokenizer(None) lora_request) != tokenizer_group.get_lora_tokenizer(None)
assert tokenizer.get_lora_tokenizer( assert tokenizer_group.get_lora_tokenizer(
lora_request) == await tokenizer.get_lora_tokenizer_async(lora_request) lora_request) == await tokenizer_group.get_lora_tokenizer_async(
lora_request)
def test_get_lora_tokenizer(sql_lora_files, tmpdir): def test_get_lora_tokenizer(sql_lora_files, tmpdir):
......
...@@ -2,8 +2,8 @@ from collections import OrderedDict ...@@ -2,8 +2,8 @@ from collections import OrderedDict
from torch import nn from torch import nn
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
from vllm.utils import LRUCache from vllm.utils import LRUCache
from vllm.lora.utils import (parse_fine_tuned_lora_name, replace_submodule)
def test_parse_fine_tuned_lora_name(): def test_parse_fine_tuned_lora_name():
......
...@@ -3,10 +3,10 @@ import random ...@@ -3,10 +3,10 @@ import random
import tempfile import tempfile
from unittest.mock import patch from unittest.mock import patch
from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
SchedulerConfig)
from vllm.lora.models import LoRAMapping from vllm.lora.models import LoRAMapping
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
DeviceConfig, LoRAConfig)
from vllm.worker.worker import Worker from vllm.worker.worker import Worker
...@@ -25,7 +25,7 @@ def test_worker_apply_lora(sql_lora_files): ...@@ -25,7 +25,7 @@ def test_worker_apply_lora(sql_lora_files):
revision=None, revision=None,
), ),
parallel_config=ParallelConfig(1, 1, False), parallel_config=ParallelConfig(1, 1, False),
scheduler_config=SchedulerConfig(32, 32, 32, 256), scheduler_config=SchedulerConfig(32, 32, 32),
device_config=DeviceConfig("cuda"), device_config=DeviceConfig("cuda"),
local_rank=0, local_rank=0,
rank=0, rank=0,
...@@ -33,7 +33,7 @@ def test_worker_apply_lora(sql_lora_files): ...@@ -33,7 +33,7 @@ def test_worker_apply_lora(sql_lora_files):
max_loras=32), max_loras=32),
distributed_init_method=f"file://{tempfile.mkstemp()[1]}", distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
) )
worker.init_model() worker.init_device()
worker.load_model() worker.load_model()
worker.model_runner.set_active_loras([], LoRAMapping([], [])) worker.model_runner.set_active_loras([], LoRAMapping([], []))
......
...@@ -21,7 +21,8 @@ def test_metric_counter_prompt_tokens( ...@@ -21,7 +21,8 @@ def test_metric_counter_prompt_tokens(
gpu_memory_utilization=0.4) gpu_memory_utilization=0.4)
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.model.get_tokenizer()
prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
# This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding. # This test needs at least 2 prompts in a batch of different lengths to
# verify their token count is correct despite padding.
assert len(example_prompts) > 1, "at least 2 prompts are required" assert len(example_prompts) > 1, "at least 2 prompts are required"
assert prompt_token_counts[0] != prompt_token_counts[1], ( assert prompt_token_counts[0] != prompt_token_counts[1], (
"prompts of different lengths are required") "prompts of different lengths are required")
...@@ -33,8 +34,8 @@ def test_metric_counter_prompt_tokens( ...@@ -33,8 +34,8 @@ def test_metric_counter_prompt_tokens(
**stat_logger.labels)._value.get() **stat_logger.labels)._value.get()
assert vllm_prompt_token_count == metric_count, ( assert vllm_prompt_token_count == metric_count, (
f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}" f"prompt token count: {vllm_prompt_token_count!r}\n"
) f"metric: {metric_count!r}")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
...@@ -60,9 +61,10 @@ def test_metric_counter_generation_tokens( ...@@ -60,9 +61,10 @@ def test_metric_counter_generation_tokens(
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
vllm_output_ids, vllm_output_str = vllm_outputs[i] vllm_output_ids, vllm_output_str = vllm_outputs[i]
prompt_ids = tokenizer.encode(example_prompts[i]) prompt_ids = tokenizer.encode(example_prompts[i])
# vllm_output_ids contains both prompt tokens and generation tokens. We're interested only in the count of the generation tokens. # vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens.
vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
assert vllm_generation_count == metric_count, ( assert vllm_generation_count == metric_count, (
f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}" f"generation token count: {vllm_generation_count!r}\n"
) f"metric: {metric_count!r}")
"""Compare the outputs of HF and vLLM when using greedy sampling.
This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`.
"""
import pytest
MODELS = [
"meta-llama/Llama-2-7b-hf",
# "mistralai/Mistral-7B-v0.1", # Broken
# "Deci/DeciLM-7b", # Broken
# "tiiuae/falcon-7b", # Broken
"EleutherAI/gpt-j-6b",
"mosaicml/mpt-7b",
# "Qwen/Qwen1.5-0.5B" # Broken,
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
import gc
from dataclasses import fields
from enum import Enum
from typing import Dict, List, Tuple
import pytest
import torch
from transformers import AutoTokenizer
from vllm.config import VisionLanguageConfig
model_and_vl_config = [
("llava-hf/llava-1.5-7b-hf",
VisionLanguageConfig(
image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
image_feature_size=576,
image_token_id=32000,
image_input_shape=(1, 3, 336, 336))),
("llava-hf/llava-1.5-7b-hf",
VisionLanguageConfig(
image_input_type=VisionLanguageConfig.ImageInputType.IMAGE_FEATURES,
image_feature_size=576,
image_token_id=32000,
image_input_shape=(1, 576, 1024)))
]
def as_dict(vision_language_config: VisionLanguageConfig) -> Dict:
"""Flatten vision language config to pure args.
Compatible with what llm entrypoint expects.
"""
result = {}
for field in fields(vision_language_config):
value = getattr(vision_language_config, field.name)
if isinstance(value, Enum):
result[field.name] = value.name.lower()
elif isinstance(value, tuple):
result[field.name] = ",".join([str(item) for item in value])
else:
result[field.name] = value
return result
def sanitize_vllm_output(vllm_output: Tuple[List[int], str],
vision_language_config: VisionLanguageConfig,
model_id: str):
"""Sanitize vllm output to be comparable with hf output.
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
tokenizer = AutoTokenizer.from_pretrained(model_id)
image_token_str = tokenizer.decode(vision_language_config.image_token_id)
image_token_str_len = len(image_token_str)
input_ids, output_str = vllm_output
sanitized_input_ids = input_ids[0:2] + input_ids[2 + vision_language_config
.image_feature_size - 1:]
sanitzied_output_str = output_str[vision_language_config.
image_feature_size *
image_token_str_len:]
return sanitized_input_ids, sanitzied_output_str
@pytest.mark.parametrize("worker_use_ray", [False])
@pytest.mark.parametrize("model_and_config", model_and_vl_config)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
vllm_image_prompts, vllm_images, model_and_config: tuple,
dtype: str, max_tokens: int, worker_use_ray: bool) -> None:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the raw images as input.
For vllm runner, we provide image tensors and corresponding
vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
model_id, vision_language_config = model_and_config
hf_model = hf_runner(model_id, dtype=dtype)
hf_outputs = hf_model.generate_greedy(hf_image_prompts,
max_tokens,
images=hf_images)
del hf_model
vllm_model = vllm_runner(model_id,
dtype=dtype,
worker_use_ray=worker_use_ray,
**as_dict(vision_language_config))
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
max_tokens,
images=vllm_images)
del vllm_model
gc.collect()
torch.cuda.empty_cache()
for i in range(len(hf_image_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = sanitize_vllm_output(
vllm_outputs[i], vision_language_config, model_id)
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
"""Compare the outputs of a GPTQ model to a Marlin model. """Compare the outputs of a GPTQ model to a Marlin model.
Note: GPTQ and Marlin do not have bitwise correctness. Note: GPTQ and Marlin do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 3 selections of each other. Marlin/GPTQ models are in the top 3 selections of each other.
Note: Marlin internally uses locks to synchronize the threads. This can Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass. up to 3 times to see if we pass.
Run `pytest tests/models/test_marlin.py --forked`. Run `pytest tests/models/test_marlin.py`.
""" """
from dataclasses import dataclass
import pytest import pytest
import torch import torch
from dataclasses import dataclass
from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY from vllm.model_executor.layers.quantization import (
_QUANTIZATION_CONFIG_REGISTRY)
capability = torch.cuda.get_device_capability() capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1] capability = capability[0] * 10 + capability[1]
...@@ -60,7 +63,6 @@ def test_models( ...@@ -60,7 +63,6 @@ def test_models(
# Note: not sure why, but deleting just the model on Ada Lovelace # Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting the just model # does not free the GPU memory. On Ampere, deleting the just model
# frees the memory. # frees the memory.
del marlin_model.model.llm_engine.driver_worker
del marlin_model del marlin_model
gptq_model = vllm_runner(model_pair.model_gptq, dtype=dtype) gptq_model = vllm_runner(model_pair.model_gptq, dtype=dtype)
...@@ -71,7 +73,6 @@ def test_models( ...@@ -71,7 +73,6 @@ def test_models(
# Note: not sure why, but deleting just the model on Ada Lovelace # Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting the just model # does not free the GPU memory. On Ampere, deleting the just model
# frees the memory. # frees the memory.
del gptq_model.model.llm_engine.driver_worker
del gptq_model del gptq_model
# loop through the prompts # loop through the prompts
...@@ -87,11 +88,11 @@ def test_models( ...@@ -87,11 +88,11 @@ def test_models(
if marlin_output_id != gptq_output_id: if marlin_output_id != gptq_output_id:
# Each predicted token must be in top 5 of the other's # Each predicted token must be in top 5 of the other's
assert gptq_output_id in marlin_logprobs[idx], ( assert gptq_output_id in marlin_logprobs[idx], (
f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}" f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
) f"Marlin:\t{marlin_output_str!r}")
assert marlin_output_id in gptq_logprobs[idx], ( assert marlin_output_id in gptq_logprobs[idx], (
f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}" f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
) f"Marlin:\t{marlin_output_str!r}")
# Break out since sequences will now diverge. # Break out since sequences will now diverge.
break break
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling. """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
Run `pytest tests/models/test_mistral.py --forked`. Run `pytest tests/models/test_mistral.py`.
""" """
import pytest import pytest
...@@ -12,6 +12,9 @@ MODELS = [ ...@@ -12,6 +12,9 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.skip(
"Two problems: 1. Failing correctness tests. 2. RuntimeError: expected "
"scalar type BFloat16 but found Half (only in CI).")
def test_models( def test_models(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
......
"""Compare the outputs of HF and vLLM when using greedy sampling. """Compare the outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/models/test_models.py --forked`. This test only tests small models. Big models such as 7B should be tested from
test_big_models.py because it could use a larger instance to run tests.
Run `pytest tests/models/test_models.py`.
""" """
import pytest import pytest
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
"mistralai/Mistral-7B-v0.1",
"Deci/DeciLM-7b",
"tiiuae/falcon-7b",
"gpt2", "gpt2",
"bigcode/tiny_starcoder_py", "bigcode/tiny_starcoder_py",
"EleutherAI/gpt-j-6b",
"EleutherAI/pythia-70m", "EleutherAI/pythia-70m",
"bigscience/bloom-560m", "bigscience/bloom-560m",
"mosaicml/mpt-7b",
"microsoft/phi-2", "microsoft/phi-2",
"stabilityai/stablelm-3b-4e1t", "stabilityai/stablelm-3b-4e1t",
"allenai/OLMo-1B", # "allenai/OLMo-1B", # Broken
"bigcode/starcoder2-3b", "bigcode/starcoder2-3b",
] ]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [96])
def test_models( def test_models(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
...@@ -34,6 +31,9 @@ def test_models( ...@@ -34,6 +31,9 @@ def test_models(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
# To pass the small model tests, we need full precision.
assert dtype == "float"
hf_model = hf_runner(model, dtype=dtype) hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model del hf_model
......
...@@ -4,38 +4,72 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`. ...@@ -4,38 +4,72 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
""" """
import pytest import pytest
from vllm import LLM, SamplingParams from vllm.core.block_manager_v1 import CachedBlockAllocator
from vllm.utils import Device
prefix = (
"You are an expert school principal, skilled in effectively managing "
"faculty and staff. Draft 10-15 questions for a potential first grade " @pytest.mark.parametrize("block_size", [16])
"Head Teacher for my K-12, all-girls', independent school that emphasizes " @pytest.mark.parametrize("num_blocks", [16])
"community, joyful discovery, and life-long learning. The candidate is " def test_block_allocator(
"coming in for a first-round panel interview for a 8th grade Math " block_size: int,
"teaching role. They have 5 years of previous teaching experience " num_blocks: int,
"as an assistant teacher at a co-ed, public school with experience "
"in middle school math teaching. Based on these information, fulfill "
"the following paragraph: ")
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("max_tokens", [16])
def test_prefix_caching(
example_prompts,
model: str,
max_tokens: int,
): ):
llm = LLM(model=model) block_hash = 1
# -1 since the last token can change when concatenating prompts. block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1
prompts = [prefix + prompt for prompt in example_prompts] # Allocate two PysicalTokenBlocks with the same hash and check
sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) # that they are the same PhysicalTokenBlock
outputs_without_prefix = llm.generate(prompts, sampling_params) first_block = block_allocator.allocate(block_hash, 0)
outputs_with_prefix = llm.generate(prompts, second_block = block_allocator.allocate(block_hash, 0)
sampling_params, assert (first_block == second_block)
prefix_pos=[prefix_pos] * len(prompts)) assert (second_block.ref_count == 2)
for output_without_prefix, output_with_prefix in zip(
outputs_without_prefix, outputs_with_prefix): # Free the first_block and confirm that the ref_count is correctly
assert (output_without_prefix.outputs[0].token_ids == # decremented on the second block
output_with_prefix.outputs[0].token_ids) block_allocator.free(first_block)
assert len(llm.llm_engine.scheduler.prefix_pool.prefixes) == 1 assert (second_block.ref_count == 1)
# Free the second block
block_allocator.free(second_block)
# Reallocate the first block and confirm that, even after the block
# had its ref_count go to 0, we still get the same block back
first_block = block_allocator.allocate(block_hash, 0)
assert (first_block == second_block)
assert (first_block.block_hash == block_hash)
@pytest.mark.parametrize("num_blocks", [16])
def test_eviction(num_blocks: int, ):
block_size = 16
block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
blocks = []
for i in range(num_blocks):
# use i as the block_hash
blocks.append(block_allocator.allocate(i, 0))
#Free all blocks
for block in blocks:
block_allocator.free(block)
# Allocate a new block and confirm that it's the first block freed.
# I.E The Least Recently Used block
new_block_hash = block_size
new_block = block_allocator.allocate(new_block_hash, 0)
assert (new_block == blocks[0])
assert (new_block.block_hash == new_block_hash)
# Reallocate the second in blocks to remove it from the free list
realloc_block_hash = 1
realloc_block = block_allocator.allocate(realloc_block_hash, 0)
assert (realloc_block == blocks[realloc_block_hash])
assert (realloc_block.block_hash == realloc_block_hash)
# Allocate a new block and confirm that it's not the realloc_block,
# since the realloc_block shouldn't be in the free list
new_block_hash = block_size + 1
new_block = block_allocator.allocate(new_block_hash, 0)
assert (realloc_block != new_block)
assert (new_block.block_hash == new_block_hash)
assert (new_block.block_number == 2)
"""Compare the outputs of HF and vLLM when using beam search. """Compare the outputs of HF and vLLM when using beam search.
Run `pytest tests/samplers/test_beam_search.py --forked`. Run `pytest tests/samplers/test_beam_search.py`.
""" """
import gc
import pytest import pytest
import torch
# FIXME(zhuohan): The test can not pass if we: # FIXME(zhuohan): The test can not pass if we:
# 1. Increase max_tokens to 256. # 1. Increase max_tokens to 256.
...@@ -36,6 +39,10 @@ def test_beam_search_single_input( ...@@ -36,6 +39,10 @@ def test_beam_search_single_input(
vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
max_tokens) max_tokens)
del vllm_model del vllm_model
# NOTE(woosuk): For some reason, the following GC is required to avoid
# GPU OOM errors in the following tests using `vllm_runner`.
gc.collect()
torch.cuda.empty_cache()
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, _ = hf_outputs[i] hf_output_ids, _ = hf_outputs[i]
......
import pytest import pytest
import torch import torch
from tests.conftest import VllmRunner
from vllm import SamplingParams from vllm import SamplingParams
MODELS = ["facebook/opt-125m"] MODELS = ["facebook/opt-125m"]
...@@ -16,6 +17,7 @@ def test_get_prompt_logprobs( ...@@ -16,6 +17,7 @@ def test_get_prompt_logprobs(
example_prompts, example_prompts,
): ):
max_tokens = 5 max_tokens = 5
num_top_logprobs = 6
hf_model = hf_runner(model, dtype=dtype) hf_model = hf_runner(model, dtype=dtype)
hf_logprobs = hf_model.generate_greedy_logprobs( hf_logprobs = hf_model.generate_greedy_logprobs(
example_prompts, example_prompts,
...@@ -23,19 +25,32 @@ def test_get_prompt_logprobs( ...@@ -23,19 +25,32 @@ def test_get_prompt_logprobs(
) )
del hf_model del hf_model
vllm_model = vllm_runner(model, dtype=dtype) vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
vllm_sampling_params = SamplingParams(max_tokens=max_tokens, vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
logprobs=5, logprobs=num_top_logprobs,
prompt_logprobs=5, prompt_logprobs=5,
temperature=0.0) temperature=0.0)
vllm_results = vllm_model.model.generate( vllm_results = vllm_model.model.generate(
example_prompts, sampling_params=vllm_sampling_params) example_prompts, sampling_params=vllm_sampling_params)
del vllm_model
# Test whether logprobs are included in the results. # Test whether logprobs are included in the results.
for result in vllm_results: for result in vllm_results:
assert result.prompt_logprobs is not None assert result.prompt_logprobs is not None
assert result.outputs[0].logprobs is not None assert result.outputs[0].logprobs is not None
assert len(result.outputs[0].logprobs) == max_tokens
for logprobs in result.outputs[0].logprobs:
assert len(logprobs) == num_top_logprobs
output_text = result.outputs[0].text
output_string_from_most_likely_tokens = []
for top_logprobs in result.outputs[0].logprobs:
top_logprob = next(iter(top_logprobs.values()))
output_string_from_most_likely_tokens.append(
top_logprob.decoded_token)
output_string_from_most_likely_tokens = "".join(
output_string_from_most_likely_tokens)
assert output_text == output_string_from_most_likely_tokens, (
"The output text from the top logprob for each token position "
"should be the same as the output text in the result.")
# Test whether prompt logprobs are consistent with HF # Test whether prompt logprobs are consistent with HF
for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs): for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
...@@ -43,14 +58,29 @@ def test_get_prompt_logprobs( ...@@ -43,14 +58,29 @@ def test_get_prompt_logprobs(
vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:] vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs): for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
for token_id, logprob in vllm_prompt_logprob_dict.items(): for token_id, logprob in vllm_prompt_logprob_dict.items():
torch.testing.assert_close(logprob, torch.testing.assert_close(logprob.logprob,
hf_logprob[0][i][token_id].item(), hf_logprob[0][i][token_id].item(),
atol=1e-2, atol=1e-2,
rtol=1e-2) rtol=1e-2)
vllm_sample_logprobs = vllm_result.outputs[0].logprobs vllm_sample_logprobs = vllm_result.outputs[0].logprobs
for i, vllm_sample_logprob_dict in enumerate(vllm_sample_logprobs): for i, top_logprobs in enumerate(vllm_sample_logprobs):
for token_id, logprob in vllm_sample_logprob_dict.items(): for token_id, sample_logprob in top_logprobs.items():
logprob = sample_logprob.logprob
torch.testing.assert_close(logprob, torch.testing.assert_close(logprob,
hf_logprob[i][-1][token_id].item(), hf_logprob[i][-1][token_id].item(),
atol=1e-2, atol=1e-2,
rtol=1e-2) rtol=1e-2)
assert isinstance(sample_logprob.decoded_token, str), (
"The token should be decoded by the time it is returned "
" to the user.")
def test_max_logprobs():
runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
vllm_sampling_params = SamplingParams(logprobs=1)
# should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
bad_sampling_params = SamplingParams(logprobs=2)
with pytest.raises(ValueError):
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
import pytest
from vllm import SamplingParams
MODELS = ["facebook/opt-125m"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_ranks(
vllm_runner,
model,
dtype,
example_prompts,
):
max_tokens = 5
num_top_logprobs = 5
num_prompt_logprobs = 5
vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
## Test greedy logprobs ranks
vllm_sampling_params = SamplingParams(temperature=0.0,
top_p=1.0,
max_tokens=max_tokens,
logprobs=num_top_logprobs,
prompt_logprobs=num_prompt_logprobs)
vllm_results = vllm_model.generate_w_logprobs(example_prompts,
vllm_sampling_params)
for result in vllm_results:
assert result[2] is not None
assert len(result[2]) == len(result[0])
# check whether all chosen tokens have ranks = 1
for token, logprobs in zip(result[0], result[2]):
assert token in logprobs
assert logprobs[token].rank == 1
## Test non-greedy logprobs ranks
sampling_params = SamplingParams(temperature=1.0,
top_p=1.0,
max_tokens=max_tokens,
logprobs=num_top_logprobs,
prompt_logprobs=num_prompt_logprobs)
res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
for result in res:
assert result[2] is not None
assert len(result[2]) == len(result[0])
# check whether all chosen tokens have ranks
for token, logprobs in zip(result[0], result[2]):
assert logprobs[token].rank >= 1
"""Tests for rejection sampling.""" """Tests for rejection sampling."""
import pytest
from typing import List, Tuple from typing import List, Tuple
import pytest
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from vllm.model_executor.utils import set_random_seed
from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.utils import set_random_seed
CUDA_DEVICES = [ CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
......
import random import random
from typing import Tuple, List from typing import List, Optional, Tuple
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
import torch import torch
from transformers import GenerationConfig, GenerationMixin from transformers import GenerationConfig, GenerationMixin
from typing import Optional
from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.utils import set_random_seed from vllm.model_executor.utils import set_random_seed
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import Counter
from vllm.worker.model_runner import ModelRunner from vllm.worker.model_runner import ModelRunner
class MockLogitsSampler(Sampler): class MockLogitsSampler(Sampler):
def __init__(self, vocab_size: int, fake_logits: torch.Tensor): def __init__(self, fake_logits: torch.Tensor):
super().__init__(vocab_size=vocab_size) super().__init__()
self.fake_logits = fake_logits self.fake_logits = fake_logits
def forward(self, *args, **kwargs): def forward(self, *args, **kwargs):
with patch( return super().forward(*args, **kwargs)
"vllm.model_executor.layers.sampler._prune_hidden_states",
lambda x, y: x), patch(
"vllm.model_executor.layers.sampler.Sampler._get_logits",
lambda *args, **kwargs: self.fake_logits):
return super().forward(*args, **kwargs)
def _prepare_test( def _prepare_test(
batch_size: int batch_size: int
) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]: ) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
vocab_size = 32000
input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
fake_logits = torch.full((batch_size, vocab_size), fake_logits = torch.full((batch_size, VOCAB_SIZE),
1e-2, 1e-2,
dtype=input_tensor.dtype) dtype=input_tensor.dtype)
sampler = MockLogitsSampler(32000, fake_logits) sampler = MockLogitsSampler(fake_logits)
model_runner = ModelRunner(None, None, None, None, None) model_runner = ModelRunner(None, None, None, None, None)
return input_tensor, fake_logits, sampler, model_runner return input_tensor, fake_logits, sampler, model_runner
VOCAB_SIZE = 32000
RANDOM_SEEDS = list(range(128)) RANDOM_SEEDS = list(range(128))
CUDA_DEVICES = [ CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
...@@ -70,9 +65,7 @@ def _do_sample( ...@@ -70,9 +65,7 @@ def _do_sample(
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list, sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
prompt_lens, prompt_lens,
subquery_lens=prompt_lens) subquery_lens=prompt_lens)
return sampler(embedding=None, return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
hidden_states=input_tensor,
sampling_metadata=sampling_metadata)
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
...@@ -85,8 +78,8 @@ def test_sampler_all_greedy(seed: int, device: str): ...@@ -85,8 +78,8 @@ def test_sampler_all_greedy(seed: int, device: str):
batch_size) batch_size)
sampling_params = SamplingParams(temperature=0) sampling_params = SamplingParams(temperature=0)
sampler_output = _do_sample(batch_size, input_tensor, sampler, sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
model_runner, sampling_params) sampling_params)
expected = torch.argmax(fake_logits, dim=-1) expected = torch.argmax(fake_logits, dim=-1)
for i, sequence_output in enumerate(sampler_output): for i, sequence_output in enumerate(sampler_output):
for nth_output in sequence_output.samples: for nth_output in sequence_output.samples:
...@@ -111,8 +104,8 @@ def test_sampler_all_random(seed: int, device: str): ...@@ -111,8 +104,8 @@ def test_sampler_all_random(seed: int, device: str):
temperature=1.0, temperature=1.0,
n=random.randint(1, 10), n=random.randint(1, 10),
) )
sampler_output = _do_sample(batch_size, input_tensor, sampler, sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
model_runner, sampling_params) sampling_params)
for i, sequence_output in enumerate(sampler_output): for i, sequence_output in enumerate(sampler_output):
for nth_output in sequence_output.samples: for nth_output in sequence_output.samples:
...@@ -127,8 +120,7 @@ def test_sampler_all_random_seed(seed: int, device: str): ...@@ -127,8 +120,7 @@ def test_sampler_all_random_seed(seed: int, device: str):
set_random_seed(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
input_tensor, fake_logits, sampler, model_runner = _prepare_test( _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
batch_size)
for i in range(batch_size): for i in range(batch_size):
fake_logits[i, i] = 1e2 fake_logits[i, i] = 1e2
...@@ -138,8 +130,8 @@ def test_sampler_all_random_seed(seed: int, device: str): ...@@ -138,8 +130,8 @@ def test_sampler_all_random_seed(seed: int, device: str):
n=random.randint(1, 10), n=random.randint(1, 10),
seed=random.randint(0, 10000), seed=random.randint(0, 10000),
) )
sampler_output = _do_sample(batch_size, input_tensor, sampler, sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
model_runner, sampling_params) sampling_params)
for i, sequence_output in enumerate(sampler_output): for i, sequence_output in enumerate(sampler_output):
for nth_output in sequence_output.samples: for nth_output in sequence_output.samples:
...@@ -154,18 +146,17 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str): ...@@ -154,18 +146,17 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
set_random_seed(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
input_tensor, fake_logits, sampler, model_runner = _prepare_test( _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
batch_size)
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=1.0, temperature=1.0,
n=random.randint(1, 10), n=random.randint(1, 10),
seed=random.randint(0, 10000), seed=random.randint(0, 10000),
) )
first_sampler_output = _do_sample(batch_size, input_tensor, sampler, first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
model_runner, sampling_params) model_runner, sampling_params)
second_sampler_output = _do_sample(batch_size, input_tensor, sampler, second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
model_runner, sampling_params) model_runner, sampling_params)
assert first_sampler_output == second_sampler_output assert first_sampler_output == second_sampler_output
...@@ -179,15 +170,14 @@ def test_sampler_all_beam(seed: int, device: str): ...@@ -179,15 +170,14 @@ def test_sampler_all_beam(seed: int, device: str):
set_random_seed(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
input_tensor, _, sampler, model_runner = _prepare_test(batch_size) _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0,
best_of=2, best_of=2,
use_beam_search=True, use_beam_search=True,
) )
_do_sample(batch_size, input_tensor, sampler, model_runner, _do_sample(batch_size, fake_logits, sampler, model_runner, sampling_params)
sampling_params)
# no assertion here as I am not sure how to determine whether # no assertion here as I am not sure how to determine whether
# the outputs are expected - in other words, this just tests # the outputs are expected - in other words, this just tests
# whether there are no exceptions in the sampler # whether there are no exceptions in the sampler
...@@ -195,6 +185,225 @@ def test_sampler_all_beam(seed: int, device: str): ...@@ -195,6 +185,225 @@ def test_sampler_all_beam(seed: int, device: str):
del model_runner del model_runner
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_min_tokens_penalty(seed: int, device: str):
seq_id_counter = Counter(start=random.randint(0, 100))
set_random_seed(seed)
torch.set_default_device(device)
def create_sampling_params(min_tokens,
eos_token_id=0,
stop_token_ids=None):
sampling_params = SamplingParams(
min_tokens=min_tokens,
max_tokens=9999, # keep higher than max of min_tokens
stop_token_ids=stop_token_ids,
)
sampling_params.eos_token_id = eos_token_id
return sampling_params
def create_sequence_data(num_input=3, num_generated=0):
seq_data = SequenceData(
random.choices(range(0, VOCAB_SIZE), k=num_input))
if num_generated > 0:
seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
k=num_generated)
return seq_data
def generate_test_case():
# generate multiple seq groups but limit total batch size
batch_size = random.randint(1, 128)
expected_penalization = []
sequence_metadata_list = []
while batch_size > 0:
# 20% chance to generate prompt seq group with single sequence
is_prompt = random.random() < 0.2
num_seqs = 1 if is_prompt else random.randint(1, batch_size)
eos_token_id = random.randint(0, VOCAB_SIZE - 1)
min_tokens = random.randint(0, 50)
num_stop_tokens = random.randint(0, 8)
if num_stop_tokens > 0:
stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1),
k=num_stop_tokens)
else:
stop_token_ids = None
sampling_params = create_sampling_params(
min_tokens=min_tokens,
eos_token_id=eos_token_id,
stop_token_ids=stop_token_ids)
seq_data = {}
seq_group_penalization = []
for _ in range(num_seqs):
num_input = random.randint(1, 100)
num_generated = random.randint(1, 100) if not is_prompt else 0
seq_data[next(seq_id_counter)] = create_sequence_data(
num_input=num_input, num_generated=num_generated)
seq_group_penalization.append(num_generated < min_tokens)
expected_penalization.extend(seq_group_penalization)
sequence_metadata_list.append(
SequenceGroupMetadata(
request_id=f"test_{batch_size}",
is_prompt=is_prompt,
seq_data=seq_data,
sampling_params=sampling_params,
block_tables={},
))
batch_size -= num_seqs
return {
"expected_penalization": expected_penalization,
"seq_group_metadata_list": sequence_metadata_list,
}
# define some explicit test cases for edge case behavior
prompt_without_penalization = {
"expected_penalization": [False],
"seq_group_metadata_list": [
SequenceGroupMetadata(
request_id="test_1",
is_prompt=True,
seq_data={
next(seq_id_counter): create_sequence_data(),
},
sampling_params=create_sampling_params(0),
block_tables={},
),
]
}
prompt_with_penalization = {
"expected_penalization": [True],
"seq_group_metadata_list": [
SequenceGroupMetadata(
request_id="test_1",
is_prompt=True,
seq_data={
next(seq_id_counter): create_sequence_data(),
},
sampling_params=create_sampling_params(1),
block_tables={},
),
]
}
stop_penalizing_after_min_tokens = {
"expected_penalization": [False],
"seq_group_metadata_list": [
SequenceGroupMetadata(
request_id="test_1",
is_prompt=False,
seq_data={
next(seq_id_counter):
create_sequence_data(num_generated=1),
},
sampling_params=create_sampling_params(1),
block_tables={},
)
]
}
stop_token_ids = [42, 99, 42, 0] # intentional duplication
simple_combination = {
"expected_penalization": [True, False, False],
"seq_group_metadata_list": [
SequenceGroupMetadata(
request_id="test_1",
is_prompt=False,
seq_data={
next(seq_id_counter):
create_sequence_data(num_generated=1),
next(seq_id_counter):
create_sequence_data(num_generated=100),
},
sampling_params=create_sampling_params(
2, stop_token_ids=stop_token_ids),
block_tables={},
),
SequenceGroupMetadata(
request_id="test_2",
is_prompt=True,
seq_data={
next(seq_id_counter): create_sequence_data(),
},
sampling_params=create_sampling_params(
0, stop_token_ids=stop_token_ids),
block_tables={},
)
]
}
if seed == 0:
test_cases = [
prompt_without_penalization,
prompt_with_penalization,
stop_penalizing_after_min_tokens,
simple_combination,
]
else:
test_cases = [generate_test_case()]
def run_test_case(*,
expected_penalization=None,
seq_group_metadata_list=None):
assert expected_penalization, "Invalid test case"
assert seq_group_metadata_list, "Invalid test case"
batch_size = 0
prompt_lens = []
sampling_params_per_seq = []
for sgm in seq_group_metadata_list:
num_seqs = len(sgm.seq_data)
batch_size += num_seqs
sampling_params = sgm.sampling_params
for seq_id in sgm.seq_data:
prompt_lens.append(sgm.seq_data[seq_id].get_prompt_len())
sampling_params_per_seq.append(sampling_params)
_, fake_logits, sampler, model_runner = _prepare_test(batch_size)
sampling_metadata = model_runner._prepare_sample(
seq_group_metadata_list,
prompt_lens=prompt_lens,
subquery_lens=prompt_lens)
# the logits tensor is modified in-place by the sampler
_ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
for logits_idx, (should_penalize, sampling_params) in enumerate(
zip(expected_penalization, sampling_params_per_seq)):
tokens_to_check = [sampling_params.eos_token_id]
if sampling_params.stop_token_ids:
tokens_to_check.extend(sampling_params.stop_token_ids)
tokens_to_check = set(tokens_to_check)
if should_penalize:
for token_id in tokens_to_check:
assert fake_logits[logits_idx, token_id] == -float(
'inf'
), f"Expected token {token_id} for logits row {logits_idx}"
" to be penalized"
# no other tokens should be set to -inf
assert torch.count_nonzero(
fake_logits[logits_idx, :] == -float('inf')) == len(
tokens_to_check
), f"Expected only {len(tokens_to_check)} to be penalized"
else:
# no tokens should be set to -inf
assert torch.count_nonzero(
fake_logits[logits_idx, :] ==
-float('inf')) == 0, "No tokens should have been penalized"
del model_runner
for test_case in test_cases:
run_test_case(**test_case)
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_mixed(seed: int, device: str): def test_sampler_mixed(seed: int, device: str):
...@@ -246,8 +455,7 @@ def test_sampler_mixed(seed: int, device: str): ...@@ -246,8 +455,7 @@ def test_sampler_mixed(seed: int, device: str):
def test_sampling(model_runner: ModelRunner): def test_sampling(model_runner: ModelRunner):
sampling_metadata = model_runner._prepare_sample( sampling_metadata = model_runner._prepare_sample(
seq_group_metadata_list, prompt_lens, subquery_lens=prompt_lens) seq_group_metadata_list, prompt_lens, subquery_lens=prompt_lens)
sampler_output = sampler(embedding=None, sampler_output = sampler(logits=fake_logits,
hidden_states=input_tensor,
sampling_metadata=sampling_metadata) sampling_metadata=sampling_metadata)
for i, (sequence_output, metadata) in enumerate( for i, (sequence_output, metadata) in enumerate(
...@@ -255,9 +463,10 @@ def test_sampler_mixed(seed: int, device: str): ...@@ -255,9 +463,10 @@ def test_sampler_mixed(seed: int, device: str):
if metadata.sampling_params.use_beam_search: if metadata.sampling_params.use_beam_search:
continue continue
if metadata.sampling_params.seed is not None \ if (metadata.sampling_params.seed is not None
and expected_tokens[i] is None: and expected_tokens[i] is None):
# Record seeded random result to compare with results of second invocation # Record seeded random result to compare with results of
# second invocation
expected_tokens[i] = [ expected_tokens[i] = [
nth_output.output_token nth_output.output_token
for nth_output in sequence_output.samples for nth_output in sequence_output.samples
...@@ -265,11 +474,13 @@ def test_sampler_mixed(seed: int, device: str): ...@@ -265,11 +474,13 @@ def test_sampler_mixed(seed: int, device: str):
continue continue
for n, nth_output in enumerate(sequence_output.samples): for n, nth_output in enumerate(sequence_output.samples):
if metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None: if (metadata.sampling_params.temperature == 0
or metadata.sampling_params.seed is not None):
# Ensure exact matches for greedy or random with seed # Ensure exact matches for greedy or random with seed
assert nth_output.output_token == expected_tokens[i][n] assert nth_output.output_token == expected_tokens[i][n]
else: else:
# For non-seeded random check that one of the high-logit tokens were chosen # For non-seeded random check that one of the high-logit
# tokens were chosen
assert nth_output.output_token in expected_tokens[i] assert nth_output.output_token in expected_tokens[i]
# Test batch # Test batch
...@@ -284,55 +495,13 @@ def test_sampler_mixed(seed: int, device: str): ...@@ -284,55 +495,13 @@ def test_sampler_mixed(seed: int, device: str):
input_tensor.data = input_tensor.index_select(0, target_index) input_tensor.data = input_tensor.index_select(0, target_index)
fake_logits.data = fake_logits.index_select(0, target_index) fake_logits.data = fake_logits.index_select(0, target_index)
# This time, results of seeded random samples will be compared with the corresponding # This time, results of seeded random samples will be compared with
# sample in the pre-shuffled batch # the corresponding sample in the pre-shuffled batch
test_sampling(model_runner) test_sampling(model_runner)
del model_runner del model_runner
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_logits_processors(seed: int, device: str):
set_random_seed(seed)
torch.set_default_device(device)
batch_size = random.randint(1, 256)
input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
# This sample logits processor gives infinite score to the i-th token,
# where i is the length of the input sequence.
# We therefore expect the output token sequence to be [0, 1, 2, ...]
def pick_ith(token_ids, logits):
logits[len(token_ids)] = float("inf")
return logits
seq_group_metadata_list = []
prompt_lens = []
for i in range(batch_size):
seq_group_metadata_list.append(
SequenceGroupMetadata(
request_id=f"test_{i}",
is_prompt=True,
seq_data={0: SequenceData([1, 2, 3])},
sampling_params=SamplingParams(temperature=0,
logits_processors=[pick_ith]),
block_tables={0: [1]},
))
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
prompt_lens,
subquery_lens=prompt_lens)
sampler_output = sampler(embedding=None,
hidden_states=input_tensor,
sampling_metadata=sampling_metadata)
for _, sequence_output in enumerate(sampler_output):
for idx, nth_output in enumerate(sequence_output.samples):
assert nth_output.output_token == idx
del model_runner
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_top_k_top_p(seed: int, device: str): def test_sampler_top_k_top_p(seed: int, device: str):
...@@ -349,7 +518,7 @@ def test_sampler_top_k_top_p(seed: int, device: str): ...@@ -349,7 +518,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
size=(batch_size, vocab_size), size=(batch_size, vocab_size),
device=input_tensor.device, device=input_tensor.device,
dtype=input_tensor.dtype) dtype=input_tensor.dtype)
sampler = MockLogitsSampler(32000, fake_logits) sampler = MockLogitsSampler(fake_logits)
model_runner = ModelRunner(None, None, None, None, None) model_runner = ModelRunner(None, None, None, None, None)
generation_model = GenerationMixin() generation_model = GenerationMixin()
...@@ -382,15 +551,13 @@ def test_sampler_top_k_top_p(seed: int, device: str): ...@@ -382,15 +551,13 @@ def test_sampler_top_k_top_p(seed: int, device: str):
sample_probs = None sample_probs = None
def mock_sample(probs, logprobs, sampling_metadata): def mock_sample(probs, *args, **kwargs):
nonlocal sample_probs nonlocal sample_probs
sample_probs = probs sample_probs = probs
return [[prob.topk(1, dim=-1).indices.tolist(), [0]] for prob in probs] return [[prob.topk(1, dim=-1).indices.tolist(), [0]] for prob in probs]
with patch("vllm.model_executor.layers.sampler._sample", mock_sample): with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
sampler(embedding=None, sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
hidden_states=input_tensor,
sampling_metadata=sampling_metadata)
hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone())
hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
assert torch.allclose(hf_probs, sample_probs, atol=1e-5) assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
......
"""Verify that seeded random sampling is deterministic. """Verify that seeded random sampling is deterministic.
Run `pytest tests/samplers/test_seeded_generate.py --forked`. Run `pytest tests/samplers/test_seeded_generate.py`.
""" """
import copy import copy
import random import random
...@@ -8,8 +8,8 @@ from itertools import combinations ...@@ -8,8 +8,8 @@ from itertools import combinations
import pytest import pytest
from vllm.model_executor.utils import set_random_seed
from vllm import SamplingParams from vllm import SamplingParams
from vllm.model_executor.utils import set_random_seed
MODEL = "facebook/opt-125m" MODEL = "facebook/opt-125m"
RANDOM_SEEDS = list(range(5)) RANDOM_SEEDS = list(range(5))
......
"""Test the different finish_reason="stop" situations during generation:
1. One of the provided stop strings
2. One of the provided stop tokens
3. The EOS token
Run `pytest tests/samplers/test_stop_reason.py`.
"""
import pytest
import transformers
from vllm import SamplingParams
MODEL = "facebook/opt-350m"
STOP_STR = "."
SEED = 42
MAX_TOKENS = 1024
@pytest.fixture
def vllm_model(vllm_runner):
vllm_model = vllm_runner(MODEL)
yield vllm_model
del vllm_model
def test_stop_reason(vllm_model, example_prompts):
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
llm = vllm_model.model
# test stop token
outputs = llm.generate(example_prompts,
sampling_params=SamplingParams(
seed=SEED,
max_tokens=MAX_TOKENS,
stop_token_ids=[stop_token_id]))
for output in outputs:
output = output.outputs[0]
assert output.finish_reason == "stop"
assert output.stop_reason == stop_token_id
# test stop string
outputs = llm.generate(example_prompts,
sampling_params=SamplingParams(
seed=SEED, max_tokens=MAX_TOKENS, stop="."))
for output in outputs:
output = output.outputs[0]
assert output.finish_reason == "stop"
assert output.stop_reason == STOP_STR
# test EOS token
outputs = llm.generate(example_prompts,
sampling_params=SamplingParams(
seed=SEED, max_tokens=MAX_TOKENS))
for output in outputs:
output = output.outputs[0]
assert output.finish_reason == "length" or (
output.finish_reason == "stop" and output.stop_reason is None)
import pytest
import torch
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from .utils import create_seq_group_metadata_from_prompts, mock_worker
@pytest.mark.parametrize('num_target_seq_ids', [100])
def test_create_target_seq_id_iterator(num_target_seq_ids: int):
"""Verify all new sequence ids are greater than all input
seq ids.
"""
scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
all_seq_ids = [
[1, 3, 5, 7],
list(range(100)) + [0],
[100],
]
for seq_ids in all_seq_ids:
max_seq_id = max(seq_ids)
iterator = scorer._create_target_seq_id_iterator(seq_ids) # pylint: disable=protected-access
for _ in range(num_target_seq_ids):
assert next(iterator) > max_seq_id
@pytest.mark.parametrize('k', [1, 2, 6])
def test_get_token_ids_to_score(k: int):
"""Verify correct tokens are selected for scoring.
"""
proposal_token_ids = torch.tensor(
list(range(k)),
dtype=torch.int64,
device='cuda',
)
expected_output = [
[],
]
for i in range(proposal_token_ids.shape[0]):
expected_output.append(proposal_token_ids[:i + 1].tolist())
scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
actual_output = scorer._get_token_ids_to_score(proposal_token_ids) # pylint: disable=protected-access
actual_output = [
x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output
]
assert actual_output == expected_output
@pytest.mark.parametrize('k', [1, 2, 6])
def test_create_single_target_seq_group_metadata(k: int):
"""Verify correct creation of a batch-expanded seq group metadata.
"""
prompt_tokens = [1, 2, 3]
prev_output_tokens = [4, 5, 6]
token_ids = list(range(k))
num_tokens_processed = len(prompt_tokens) + len(prev_output_tokens) - 1
final_seq_len = len(prompt_tokens) + len(prev_output_tokens) + len(
token_ids)
block_size = 32
input_seq_group_metadata = create_seq_group_metadata_from_prompts(
[prompt_tokens], 2048 // block_size, block_size, [final_seq_len],
[prev_output_tokens], [num_tokens_processed])[0]
input_seq_id = list(input_seq_group_metadata.seq_data.keys())[0]
target_seq_id = 100
scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
output = scorer._create_single_target_seq_group_metadata( # pylint: disable=protected-access
input_seq_group_metadata,
input_seq_id,
target_seq_id,
token_ids,
)
assert output.request_id == input_seq_group_metadata.request_id
assert len(output.seq_data) == 1
assert output.seq_data[target_seq_id].get_prompt_token_ids(
) == prompt_tokens
assert output.seq_data[target_seq_id].get_output_token_ids(
) == prev_output_tokens + token_ids
assert len(output.block_tables) == 1
assert output.block_tables[
target_seq_id] == input_seq_group_metadata.block_tables[input_seq_id]
import math
from unittest.mock import MagicMock
import pytest
import torch
from vllm.spec_decode.metrics import AsyncMetricsCollector
def test_initial_call_returns_none():
"""Expect first call to get metrics to return None.
"""
rej_sampler = MagicMock()
rej_sampler.num_accepted_tokens = torch.tensor(0,
dtype=torch.long,
device='cuda')
rej_sampler.num_emitted_tokens = torch.tensor(0,
dtype=torch.long,
device='cuda')
rej_sampler.num_draft_tokens = 0
collector = AsyncMetricsCollector(rej_sampler)
collector.init_gpu_tensors(rank=0)
maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5)
assert maybe_metrics is None
def test_second_call_returns_metrics():
"""Expect second call to not return None.
"""
rej_sampler = MagicMock()
rej_sampler.num_accepted_tokens = torch.tensor(0,
dtype=torch.long,
device='cuda')
rej_sampler.num_emitted_tokens = torch.tensor(0,
dtype=torch.long,
device='cuda')
rej_sampler.num_draft_tokens = 0
collect_interval_s = 5.0
timer = MagicMock()
timer.side_effect = [
0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
]
collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
timer=timer,
collect_interval_s=collect_interval_s)
collector.init_gpu_tensors(rank=0)
_ = collector.maybe_collect_rejsample_metrics(k=5)
metrics = collector.maybe_collect_rejsample_metrics(k=5)
assert metrics is not None
@pytest.mark.parametrize("rank", [1, 2, 3, 4])
def test_nonzero_rank_noop(rank):
"""Verify nonzero ranks don't collect metrics.
"""
rej_sampler = MagicMock()
rej_sampler.num_accepted_tokens = torch.tensor(0,
dtype=torch.long,
device='cuda')
rej_sampler.num_emitted_tokens = torch.tensor(0,
dtype=torch.long,
device='cuda')
rej_sampler.num_draft_tokens = 0
collector = AsyncMetricsCollector(rej_sampler)
collector.init_gpu_tensors(rank=rank)
_ = collector.maybe_collect_rejsample_metrics(k=5)
metrics = collector.maybe_collect_rejsample_metrics(k=5)
assert metrics is None
def test_noop_until_time():
"""Verify metrics aren't collected until enough time passes.
"""
rej_sampler = MagicMock()
rej_sampler.num_accepted_tokens = torch.tensor(0,
dtype=torch.long,
device='cuda')
rej_sampler.num_emitted_tokens = torch.tensor(0,
dtype=torch.long,
device='cuda')
rej_sampler.num_draft_tokens = 0
collect_interval_s = 5.0
timer = MagicMock()
timer.side_effect = [
0.0, collect_interval_s - 0.1, collect_interval_s - 0.1,
collect_interval_s + 0.1, collect_interval_s + 0.1
]
collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
timer=timer,
collect_interval_s=collect_interval_s)
collector.init_gpu_tensors(rank=0)
_ = collector.maybe_collect_rejsample_metrics(k=5)
metrics = collector.maybe_collect_rejsample_metrics(k=5)
assert metrics is None
_ = collector.maybe_collect_rejsample_metrics(k=5)
metrics = collector.maybe_collect_rejsample_metrics(k=5)
assert metrics is not None
@pytest.mark.parametrize("has_data", [True, False])
def test_initial_metrics_has_correct_values(has_data: bool):
"""Test correctness of metrics data.
"""
if has_data:
num_accepted_tokens = 103
num_emitted_tokens = 104
num_draft_tokens = 105
else:
num_accepted_tokens = 0
num_emitted_tokens = 0
num_draft_tokens = 0
k = 5
num_possible_tokens = AsyncMetricsCollector.get_max_num_accepted_tokens(
num_draft_tokens, k)
rej_sampler = MagicMock()
rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
dtype=torch.long,
device='cuda')
rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
dtype=torch.long,
device='cuda')
rej_sampler.num_draft_tokens = num_draft_tokens
collect_interval_s = 5.0
timer = MagicMock()
timer.side_effect = [
0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
]
collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
timer=timer,
collect_interval_s=collect_interval_s)
collector.init_gpu_tensors(rank=0)
_ = collector.maybe_collect_rejsample_metrics(k)
metrics = collector.maybe_collect_rejsample_metrics(k)
assert metrics.num_spec_tokens == k
assert metrics.accepted_tokens == num_accepted_tokens
assert metrics.draft_tokens == num_draft_tokens
assert metrics.emitted_tokens == num_emitted_tokens
if has_data:
assert (metrics.draft_acceptance_rate == num_accepted_tokens /
num_draft_tokens)
assert (metrics.system_efficiency == num_emitted_tokens /
num_possible_tokens)
else:
assert math.isnan(metrics.draft_acceptance_rate)
assert math.isnan(metrics.system_efficiency)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment