Commit 7c4f76e3 authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.4.0

parents 2da0dd3e 51c31bc1
import contextlib
import gc
import os import os
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
import pytest import pytest
import torch import torch
from transformers import AutoModelForCausalLM from PIL import Image
from transformers import (AutoModelForCausalLM, AutoProcessor,
LlavaForConditionalGeneration)
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
from vllm.model_executor.parallel_utils.parallel_state import (
destroy_model_parallel)
from vllm.sequence import MultiModalData
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
_TEST_DIR = os.path.dirname(__file__) _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
# Multi modal related
_PIXEL_VALUES_FILES = [
os.path.join(_TEST_DIR, "images", filename) for filename in
["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
]
_IMAGE_FEATURES_FILES = [
os.path.join(_TEST_DIR, "images", filename) for filename in
["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
]
_IMAGE_FILES = [
os.path.join(_TEST_DIR, "images", filename)
for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
]
_IMAGE_PROMPTS = [
"<image>\nUSER: What's the content of the image?\nASSISTANT:",
"<image>\nUSER: What is the season?\nASSISTANT:"
]
assert len(_PIXEL_VALUES_FILES) == len(_IMAGE_FEATURES_FILES) == len(
_IMAGE_FILES) == len(_IMAGE_PROMPTS)
def _read_prompts(filename: str) -> List[str]: def _read_prompts(filename: str) -> List[str]:
with open(filename, "r") as f: with open(filename, "r") as f:
...@@ -19,6 +47,53 @@ def _read_prompts(filename: str) -> List[str]: ...@@ -19,6 +47,53 @@ def _read_prompts(filename: str) -> List[str]:
return prompts return prompts
def cleanup():
destroy_model_parallel()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
torch.cuda.empty_cache()
@pytest.fixture(autouse=True)
def cleanup_fixture():
yield
cleanup()
@pytest.fixture(scope="session")
def hf_image_prompts() -> List[str]:
return _IMAGE_PROMPTS
@pytest.fixture(scope="session")
def hf_images() -> List[Image.Image]:
return [Image.open(filename) for filename in _IMAGE_FILES]
@pytest.fixture()
def vllm_images(request) -> "torch.Tensor":
vision_language_config = request.getfixturevalue("model_and_config")[1]
all_images = []
if vision_language_config.image_input_type == (
VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
filenames = _IMAGE_FEATURES_FILES
else:
filenames = _PIXEL_VALUES_FILES
for filename in filenames:
all_images.append(torch.load(filename))
return torch.concat(all_images, dim=0)
@pytest.fixture()
def vllm_image_prompts(request) -> List[str]:
vision_language_config = request.getfixturevalue("model_and_config")[1]
return [
"<image>" * (vision_language_config.image_feature_size - 1) + p
for p in _IMAGE_PROMPTS
]
@pytest.fixture @pytest.fixture
def example_prompts() -> List[str]: def example_prompts() -> List[str]:
prompts = [] prompts = []
...@@ -41,6 +116,10 @@ _STR_DTYPE_TO_TORCH_DTYPE = { ...@@ -41,6 +116,10 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
"float": torch.float, "float": torch.float,
} }
_VISION_LANGUAGE_MODELS = {
"llava-hf/llava-1.5-7b-hf": LlavaForConditionalGeneration,
}
class HfRunner: class HfRunner:
...@@ -52,11 +131,24 @@ class HfRunner: ...@@ -52,11 +131,24 @@ class HfRunner:
) -> None: ) -> None:
assert dtype in _STR_DTYPE_TO_TORCH_DTYPE assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
self.model = AutoModelForCausalLM.from_pretrained( self.model_name = model_name
model_name, if model_name not in _VISION_LANGUAGE_MODELS:
torch_dtype=torch_dtype, self.model = AutoModelForCausalLM.from_pretrained(
trust_remote_code=True, model_name,
).cuda() torch_dtype=torch_dtype,
trust_remote_code=True,
).cuda()
self.processor = None
else:
self.model = _VISION_LANGUAGE_MODELS[model_name].from_pretrained(
model_name,
torch_dtype=torch_dtype,
trust_remote_code=True,
).cuda()
self.processor = AutoProcessor.from_pretrained(
model_name,
torch_dtype=torch_dtype,
)
if tokenizer_name is None: if tokenizer_name is None:
tokenizer_name = model_name tokenizer_name = model_name
self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True) self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
...@@ -64,13 +156,28 @@ class HfRunner: ...@@ -64,13 +156,28 @@ class HfRunner:
def generate( def generate(
self, self,
prompts: List[str], prompts: List[str],
images: Optional[List[Image.Image]] = None,
**kwargs, **kwargs,
) -> List[Tuple[List[int], str]]: ) -> List[Tuple[List[int], str]]:
outputs: List[Tuple[List[int], str]] = [] outputs: List[Tuple[List[int], str]] = []
for prompt in prompts: if images:
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids assert len(prompts) == len(images)
for i, prompt in enumerate(prompts):
if self.model_name not in _VISION_LANGUAGE_MODELS:
input_ids = self.tokenizer(prompt,
return_tensors="pt").input_ids
inputs = {"input_ids": input_ids.cuda()}
else:
image = images[i] if images else None
inputs = self.processor(text=prompt,
images=image,
return_tensors="pt")
inputs = {
key: value.cuda() if value is not None else None
for key, value in inputs.items()
}
output_ids = self.model.generate( output_ids = self.model.generate(
input_ids.cuda(), **inputs,
use_cache=True, use_cache=True,
**kwargs, **kwargs,
) )
...@@ -87,10 +194,12 @@ class HfRunner: ...@@ -87,10 +194,12 @@ class HfRunner:
self, self,
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
images: Optional["torch.Tensor"] = None,
) -> List[Tuple[List[int], str]]: ) -> List[Tuple[List[int], str]]:
outputs = self.generate(prompts, outputs = self.generate(prompts,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens) max_new_tokens=max_tokens,
images=images)
for i in range(len(outputs)): for i in range(len(outputs)):
output_ids, output_str = outputs[i] output_ids, output_str = outputs[i]
outputs[i] = (output_ids[0], output_str[0]) outputs[i] = (output_ids[0], output_str[0])
...@@ -150,6 +259,10 @@ class HfRunner: ...@@ -150,6 +259,10 @@ class HfRunner:
all_logprobs.append(seq_logprobs) all_logprobs.append(seq_logprobs)
return all_logprobs return all_logprobs
def __del__(self):
del self.model
cleanup()
@pytest.fixture @pytest.fixture
def hf_runner(): def hf_runner():
...@@ -162,9 +275,14 @@ class VllmRunner: ...@@ -162,9 +275,14 @@ class VllmRunner:
self, self,
model_name: str, model_name: str,
tokenizer_name: Optional[str] = None, tokenizer_name: Optional[str] = None,
# Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit.
max_model_len=1024,
dtype: str = "half", dtype: str = "half",
disable_log_stats: bool = True, disable_log_stats: bool = True,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
block_size: int = 16,
enable_chunked_prefill: bool = False,
**kwargs, **kwargs,
) -> None: ) -> None:
self.model = LLM( self.model = LLM(
...@@ -175,6 +293,9 @@ class VllmRunner: ...@@ -175,6 +293,9 @@ class VllmRunner:
swap_space=0, swap_space=0,
disable_log_stats=disable_log_stats, disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
max_model_len=max_model_len,
block_size=block_size,
enable_chunked_prefill=enable_chunked_prefill,
**kwargs, **kwargs,
) )
...@@ -182,9 +303,16 @@ class VllmRunner: ...@@ -182,9 +303,16 @@ class VllmRunner:
self, self,
prompts: List[str], prompts: List[str],
sampling_params: SamplingParams, sampling_params: SamplingParams,
images: Optional["torch.Tensor"] = None,
) -> List[Tuple[List[int], str]]: ) -> List[Tuple[List[int], str]]:
req_outputs = self.model.generate(prompts, if images is not None:
sampling_params=sampling_params) assert len(prompts) == images.shape[0]
req_outputs = self.model.generate(
prompts,
sampling_params=sampling_params,
multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE,
data=images)
if images is not None else None)
outputs = [] outputs = []
for req_output in req_outputs: for req_output in req_outputs:
prompt_str = req_output.prompt prompt_str = req_output.prompt
...@@ -221,9 +349,10 @@ class VllmRunner: ...@@ -221,9 +349,10 @@ class VllmRunner:
self, self,
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
images: Optional[torch.Tensor] = None,
) -> List[Tuple[List[int], str]]: ) -> List[Tuple[List[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
outputs = self.generate(prompts, greedy_params) outputs = self.generate(prompts, greedy_params, images=images)
return [(output_ids[0], output_str[0]) return [(output_ids[0], output_str[0])
for output_ids, output_str in outputs] for output_ids, output_str in outputs]
...@@ -254,7 +383,21 @@ class VllmRunner: ...@@ -254,7 +383,21 @@ class VllmRunner:
outputs = self.generate(prompts, beam_search_params) outputs = self.generate(prompts, beam_search_params)
return outputs return outputs
def __del__(self):
del self.model
cleanup()
@pytest.fixture @pytest.fixture
def vllm_runner(): def vllm_runner():
return VllmRunner return VllmRunner
def get_tokenizer_pool_config(tokenizer_group_type):
if tokenizer_group_type is None:
return None
if tokenizer_group_type == "ray":
return TokenizerPoolConfig(pool_size=1,
pool_type="ray",
extra_config={})
raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
import contextlib
import gc
import pytest
import ray
import torch
from vllm import LLM
from vllm.model_executor.parallel_utils.parallel_state import (
destroy_model_parallel)
from vllm.model_executor.utils import set_random_seed
def cleanup():
destroy_model_parallel()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
torch.cuda.empty_cache()
ray.shutdown()
@pytest.fixture
def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, seed):
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, seed)
@pytest.fixture
def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
test_llm_kwargs, seed):
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
test_llm_kwargs, seed)
def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
distinct_llm_kwargs, seed):
kwargs = {
**common_llm_kwargs,
**per_test_common_llm_kwargs,
**distinct_llm_kwargs,
}
def generator_inner():
llm = LLM(**kwargs)
set_random_seed(seed)
yield llm
del llm
cleanup()
for llm in generator_inner():
yield llm
del llm
from itertools import cycle
import pytest
from vllm import SamplingParams
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"forced_num_gpu_blocks": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"use_v2_block_manager": False
}])
@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify block manager v2 produces same outputs as block manager v1, even
when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted in the v2 block manager.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
"""
output_len = 1024
temperature = 0.0
# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids from block manager v1')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids from block manager v2')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
for llm in llm_generator:
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
token_ids = [output.outputs[0].token_ids for output in outputs]
del llm
return token_ids
import pytest
from vllm.core.block_manager_v2 import BlockSpaceManagerV2
from vllm.core.interfaces import AllocStatus
from ..utils import create_seq_group
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
@pytest.mark.parametrize("watermark", [0.0, 0.5])
def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
num_gpu_blocks: int, watermark: float):
block_manager = BlockSpaceManagerV2(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
watermark=watermark,
)
num_watermark_blocks = int(watermark * num_gpu_blocks)
num_output_blocks_per_seq = 1
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
# the current implementation assumes all seqs are new prompts / don't have
# different output lens.
num_output_blocks = num_output_blocks_per_seq
for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
seq_group = create_seq_group(
seq_prompt_lens=block_size * num_prompt_blocks,
seq_output_lens=[
block_size * num_output_blocks_per_seq
for _ in range(num_seqs_per_group)
],
)
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
can_allocate_result = block_manager.can_allocate(seq_group)
num_required_blocks = num_prompt_blocks + num_output_blocks
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
assert can_allocate_result == AllocStatus.NEVER
elif num_gpu_blocks >= num_required_blocks:
assert can_allocate_result == AllocStatus.OK
else:
assert can_allocate_result == AllocStatus.LATER
import pytest
from vllm.core.block.block_table import BlockTable
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
from vllm.utils import Device, cdiv, chunk_list
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
def test_allocate_naive(block_size: int, sequence_len: int):
"""Test the allocation of blocks using the naive allocator.
This test creates a CpuGpuBlockAllocator with the specified block size and
number of blocks. It then allocates multiple BlockTables with varying
sequence lengths and verifies that the number of free blocks decreases as
expected after each allocation.
"""
assert block_size > 1
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type="naive",
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
block_tables = []
for i in range(5):
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
block_tables.append(
BlockTable(
block_size=block_size,
block_allocator=allocator,
))
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
def test_allocate_prefix_caching(block_size: int, sequence_len: int):
"""Test the allocation of blocks using the prefix caching allocator.
This test creates a CpuGpuBlockAllocator with the specified block size and
number of blocks, using the prefix caching allocator. It then allocates
multiple BlockTables with varying sequence lengths and verifies that the
number of free blocks decreases as expected after each allocation.
The test expects all sequences to share allocations, except for their last
block, which may be mutable. It calculates the expected number of immutable
and mutable blocks per allocation based on the sequence length and block
size.
"""
assert block_size > 1
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type="prefix_caching",
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
chunked_tokens = list(chunk_list(token_ids, block_size))
num_mutable_blocks_per_alloc = 0 if len(
chunked_tokens[-1]) == block_size else 1
num_immutable_blocks_per_alloc = len(
chunked_tokens) - num_mutable_blocks_per_alloc
block_tables = []
for alloc_i in range(1, 6):
block_tables.append(
BlockTable(
block_size=block_size,
block_allocator=allocator,
))
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
# Expect all sequences to share allocations, except for their last block
# (which may be mutable).
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_gpu_blocks - (
num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
(alloc_i))
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
@pytest.mark.parametrize("device", ["cpu", "gpu"])
def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
device: str):
"""Test the allocation and freeing of blocks using different allocators and
devices.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, allocator type, and device. It then allocates a BlockTable
multiple times with the same sequence and verifies that the number of free
blocks remains consistent after each allocation and freeing.
"""
device = Device[device.upper()]
num_device_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_device_blocks,
num_cpu_blocks=num_device_blocks,
block_size=block_size,
)
token_ids = list(range(sequence_len))
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
for i in range(5):
block_table.allocate(token_ids=token_ids, device=device)
assert allocator.get_num_free_blocks(
device) == num_device_blocks - num_blocks_per_alloc
assert all(block_id is not None
for block_id in block_table.physical_block_ids)
block_table.free()
assert allocator.get_num_free_blocks(device) == num_device_blocks
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_append_token_ids_allocation(block_size: int, sequence_len: int,
append_len: int, allocator_type: str):
"""Test the allocation behavior when appending token IDs to a BlockTable.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, and allocator type. It then allocates a BlockTable with an
initial sequence and appends additional token IDs to it. The test verifies
that the number of allocated blocks before and after appending matches the
expected values.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
num_expected_blocks_before_append = len(
list(chunk_list(token_ids, block_size)))
num_expected_appended_blocks = len(
list(chunk_list(token_ids + token_ids_to_append,
block_size))) - num_expected_blocks_before_append
block_table.allocate(token_ids=token_ids, device=Device.GPU)
assert len(
block_table.physical_block_ids) == num_expected_blocks_before_append
block_table.append_token_ids(token_ids_to_append)
assert len(
block_table.physical_block_ids
) == num_expected_blocks_before_append + num_expected_appended_blocks
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
num_empty_slots: int,
allocator_type: str):
"""Test the allocation behavior when ensuring a certain number of empty
slots in a BlockTable.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, and allocator type. It then allocates a BlockTable with an
initial sequence and ensures a certain number of empty slots. The test
verifies that the number of allocated blocks before and after ensuring empty
slots matches the expected values. It also checks that filling up the empty
slots does not consume additional blocks.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
num_expected_blocks_before_append = len(
list(chunk_list(token_ids, block_size)))
num_expected_appended_blocks = len(
list(chunk_list(token_ids + [-1] * num_empty_slots,
block_size))) - num_expected_blocks_before_append
block_table.allocate(token_ids=token_ids, device=Device.GPU)
# Assert that the empty slots consume the expected number of additional
# blocks.
assert len(
block_table.physical_block_ids) == num_expected_blocks_before_append
block_table.ensure_num_empty_slots(num_empty_slots)
assert len(
block_table.physical_block_ids
) == num_expected_blocks_before_append + num_expected_appended_blocks
# Now, ensure no additional blocks consumed as we fill up the empty slots.
num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 9])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("append_size", [1, 4, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
append_len: int, allocator_type: str,
append_size: int):
"""Verify token ids are correctly appended. Appends various amounts of
token ids in various append sizes, and verifies the final sequence is
correct.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
block_table.allocate(token_ids=token_ids, device=Device.GPU)
appended_so_far = []
for append in chunk_list(token_ids_to_append, append_size):
block_table.append_token_ids(append)
appended_so_far.extend(append)
assert block_table._get_all_token_ids() == token_ids + appended_so_far
assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
@pytest.mark.parametrize("seq_len", [1, 9, 129])
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_fork(seq_len: int, block_size: int, allocator_type: str):
"""Create a sequence using the specified allocator.
1. Assert that after forking the sequence, the free block count is the
same.
2. Assert that the forked sequence has the same physical mappings.
3. Then free the original sequence; verify that the free block count is
the same.
4. Finally, free the forked sequence and verify that the free block
count drops to zero.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(seq_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
block_table.allocate(token_ids)
num_free_blocks_before_fork = allocator.get_num_free_blocks(
device=Device.GPU)
forked_block_table = block_table.fork()
# Expect physical_block_ids and token_ids to match.
assert (block_table.physical_block_ids ==
forked_block_table.physical_block_ids)
assert block_table._get_all_token_ids(
) == forked_block_table._get_all_token_ids()
# Do not expect any additional allocations.
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_free_blocks_before_fork
# Free the original blocks. Assert num free blocks does not change, since
# refcount is nonzero.
block_table.free()
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_free_blocks_before_fork
# Expect the forked block table to be unaffected by the free.
assert all(block_id is not None
for block_id in forked_block_table.physical_block_ids)
# Free the forked blocks. Assert num free blocks does change, since
# refcount is now zero.
forked_block_table.free()
assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("appender", ["forked", "original"])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_cow(block_size: int, sequence_len: int, append_len: int,
allocator_type: str, appender: str):
"""Fork a sequence; append to the forked sequence; verify there's a CoW.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
original_block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
num_expected_cow_blocks = cdiv(sequence_len + append_len,
block_size) - (sequence_len // block_size)
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
original_block_ids = original_block_table.physical_block_ids
forked_block_table = original_block_table.fork()
# Expect no additional allocation (copy on _write_).
assert allocator.get_num_free_blocks(
Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
if appender == "forked":
appender_block_table = forked_block_table
static_block_table = original_block_table
elif appender == "original":
appender_block_table = original_block_table
static_block_table = forked_block_table
else:
raise ValueError(f"unknown test config {appender=}")
# Write tokens.
appender_block_table.append_token_ids(token_ids_to_append)
# Expect the non-appending block table to have no change.
assert static_block_table.physical_block_ids == original_block_ids
assert appender_block_table.physical_block_ids != original_block_ids
# Expect the blocks changed during append to have a CoW.
assert allocator.get_num_free_blocks(
Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
num_expected_cow_blocks)
cows = allocator.clear_copy_on_writes()
if sequence_len % block_size > 0:
# If the last block in the sequence is not full, then when appending we
# expect a CoW.
assert cows
cow_block_id = sequence_len // block_size
expected_src = static_block_table.physical_block_ids[cow_block_id]
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
assert expected_src in cows
assert expected_dst in cows[expected_src]
else:
# Otherwise, there should be no copy-on-write.
assert not cows
static_block_table.free()
appender_block_table.free()
# After free, expect all blocks to be freed.
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
@pytest.mark.parametrize("appender", ["forked", "original"])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_cow_lookahead_simple(block_size: int, sequence_len: int,
append_len: int, lookahead_slots: int,
allocator_type: str, appender: str):
"""Similar to test_cow, except with lookahead allocation. The assertions are
less rigorous due to the complexity of the property under test.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
original_block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
# Allocate lookahead slots.
original_block_table.ensure_num_empty_slots(lookahead_slots)
original_block_ids = original_block_table.physical_block_ids
forked_block_table = original_block_table.fork()
if appender == "forked":
appender_block_table = forked_block_table
static_block_table = original_block_table
elif appender == "original":
appender_block_table = original_block_table
static_block_table = forked_block_table
else:
raise ValueError(f"unknown test config {appender=}")
# Write tokens.
appender_block_table.append_token_ids(token_ids_to_append)
# Expect the non-appending block table to have no change.
assert static_block_table.physical_block_ids == original_block_ids
assert appender_block_table.physical_block_ids != original_block_ids
cows = allocator.clear_copy_on_writes()
# Always expect copy-on-write
assert cows
if sequence_len % block_size > 0:
# If the last block in the sequence is not full, then when appending we
# expect a CoW.
assert cows
cow_block_id = sequence_len // block_size
expected_src = static_block_table.physical_block_ids[cow_block_id]
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
assert expected_src in cows
assert expected_dst in cows[expected_src]
static_block_table.free()
appender_block_table.free()
# After free, expect all blocks to be freed.
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
import random
import pytest
from vllm.core.block.common import RefCounter
@pytest.mark.parametrize("seed", list(range(20)))
@pytest.mark.parametrize("num_incrs", [1, 100])
@pytest.mark.parametrize("num_blocks", [1024])
def test_incr(seed: int, num_incrs: int, num_blocks: int):
random.seed(seed)
all_block_indices = list(range(num_blocks))
counter = RefCounter(all_block_indices=all_block_indices)
block_id = random.randint(0, num_blocks - 1)
for i in range(num_incrs):
value = counter.incr(block_id)
assert value == i + 1
@pytest.mark.parametrize("seed", list(range(20)))
@pytest.mark.parametrize("num_incrs", [1, 100])
@pytest.mark.parametrize("num_blocks", [1024])
def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
random.seed(seed)
all_block_indices = list(range(num_blocks))
counter = RefCounter(all_block_indices=all_block_indices)
block_id = random.randint(0, num_blocks - 1)
for i in range(num_incrs):
value = counter.incr(block_id)
assert value == i + 1
for i in range(num_incrs):
value = counter.decr(block_id)
assert value == num_incrs - (i + 1)
with pytest.raises(AssertionError):
counter.decr(block_id)
import pytest
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
from vllm.utils import Device, chunk_list
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
@pytest.mark.parametrize("num_gpu_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
block_size=block_size,
)
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
cpu_blocks = [
allocator.allocate_mutable(prev_block=None, device=Device.CPU)
for _ in range(num_cpu_blocks)
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
gpu_blocks = [
allocator.allocate_mutable(prev_block=None, device=Device.GPU)
for _ in range(num_gpu_blocks)
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in cpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in gpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
@pytest.mark.parametrize("num_gpu_blocks", [1024])
@pytest.mark.parametrize("block_size", [2])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
block_size=block_size,
)
unique_token_ids = list(
range((num_cpu_blocks + num_gpu_blocks) * block_size))
gpu_token_ids = chunk_list(unique_token_ids[:num_gpu_blocks * block_size],
block_size)
cpu_token_ids = chunk_list(unique_token_ids[num_gpu_blocks * block_size:],
block_size)
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
cpu_blocks = [
allocator.allocate_immutable(prev_block=None,
token_ids=token_ids,
device=Device.CPU)
for token_ids in cpu_token_ids
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
gpu_blocks = [
allocator.allocate_immutable(prev_block=None,
token_ids=token_ids,
device=Device.GPU)
for token_ids in gpu_token_ids
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in cpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in gpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
from typing import List, Optional
import pytest
from vllm.core.block.interfaces import Block, BlockAllocator
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
class TestNaiveBlockAllocator:
@staticmethod
def create_allocate_lambda(allocate_type: str,
allocator: NaiveBlockAllocator,
prev_block: Optional[Block],
token_ids: List[int]):
if allocate_type == "immutable":
allocate_block = lambda: allocator.allocate_immutable(
prev_block=prev_block, token_ids=token_ids)
elif allocate_type == "mutable":
allocate_block = lambda: allocator.allocate_mutable(prev_block=
prev_block)
else:
raise ValueError()
return allocate_block
@staticmethod
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_allocate_ooms(allocate_type: str, num_blocks: int,
block_size: int):
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
allocate_type,
allocator,
prev_block=None,
token_ids=list(range(block_size)))
[allocate_block() for _ in range(num_blocks)]
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
@staticmethod
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_free_prevents_oom(allocate_type: str, num_blocks: int,
block_size: int):
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
allocate_type,
allocator,
prev_block=None,
token_ids=list(range(block_size)))
blocks = [allocate_block() for _ in range(num_blocks)]
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
block_to_free = blocks.pop()
for _ in range(100):
block_id = block_to_free.block_id
allocator.free(block_to_free)
assert block_to_free.block_id is None
new_block = allocate_block()
assert new_block.block_id == block_id
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
block_to_free = new_block
@staticmethod
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
block_size: int):
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
allocate_type,
allocator,
prev_block=None,
token_ids=list(range(block_size)))
assert allocator.get_num_free_blocks() == num_blocks
blocks = [allocate_block() for _ in range(num_blocks)]
for i, block in enumerate(blocks):
assert allocator.get_num_free_blocks() == i
allocator.free(block)
import math
import random
from typing import List, Optional
from unittest.mock import MagicMock
import pytest
from vllm.core.block.interfaces import Block, BlockAllocator
from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
PrefixCachingBlockAllocator)
class TestPrefixCachingBlock:
@staticmethod
@pytest.mark.parametrize("seed", list(range(10)))
@pytest.mark.parametrize("block_size", [1, 16])
@pytest.mark.parametrize("is_curr_block_full", [True, False])
def test_first_block_has_correct_content_hash(seed: int, block_size: int,
is_curr_block_full: bool):
"""Verify a block which is first in the sequence has the correct hash.
"""
random.seed(seed)
num_to_fill = block_size if is_curr_block_full else random.randint(
0, block_size - 1)
token_ids = list(range(num_to_fill))
mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
block_with_prev = PrefixCachingBlock(
prev_block=None,
token_ids=token_ids,
block_size=block_size,
prefix_caching_allocator=mock_allocator)
if is_curr_block_full:
# Expect hash since block is full.
assert block_with_prev.content_hash == (
PrefixCachingBlock.hash_block_tokens(
is_first_block=True,
prev_block_hash=None,
cur_block_token_ids=token_ids))
else:
# Do not expect hash since block is not full.
assert block_with_prev.content_hash is None
@staticmethod
@pytest.mark.parametrize("seed", list(range(10)))
@pytest.mark.parametrize("block_size", [1, 16])
@pytest.mark.parametrize("is_curr_block_full", [True, False])
@pytest.mark.parametrize("prev_block_has_hash", [True, False])
def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
is_curr_block_full: bool,
prev_block_has_hash: bool):
"""Verify a block which is not first in the sequence has the correct
hash.
"""
random.seed(seed)
previous_block = MagicMock(spec=PrefixCachingBlock)
prev_block_hash = random.randint(0, 1000)
previous_block.content_hash = (prev_block_hash
if prev_block_has_hash else None)
num_to_fill = block_size if is_curr_block_full else random.randint(
0, block_size - 1)
token_ids = list(range(num_to_fill))
mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
block_with_prev = PrefixCachingBlock(
prev_block=previous_block,
token_ids=token_ids,
block_size=block_size,
prefix_caching_allocator=mock_allocator,
)
if is_curr_block_full and prev_block_has_hash:
# Expect hash since block is full and previous block has hash.
assert (block_with_prev.content_hash ==
PrefixCachingBlock.hash_block_tokens(
is_first_block=False,
prev_block_hash=prev_block_hash,
cur_block_token_ids=token_ids))
else:
# Do not expect hash since block is not full or the previous block
# does not have a hash.
assert block_with_prev.content_hash is None
@staticmethod
@pytest.mark.parametrize("block_size", [1, 2, 16])
@pytest.mark.parametrize("num_tokens", list(range(3)))
@pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
def test_blocks_have_correct_hash_in_chain(block_size: int,
num_tokens: int,
num_empty_trailing_blocks: int):
"""Create two chains of logical blocks with the same contents.
Assert the hashes are equal.
"""
random.seed(0)
token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
first_chain, second_chain = [
TestPrefixCachingBlock.create_chain(
block_size=block_size,
token_ids=token_ids,
num_empty_trailing_blocks=num_empty_trailing_blocks)
for _ in range(2)
]
for first_chain_block, second_chain_block in zip(
first_chain, second_chain):
assert (first_chain_block.content_hash ==
second_chain_block.content_hash)
if not first_chain or not second_chain:
assert first_chain == second_chain
assert num_tokens == 0
@staticmethod
def create_chain(block_size: int,
token_ids: List[int],
num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
"""Helper method which creates a chain of blocks.
"""
blocks = []
num_blocks = math.ceil(
len(token_ids) / block_size) + num_empty_trailing_blocks
if num_blocks == 0:
return []
allocator = MagicMock(spec=PrefixCachingBlockAllocator)
prev_block = None
for block_number in range(0, num_blocks):
prev_block = PrefixCachingBlock(
prev_block=prev_block,
token_ids=[],
block_size=block_size,
prefix_caching_allocator=allocator,
)
tokens_to_append = token_ids[block_number *
block_size:(block_number + 1) *
block_size]
if tokens_to_append:
prev_block.append_token_ids(tokens_to_append)
blocks.append(prev_block)
return blocks
class TestPrefixCachingBlockAllocator:
@staticmethod
def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
prev_block: Optional[Block],
token_ids: List[int]):
if allocate_type == "immutable":
allocate_block = lambda: allocator.allocate_immutable(
prev_block=prev_block, token_ids=token_ids)
elif allocate_type == "mutable":
allocate_block = lambda: allocator.allocate_mutable(prev_block=
prev_block)
else:
raise ValueError()
return allocate_block
@staticmethod
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
allocate_type="mutable",
allocator=allocator,
prev_block=None,
token_ids=list(range(block_size)),
)
[allocate_block() for _ in range(num_blocks)]
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
@staticmethod
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_allocate_immutable_does_not_oom_single_hash(
num_blocks: int, block_size: int):
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
allocate_type="immutable",
allocator=allocator,
prev_block=None,
token_ids=list(range(block_size)),
)
blocks = [allocate_block() for _ in range(num_blocks)]
# Expect no OOM. If these were mutable blocks, this would OOM.
non_oom_block = allocate_block()
# Expect all blocks to have same physical block index.
for block in blocks:
assert (block.block_id == non_oom_block.block_id)
@staticmethod
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_allocate_immutable_ooms_many_hash(num_blocks: int,
block_size: int):
"""Consume all blocks using many different hashes/block content.
Do this by creating a sequence that is very long.
Expect next block to OOM.
"""
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
# Create token ids that will exhaust all blocks.
token_ids = list(range(num_blocks * block_size))
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Expect allocation with unseen hash to fail.
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocator.allocate_immutable(prev_block=chain[-1],
token_ids=list(range(block_size)))
# Expect mutable allocation to fail.
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocator.allocate_mutable(prev_block=chain[-1])
# Expect allocation of exact same chain to pass.
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Expect physical block indices to be the same in both chains.
assert chain and second_chain
for first_chain_block, second_chain_block in zip(chain, second_chain):
assert (first_chain_block.block_id == second_chain_block.block_id)
@staticmethod
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_free_prevents_oom(num_blocks: int, block_size: int):
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
# Create token ids that will exhaust all blocks.
token_ids = list(range(num_blocks * block_size))
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Expect mutable allocation to fail.
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocator.allocate_mutable(prev_block=None)
block_to_free = chain[-1]
# Expect free/allocate loop to succeed many times.
for i in range(100):
block_id = block_to_free.block_id
allocator.free(block_to_free)
assert block_to_free.block_id is None, i
new_block = allocator.allocate_mutable(prev_block=None)
assert new_block.block_id == block_id, i
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocator.allocate_mutable(prev_block=None)
block_to_free = new_block
@staticmethod
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(20)))
def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
random.seed(seed)
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
num_blocks_to_consume = random.randint(1, num_blocks - 1)
# Create token ids that will exhaust all blocks.
token_ids = list(range(num_blocks_to_consume * block_size))
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Free each block in chain, assert num free blocks includes new free
# block.
for i, block in enumerate(chain):
assert allocator.get_num_free_blocks() == (num_blocks -
num_blocks_to_consume +
i)
allocator.free(block)
@staticmethod
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(20)))
def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
seed: int):
"""Verify sharing occurs by allocating two sequences that share prefixes
and incrementally freeing blocks.
"""
random.seed(seed)
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
num_blocks_to_consume = random.randint(1, num_blocks - 1)
# Create token ids that will exhaust all blocks.
token_ids = list(range(num_blocks_to_consume * block_size))
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Free each block in the first chain. Since all blocks are shared, the
# free count should stay constant.
for i, block in enumerate(first_chain):
assert allocator.get_num_free_blocks() == (num_blocks -
num_blocks_to_consume)
allocator.free(block)
# Free each block in the second chain. Since the refcount is now zero,
# the free count should increment with each free.
for i, block in enumerate(second_chain):
assert allocator.get_num_free_blocks() == (num_blocks -
num_blocks_to_consume +
i)
allocator.free(block)
@staticmethod
def create_immutable_chain(
block_size: int,
token_ids: List[int],
allocator: PrefixCachingBlockAllocator,
) -> List[PrefixCachingBlock]:
"""Helper method which creates a chain of blocks.
"""
blocks = []
num_blocks = math.ceil(len(token_ids) / block_size)
if num_blocks == 0:
return []
prev_block = None
for block_number in range(0, num_blocks):
block_token_ids = token_ids[block_number *
block_size:(block_number + 1) *
block_size]
prev_block = allocator.allocate_immutable(
prev_block=prev_block, token_ids=block_token_ids)
blocks.append(prev_block)
return blocks
import time
from typing import List
import pytest
from vllm import SamplingParams
from vllm.block import PhysicalTokenBlock
from vllm.core.block_manager_v1 import (BlockSpaceManagerV1,
UncachedBlockAllocator)
from vllm.core.interfaces import AllocStatus
from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
from vllm.utils import Device
from .utils import create_dummy_prompt
def test_block_allocator_allocate():
block_size = 4
num_cpu_blocks = 4
cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
num_cpu_blocks)
# Allocate all available cpu blocks.
num_free = num_cpu_blocks
assert cpu_allocator.get_num_free_blocks() == num_free
for _ in range(num_cpu_blocks):
block = cpu_allocator.allocate()
num_free -= 1
assert block not in cpu_allocator.free_blocks
assert cpu_allocator.get_num_free_blocks() == num_free
with pytest.raises(ValueError):
cpu_allocator.allocate()
def test_block_allocator_free():
block_size = 4
num_cpu_blocks = 4
cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
num_cpu_blocks)
# Allocate all available cpu blocks.
blocks: List[PhysicalTokenBlock] = []
for _ in range(num_cpu_blocks):
block = cpu_allocator.allocate()
blocks.append(block)
assert block not in cpu_allocator.free_blocks
# Free all allocated cpu blocks.
num_free = 0
assert cpu_allocator.get_num_free_blocks() == num_free
for block in blocks:
cpu_allocator.free(block)
num_free += 1
assert block in cpu_allocator.free_blocks
assert cpu_allocator.get_num_free_blocks() == num_free
with pytest.raises(ValueError):
cpu_allocator.free(block)
def test_allocate():
block_size = 4
num_cpu_blocks = 4
num_gpu_blocks = 4
block_manager = BlockSpaceManagerV1(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0)
# Allocate same sequence group to all available gpu blocks.
for i in range(num_gpu_blocks):
_, seq_group = create_dummy_prompt(str(i), block_size)
assert block_manager.can_allocate(seq_group)
block_manager.allocate(seq_group)
assert block_manager.can_allocate(seq_group) != AllocStatus.OK
# Allocate same sequence group to all available gpu blocks.
# Use watermark to reserve one gpu block.
block_manager = BlockSpaceManagerV1(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=1 / num_gpu_blocks)
for i in range(num_gpu_blocks - 1):
_, seq_group = create_dummy_prompt(str(i), block_size)
assert block_manager.can_allocate(seq_group)
block_manager.allocate(seq_group)
assert block_manager.can_allocate(seq_group) != AllocStatus.OK
def test_append_slot_single_seq():
block_size = 4
num_cpu_blocks = 4
num_gpu_blocks = 4
block_manager = BlockSpaceManagerV1(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0)
# Allocate single seq to gpu block.
prompt, seq_group = create_dummy_prompt("1", block_size)
block_manager.allocate(seq_group)
# Nothing to append. Sequence has no new logical blocks.
assert block_manager.can_append_slot(seq_group)
before_blocks = block_manager.get_num_free_gpu_blocks()
assert not block_manager.append_slot(prompt)
after_blocks = block_manager.get_num_free_gpu_blocks()
assert before_blocks == after_blocks
# Add block_size number of new tokens and append slot.
for i in range(block_size):
token_id = i + 5
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
assert block_manager.can_append_slot(seq_group)
before_blocks = block_manager.get_num_free_gpu_blocks()
assert not block_manager.append_slot(prompt)
after_blocks = block_manager.get_num_free_gpu_blocks()
assert before_blocks - after_blocks == 1
def test_append_slot_cow():
block_size = 4
num_cpu_blocks = 4
num_gpu_blocks = 4
block_manager = BlockSpaceManagerV1(block_size=block_size,
num_cpu_blocks=num_cpu_blocks,
num_gpu_blocks=num_gpu_blocks,
watermark=0)
# Allocate prompt to gpu block. There is one slot left in the block.
prompt = Sequence(seq_id=1,
prompt="one two three",
prompt_token_ids=[1, 2, 3],
block_size=block_size)
# Fork the sequence, such that a COW will be required when we append a new
# token id.
child = prompt.fork(new_seq_id=2)
# Allocate space for the sequence group.
seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
time.time(), time.perf_counter)
block_manager.allocate(seq_group)
# Fork and append a new token id. We expect a COW to be scheduled.
token_id = 4
child.append_token_id(token_id, {token_id: Logprob(0.0)})
block_manager.fork(prompt, child)
assert block_manager.can_append_slot(seq_group)
before_blocks = block_manager.get_num_free_gpu_blocks()
maybe_src_dst_block = block_manager.append_slot(child)
assert maybe_src_dst_block is not None
src_block, dst_block = maybe_src_dst_block
assert src_block != dst_block
after_blocks = block_manager.get_num_free_gpu_blocks()
assert before_blocks - after_blocks == 1
def test_fork():
block_size = 4
num_cpu_blocks = 4
num_gpu_blocks = 4
block_manager = BlockSpaceManagerV1(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0)
prompt, seq_group = create_dummy_prompt("1",
block_size - 1,
block_size=block_size)
block_manager.allocate(seq_group)
# Fork prompt and copy block tables.
child = prompt.fork(2)
block_manager.fork(prompt, child)
assert block_manager.get_block_table(
prompt) == block_manager.get_block_table(child)
token_id = 4
# Append token to child. Block is shared so copy on write occurs.
child.append_token_id(token_id, {token_id: Logprob(0.0)})
block_manager.append_slot(child)
assert block_manager.get_block_table(
prompt) != block_manager.get_block_table(child)
def test_swap():
block_size = 4
num_cpu_blocks = 4
num_gpu_blocks = 4
block_manager = BlockSpaceManagerV1(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0)
prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id = 0
prompt.status = SequenceStatus.RUNNING
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
# Swap seq group from GPU -> CPU.
gpu_blocks = block_manager.get_block_table(prompt)
assert block_manager.can_swap_out(seq_group)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_out(seq_group)
assert list(mapping.keys()) == gpu_blocks
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
prompt.status = SequenceStatus.SWAPPED
# Swap seq group from CPU -> GPU.
cpu_blocks = block_manager.get_block_table(prompt)
assert block_manager.can_swap_in(seq_group)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_in(seq_group)
assert list(mapping.keys()) == cpu_blocks
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
def test_free():
block_size = 4
num_cpu_blocks = 4
num_gpu_blocks = 4
block_manager = BlockSpaceManagerV1(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0)
prompt, seq_group = create_dummy_prompt("1", block_size)
block_manager.allocate(seq_group)
# Free allocated seq.
prompt_blocks = len(block_manager.get_block_table(prompt))
before_blocks = block_manager.get_num_free_gpu_blocks()
block_manager.free(prompt)
after_blocks = block_manager.get_num_free_gpu_blocks()
assert after_blocks == before_blocks + prompt_blocks
# Block table for freed seq is deleted.
with pytest.raises(KeyError):
block_manager.get_block_table(prompt)
def test_reset():
block_size = 4
num_cpu_blocks = 4
num_gpu_blocks = 4
block_manager = BlockSpaceManagerV1(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0)
# Allocate same seq group on all available gpu blocks.
original_blocks = block_manager.get_num_free_gpu_blocks()
for i in range(num_gpu_blocks):
_, seq_group = create_dummy_prompt(str(i), block_size)
block_manager.allocate(seq_group)
assert block_manager.get_num_free_gpu_blocks() == 0
# Resetting block manager frees all allocated blocks.
block_manager.reset()
assert block_manager.get_num_free_gpu_blocks() == original_blocks
def test_sliding_window_multi_seq():
"""
Tests that memory allocation and deallocation is handled
correctly with multiple sequences that exceed the sliding
window's capacity.
"""
block_size = 1
num_cpu_blocks = 8
num_gpu_blocks = 8
sliding_window = 2
block_manager = BlockSpaceManagerV1(block_size,
num_cpu_blocks,
num_gpu_blocks,
sliding_window=sliding_window,
watermark=0)
assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
parent = Sequence(1, "one two three", [0, 1, 2], block_size)
seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
None)
block_manager.allocate(seq_group)
# assert the number of blocks allocated is correct
# the parent seq has len 3, but since sliding_window is 2,
# we will use at most 2 blocks
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - sliding_window
# Fork prompt and copy block tables.
child = parent.fork(2)
block_manager.fork(parent, child)
# assert the number of blocks allocated is correct
# forking does not increase memory consumption
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - sliding_window
# assert both parent and child share all blocks
assert block_manager.get_block_table(
parent) == block_manager.get_block_table(child)
token_id = 4
# Append token to child. Block is shared so copy on write occurs.
child.append_token_id(token_id, {token_id: Logprob(0.0)})
block_manager.append_slot(child)
# assert the number of blocks allocated is correct
# we will use now one block more. Each seq will use 2 blocks,
# but only one can be shared
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - sliding_window - 1
token_id = 5
parent.append_token_id(token_id, {token_id: Logprob(0.0)})
block_manager.append_slot(parent)
# assert the number of blocks allocated is correct
# no change, because both sequences are still just sharing one block
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - sliding_window - 1
block_table_parent = block_manager.get_block_table(parent)
block_table_child = block_manager.get_block_table(child)
assert block_table_parent != block_table_child
# assert both blocks are sharing the second-last block
assert block_table_parent[-2] == block_table_child[-2]
# now let's clean up...
block_manager.free(parent)
# assert the number of blocks allocated is correct
# We have freed one seq, reducing the ref count of two blocks by one.
# One of the two was only used by the parent seq, so this is now free.
# The child seq still consumes sliding_window blocks
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - sliding_window
# free all blocks
block_manager.free(child)
# assert all blocks are free now
assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
import time
from typing import List
import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig
from vllm.core.scheduler import Scheduler
from vllm.sequence import Logprob, SequenceGroup
from .utils import create_dummy_prompt
def get_sequence_groups(scheduler_output):
return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
def test_scheduler_add_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(100, 64, 1)
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4
scheduler = Scheduler(scheduler_config, cache_config, None)
# Add seq group to scheduler.
num_seq_group = 4
for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), block_size)
scheduler.add_seq_group(seq_group)
assert scheduler.get_num_unfinished_seq_groups() == i + 1
def test_scheduler_abort_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(100, 64, 1)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4
scheduler = Scheduler(scheduler_config, cache_config, None)
# Add multiple seq groups to scheduler.
num_seq_group = 4
request_ids = set()
for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), block_size)
scheduler.add_seq_group(seq_group)
request_ids.add(str(i))
# Abort all added seq groups.
assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
scheduler.abort_seq_group(request_ids)
assert scheduler.get_num_unfinished_seq_groups() == 0
def test_scheduler_schedule_simple():
block_size = 4
num_seq_group = 4
max_model_len = 16
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
scheduler = Scheduler(scheduler_config, cache_config, None)
running: List[SequenceGroup] = []
# Add seq groups to scheduler.
for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
scheduler.add_seq_group(seq_group)
running.append(seq_group)
# Schedule seq groups prompts.
num_tokens = block_size * num_seq_group
seq_group_meta, out = scheduler.schedule()
assert set(get_sequence_groups(out)) == set(running)
assert out.num_batched_tokens == num_tokens
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
and not out.blocks_to_swap_out)
assert len(seq_group_meta) == num_seq_group
# Schedule seq groups generation.
seq_group_meta, out = scheduler.schedule()
assert set(get_sequence_groups(out)) == set(running)
assert out.num_batched_tokens == num_seq_group
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
and not out.blocks_to_swap_out)
assert len(seq_group_meta) == num_seq_group
def test_scheduler_schedule_preempt_abort():
block_size = 4
max_model_len = 16
scheduler_config = SchedulerConfig(64, 2, max_model_len)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2
cache_config.num_gpu_blocks = 2
scheduler = Scheduler(scheduler_config, cache_config, None)
# Add seq groups to scheduler.
seq_a, seq_group_a = create_dummy_prompt("1", block_size)
seq_b, seq_group_b = create_dummy_prompt("2", block_size)
scheduler.add_seq_group(seq_group_a)
scheduler.add_seq_group(seq_group_b)
# Schedule seq groups prompts.
seq_group_meta, out = scheduler.schedule()
assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
assert out.num_batched_tokens == block_size * 2 # seq_a and seq_b
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
and not out.blocks_to_swap_out)
assert len(seq_group_meta) == 2
assert scheduler.get_num_unfinished_seq_groups() == 2
# Append "generated" tokens, allowing the sequence to mark prompt tokens as
# processed.
token_id = 0
seq_a.append_token_id(token_id, {token_id: Logprob(0.0)})
seq_b.append_token_id(token_id, {token_id: Logprob(0.0)})
# Schedule seq groups generation and preempt seq group b.
seq_group_meta, out = scheduler.schedule()
assert get_sequence_groups(out) == [seq_group_a]
assert out.num_batched_tokens == 1
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
and not out.blocks_to_swap_out)
assert len(seq_group_meta) == 1
assert scheduler.get_num_unfinished_seq_groups() == 2
# Abort seq group a. Re-schedule seq group b prompt with recomputation.
scheduler.abort_seq_group("1")
seq_group_meta, out = scheduler.schedule()
assert get_sequence_groups(out) == [seq_group_b]
assert out.num_batched_tokens == 5 # 4 prompt + 1 generation.
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
and not out.blocks_to_swap_out)
assert len(seq_group_meta) == 1
assert scheduler.get_num_unfinished_seq_groups() == 1
def test_scheduler_max_seqs():
block_size = 4
num_seq_group = 4
max_seq_group = 2
max_model_len = 16
scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
scheduler = Scheduler(scheduler_config, cache_config, None)
all_seq_groups: List[SequenceGroup] = []
# Add seq groups to scheduler.
for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
all_seq_groups.append(seq_group)
# Append 1 seq group
scheduler.add_seq_group(all_seq_groups[0])
# Schedule seq groups prompts.
_, out = scheduler.schedule()
assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
# Schedule seq groups generation.
_, out = scheduler.schedule()
assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
# Append 2 more seq group
scheduler.add_seq_group(all_seq_groups[1])
scheduler.add_seq_group(all_seq_groups[2])
# Schedule seq groups prompts.
# Only 1 seq group should be scheduled since max_seq_group is 2
# and one is prompting.
_, out = scheduler.schedule()
assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
def test_scheduler_delay_factor():
block_size = 4
scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
scheduler = Scheduler(scheduler_config, cache_config, None)
# schedule first prompt
_, seq_group = create_dummy_prompt("0", prompt_length=block_size)
scheduler.add_seq_group(seq_group)
seq_group_meta, out = scheduler.schedule()
assert out.prompt_run
assert seq_group_meta[0].request_id == '0'
# wait for a second before scheduling next prompt
time.sleep(1)
_, seq_group = create_dummy_prompt("1", prompt_length=block_size)
scheduler.add_seq_group(seq_group)
# second prompt should *not* be scheduled
seq_group_meta, out = scheduler.schedule()
assert not out.prompt_run
assert seq_group_meta[0].request_id == '0'
# wait for more than 0.5 second and try again
time.sleep(0.6)
seq_group_meta, out = scheduler.schedule()
assert out.prompt_run
assert seq_group_meta[0].request_id == '1'
import time
from typing import Tuple
from vllm import SamplingParams
from vllm.sequence import Logprob, Sequence, SequenceGroup
def create_dummy_prompt(
request_id: str,
prompt_length: int,
block_size: int = None) -> Tuple[Sequence, SequenceGroup]:
if not block_size:
block_size = prompt_length
# Create dummy prompt sequence with tokens 0...block_size-1
# and prompt "0 ... block_size".
prompt_tokens = list(range(prompt_length))
prompt_str = " ".join([str(t) for t in prompt_tokens])
prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
time.time(), None)
return prompt, seq_group
def create_seq_group(
seq_prompt_lens=1024,
seq_output_lens=(128, ),
request_id='0',
seq_id_start=0,
) -> SequenceGroup:
assert len(seq_output_lens) > 0
prompt_token_ids = [0] * seq_prompt_lens
seqs = []
for seq_id_offset, output_len in enumerate(seq_output_lens):
seq = Sequence(
seq_id=seq_id_start + seq_id_offset,
prompt="",
prompt_token_ids=prompt_token_ids,
block_size=16,
)
for i in range(output_len):
seq.append_token_id(
token_id=i,
logprobs={i: Logprob(0.0)},
)
seqs.append(seq)
seq_group = SequenceGroup(
request_id=request_id,
seqs=seqs,
sampling_params=SamplingParams(),
arrival_time=time.time(),
)
return seq_group
def round_up_to_next_block(seq_len: int, block_size: int) -> int:
return (seq_len + block_size - 1) // block_size
"""Compare the outputs of HF and distributed vLLM when using greedy sampling. """Compare the outputs of HF and distributed vLLM when using greedy sampling.
vLLM will allocate all the available memory, so we need to run the tests one
Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`. by one. The solution is to pass arguments (model name) by environment
variables.
Run:
```sh
TEST_DIST_MODEL=facebook/opt-125m pytest \
test_basic_distributed_correctness.py
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
test_basic_distributed_correctness.py
```
""" """
import os
import pytest import pytest
import torch import torch
MODELS = [ MODELS = [
"facebook/opt-125m", os.environ["TEST_DIST_MODEL"],
"meta-llama/Llama-2-7b-hf",
] ]
......
"""Test the communication operators. """Test the communication operators.
Run `pytest tests/distributed/test_comm_ops.py --forked`. Run `pytest tests/distributed/test_comm_ops.py`.
""" """
import os
import pytest import pytest
import torch
import ray import ray
import torch
from vllm.model_executor.parallel_utils.communication_op import ( from vllm.model_executor.parallel_utils.communication_op import (
tensor_model_parallel_all_reduce, broadcast_tensor_dict, tensor_model_parallel_all_gather,
tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
broadcast_tensor_dict,
)
from vllm.test_utils import (init_test_distributed_environment, from vllm.test_utils import (init_test_distributed_environment,
multi_process_tensor_parallel) multi_process_tensor_parallel)
...@@ -18,6 +18,12 @@ from vllm.test_utils import (init_test_distributed_environment, ...@@ -18,6 +18,12 @@ from vllm.test_utils import (init_test_distributed_environment,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tensor_parallel_size: int, rank: int, def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del os.environ["CUDA_VISIBLE_DEVICES"]
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(1, tensor_parallel_size, rank, init_test_distributed_environment(1, tensor_parallel_size, rank,
distributed_init_port) distributed_init_port)
num_elements = 8 num_elements = 8
...@@ -34,6 +40,12 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int, ...@@ -34,6 +40,12 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tensor_parallel_size: int, rank: int, def all_gather_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del os.environ["CUDA_VISIBLE_DEVICES"]
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(1, tensor_parallel_size, rank, init_test_distributed_environment(1, tensor_parallel_size, rank,
distributed_init_port) distributed_init_port)
num_dimensions = 3 num_dimensions = 3
...@@ -56,6 +68,12 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int, ...@@ -56,6 +68,12 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int, def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del os.environ["CUDA_VISIBLE_DEVICES"]
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(1, tensor_parallel_size, rank, init_test_distributed_environment(1, tensor_parallel_size, rank,
distributed_init_port) distributed_init_port)
test_dict = { test_dict = {
......
import os
import random import random
import os
import pytest import pytest
import ray import ray
import torch import torch
......
import multiprocessing
import os
import pytest
import torch
from vllm.model_executor.parallel_utils.pynccl import (NCCLCommunicator,
ncclGetUniqueId)
def distributed_run(fn, world_size):
number_of_processes = world_size
processes = []
for i in range(number_of_processes):
env = os.environ.copy()
env['RANK'] = str(i)
env['LOCAL_RANK'] = str(i)
env['WORLD_SIZE'] = str(number_of_processes)
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
env['MASTER_ADDR'] = 'localhost'
env['MASTER_PORT'] = '12345'
p = multiprocessing.Process(target=fn, args=(env, ))
processes.append(p)
p.start()
for p in processes:
p.join()
def update_env(fn):
# `multiprocessing.Process` cannot accept environment variables directly
# so we need to pass the environment variables as arguments
# and update the environment variables in the function
def wrapper(env):
import os
os.environ.update(env)
fn()
return wrapper
@update_env
def worker_fn():
comm = NCCLCommunicator()
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
comm.all_reduce(tensor)
result = tensor.mean().cpu().item()
assert result == comm.world_size
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
def test_pynccl():
distributed_run(worker_fn, 2)
@update_env
def worker_fn_with_cudagraph():
with torch.no_grad():
graph = torch.cuda.CUDAGraph()
comm = NCCLCommunicator()
# run something in the default stream to initialize torch engine
a = torch.ones((4, 4), device=f'cuda:{comm.rank}')
torch.cuda.synchronize()
with torch.cuda.graph(graph, stream=comm.stream):
# operation during the graph capture is recorded but not executed
# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
comm.all_reduce(a)
comm.stream.synchronize()
assert a.mean().cpu().item() == comm.world_size**0
graph.replay()
comm.stream.synchronize()
assert a.mean().cpu().item() == comm.world_size**1
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
def test_pynccl_with_cudagraph():
distributed_run(worker_fn_with_cudagraph, 2)
def test_ncclGetUniqueId():
unique_id = ncclGetUniqueId()
# `list(unique_id.internal)` is something like this:
# [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# as long as the function doesn't raise an exception, we're good
assert unique_id is not None
import pytest
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("block_size", [16])
def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion
# without triggering asserts.
# We are in a scenario where all blocks from the second request's prompt
# are full and already computed when the second request arrives.
prompt = (
"You are a helpful assistant. How do I build a car from cardboard and "
"paper clips? Is there an easy to follow video tutorial available "
"online for free?")
prompt2 = (
" Please recommend to me some resources where I can learn not only to "
"handle technical difficulties of building a car, but also "
"decoration.")
engine_args = EngineArgs(model=model,
block_size=block_size,
enable_prefix_caching=True)
engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams()
engine.add_request("0", prompt + prompt2, sampling_params)
engine.step()
engine.add_request("1", prompt, sampling_params)
engine.step()
import pytest
from transformers import AutoTokenizer
from vllm.transformers_utils.tokenizer import detokenize_incrementally
TRUTH = [
"Hello here, this is a simple test", # noqa: E501
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa: E501
"我很感谢你的热情" # noqa: E501
]
TOKENIZERS = [
"facebook/opt-125m",
"gpt2",
"bigcode/tiny_starcoder_py",
"EleutherAI/gpt-j-6b",
"EleutherAI/pythia-70m",
"bigscience/bloom-560m",
"mosaicml/mpt-7b",
"tiiuae/falcon-7b",
"meta-llama/Llama-2-7b-hf",
"codellama/CodeLlama-7b-hf",
]
def _run_incremental_decode(tokenizer, all_input_ids,
skip_special_tokens: bool):
decoded_text = ""
offset = 0
token_offset = 0
prev_tokens = None
for i in range(len(all_input_ids)):
new_tokens, text, offset, token_offset = detokenize_incrementally(
tokenizer,
all_input_ids[:i + 1],
prev_tokens,
offset,
token_offset,
skip_special_tokens=skip_special_tokens)
decoded_text += text
if prev_tokens is None:
prev_tokens = new_tokens
else:
prev_tokens += new_tokens
return decoded_text
@pytest.mark.parametrize("truth", TRUTH)
@pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
@pytest.mark.parametrize("skip_special_tokens", (True, False))
def test_decode_streaming(tokenizer_id, truth, skip_special_tokens):
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
if skip_special_tokens:
all_input_ids = ([tokenizer.bos_token_id]
if tokenizer.bos_token_id is not None else
[]) + all_input_ids + [tokenizer.eos_token_id]
decoded_text = _run_incremental_decode(
tokenizer, all_input_ids, skip_special_tokens=skip_special_tokens)
assert decoded_text == truth
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment