Unverified Commit 81ede99c authored by Kuntai Du's avatar Kuntai Du Committed by GitHub
Browse files

[Core] Deprecating block manager v1 and make block manager v2 default (#8704)

Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
parent 5eda21e7
......@@ -77,8 +77,8 @@ steps:
- vllm/
- tests/basic_correctness/test_chunked_prefill
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test # 10min
mirror_hardwares: [amd]
......@@ -88,11 +88,7 @@ steps:
- vllm/distributed
- tests/core
commands:
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
- pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
- pytest -v -s core
- label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests"
......@@ -192,8 +188,7 @@ steps:
- vllm/
- tests/prefix_caching
commands:
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
- pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
- pytest -v -s prefix_caching
- label: Samplers Test # 36min
source_file_dependencies:
......@@ -217,8 +212,7 @@ steps:
- tests/spec_decode
commands:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- label: LoRA Test %N # 15min each
mirror_hardwares: [amd]
......@@ -405,7 +399,7 @@ steps:
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
......
......@@ -38,7 +38,6 @@ def main(args: argparse.Namespace):
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
......@@ -221,9 +220,6 @@ if __name__ == '__main__':
parser.add_argument("--enable-prefix-caching",
action='store_true',
help="Enable automatic prefix caching")
parser.add_argument('--use-v2-block-manager',
action='store_true',
default=EngineArgs.use_v2_block_manager)
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
......
......@@ -33,7 +33,6 @@ from typing import List, Optional, Tuple
from transformers import PreTrainedTokenizerBase
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
try:
......@@ -134,7 +133,6 @@ def main(args):
tokenizer_mode='auto',
trust_remote_code=True,
enforce_eager=True,
use_v2_block_manager=args.use_v2_block_manager,
tensor_parallel_size=args.tensor_parallel_size,
enable_prefix_caching=args.enable_prefix_caching)
......@@ -176,10 +174,6 @@ if __name__ == "__main__":
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
parser.add_argument('--use-v2-block-manager',
action='store_true',
default=EngineArgs.use_v2_block_manager,
help='Use BlockSpaceMangerV2')
parser.add_argument('--num-prompts',
type=int,
default=1,
......
......@@ -86,7 +86,6 @@ def run_vllm(
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
num_scheduler_steps: int = 1,
use_v2_block_manager: bool = False,
download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
disable_async_output_proc: bool = False,
......@@ -113,7 +112,6 @@ def run_vllm(
distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
num_scheduler_steps=num_scheduler_steps,
use_v2_block_manager=use_v2_block_manager,
disable_async_output_proc=disable_async_output_proc,
)
......@@ -176,7 +174,6 @@ async def run_vllm_async(
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
num_scheduler_steps: int = 1,
use_v2_block_manager: bool = False,
download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
disable_async_output_proc: bool = False,
......@@ -204,7 +201,6 @@ async def run_vllm_async(
distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
num_scheduler_steps=num_scheduler_steps,
use_v2_block_manager=use_v2_block_manager,
disable_async_output_proc=disable_async_output_proc,
worker_use_ray=False,
disable_log_requests=True,
......@@ -341,8 +337,7 @@ def main(args: argparse.Namespace):
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.num_scheduler_steps,
args.use_v2_block_manager, args.download_dir, args.load_format,
args.disable_async_output_proc
args.download_dir, args.load_format, args.disable_async_output_proc
]
if args.async_engine:
......@@ -471,10 +466,6 @@ if __name__ == "__main__":
type=int,
default=1,
help="Maximum number of forward steps per scheduler call.")
parser.add_argument("--use-v2-block-manager",
action='store_true',
default=EngineArgs.use_v2_block_manager,
help="Enable block manager v2.")
parser.add_argument(
"--enable-prefix-caching",
action='store_true',
......
......@@ -16,7 +16,6 @@ def main(args):
enforce_eager=True,
enable_prefix_caching=True,
tensor_parallel_size=args.tensor_parallel_size,
use_v2_block_manager=args.use_v2_block_manager,
)
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
......@@ -56,8 +55,5 @@ if __name__ == "__main__":
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
parser.add_argument('--use-v2-block-manager',
action='store_true',
help='Use BlockSpaceMangerV2')
args = parser.parse_args()
main(args)
......@@ -30,7 +30,6 @@ The following code configures vLLM in an offline mode to use speculative decodin
tensor_parallel_size=1,
speculative_model="facebook/opt-125m",
num_speculative_tokens=5,
use_v2_block_manager=True,
)
outputs = llm.generate(prompts, sampling_params)
......@@ -104,7 +103,6 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
speculative_model="[ngram]",
num_speculative_tokens=5,
ngram_prompt_lookup_max=4,
use_v2_block_manager=True,
)
outputs = llm.generate(prompts, sampling_params)
......@@ -135,7 +133,6 @@ For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-
tensor_parallel_size=4,
speculative_model="ibm-fms/llama3-70b-accelerator",
speculative_draft_tensor_parallel_size=1,
use_v2_block_manager=True,
)
outputs = llm.generate(prompts, sampling_params)
......
......@@ -50,8 +50,6 @@ if __name__ == "__main__":
llm = LLM(
model="meta-llama/Llama-2-13b-chat-hf",
speculative_model="ibm-fms/llama-13b-accelerator",
# These are currently required for MLPSpeculator decoding
use_v2_block_manager=True,
)
print("With speculation")
......
......@@ -12,7 +12,7 @@ from contextlib import nullcontext
import pytest
from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import check_deprecated_block_manager_usage, multi_gpu_test
from ..utils import multi_gpu_test
MODELS = [
"facebook/opt-125m",
......@@ -20,12 +20,6 @@ MODELS = [
]
@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/basic_correctness/test_chunked_prefill.py')
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
......@@ -197,7 +191,6 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("use_v2_block_manager", [False, True])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
......@@ -206,7 +199,6 @@ def test_with_prefix_caching(
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
use_v2_block_manager: bool,
tensor_parallel_size: int,
) -> None:
"""
......@@ -234,7 +226,6 @@ def test_with_prefix_caching(
enable_chunked_prefill=True,
enable_prefix_caching=enable,
tensor_parallel_size=tensor_parallel_size,
use_v2_block_manager=use_v2_block_manager,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
) as vllm_model:
......
......@@ -2,18 +2,11 @@ from itertools import cycle
import pytest
from tests.utils import check_deprecated_block_manager_usage
from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator
@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/core/block/e2e/test_correctness.py')
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
......@@ -28,32 +21,32 @@ def check_deprecated_block_manager():
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"use_v2_block_manager": False
}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"use_v2_block_manager": True,
"preemption_mode": "swap"
}, {
"use_v2_block_manager": True,
"preemption_mode": "recompute"
}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify block manager v2 produces same outputs as block manager v1, even
when there is preemption.
def test_block_manager_with_preemption(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify block manager produces same outputs even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted in the v2 block manager.
cache is not corrupted.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
NOTE(Kuntai): Though we have removed block manager v1, this test is still
useful as it asserts the behavior of block manager v2 (now it is called
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
keep this test.
"""
output_len = 1024
temperature = 0.0
......@@ -77,11 +70,9 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
temperature=temperature,
)
print('Getting token ids from block manager v1')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids from block manager v2')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
......@@ -104,9 +95,6 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Lookahead scheduling only supported in v2 block manager.
"use_v2_block_manager": True,
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
......@@ -218,26 +206,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
"max_num_seqs": 10,
}])
@pytest.mark.parametrize("baseline_llm_kwargs", [
{
"use_v2_block_manager": False,
},
{},
])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"use_v2_block_manager": True,
"num_lookahead_slots": 0,
},
{
"use_v2_block_manager": True,
"num_lookahead_slots": 5,
},
])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify that chunked prefill works with BlockManagerV2, with and without
lookahead scheduling.
def test_chunked_prefill_block_manager(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify that chunked prefill works with SelfAttnBlockSpaceManager,
with and without lookahead scheduling.
"""
output_len = 32
temperature = 0.0
......@@ -258,11 +242,11 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
temperature=temperature,
)
print('Getting token ids with BlockManagerV1')
print('Getting token ids with BlockManager')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with BlockManagerV2')
print('Getting token ids with BlockManager, with lookahead slots.')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
......@@ -290,32 +274,32 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
"enable_prefix_caching": True,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"use_v2_block_manager": False
}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"use_v2_block_manager": True,
"preemption_mode": "swap"
}, {
"use_v2_block_manager": True,
"preemption_mode": "recompute"
}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
def test_block_manager_prefix_caching_enabled_with_preemption(
baseline_llm_generator, test_llm_generator, batch_size):
"""Verify block manager v2 produces same outputs as block manager v1, even
when there is preemption.
"""Verify block manager produces same outputs even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted in the v2 block manager.
cache is not corrupted.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
NOTE(Kuntai): Though we have removed block manager v1, this test is still
useful as it asserts the behavior of block manager v2 (now it is called
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
keep this test.
"""
output_len = 1024
temperature = 0.0
......@@ -339,11 +323,11 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
temperature=temperature,
)
print('Getting token ids from block manager v1')
print('Getting token ids from block manager')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids from block manager v2')
print('Getting token ids from block manager, with preemption')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
......@@ -366,9 +350,6 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Test APC in v2 block
"use_v2_block_manager": True,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
......@@ -444,9 +425,6 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
"max_model_len": 48,
"block_size": 16,
"num_gpu_blocks_override": 3,
# Test APC in v2 block
"use_v2_block_manager": True,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
......
......@@ -3,7 +3,6 @@ from typing import List
import pytest
from tests.utils import check_deprecated_block_manager_usage
from vllm import LLM, SamplingParams
from .conftest import get_text_from_llm_generator
......@@ -13,12 +12,6 @@ MODEL = "bigcode/starcoder2-3b"
BLOCK_SIZE = 16
@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/core/block/e2e/test_correctness_sliding_window.py')
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
......@@ -31,10 +24,8 @@ def check_deprecated_block_manager():
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"use_v2_block_manager": False
}])
@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
......@@ -55,7 +46,6 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
prompts, answer, indices = prep_prompts(batch_size)
print('Getting token ids from block manager v1')
baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
prompts,
sampling_params,
......@@ -91,10 +81,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"use_v2_block_manager": True,
"enable_chunked_prefill": True
}])
@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
......
......@@ -2,7 +2,7 @@ import pytest
from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
STR_NOT_IMPL_ENC_DEC_SWA)
from vllm.core.block_manager_v2 import BlockSpaceManagerV2
from vllm.core.block_manager import SelfAttnBlockSpaceManager
from vllm.core.interfaces import AllocStatus
from vllm.sequence import Logprob, SequenceStatus
from vllm.utils import chunk_list
......@@ -17,7 +17,7 @@ from ..utils import (create_dummy_prompt, create_seq_group,
@pytest.mark.parametrize("watermark", [0.0, 0.5])
def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
num_gpu_blocks: int, watermark: float):
block_manager = BlockSpaceManagerV2(
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
......@@ -63,7 +63,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
num_seqs_per_group: int,
num_gpu_blocks: int,
watermark: float):
block_manager = BlockSpaceManagerV2(
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
......@@ -117,16 +117,16 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
'''
SWA short for Sliding Window Attention.
At time of writing block manager v2 does not support SWA.
At time of writing block manager does not support SWA.
However even when SWA is implemented for block manager v2,
However even when SWA is implemented for block manager,
there will still most likely be a separate workstream required
to enable SWA for encoder/decoder models.
Therefore this test enforces that one of the following cases
hold true:
1. Block manager v2 does not support SWA at all (true at time of writing)
2. Block manager v2 fails with NotImplementError when SWA is enabled
1. Block manager does not support SWA at all (true at time of writing)
2. Block manager fails with NotImplementError when SWA is enabled
AND a SequenceGroup with an encoder sequence (i.e. in support of an
encoder/decoder model) is passed into can_allocate() as an argument
......@@ -135,7 +135,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
'''
with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
block_manager = BlockSpaceManagerV2(
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
......@@ -158,7 +158,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
block_manager.can_allocate(seq_group)
# Assert that either
# 1. Block manager v2 constructor fails with assertion that sliding window
# 1. Block manager constructor fails with assertion that sliding window
# is not yet supported (most likely near-term outcome at time of
# writing), or
# 2. can_allocate() fails with NotImplementedError due to combination of
......@@ -177,7 +177,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
watermark: float):
block_manager = BlockSpaceManagerV2(
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
......@@ -217,7 +217,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
num_gpu_blocks = 1024
watermark = 0.1
block_manager = BlockSpaceManagerV2(
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
......@@ -269,14 +269,15 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
"""Verify blocks number on src/desc device is correct after swapping in/out
sequence group (not missing or extra blocks).
"""
block_manager = BlockSpaceManagerV2(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
block_manager = SelfAttnBlockSpaceManager(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
......@@ -321,11 +322,11 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
can be swapped in/out.
"""
num_cpu_blocks = num_gpu_blocks
block_manager = BlockSpaceManagerV2(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
block_manager = SelfAttnBlockSpaceManager(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt, seq_group = create_dummy_prompt(
"1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
prompt.status = SequenceStatus.WAITING
......@@ -382,11 +383,11 @@ def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
block_size = 8
num_cpu_blocks = 1
num_gpu_blocks = 1
block_manager = BlockSpaceManagerV2(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
block_manager = SelfAttnBlockSpaceManager(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt_length = block_size - 3
assert prompt_length > 0
prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
......@@ -434,7 +435,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
num_gpu_blocks = 1024
watermark = 0.1
block_manager = BlockSpaceManagerV2(
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
......@@ -474,7 +475,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
seq.data.update_num_computed_tokens(prompt_len)
check_used(num_blocks(prompt_len))
# this is how we compute it in BlockSpaceManagerV2.__init__
# this is how we compute it in SelfAttnBlockSpaceManager.__init__
sliding_blocks = (sliding_window // block_size) + 2
# plus one block for null block
sliding_blocks += 1
......
This diff is collapsed.
......@@ -8,7 +8,6 @@ from vllm.core.interfaces import AllocStatus
from vllm.core.scheduler import Scheduler
from vllm.sequence import Logprob, SequenceGroup
from ..utils import check_deprecated_block_manager_usage
from .utils import create_dummy_prompt
......@@ -28,25 +27,16 @@ def schedule_and_update_computed_tokens(scheduler):
return metas, out
@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/core/test_chunked_prefill_scheduler.py')
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_simple(use_v2_block_manager: bool):
def test_simple():
"""Verify basic scheduling works."""
block_size = 4
num_seq_group = 4
max_model_len = 16
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
max_num_batched_tokens,
num_seq_group,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
scheduler_config = SchedulerConfig(max_num_batched_tokens,
num_seq_group,
max_model_len,
enable_chunked_prefill=True)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
......@@ -81,8 +71,7 @@ def test_simple(use_v2_block_manager: bool):
assert len(seq_group_meta) == num_seq_group
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_chunk(use_v2_block_manager: bool):
def test_chunk():
"""Verify prefills are chunked properly."""
block_size = 4
max_seqs = 60
......@@ -93,7 +82,7 @@ def test_chunk(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 32
cache_config.num_gpu_blocks = 32
......@@ -131,8 +120,7 @@ def test_chunk(use_v2_block_manager: bool):
assert out.num_batched_tokens == 57
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_complex(use_v2_block_manager: bool):
def test_complex():
block_size = 4
max_seqs = 60
max_model_len = 80
......@@ -142,7 +130,7 @@ def test_complex(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 64
cache_config.num_gpu_blocks = 64
......@@ -201,8 +189,7 @@ def test_complex(use_v2_block_manager: bool):
assert running[2].is_prefill()
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_maximal_decoding(use_v2_block_manager: bool):
def test_maximal_decoding():
"""Verify decoding requests are prioritized."""
block_size = 4
max_seqs = 2
......@@ -213,7 +200,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
......@@ -295,8 +282,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
assert out.num_batched_tokens == 2
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prompt_limit(use_v2_block_manager: bool):
def test_prompt_limit():
"""Verify max_num_batched_tokens < max_model_len is possible."""
block_size = 4
max_seqs = 32
......@@ -307,7 +293,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16
......@@ -330,8 +316,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
assert out.num_batched_tokens == 32
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prompt_limit_exceed(use_v2_block_manager: bool):
def test_prompt_limit_exceed():
block_size = 4
max_seqs = 64
max_model_len = 32
......@@ -356,8 +341,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool):
assert out.ignored_seq_groups[0] == seq_group
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_swap(use_v2_block_manager: bool):
def test_swap():
"""Verify swapping works with chunked prefill requests"""
block_size = 4
max_seqs = 30
......@@ -368,7 +352,7 @@ def test_swap(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16
......@@ -414,8 +398,7 @@ def test_swap(use_v2_block_manager: bool):
assert out.blocks_to_swap_out == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
def test_running_prefill_prioritized_over_swap():
block_size = 4
max_seqs = 30
max_model_len = 200
......@@ -425,7 +408,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 32
cache_config.num_gpu_blocks = 32
......@@ -508,8 +491,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
assert out.blocks_to_swap_out == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_chunked_prefill_preempt(use_v2_block_manager: bool):
def test_chunked_prefill_preempt():
"""Verify preempt works with chunked prefill requests"""
block_size = 4
max_seqs = 30
......@@ -520,7 +502,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16
......@@ -575,8 +557,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
assert out.num_batched_tokens == max_num_batched_tokens
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
def test_chunked_prefill_max_seqs():
block_size = 4
max_seqs = 2
max_model_len = 80
......@@ -586,7 +567,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 128
cache_config.num_gpu_blocks = 128
......@@ -629,8 +610,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
assert not running[1].is_prefill()
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_perfix_caching(use_v2_block_manager: bool):
def test_perfix_caching():
"""Verify allocating full blocks when prefix caching is enabled."""
block_size = 4
max_seqs = 10
......@@ -641,7 +621,7 @@ def test_perfix_caching(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size,
1.0,
1,
......
......@@ -31,7 +31,6 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
# Make a vllm engine
runner = VllmRunner(model_name=MODEL,
gpu_memory_utilization=0.7,
use_v2_block_manager=True,
num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager)
......
......@@ -3,7 +3,7 @@ from collections import deque
from typing import List, Set, Tuple
from unittest.mock import MagicMock
import pytest
import pytest # noqa
from torch import Use # noqa
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
......@@ -12,23 +12,18 @@ from vllm.core.scheduler import Scheduler, SchedulingBudget
from vllm.lora.request import LoRARequest
from vllm.sequence import SequenceGroup, SequenceStatus
from ..utils import check_deprecated_block_manager_usage
from .utils import (append_new_token, append_new_token_seq_group,
create_dummy_prompt, get_sequence_groups,
schedule_and_update_computed_tokens)
@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
"tests/core/test_chunked_prefill_scheduler.py")
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_add_seq_group(use_v2_block_manager: bool):
def test_scheduler_add_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(
100, 64, 1, use_v2_block_manager=use_v2_block_manager)
100,
64,
1,
)
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4
......@@ -44,11 +39,13 @@ def test_scheduler_add_seq_group(use_v2_block_manager: bool):
assert scheduler.get_num_unfinished_seq_groups() == i + 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
def test_scheduler_abort_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(
100, 64, 1, use_v2_block_manager=use_v2_block_manager)
100,
64,
1,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4
......@@ -68,8 +65,7 @@ def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
assert scheduler.get_num_unfinished_seq_groups() == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_schedule_simple(use_v2_block_manager: bool):
def test_scheduler_schedule_simple():
block_size = 4
num_seq_group = 4
max_model_len = 16
......@@ -77,7 +73,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
64,
num_seq_group,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
......@@ -112,8 +108,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
append_new_token(out, 1)
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
def test_scheduler_prefill_prioritized():
"""Verify running batched tokens are not applied to prefill requests."""
block_size = 4
max_model_len = 30
......@@ -122,7 +117,7 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
max_batched_num_tokens,
2,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16
......@@ -146,12 +141,14 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
assert get_sequence_groups(out) == [seq_group_b]
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
def test_scheduler_schedule_preempt_abort():
block_size = 4
max_model_len = 16
scheduler_config = SchedulerConfig(
64, 2, max_model_len, use_v2_block_manager=use_v2_block_manager)
64,
2,
max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2
cache_config.num_gpu_blocks = 2
......@@ -201,8 +198,7 @@ def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
assert scheduler.get_num_unfinished_seq_groups() == 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_max_seqs(use_v2_block_manager: bool):
def test_scheduler_max_seqs():
block_size = 4
num_seq_group = 4
max_seq_group = 2
......@@ -211,7 +207,7 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
64,
max_seq_group,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
......@@ -249,15 +245,14 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_delay_factor(use_v2_block_manager: bool):
def test_scheduler_delay_factor():
block_size = 4
scheduler_config = SchedulerConfig(
100,
64,
16,
delay_factor=0.5,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
......@@ -294,12 +289,10 @@ def test_scheduler_delay_factor(use_v2_block_manager: bool):
append_new_token(out, 1)
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_swapped_out_prioritized(use_v2_block_manager: bool):
def test_swapped_out_prioritized():
block_size = 4
scheduler = initialize_scheduler(max_num_seqs=6,
block_size=block_size,
use_v2_block_manager=use_v2_block_manager,
num_cpu_blocks=64,
num_gpu_blocks=64)
# best_of=2 * 3 == 6 sequences.
......@@ -351,7 +344,6 @@ def initialize_scheduler(
max_token_budget=1000,
max_model_len=1000,
lora_config=None,
use_v2_block_manager=False,
block_size=4,
num_cpu_blocks=8,
num_gpu_blocks=8,
......@@ -361,7 +353,7 @@ def initialize_scheduler(
max_token_budget,
max_num_seqs,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = num_cpu_blocks
cache_config.num_gpu_blocks = num_gpu_blocks
......@@ -386,15 +378,12 @@ def add_token_budget(budget: SchedulingBudget,
budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
def test_prefill_schedule_max_prompt_len():
"""
Test prompt longer than max_prompt_len is aborted.
"""
block_size = 4
scheduler = initialize_scheduler(max_model_len=30,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size)
scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
_, seq_group = create_dummy_prompt("0",
prompt_length=60,
block_size=block_size)
......@@ -409,14 +398,12 @@ def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
assert len(remaining_waiting) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
def test_prefill_schedule_token_budget():
"""
Test token budget respected.
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
budget = create_token_budget(token_budget=0)
......@@ -446,8 +433,7 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
assert len(remaining_waiting) == 1
# Test when current_batched_tokens respected.
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=16,
num_gpu_blocks=16)
budget = create_token_budget(token_budget=60)
......@@ -474,14 +460,12 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
assert len(remaining_waiting) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
def test_prefill_schedule_max_seqs():
"""
Test max seq respected.
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
budget = create_token_budget(max_num_seqs=2)
......@@ -515,15 +499,13 @@ def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
assert len(remaining_waiting) == 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
def test_prefill_schedule_max_lora():
"""
Test max lora is respected and prioritized.
"""
block_size = 4
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
......@@ -570,14 +552,12 @@ def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
assert budget.num_batched_tokens == 60
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
def test_prefill_schedule_no_block_manager_capacity():
"""
Test sequence cannot be scheduled due to block manager has no capacity.
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_gpu_blocks=128,
num_cpu_blocks=128)
budget = create_token_budget()
......@@ -614,14 +594,12 @@ def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
assert len(remaining_waiting) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_decode_schedule_preempted(use_v2_block_manager: bool):
def test_decode_schedule_preempted():
"""
Test decodes cannot be scheduled and preempted.
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
curr_loras = None
......@@ -660,14 +638,12 @@ def test_decode_schedule_preempted(use_v2_block_manager: bool):
assert output.blocks_to_copy == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_decode_swap_beam_search(use_v2_block_manager: bool):
def test_decode_swap_beam_search():
"""
Test best_of > 1 swap out blocks
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_gpu_blocks=64,
num_cpu_blocks=64)
curr_loras = None
......@@ -716,14 +692,12 @@ def test_decode_swap_beam_search(use_v2_block_manager: bool):
assert output.blocks_to_copy == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
def test_schedule_decode_blocks_to_copy_update():
"""
Verify blocks_to_copy is updated.
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=4,
scheduler = initialize_scheduler(block_size=4,
num_cpu_blocks=16,
num_gpu_blocks=16)
_, seq_group = create_dummy_prompt("1",
......@@ -754,11 +728,9 @@ def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
assert output.blocks_to_copy == [(2, 3)]
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_simple(use_v2_block_manager: bool):
def test_schedule_swapped_simple():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size)
scheduler = initialize_scheduler(block_size=block_size)
curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = []
_, seq_group = create_dummy_prompt("1",
......@@ -785,11 +757,9 @@ def test_schedule_swapped_simple(use_v2_block_manager: bool):
assert blocks_to_swap_out == blocks_to_swap_in_reverse
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
def test_schedule_swapped_max_token_budget():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None
......@@ -822,11 +792,9 @@ def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
def test_schedule_swapped_max_seqs():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
curr_loras = None
......@@ -859,12 +827,10 @@ def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
def test_schedule_swapped_max_loras():
block_size = 4
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
......@@ -894,11 +860,9 @@ def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
assert len(curr_loras) == 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
def test_schedule_swapped_cannot_swap_in():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None
......@@ -927,11 +891,9 @@ def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_infeasible_swap(use_v2_block_manager: bool):
def test_infeasible_swap():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None
......@@ -961,11 +923,9 @@ def test_infeasible_swap(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_blocks_to_copy(use_v2_block_manager: bool):
def test_schedule_swapped_blocks_to_copy():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None
......
......@@ -185,13 +185,14 @@ def test_metric_spec_decode(
) -> None:
k = 5
with vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
use_v2_block_manager=True) as vllm_model:
with vllm_runner(
model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
) as vllm_model:
# Force log interval to be 0 to catch all metrics.
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
......@@ -242,7 +243,6 @@ def test_metric_spec_decode_interval(
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
use_v2_block_manager=True,
enforce_eager=True)
engine = LLMEngine.from_engine_args(engine_args)
......
......@@ -17,7 +17,6 @@ NUM_PROMPTS = [10]
DEFAULT_SERVER_ARGS: List[str] = [
"--disable-log-requests",
"--use-v2-block-manager",
"--worker-use-ray",
"--gpu-memory-utilization",
"0.85",
......
......@@ -76,7 +76,6 @@ def test_multi_step_llm(
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
use_v2_block_manager=True,
enable_chunked_prefill=enable_chunked_prefill,
num_scheduler_steps=num_scheduler_steps,
) as vllm_model:
......@@ -169,7 +168,6 @@ def test_multi_step_llm_w_prompt_logprobs(
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
use_v2_block_manager=True,
num_scheduler_steps=num_scheduler_steps,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
......@@ -305,7 +303,6 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
use_v2_block_manager=True,
num_scheduler_steps=num_scheduler_steps,
max_model_len=48,
max_num_batched_tokens=48,
......@@ -324,7 +321,6 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
use_v2_block_manager=True,
enable_chunked_prefill=True,
enable_prefix_caching=True,
num_scheduler_steps=num_scheduler_steps,
......
......@@ -2,15 +2,9 @@
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
from typing import List
import pytest
from tests.kernels.utils import override_backend_env_variable
from tests.utils import check_deprecated_block_manager_usage
from vllm.block import PhysicalTokenBlock
from vllm.core.block_manager_v1 import CachedBlockAllocator
from vllm.utils import Device
from ..models.utils import check_outputs_equal
......@@ -19,92 +13,11 @@ MODELS = [
]
@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/prefix_caching/test_prefix_caching.py')
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_blocks", [16])
def test_block_allocator(
block_size: int,
num_blocks: int,
):
block_hash = 1
block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
# Allocate two PysicalTokenBlocks with the same hash and check
# that they are the same PhysicalTokenBlock
first_block = block_allocator.allocate(block_hash, 0)
second_block = block_allocator.allocate(block_hash, 0)
assert (first_block == second_block)
assert (second_block.ref_count == 2)
# Check metric: 1 hit of 2 queries
assert block_allocator.get_prefix_cache_hit_rate() == 0.5
# Free the first_block and confirm that the ref_count is correctly
# decremented on the second block
block_allocator.free(first_block)
assert (second_block.ref_count == 1)
# Free the second block
block_allocator.free(second_block)
# Reallocate the first block and confirm that, even after the block
# had its ref_count go to 0, we still get the same block back
first_block = block_allocator.allocate(block_hash, 0)
assert (first_block == second_block)
assert (first_block.block_hash == block_hash)
# Allocate one more time to get 3/4 hit rate for easy checking
block_allocator.allocate(block_hash, 0)
assert block_allocator.get_prefix_cache_hit_rate() == 0.75
@pytest.mark.parametrize("num_blocks", [16])
def test_eviction(num_blocks: int, ):
block_size = 16
block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
blocks: List[PhysicalTokenBlock] = []
for i in range(num_blocks):
# use i as the block_hash
blocks.append(block_allocator.allocate(i, 0))
#Free all blocks
for block in blocks:
block_allocator.free(block)
# Allocate a new block and confirm that it's the first block freed.
# I.E The Least Recently Used block
new_block_hash = block_size
new_block = block_allocator.allocate(new_block_hash, 0)
assert (new_block == blocks[0])
assert (new_block.block_hash == new_block_hash)
# Reallocate the second in blocks to remove it from the free list
realloc_block_hash = 1
realloc_block = block_allocator.allocate(realloc_block_hash, 0)
assert (realloc_block == blocks[realloc_block_hash])
assert (realloc_block.block_hash == realloc_block_hash)
# Allocate a new block and confirm that it's not the realloc_block,
# since the realloc_block shouldn't be in the free list
new_block_hash = block_size + 1
new_block = block_allocator.allocate(new_block_hash, 0)
assert (realloc_block != new_block)
assert (new_block.block_hash == new_block_hash)
assert (new_block.block_number == 2)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("cached_position", [0, 1])
@pytest.mark.parametrize("use_v2_block_manager", [False, True])
def test_mixed_requests(
hf_runner,
vllm_runner,
......@@ -114,7 +27,6 @@ def test_mixed_requests(
dtype: str,
max_tokens: int,
cached_position: int,
use_v2_block_manager: bool,
monkeypatch,
) -> None:
"""
......@@ -132,7 +44,6 @@ def test_mixed_requests(
model,
dtype=dtype,
enable_prefix_caching=True,
use_v2_block_manager=use_v2_block_manager,
) as vllm_model:
# Run the first prompt so the cache is populated
vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
......
import pytest
from tests.utils import check_deprecated_block_manager_usage
from vllm import SamplingParams
from .conftest import get_output_from_llm_generator
@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/spec_decode/e2e/test_compatibility.py')
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": "JackFram/llama-68m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
# Required for spec decode.
"use_v2_block_manager": True
}])
@pytest.mark.parametrize("common_llm_kwargs", [{
"model": "JackFram/llama-68m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
"enable_chunked_prefill": True,
......@@ -51,16 +39,11 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
sampling_params)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": "meta-llama/Llama-2-7b-chat-hf",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
# Required for spec decode.
"use_v2_block_manager": True
}])
@pytest.mark.parametrize("common_llm_kwargs", [{
"model": "meta-llama/Llama-2-7b-chat-hf",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[
......@@ -101,34 +84,3 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
with pytest.raises(ValueError, match="cannot be larger than"):
get_output_from_llm_generator(test_llm_generator, prompts,
sampling_params)
@pytest.mark.parametrize("common_llm_kwargs", [{
"model": "JackFram/llama-68m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"use_v2_block_manager": False,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("seed", [1])
def test_spec_decode_xfail_block_manager_v1(test_llm_generator):
"""Verify that speculative decoding with block manager v1 fails.
"""
output_len = 128
temperature = 0.0
prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
with pytest.raises(ValueError,
match="Speculative decoding requires usage of the V2"):
get_output_from_llm_generator(test_llm_generator, prompts,
sampling_params)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment