test_computed_prefix_blocks.py 1.62 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import os
ElizaWszola's avatar
ElizaWszola committed
5
6
7
8
9
import pytest

from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams
10
11
12
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
ElizaWszola's avatar
ElizaWszola committed
13
14


15
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
zhuwenwen's avatar
zhuwenwen committed
16
@pytest.mark.parametrize("block_size", [64] if envs.VLLM_USE_FLASH_ATTN_PA else [16])
ElizaWszola's avatar
ElizaWszola committed
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def test_computed_prefix_blocks(model: str, block_size: int):
    # This test checks if we are able to run the engine to completion
    # without triggering asserts.
    # We are in a scenario where all blocks from the second request's prompt
    # are full and already computed when the second request arrives.
    prompt = (
        "You are a helpful assistant. How do I build a car from cardboard and "
        "paper clips? Is there an easy to follow video tutorial available "
        "online for free?")
    prompt2 = (
        " Please recommend to me some resources where I can learn not only to "
        "handle technical difficulties of building a car, but also "
        "decoration.")

    engine_args = EngineArgs(model=model,
                             block_size=block_size,
                             enable_prefix_caching=True)

    engine = LLMEngine.from_engine_args(engine_args)
    sampling_params = SamplingParams()

    engine.add_request("0", prompt + prompt2, sampling_params)
    engine.step()
    engine.add_request("1", prompt, sampling_params)
41
    engine.step()