test_engine_args.py 2.24 KB
Newer Older
zhuwenwen's avatar
zhuwenwen committed
1
import os
2
3
4
5
6
7
import pytest

from vllm import envs
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.usage.usage_lib import UsageContext
8
from vllm.utils import FlexibleArgumentParser
zhuwenwen's avatar
zhuwenwen committed
9
from ...utils import models_path_prefix
10
11
12
13
14
15
16
17

if not envs.VLLM_USE_V1:
    pytest.skip(
        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
        allow_module_level=True,
    )


18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def test_prefix_caching_from_cli():
    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
    args = parser.parse_args([])
    engine_args = EngineArgs.from_cli_args(args=args)
    assert (engine_args.enable_prefix_caching
            ), "V1 turns on prefix caching by default."

    # Turn it off possible with flag.
    args = parser.parse_args(["--no-enable-prefix-caching"])
    engine_args = EngineArgs.from_cli_args(args=args)
    assert not engine_args.enable_prefix_caching

    # Turn it on with flag.
    args = parser.parse_args(["--enable-prefix-caching"])
    engine_args = EngineArgs.from_cli_args(args=args)
    assert engine_args.enable_prefix_caching


36
def test_defaults():
zhuwenwen's avatar
zhuwenwen committed
37
    engine_args = EngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"))
38
39
40
41
42
43
44

    # Assert V1 defaults
    assert (engine_args.enable_prefix_caching
            ), "V1 turns on prefix caching by default"


def test_defaults_with_usage_context():
zhuwenwen's avatar
zhuwenwen committed
45
    engine_args = EngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"))
46
47
48
49
50
51
    vllm_config: VllmConfig = engine_args.create_engine_config(
        UsageContext.LLM_CLASS)

    assert vllm_config.scheduler_config.max_num_seqs == 1024
    assert vllm_config.scheduler_config.max_num_batched_tokens == 8192

zhuwenwen's avatar
zhuwenwen committed
52
    engine_args = EngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"))
53
54
55
56
57
58
59
    vllm_config = engine_args.create_engine_config(
        UsageContext.OPENAI_API_SERVER)
    assert vllm_config.scheduler_config.max_num_seqs == 1024
    assert vllm_config.scheduler_config.max_num_batched_tokens == 2048


def test_prefix_cache_disabled_with_multimodel():
zhuwenwen's avatar
zhuwenwen committed
60
    engine_args = EngineArgs(model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"))
61
62
63

    vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS)
    assert not vllm_config.cache_config.enable_prefix_caching