test_worker.py 3.42 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
4
5
import os
import random
import tempfile
6
from typing import Union
7
8
from unittest.mock import patch

9
10
11
import pytest

import vllm.envs as envs
12
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
13
14
                         ModelConfig, ParallelConfig, SchedulerConfig,
                         VllmConfig)
15
16
from vllm.lora.models import LoRAMapping
from vllm.lora.request import LoRARequest
17
from vllm.v1.worker.gpu_worker import Worker as V1Worker
18
from vllm.worker.worker import Worker
19
from ..utils import models_path_prefix
20
21


22
23
24
25
26
27
28
29
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
    # Simple autouse wrapper to run both engines for each test
    # This can be promoted up to conftest.py to run for every
    # test in a package
    pass


30
31
@patch.dict(os.environ, {"RANK": "0"})
def test_worker_apply_lora(sql_lora_files):
32
33
34
35
36
37
38
39
40
41
42
43
44
45

    def set_active_loras(worker: Union[Worker, V1Worker],
                         lora_requests: list[LoRARequest]):
        lora_mapping = LoRAMapping([], [])
        if isinstance(worker, Worker):
            # v0 case
            worker.model_runner.set_active_loras(lora_requests, lora_mapping)
        else:
            # v1 case
            worker.model_runner.lora_manager.set_active_adapters(
                lora_requests, lora_mapping)

    worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker

46
    vllm_config = VllmConfig(
47
        model_config=ModelConfig(
48
            os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
49
            task="auto",
zhuwenwen's avatar
zhuwenwen committed
50
            tokenizer=os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
51
52
53
54
55
            tokenizer_mode="auto",
            trust_remote_code=False,
            seed=0,
            dtype="float16",
            revision=None,
56
            enforce_eager=True,
57
        ),
58
59
60
61
        load_config=LoadConfig(
            download_dir=None,
            load_format="dummy",
        ),
62
        parallel_config=ParallelConfig(1, 1, False),
63
        scheduler_config=SchedulerConfig("generate", 32, 32, 32),
64
        device_config=DeviceConfig("cuda"),
65
66
67
68
        cache_config=CacheConfig(block_size=16,
                                 gpu_memory_utilization=1.,
                                 swap_space=0,
                                 cache_dtype="auto"),
69
70
        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
                               max_loras=32),
71
    )
72
    worker = worker_cls(
73
74
75
        vllm_config=vllm_config,
        local_rank=0,
        rank=0,
76
77
        distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
    )
78

79
    worker.init_device()
80
81
    worker.load_model()

82
    set_active_loras(worker, [])
83
84
85
86
87
88
89
    assert worker.list_loras() == set()

    n_loras = 32
    lora_requests = [
        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
    ]

90
    set_active_loras(worker, lora_requests)
91
92
93
94
95
96
97
98
99
100
101
    assert worker.list_loras() == {
        lora_request.lora_int_id
        for lora_request in lora_requests
    }

    for i in range(32):
        random.seed(i)
        iter_lora_requests = random.choices(lora_requests,
                                            k=random.randint(1, n_loras))
        random.shuffle(iter_lora_requests)
        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
102
        set_active_loras(worker, lora_requests)
103
104
105
        assert worker.list_loras().issuperset(
            {lora_request.lora_int_id
             for lora_request in iter_lora_requests})