test_worker.py 3.11 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
7
8
import os
import random
import tempfile
from unittest.mock import patch

9
10
11
12
13
14
15
from vllm.config import (
    CacheConfig,
    DeviceConfig,
    ModelConfig,
    ParallelConfig,
    SchedulerConfig,
    VllmConfig,
16
    set_current_vllm_config,
17
)
18
from vllm.config.load import LoadConfig
19
from vllm.config.lora import LoRAConfig
20
from vllm.lora.model_manager import LoRAMapping
21
from vllm.lora.request import LoRARequest
22
from vllm.platforms import current_platform
23
from vllm.v1.worker.gpu_worker import Worker
24

25
MODEL_PATH = "Qwen/Qwen3-0.6B"
Jee Jee Li's avatar
Jee Jee Li committed
26
27
NUM_LORAS = 16

28
29
DEVICE_TYPE = current_platform.device_type

30
31

@patch.dict(os.environ, {"RANK": "0"})
32
def test_worker_apply_lora(qwen3_lora_files):
33
    def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
34
35
        lora_mapping = LoRAMapping([], [])

36
        worker.model_runner.lora_manager.set_active_adapters(
37
38
            lora_requests, lora_mapping
        )
39

40
41
42
43
44
45
46
47
    model_config = ModelConfig(
        MODEL_PATH,
        seed=0,
        dtype="float16",
        max_model_len=127,
        enforce_eager=True,
    )

48
    vllm_config = VllmConfig(
49
        model_config=model_config,
50
51
52
53
        load_config=LoadConfig(
            download_dir=None,
            load_format="dummy",
        ),
Jee Jee Li's avatar
Jee Jee Li committed
54
55
56
57
58
        parallel_config=ParallelConfig(
            pipeline_parallel_size=1,
            tensor_parallel_size=1,
            data_parallel_size=1,
        ),
59
60
61
62
63
64
65
66
        scheduler_config=SchedulerConfig(
            max_model_len=model_config.max_model_len,
            is_encoder_decoder=model_config.is_encoder_decoder,
            runner_type="generate",
            max_num_batched_tokens=32,
            max_num_seqs=32,
            max_num_partial_prefills=32,
        ),
67
        device_config=DeviceConfig(DEVICE_TYPE),
Jee Jee Li's avatar
Jee Jee Li committed
68
69
70
71
        cache_config=CacheConfig(
            block_size=16,
            cache_dtype="auto",
        ),
72
73
74
        lora_config=LoRAConfig(
            max_lora_rank=8, max_cpu_loras=NUM_LORAS, max_loras=NUM_LORAS
        ),
75
    )
76
    worker = Worker(
77
78
79
        vllm_config=vllm_config,
        local_rank=0,
        rank=0,
80
81
        distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
    )
82

83
84
85
    with set_current_vllm_config(vllm_config):
        worker.init_device()
        worker.load_model()
86

87
    set_active_loras(worker, [])
88
89
90
    assert worker.list_loras() == set()

    lora_requests = [
91
        LoRARequest(str(i + 1), i + 1, qwen3_lora_files) for i in range(NUM_LORAS)
92
93
    ]

94
    set_active_loras(worker, lora_requests)
95
    assert worker.list_loras() == {
96
        lora_request.lora_int_id for lora_request in lora_requests
97
98
    }

Jee Jee Li's avatar
Jee Jee Li committed
99
    for i in range(NUM_LORAS):
100
        random.seed(i)
101
102
103
        iter_lora_requests = random.choices(
            lora_requests, k=random.randint(1, NUM_LORAS)
        )
104
        random.shuffle(iter_lora_requests)
105
        iter_lora_requests = iter_lora_requests[: -random.randint(0, NUM_LORAS)]
106
        set_active_loras(worker, lora_requests)
107
        assert worker.list_loras().issuperset(
108
109
            {lora_request.lora_int_id for lora_request in iter_lora_requests}
        )