test_add_lora.py 4.01 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
import asyncio
import time

zhuwenwen's avatar
zhuwenwen committed
6
import os
7
8
9
import pytest

from vllm.engine.arg_utils import AsyncEngineArgs
10
from vllm.entrypoints.openai.api_server import (
11
12
    build_async_engine_client_from_engine_args,
)
13
14
15
from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
16

17
from vllm.utils.async_utils import merge_async_iterators
zhuwenwen's avatar
zhuwenwen committed
18
from ..utils import models_path_prefix
19

20
MODEL_PATH = os.path.join(models_path_prefix, "zai-org/chatglm3-6b")
21
22
LORA_RANK = 64
DEFAULT_MAX_LORAS = 4 * 3
23
24


25
def get_lora_requests(lora_path) -> list[LoRARequest]:
26
    lora_requests: list[LoRARequest] = [
27
        LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
28
29
30
31
32
        for i in range(1, DEFAULT_MAX_LORAS + 1)
    ]
    return lora_requests


33
34
35
36
async def requests_processing_time(llm, lora_requests: list[LoRARequest]) -> float:
    sampling_params = SamplingParams(
        n=1, temperature=0.0, top_p=1.0, ignore_eos=True, max_tokens=1
    )
37
38
39
40
41
42
43

    generators = []
    start = time.perf_counter()

    for lora_request in lora_requests:
        lora_int_id = lora_request.lora_int_id
        generator = llm.generate(
44
            prompt=TextPrompt(prompt=f"hello {lora_int_id}", multi_modal_data=None),  # type: ignore
45
46
            sampling_params=sampling_params,
            lora_request=lora_request,
47
48
            request_id=f"test{lora_int_id}",
        )
49
50
51
52
53
54
55
56
57
58
59
        generators.append(generator)

    all_gens = merge_async_iterators(*generators)
    async for i, res in all_gens:
        pass

    end = time.perf_counter()
    return end - start


@pytest.mark.asyncio
60
async def test_add_lora(chatglm3_lora_files):
61
    """
62
    The add_lora function is used to preload some LoRA adapters into the
63
64
    engine in anticipation of future requests using these adapters. To test
    this functionality, we use the async engine to process some requests - We
65
    do it twice, once with add_lora() preloading and once without.
66

67
    We measure the request processing time in both cases and expect the time
68
69
    to be lesser in the case with add_lora() calls.
    """
70
    lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
71
72
73
74
75
76
77
78
79
80

    max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
    # Create engine in eager-mode. Due to high max_loras, the CI can
    # OOM during cuda-graph capture.
    engine_args = AsyncEngineArgs(
        model=MODEL_PATH,
        enable_lora=True,
        max_loras=max_loras,
        max_lora_rank=LORA_RANK,
        max_model_len=128,
81
        gpu_memory_utilization=0.8,  # avoid OOM
82
        trust_remote_code=True,
83
84
        enforce_eager=True,
    )
85
86
87
88

    # split lora_requests into 3 parts
    part_size = len(lora_requests) // 3
    dummy_run_requests = lora_requests[:part_size]
89
90
    warmup_run_requests = lora_requests[part_size : part_size * 2]
    cold_run_requests = lora_requests[part_size * 2 :]
91
92
93
94
95
96
97

    async with build_async_engine_client_from_engine_args(engine_args) as llm:
        # Dummy run - So any 1-time functionality like triton kernel compilation
        # is complete here.
        await requests_processing_time(llm, dummy_run_requests)

        # Run with warmup
98
99
        add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
        add_lora_results = await asyncio.gather(*add_lora_tasks)
100
101
102
103

        # Test that all all_lora calls are successful.
        assert all(add_lora_results)

104
        time_with_add_lora = await requests_processing_time(llm, warmup_run_requests)
105
106

        # Run without any warmup
107
        time_cold_start = await requests_processing_time(llm, cold_run_requests)
108

109
    print(f"time hot-start {time_with_add_lora} vs time cold-start {time_cold_start} ")
110
111
112
113
114

    assert time_with_add_lora < time_cold_start, (
        f"time_with_add_lora={time_with_add_lora}, "
        f"time_cold_start={time_cold_start}"
        "The engine request processing time with LoRA pre-loading "
115
116
        "must be less than the version that does on-demand LoRA loading."
    )