test_lora_resolvers.py 7.64 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6

from contextlib import suppress
from dataclasses import dataclass, field
from http import HTTPStatus
7
from unittest.mock import AsyncMock, MagicMock
8
9
10

import pytest

11
from vllm.config.multimodal import MultiModalConfig
12
13
14
15
16
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
17
18
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
19
from vllm.tokenizers import get_tokenizer
20
from vllm.v1.engine.async_llm import AsyncLLM
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

MODEL_NAME = "openai-community/gpt2"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]

MOCK_RESOLVER_NAME = "mock_test_resolver"


@dataclass
class MockHFConfig:
    model_type: str = "any"


@dataclass
class MockModelConfig:
    """Minimal mock ModelConfig for testing."""
36

37
    model: str = MODEL_NAME
38
    tokenizer: str = MODEL_NAME
39
    trust_remote_code: bool = False
40
    tokenizer_mode: str = "auto"
41
    max_model_len: int = 100
42
    tokenizer_revision: str | None = None
43
    multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
44
    hf_config: MockHFConfig = field(default_factory=MockHFConfig)
45
    logits_processors: list[str] | None = None
46
47
    logits_processor_pattern: str | None = None
    diff_sampling_param: dict | None = None
48
49
    allowed_local_media_path: str = ""
    allowed_media_domains: list[str] | None = None
50
51
    encoder_config = None
    generation_config: str = "auto"
52
    skip_tokenizer_init: bool = False
53
54
55
56
57
58

    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}


class MockLoRAResolver(LoRAResolver):
59
60
    async def resolve_lora(
        self, base_model_name: str, lora_name: str
61
    ) -> LoRARequest | None:
62
        if lora_name == "test-lora":
63
64
65
            return LoRARequest(
                lora_name="test-lora",
                lora_int_id=1,
66
                lora_path="/fake/path/test-lora",
67
            )
68
        elif lora_name == "invalid-lora":
69
70
71
            return LoRARequest(
                lora_name="invalid-lora",
                lora_int_id=2,
72
                lora_path="/fake/path/invalid-lora",
73
            )
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
        return None


@pytest.fixture(autouse=True)
def register_mock_resolver():
    """Fixture to register and unregister the mock LoRA resolver."""
    resolver = MockLoRAResolver()
    LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
    yield
    # Cleanup: remove the resolver after the test runs
    if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
        del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]


@pytest.fixture
def mock_serving_setup():
    """Provides a mocked engine and serving completion instance."""
91
    mock_engine = MagicMock(spec=AsyncLLM)
92
93
    mock_engine.errored = False

94
95
96
97
    tokenizer = get_tokenizer(MODEL_NAME)
    mock_engine.get_tokenizer = AsyncMock(return_value=tokenizer)

    async def mock_add_lora_side_effect(lora_request: LoRARequest):
98
99
100
        """Simulate engine behavior when adding LoRAs."""
        if lora_request.lora_name == "test-lora":
            # Simulate successful addition
101
102
            return True
        if lora_request.lora_name == "invalid-lora":
103
            # Simulate failure during addition (e.g. invalid format)
104
            raise ValueError(f"Simulated failure adding LoRA: {lora_request.lora_name}")
105
106
107
108
109
110
111
112
        return True

    mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect)

    async def mock_generate(*args, **kwargs):
        for _ in []:
            yield _

113
    mock_engine.generate = MagicMock(spec=AsyncLLM.generate, side_effect=mock_generate)
114
115
116
117

    mock_engine.generate.reset_mock()
    mock_engine.add_lora.reset_mock()

118
    mock_engine.model_config = MockModelConfig()
119
    mock_engine.input_processor = MagicMock()
120
121
    mock_engine.io_processor = MagicMock()

122
123
124
125
    models = OpenAIServingModels(
        engine_client=mock_engine,
        base_model_paths=BASE_MODEL_PATHS,
    )
126

127
    serving_completion = OpenAIServingCompletion(
128
        mock_engine, models, request_logger=None
129
    )
130

131
132
133
    serving_completion._process_inputs = AsyncMock(
        return_value=(MagicMock(name="engine_request"), {})
    )
134

135
136
137
138
    return mock_engine, serving_completion


@pytest.mark.asyncio
139
async def test_serving_completion_with_lora_resolver(mock_serving_setup, monkeypatch):
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")

    mock_engine, serving_completion = mock_serving_setup

    lora_model_name = "test-lora"
    req_found = CompletionRequest(
        model=lora_model_name,
        prompt="Generate with LoRA",
    )

    # Suppress potential errors during the mocked generate call,
    # as we are primarily checking for add_lora and generate calls
    with suppress(Exception):
        await serving_completion.create_completion(req_found)

155
    mock_engine.add_lora.assert_awaited_once()
156
157
158
159
160
    called_lora_request = mock_engine.add_lora.call_args[0][0]
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == lora_model_name

    mock_engine.generate.assert_called_once()
161
    called_lora_request = mock_engine.generate.call_args[1]["lora_request"]
162
163
164
165
166
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == lora_model_name


@pytest.mark.asyncio
167
async def test_serving_completion_resolver_not_found(mock_serving_setup, monkeypatch):
168
169
170
171
172
173
174
175
176
177
178
179
    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")

    mock_engine, serving_completion = mock_serving_setup

    non_existent_model = "non-existent-lora-adapter"
    req = CompletionRequest(
        model=non_existent_model,
        prompt="what is 1+1?",
    )

    response = await serving_completion.create_completion(req)

180
    mock_engine.add_lora.assert_not_awaited()
181
182
183
    mock_engine.generate.assert_not_called()

    assert isinstance(response, ErrorResponse)
184
185
    assert response.error.code == HTTPStatus.NOT_FOUND.value
    assert non_existent_model in response.error.message
186
187
188
189


@pytest.mark.asyncio
async def test_serving_completion_resolver_add_lora_fails(
190
191
    mock_serving_setup, monkeypatch
):
192
193
194
195
196
197
198
199
200
201
202
203
204
    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")

    mock_engine, serving_completion = mock_serving_setup

    invalid_model = "invalid-lora"
    req = CompletionRequest(
        model=invalid_model,
        prompt="what is 1+1?",
    )

    response = await serving_completion.create_completion(req)

    # Assert add_lora was called before the failure
205
    mock_engine.add_lora.assert_awaited_once()
206
207
208
209
210
211
212
213
214
    called_lora_request = mock_engine.add_lora.call_args[0][0]
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == invalid_model

    # Assert generate was *not* called due to the failure
    mock_engine.generate.assert_not_called()

    # Assert the correct error response
    assert isinstance(response, ErrorResponse)
215
216
    assert response.error.code == HTTPStatus.BAD_REQUEST.value
    assert invalid_model in response.error.message
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232


@pytest.mark.asyncio
async def test_serving_completion_flag_not_set(mock_serving_setup):
    mock_engine, serving_completion = mock_serving_setup

    lora_model_name = "test-lora"
    req_found = CompletionRequest(
        model=lora_model_name,
        prompt="Generate with LoRA",
    )

    await serving_completion.create_completion(req_found)

    mock_engine.add_lora.assert_not_called()
    mock_engine.generate.assert_not_called()