test_lora_resolvers.py 8.51 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6

from contextlib import suppress
from dataclasses import dataclass, field
from http import HTTPStatus
7
from unittest.mock import AsyncMock, MagicMock
8
9
10

import pytest

11
from vllm.config.multimodal import MultiModalConfig
12
13
14
15
16
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
17
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
18
19
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
20
21
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
22
from vllm.v1.engine.async_llm import AsyncLLM
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

MODEL_NAME = "openai-community/gpt2"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]

MOCK_RESOLVER_NAME = "mock_test_resolver"


@dataclass
class MockHFConfig:
    model_type: str = "any"


@dataclass
class MockModelConfig:
    """Minimal mock ModelConfig for testing."""
38

39
    model: str = MODEL_NAME
40
    runner_type = "generate"
41
    tokenizer: str = MODEL_NAME
42
    trust_remote_code: bool = False
43
    tokenizer_mode: str = "auto"
44
    max_model_len: int = 100
45
    tokenizer_revision: str | None = None
46
    multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
47
    hf_config: MockHFConfig = field(default_factory=MockHFConfig)
48
    logits_processors: list[str] | None = None
49
    diff_sampling_param: dict | None = None
50
51
    allowed_local_media_path: str = ""
    allowed_media_domains: list[str] | None = None
52
53
    encoder_config = None
    generation_config: str = "auto"
54
    skip_tokenizer_init: bool = False
55
    is_encoder_decoder: bool = False
56
    is_multimodal_model: bool = False
57
58
59
60
61

    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}


62
63
64
65
66
@dataclass
class MockParallelConfig:
    _api_process_rank: int = 0


67
68
69
@dataclass
class MockVllmConfig:
    model_config: MockModelConfig
70
    parallel_config: MockParallelConfig
71
72


73
class MockLoRAResolver(LoRAResolver):
74
75
    async def resolve_lora(
        self, base_model_name: str, lora_name: str
76
    ) -> LoRARequest | None:
77
        if lora_name == "test-lora":
78
79
80
            return LoRARequest(
                lora_name="test-lora",
                lora_int_id=1,
81
                lora_path="/fake/path/test-lora",
82
            )
83
        elif lora_name == "invalid-lora":
84
85
86
            return LoRARequest(
                lora_name="invalid-lora",
                lora_int_id=2,
87
                lora_path="/fake/path/invalid-lora",
88
            )
89
90
91
92
93
94
95
96
97
98
99
100
101
102
        return None


@pytest.fixture(autouse=True)
def register_mock_resolver():
    """Fixture to register and unregister the mock LoRA resolver."""
    resolver = MockLoRAResolver()
    LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
    yield
    # Cleanup: remove the resolver after the test runs
    if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
        del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]


103
104
105
def _build_renderer(model_config: MockModelConfig):
    _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)

106
    return HfRenderer.from_config(
107
        MockVllmConfig(model_config, parallel_config=MockParallelConfig()),
108
109
110
111
        tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
    )


112
113
114
@pytest.fixture
def mock_serving_setup():
    """Provides a mocked engine and serving completion instance."""
115
    mock_engine = MagicMock(spec=AsyncLLM)
116
117
    mock_engine.errored = False

118
    async def mock_add_lora_side_effect(lora_request: LoRARequest):
119
120
121
        """Simulate engine behavior when adding LoRAs."""
        if lora_request.lora_name == "test-lora":
            # Simulate successful addition
122
123
            return True
        if lora_request.lora_name == "invalid-lora":
124
            # Simulate failure during addition (e.g. invalid format)
125
            raise ValueError(f"Simulated failure adding LoRA: {lora_request.lora_name}")
126
127
128
129
130
131
132
133
        return True

    mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect)

    async def mock_generate(*args, **kwargs):
        for _ in []:
            yield _

134
    mock_engine.generate = MagicMock(spec=AsyncLLM.generate, side_effect=mock_generate)
135
136
137
138

    mock_engine.generate.reset_mock()
    mock_engine.add_lora.reset_mock()

139
    mock_engine.model_config = MockModelConfig()
140
    mock_engine.input_processor = MagicMock()
141
    mock_engine.io_processor = MagicMock()
142
    mock_engine.renderer = _build_renderer(mock_engine.model_config)
143

144
145
146
147
    models = OpenAIServingModels(
        engine_client=mock_engine,
        base_model_paths=BASE_MODEL_PATHS,
    )
148

149
150
151
152
153
154
155
156
157
    serving_render = OpenAIServingRender(
        model_config=mock_engine.model_config,
        renderer=mock_engine.renderer,
        io_processor=mock_engine.io_processor,
        model_registry=models.registry,
        request_logger=None,
        chat_template=None,
        chat_template_content_format="auto",
    )
158
    serving_completion = OpenAIServingCompletion(
159
        mock_engine, models, openai_serving_render=serving_render, request_logger=None
160
    )
161
162
163
164
165

    return mock_engine, serving_completion


@pytest.mark.asyncio
166
async def test_serving_completion_with_lora_resolver(mock_serving_setup, monkeypatch):
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")

    mock_engine, serving_completion = mock_serving_setup

    lora_model_name = "test-lora"
    req_found = CompletionRequest(
        model=lora_model_name,
        prompt="Generate with LoRA",
    )

    # Suppress potential errors during the mocked generate call,
    # as we are primarily checking for add_lora and generate calls
    with suppress(Exception):
        await serving_completion.create_completion(req_found)

182
    mock_engine.add_lora.assert_awaited_once()
183
184
185
186
187
    called_lora_request = mock_engine.add_lora.call_args[0][0]
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == lora_model_name

    mock_engine.generate.assert_called_once()
188
    called_lora_request = mock_engine.generate.call_args[1]["lora_request"]
189
190
191
192
193
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == lora_model_name


@pytest.mark.asyncio
194
async def test_serving_completion_resolver_not_found(mock_serving_setup, monkeypatch):
195
196
197
198
199
200
201
202
203
204
205
206
    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")

    mock_engine, serving_completion = mock_serving_setup

    non_existent_model = "non-existent-lora-adapter"
    req = CompletionRequest(
        model=non_existent_model,
        prompt="what is 1+1?",
    )

    response = await serving_completion.create_completion(req)

207
    mock_engine.add_lora.assert_not_awaited()
208
209
210
    mock_engine.generate.assert_not_called()

    assert isinstance(response, ErrorResponse)
211
212
    assert response.error.code == HTTPStatus.NOT_FOUND.value
    assert non_existent_model in response.error.message
213
214
215
216


@pytest.mark.asyncio
async def test_serving_completion_resolver_add_lora_fails(
217
218
    mock_serving_setup, monkeypatch
):
219
220
221
222
223
224
225
226
227
228
229
230
231
    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")

    mock_engine, serving_completion = mock_serving_setup

    invalid_model = "invalid-lora"
    req = CompletionRequest(
        model=invalid_model,
        prompt="what is 1+1?",
    )

    response = await serving_completion.create_completion(req)

    # Assert add_lora was called before the failure
232
    mock_engine.add_lora.assert_awaited_once()
233
234
235
236
237
238
239
240
241
    called_lora_request = mock_engine.add_lora.call_args[0][0]
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == invalid_model

    # Assert generate was *not* called due to the failure
    mock_engine.generate.assert_not_called()

    # Assert the correct error response
    assert isinstance(response, ErrorResponse)
242
243
    assert response.error.code == HTTPStatus.BAD_REQUEST.value
    assert invalid_model in response.error.message
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259


@pytest.mark.asyncio
async def test_serving_completion_flag_not_set(mock_serving_setup):
    mock_engine, serving_completion = mock_serving_setup

    lora_model_name = "test-lora"
    req_found = CompletionRequest(
        model=lora_model_name,
        prompt="Generate with LoRA",
    )

    await serving_completion.create_completion(req_found)

    mock_engine.add_lora.assert_not_called()
    mock_engine.generate.assert_not_called()