test_lora_resolvers.py 7.9 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6

from contextlib import suppress
from dataclasses import dataclass, field
from http import HTTPStatus
7
from unittest.mock import AsyncMock, MagicMock
8
9
10

import pytest

11
from vllm.config.multimodal import MultiModalConfig
12
13
14
15
16
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
17
18
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
19
20
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
21
from vllm.v1.engine.async_llm import AsyncLLM
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

MODEL_NAME = "openai-community/gpt2"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]

MOCK_RESOLVER_NAME = "mock_test_resolver"


@dataclass
class MockHFConfig:
    model_type: str = "any"


@dataclass
class MockModelConfig:
    """Minimal mock ModelConfig for testing."""
37

38
    model: str = MODEL_NAME
39
    runner_type = "generate"
40
    tokenizer: str = MODEL_NAME
41
    trust_remote_code: bool = False
42
    tokenizer_mode: str = "auto"
43
    max_model_len: int = 100
44
    tokenizer_revision: str | None = None
45
    multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
46
    hf_config: MockHFConfig = field(default_factory=MockHFConfig)
47
    logits_processors: list[str] | None = None
48
    diff_sampling_param: dict | None = None
49
50
    allowed_local_media_path: str = ""
    allowed_media_domains: list[str] | None = None
51
52
    encoder_config = None
    generation_config: str = "auto"
53
    skip_tokenizer_init: bool = False
54
    is_encoder_decoder: bool = False
55
56
57
58
59

    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}


60
61
62
63
64
@dataclass
class MockVllmConfig:
    model_config: MockModelConfig


65
class MockLoRAResolver(LoRAResolver):
66
67
    async def resolve_lora(
        self, base_model_name: str, lora_name: str
68
    ) -> LoRARequest | None:
69
        if lora_name == "test-lora":
70
71
72
            return LoRARequest(
                lora_name="test-lora",
                lora_int_id=1,
73
                lora_path="/fake/path/test-lora",
74
            )
75
        elif lora_name == "invalid-lora":
76
77
78
            return LoRARequest(
                lora_name="invalid-lora",
                lora_int_id=2,
79
                lora_path="/fake/path/invalid-lora",
80
            )
81
82
83
84
85
86
87
88
89
90
91
92
93
94
        return None


@pytest.fixture(autouse=True)
def register_mock_resolver():
    """Fixture to register and unregister the mock LoRA resolver."""
    resolver = MockLoRAResolver()
    LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
    yield
    # Cleanup: remove the resolver after the test runs
    if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
        del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]


95
96
97
98
def _build_renderer(model_config: MockModelConfig):
    _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)

    return HfRenderer(
99
        MockVllmConfig(model_config),
100
101
102
103
        tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
    )


104
105
106
@pytest.fixture
def mock_serving_setup():
    """Provides a mocked engine and serving completion instance."""
107
    mock_engine = MagicMock(spec=AsyncLLM)
108
109
    mock_engine.errored = False

110
    async def mock_add_lora_side_effect(lora_request: LoRARequest):
111
112
113
        """Simulate engine behavior when adding LoRAs."""
        if lora_request.lora_name == "test-lora":
            # Simulate successful addition
114
115
            return True
        if lora_request.lora_name == "invalid-lora":
116
            # Simulate failure during addition (e.g. invalid format)
117
            raise ValueError(f"Simulated failure adding LoRA: {lora_request.lora_name}")
118
119
120
121
122
123
124
125
        return True

    mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect)

    async def mock_generate(*args, **kwargs):
        for _ in []:
            yield _

126
    mock_engine.generate = MagicMock(spec=AsyncLLM.generate, side_effect=mock_generate)
127
128
129
130

    mock_engine.generate.reset_mock()
    mock_engine.add_lora.reset_mock()

131
    mock_engine.model_config = MockModelConfig()
132
    mock_engine.input_processor = MagicMock()
133
    mock_engine.io_processor = MagicMock()
134
    mock_engine.renderer = _build_renderer(mock_engine.model_config)
135

136
137
138
139
    models = OpenAIServingModels(
        engine_client=mock_engine,
        base_model_paths=BASE_MODEL_PATHS,
    )
140

141
    serving_completion = OpenAIServingCompletion(
142
        mock_engine, models, request_logger=None
143
    )
144
145
146
147
148

    return mock_engine, serving_completion


@pytest.mark.asyncio
149
async def test_serving_completion_with_lora_resolver(mock_serving_setup, monkeypatch):
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")

    mock_engine, serving_completion = mock_serving_setup

    lora_model_name = "test-lora"
    req_found = CompletionRequest(
        model=lora_model_name,
        prompt="Generate with LoRA",
    )

    # Suppress potential errors during the mocked generate call,
    # as we are primarily checking for add_lora and generate calls
    with suppress(Exception):
        await serving_completion.create_completion(req_found)

165
    mock_engine.add_lora.assert_awaited_once()
166
167
168
169
170
    called_lora_request = mock_engine.add_lora.call_args[0][0]
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == lora_model_name

    mock_engine.generate.assert_called_once()
171
    called_lora_request = mock_engine.generate.call_args[1]["lora_request"]
172
173
174
175
176
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == lora_model_name


@pytest.mark.asyncio
177
async def test_serving_completion_resolver_not_found(mock_serving_setup, monkeypatch):
178
179
180
181
182
183
184
185
186
187
188
189
    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")

    mock_engine, serving_completion = mock_serving_setup

    non_existent_model = "non-existent-lora-adapter"
    req = CompletionRequest(
        model=non_existent_model,
        prompt="what is 1+1?",
    )

    response = await serving_completion.create_completion(req)

190
    mock_engine.add_lora.assert_not_awaited()
191
192
193
    mock_engine.generate.assert_not_called()

    assert isinstance(response, ErrorResponse)
194
195
    assert response.error.code == HTTPStatus.NOT_FOUND.value
    assert non_existent_model in response.error.message
196
197
198
199


@pytest.mark.asyncio
async def test_serving_completion_resolver_add_lora_fails(
200
201
    mock_serving_setup, monkeypatch
):
202
203
204
205
206
207
208
209
210
211
212
213
214
    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")

    mock_engine, serving_completion = mock_serving_setup

    invalid_model = "invalid-lora"
    req = CompletionRequest(
        model=invalid_model,
        prompt="what is 1+1?",
    )

    response = await serving_completion.create_completion(req)

    # Assert add_lora was called before the failure
215
    mock_engine.add_lora.assert_awaited_once()
216
217
218
219
220
221
222
223
224
    called_lora_request = mock_engine.add_lora.call_args[0][0]
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == invalid_model

    # Assert generate was *not* called due to the failure
    mock_engine.generate.assert_not_called()

    # Assert the correct error response
    assert isinstance(response, ErrorResponse)
225
226
    assert response.error.code == HTTPStatus.BAD_REQUEST.value
    assert invalid_model in response.error.message
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242


@pytest.mark.asyncio
async def test_serving_completion_flag_not_set(mock_serving_setup):
    mock_engine, serving_completion = mock_serving_setup

    lora_model_name = "test-lora"
    req_found = CompletionRequest(
        model=lora_model_name,
        prompt="Generate with LoRA",
    )

    await serving_completion.create_completion(req_found)

    mock_engine.add_lora.assert_not_called()
    mock_engine.generate.assert_not_called()