test_serving_engine.py 2.21 KB
Newer Older
1
2
3
4
5
6
7
8
9
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import asyncio
import time
from unittest.mock import Mock

import pytest

10
from vllm.config import ModelConfig, RendererConfig
11
12
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
13
from vllm.tokenizers import MistralTokenizer
14
15
16
17
18
19
20
21


@pytest.fixture()
def serving() -> OpenAIServing:
    """Create a minimal OpenAIServing instance for testing."""

    # Create minimal mocks
    engine_client = Mock()
22

23
24
    model_config = Mock(spec=ModelConfig)
    model_config.max_model_len = 32768
25
26
27
28

    renderer_config = Mock(spec=RendererConfig)
    renderer_config.model_config = model_config

29
    models = Mock(spec=OpenAIServingModels)
30
    models.model_config = model_config
31
    models.renderer_config = renderer_config
32
    models.input_processor = Mock()
33
    models.io_processor = Mock()
34
35
36
37
38
39
40
41
42
43
44

    serving = OpenAIServing(
        engine_client=engine_client,
        models=models,
        request_logger=None,
    )
    return serving


@pytest.mark.asyncio
async def test_async_mistral_tokenizer_does_not_block_event_loop(
45
46
    serving: OpenAIServing,
):
47
48
49
50
51
52
53
54
55
56
    expected_tokens = [1, 2, 3]

    # Mock the blocking version to sleep
    def mocked_apply_chat_template(*_args, **_kwargs):
        time.sleep(2)
        return expected_tokens

    mock_tokenizer = Mock(spec=MistralTokenizer)
    mock_tokenizer.apply_chat_template.side_effect = mocked_apply_chat_template

57
58
59
    task = serving._apply_mistral_chat_template_async(
        tokenizer=mock_tokenizer, messages=[], chat_template=None, tools=[]
    )
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

    # Ensure the event loop is not blocked
    blocked_count = 0
    for _i in range(20):  # Check over ~2 seconds
        start = time.perf_counter()
        await asyncio.sleep(0)
        elapsed = time.perf_counter() - start

        # an overly generous elapsed time for slow machines
        if elapsed >= 0.5:
            blocked_count += 1

        await asyncio.sleep(0.1)

    # Ensure task completes
    tokens = await task
    assert tokens == expected_tokens, "Mocked blocking tokenizer was not called"
77
    assert blocked_count == 0, "Event loop blocked during tokenization"