test_serving_engine.py 2.06 KB
Newer Older
1
2
3
4
5
6
7
8
9
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import asyncio
import time
from unittest.mock import Mock

import pytest

10
from vllm.config import ModelConfig
11
from vllm.entrypoints.openai.engine.serving import OpenAIServing
12
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
13
from vllm.tokenizers.mistral import MistralTokenizer
14
15
16
17
18
19
20
21
22
23
24


@pytest.fixture()
def serving() -> OpenAIServing:
    """Create a minimal OpenAIServing instance for testing."""

    # Create minimal mocks
    engine_client = Mock()
    model_config = Mock(spec=ModelConfig)
    model_config.max_model_len = 32768
    models = Mock(spec=OpenAIServingModels)
25
    models.model_config = model_config
26
    models.input_processor = Mock()
27
    models.io_processor = Mock()
28
29
30
31
32
33
34
35
36
37
38

    serving = OpenAIServing(
        engine_client=engine_client,
        models=models,
        request_logger=None,
    )
    return serving


@pytest.mark.asyncio
async def test_async_mistral_tokenizer_does_not_block_event_loop(
39
40
    serving: OpenAIServing,
):
41
42
43
44
45
46
47
48
49
50
    expected_tokens = [1, 2, 3]

    # Mock the blocking version to sleep
    def mocked_apply_chat_template(*_args, **_kwargs):
        time.sleep(2)
        return expected_tokens

    mock_tokenizer = Mock(spec=MistralTokenizer)
    mock_tokenizer.apply_chat_template.side_effect = mocked_apply_chat_template

51
52
53
    task = serving._apply_mistral_chat_template_async(
        tokenizer=mock_tokenizer, messages=[], chat_template=None, tools=[]
    )
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

    # Ensure the event loop is not blocked
    blocked_count = 0
    for _i in range(20):  # Check over ~2 seconds
        start = time.perf_counter()
        await asyncio.sleep(0)
        elapsed = time.perf_counter() - start

        # an overly generous elapsed time for slow machines
        if elapsed >= 0.5:
            blocked_count += 1

        await asyncio.sleep(0.1)

    # Ensure task completes
    tokens = await task
    assert tokens == expected_tokens, "Mocked blocking tokenizer was not called"
71
    assert blocked_count == 0, "Event loop blocked during tokenization"