conftest.py 1.71 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""Shared fixtures and utilities for SageMaker tests."""

import pytest
import pytest_asyncio

from ...utils import RemoteOpenAIServer

# Model name constants used across tests
MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese"

# SageMaker header constants
HEADER_SAGEMAKER_CLOSED_SESSION_ID = "X-Amzn-SageMaker-Closed-Session-Id"
HEADER_SAGEMAKER_SESSION_ID = "X-Amzn-SageMaker-Session-Id"
HEADER_SAGEMAKER_NEW_SESSION_ID = "X-Amzn-SageMaker-New-Session-Id"


@pytest.fixture(scope="session")
def smollm2_lora_files():
    """Download LoRA files once per test session."""
    from huggingface_hub import snapshot_download

    return snapshot_download(repo_id=LORA_ADAPTER_NAME_SMOLLM)


@pytest.fixture(scope="module")
def basic_server_with_lora(smollm2_lora_files):
    """Basic server fixture with standard configuration."""
    args = [
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "8192",
        "--enforce-eager",
        # lora config below
        "--enable-lora",
        "--max-lora-rank",
        "256",
        "--max-cpu-loras",
        "2",
        "--max-num-seqs",
        "64",
    ]

    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
    with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=envs) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def async_client(basic_server_with_lora: RemoteOpenAIServer):
    """Async OpenAI client fixture for use with basic_server."""
    async with basic_server_with_lora.get_async_client() as async_client:
        yield async_client