test_offline_mode.py 5.23 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
Joe Runde's avatar
Joe Runde committed
3
"""Tests for HF_HUB_OFFLINE mode"""
4

5
import dataclasses
Joe Runde's avatar
Joe Runde committed
6
7
8
9
import importlib
import sys

import pytest
10
import regex as re
11
import urllib3
Joe Runde's avatar
Joe Runde committed
12
13

from vllm import LLM
14
from vllm.distributed import cleanup_dist_env_and_memory
15
from vllm.engine.arg_utils import EngineArgs
Joe Runde's avatar
Joe Runde committed
16

17
18
19
20
21
22
23
24
25
26
MODEL_CONFIGS = [
    {
        "model": "facebook/opt-125m",
        "enforce_eager": True,
        "gpu_memory_utilization": 0.20,
        "max_model_len": 64,
        "max_num_batched_tokens": 64,
        "max_num_seqs": 64,
        "tensor_parallel_size": 1,
    },
27
28
29
30
31
32
33
34
35
36
    {
        "model": "Qwen/Qwen3-0.6B",
        "enforce_eager": True,
        "gpu_memory_utilization": 0.50,
        "max_model_len": 64,
        "max_num_batched_tokens": 64,
        "max_num_seqs": 64,
        "tensor_parallel_size": 1,
        "tokenizer": "Qwen/Qwen3-4B",
    },
37
38
39
40
41
42
43
44
45
46
    {
        "model": "mistralai/Mistral-7B-Instruct-v0.1",
        "enforce_eager": True,
        "gpu_memory_utilization": 0.95,
        "max_model_len": 64,
        "max_num_batched_tokens": 64,
        "max_num_seqs": 64,
        "tensor_parallel_size": 1,
        "tokenizer_mode": "mistral",
    },
47
48
49
50
51
52
53
54
55
56
    # TODO: re-enable once these tests are run with V1
    # {
    #     "model": "sentence-transformers/all-MiniLM-L12-v2",
    #     "enforce_eager": True,
    #     "gpu_memory_utilization": 0.20,
    #     "max_model_len": 64,
    #     "max_num_batched_tokens": 64,
    #     "max_num_seqs": 64,
    #     "tensor_parallel_size": 1,
    # },
57
]
Joe Runde's avatar
Joe Runde committed
58
59
60


@pytest.fixture(scope="module")
61
62
63
64
65
def cache_models():
    # Cache model files first
    for model_config in MODEL_CONFIGS:
        LLM(**model_config)
        cleanup_dist_env_and_memory()
Joe Runde's avatar
Joe Runde committed
66

67
    yield
Joe Runde's avatar
Joe Runde committed
68
69
70


@pytest.mark.skip_global_cleanup
71
@pytest.mark.usefixtures("cache_models")
72
def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
Joe Runde's avatar
Joe Runde committed
73
    # Set HF to offline mode and ensure we can still construct an LLM
74
75
76
77
    with monkeypatch.context() as m:
        try:
            m.setenv("HF_HUB_OFFLINE", "1")
            m.setenv("VLLM_NO_USAGE_STATS", "1")
78

79
80
            def disable_connect(*args, **kwargs):
                raise RuntimeError("No http calls allowed")
81

82
83
84
85
86
87
88
89
90
91
            m.setattr(
                urllib3.connection.HTTPConnection,
                "connect",
                disable_connect,
            )
            m.setattr(
                urllib3.connection.HTTPSConnection,
                "connect",
                disable_connect,
            )
92

93
            # Need to re-import huggingface_hub
94
            # and friends to set up offline mode
95
96
97
98
99
100
101
102
            _re_import_modules()
            # Cached model files should be used in offline mode
            for model_config in MODEL_CONFIGS:
                LLM(**model_config)
        finally:
            # Reset the environment after the test
            # NB: Assuming tests are run in online mode
            _re_import_modules()
Joe Runde's avatar
Joe Runde committed
103
104
105


def _re_import_modules():
106
    hf_hub_module_names = [k for k in sys.modules if k.startswith("huggingface_hub")]
Joe Runde's avatar
Joe Runde committed
107
    transformers_module_names = [
108
109
110
        k
        for k in sys.modules
        if k.startswith("transformers") and not k.startswith("transformers_modules")
Joe Runde's avatar
Joe Runde committed
111
112
    ]

113
    # These modules are aliased in Transformers v5 and so cannot be reloaded directly
114
115
116
117
118
    aliased_module_patterns = [
        r".+\.tokenization_utils$",
        r".+\.tokenization_utils_fast$",
        r".+\.models\..+\.image_processing_.+_fast$",
    ]
119

Joe Runde's avatar
Joe Runde committed
120
121
    reload_exception = None
    for module_name in hf_hub_module_names + transformers_module_names:
122
        if any(re.match(pattern, module_name) for pattern in aliased_module_patterns):
123
124
125
            # Remove from sys.modules so they are re-aliased on next import
            del sys.modules[module_name]
            continue
Joe Runde's avatar
Joe Runde committed
126
127
128
129
130
131
132
133
134
135
        try:
            importlib.reload(sys.modules[module_name])
        except Exception as e:
            reload_exception = e
            # Try to continue clean up so that other tests are less likely to
            # be affected

    # Error this test if reloading a module failed
    if reload_exception is not None:
        raise reload_exception
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160


@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
    # Set HF to offline mode and ensure we can still construct an LLM
    with monkeypatch.context() as m:
        try:
            m.setenv("HF_HUB_OFFLINE", "1")
            m.setenv("VLLM_NO_USAGE_STATS", "1")

            def disable_connect(*args, **kwargs):
                raise RuntimeError("No http calls allowed")

            m.setattr(
                urllib3.connection.HTTPConnection,
                "connect",
                disable_connect,
            )
            m.setattr(
                urllib3.connection.HTTPSConnection,
                "connect",
                disable_connect,
            )
            # Need to re-import huggingface_hub
161
            # and friends to set up offline mode
162
163
164
165
166
167
168
            _re_import_modules()
            engine_args = EngineArgs(model="facebook/opt-125m")
            LLM(**dataclasses.asdict(engine_args))
        finally:
            # Reset the environment after the test
            # NB: Assuming tests are run in online mode
            _re_import_modules()