test_offline_mode.py 4.58 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
Joe Runde's avatar
Joe Runde committed
3
"""Tests for HF_HUB_OFFLINE mode"""
4
import dataclasses
Joe Runde's avatar
Joe Runde committed
5
6
import importlib
import sys
7
import os
Joe Runde's avatar
Joe Runde committed
8
9

import pytest
10
import urllib3
Joe Runde's avatar
Joe Runde committed
11
12

from vllm import LLM
13
from vllm.distributed import cleanup_dist_env_and_memory
14
from vllm.engine.arg_utils import EngineArgs
15
from ...utils import models_path_prefix
Joe Runde's avatar
Joe Runde committed
16

17
18
MODEL_CONFIGS = [
    {
zhuwenwen's avatar
zhuwenwen committed
19
        "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
20
21
22
23
24
25
26
27
        "enforce_eager": True,
        "gpu_memory_utilization": 0.20,
        "max_model_len": 64,
        "max_num_batched_tokens": 64,
        "max_num_seqs": 64,
        "tensor_parallel_size": 1,
    },
    {
zhuwenwen's avatar
zhuwenwen committed
28
        "model":  os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.1"),
29
30
31
32
33
34
35
36
        "enforce_eager": True,
        "gpu_memory_utilization": 0.95,
        "max_model_len": 64,
        "max_num_batched_tokens": 64,
        "max_num_seqs": 64,
        "tensor_parallel_size": 1,
        "tokenizer_mode": "mistral",
    },
37
38
39
40
41
42
43
44
45
46
    # TODO: re-enable once these tests are run with V1
    # {
    #     "model": "sentence-transformers/all-MiniLM-L12-v2",
    #     "enforce_eager": True,
    #     "gpu_memory_utilization": 0.20,
    #     "max_model_len": 64,
    #     "max_num_batched_tokens": 64,
    #     "max_num_seqs": 64,
    #     "tensor_parallel_size": 1,
    # },
47
]
Joe Runde's avatar
Joe Runde committed
48
49
50


@pytest.fixture(scope="module")
51
52
53
54
55
def cache_models():
    # Cache model files first
    for model_config in MODEL_CONFIGS:
        LLM(**model_config)
        cleanup_dist_env_and_memory()
Joe Runde's avatar
Joe Runde committed
56

57
    yield
Joe Runde's avatar
Joe Runde committed
58
59
60


@pytest.mark.skip_global_cleanup
61
@pytest.mark.usefixtures("cache_models")
62
def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
Joe Runde's avatar
Joe Runde committed
63
    # Set HF to offline mode and ensure we can still construct an LLM
64
65
66
67
    with monkeypatch.context() as m:
        try:
            m.setenv("HF_HUB_OFFLINE", "1")
            m.setenv("VLLM_NO_USAGE_STATS", "1")
68

69
70
            def disable_connect(*args, **kwargs):
                raise RuntimeError("No http calls allowed")
71

72
73
74
75
76
77
78
79
80
81
            m.setattr(
                urllib3.connection.HTTPConnection,
                "connect",
                disable_connect,
            )
            m.setattr(
                urllib3.connection.HTTPSConnection,
                "connect",
                disable_connect,
            )
82

83
84
85
86
87
88
89
90
91
92
            # Need to re-import huggingface_hub
            # and friends to setup offline mode
            _re_import_modules()
            # Cached model files should be used in offline mode
            for model_config in MODEL_CONFIGS:
                LLM(**model_config)
        finally:
            # Reset the environment after the test
            # NB: Assuming tests are run in online mode
            _re_import_modules()
Joe Runde's avatar
Joe Runde committed
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115


def _re_import_modules():
    hf_hub_module_names = [
        k for k in sys.modules if k.startswith("huggingface_hub")
    ]
    transformers_module_names = [
        k for k in sys.modules if k.startswith("transformers")
        and not k.startswith("transformers_modules")
    ]

    reload_exception = None
    for module_name in hf_hub_module_names + transformers_module_names:
        try:
            importlib.reload(sys.modules[module_name])
        except Exception as e:
            reload_exception = e
            # Try to continue clean up so that other tests are less likely to
            # be affected

    # Error this test if reloading a module failed
    if reload_exception is not None:
        raise reload_exception
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148


@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
    # Set HF to offline mode and ensure we can still construct an LLM
    with monkeypatch.context() as m:
        try:
            m.setenv("HF_HUB_OFFLINE", "1")
            m.setenv("VLLM_NO_USAGE_STATS", "1")

            def disable_connect(*args, **kwargs):
                raise RuntimeError("No http calls allowed")

            m.setattr(
                urllib3.connection.HTTPConnection,
                "connect",
                disable_connect,
            )
            m.setattr(
                urllib3.connection.HTTPSConnection,
                "connect",
                disable_connect,
            )
            # Need to re-import huggingface_hub
            # and friends to setup offline mode
            _re_import_modules()
            engine_args = EngineArgs(model="facebook/opt-125m")
            LLM(**dataclasses.asdict(engine_args))
        finally:
            # Reset the environment after the test
            # NB: Assuming tests are run in online mode
            _re_import_modules()