test_sleep.py 2.34 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

zhuwenwen's avatar
zhuwenwen committed
4
import os
5
6
import requests

zhuwenwen's avatar
zhuwenwen committed
7
from ...utils import RemoteOpenAIServer, models_path_prefix
8

zhuwenwen's avatar
zhuwenwen committed
9
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


def test_sleep_mode():
    # dtype, max-len etc set so that this can run in CI
    args = [
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "8192",
        "--max-num-seqs",
        "128",
        "--enable-sleep-mode",
    ]

    with RemoteOpenAIServer(MODEL_NAME,
                            args,
                            env_dict={
                                "VLLM_SERVER_DEV_MODE": "1",
                                "CUDA_VISIBLE_DEVICES": "0"
                            }) as remote_server:
30
31
32
33
34
35
36
37
38
39
40
41
        response = requests.post(remote_server.url_for("sleep"),
                                 params={"level": "1"})
        assert response.status_code == 200
        response = requests.get(remote_server.url_for("is_sleeping"))
        assert response.status_code == 200
        assert response.json().get("is_sleeping") is True

        response = requests.post(remote_server.url_for("wake_up"))
        assert response.status_code == 200
        response = requests.get(remote_server.url_for("is_sleeping"))
        assert response.status_code == 200
        assert response.json().get("is_sleeping") is False
42

43
44
        # test wake up with tags
        response = requests.post(remote_server.url_for("sleep"),
45
                                 params={"level": "1"})
46
        assert response.status_code == 200
47
48
49
50
51
52
53

        response = requests.post(remote_server.url_for("wake_up"),
                                 params={"tags": ["weights"]})
        assert response.status_code == 200

        # is sleeping should be false after waking up any part of the engine
        response = requests.get(remote_server.url_for("is_sleeping"))
54
55
56
        assert response.status_code == 200
        assert response.json().get("is_sleeping") is True

57
58
        response = requests.post(remote_server.url_for("wake_up"),
                                 params={"tags": ["kv_cache"]})
59
        assert response.status_code == 200
60
61

        response = requests.get(remote_server.url_for("is_sleeping"))
62
        assert response.status_code == 200
zhuwenwen's avatar
zhuwenwen committed
63
        assert response.json().get("is_sleeping") is False