test_cpu_offload.py 3.81 KB
Newer Older
zhuwenwen's avatar
zhuwenwen committed
1
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
zhuwenwen's avatar
zhuwenwen committed
2
3
4
5
6

# Expanded quantized model tests for CPU offloading
# Base tests: tests/basic_correctness/test_cpu_offload.py

import os
zhuwenwen's avatar
zhuwenwen committed
7
import pytest
zhuwenwen's avatar
zhuwenwen committed
8
9
10
11

from tests.quantization.utils import is_quant_method_supported

from vllm.platforms import current_platform
zhuwenwen's avatar
zhuwenwen committed
12
from ..utils import compare_two_settings, models_path_prefix
zhuwenwen's avatar
zhuwenwen committed
13
14
15
16
17
18


@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
                    reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8():
    # Test quantization of an unquantized checkpoint
zhuwenwen's avatar
zhuwenwen committed
19
    compare_two_settings(os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
zhuwenwen's avatar
zhuwenwen committed
20
21
22
23
                         ["--quantization", "fp8"],
                         ["--quantization", "fp8", "--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
    # Test loading a quantized checkpoint
zhuwenwen's avatar
zhuwenwen committed
24
25
26
    compare_two_settings(os.path.join(models_path_prefix,"neuralmagic/Qwen2-1.5B-Instruct-FP8"), [],
                         ["--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
zhuwenwen's avatar
zhuwenwen committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78


@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
                    reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq(monkeypatch):
    # This quant method is sensitive to dummy weights, so we force real weights
    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
    # Test GPTQ Marlin
    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"), [],
                         ["--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
    # Test GPTQ
    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"),
                         ["--quantization", "gptq"],
                         ["--quantization", "gptq", "--cpu-offload-gb", "1"],
                         max_wait_seconds=480)


@pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or current_platform.is_rocm(),
                    reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq(monkeypatch):
    # This quant method is sensitive to dummy weights, so we force real weights
    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
    # Test AWQ Marlin
    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"), [],
                         ["--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
    # Test AWQ
    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"),
                         ["--quantization", "awq"],
                         ["--quantization", "awq", "--cpu-offload-gb", "1"],
                         max_wait_seconds=480)


@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
                    reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors(monkeypatch):
    # This quant method is sensitive to dummy weights, so we force real weights
    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
    # Test wNa16
    compare_two_settings(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"), [],
                         ["--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
    # Test w4a16_marlin24
    compare_two_settings(os.path.join(models_path_prefix, "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"),
                         [], ["--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
    # Test w8a8
    compare_two_settings(
        os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), [],
        ["--cpu-offload-gb", "1"],
        max_wait_seconds=480)