"tests/vscode:/vscode.git/clone" did not exist on "0fca3cdcf265cd375bca684d951702b6b7adf65a"
test_gpt_oss.py 3.24 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
End-to-end accuracy test for GPT-OSS model quantization.

Config:
    Task:   gsm8k_platinum
    Filter: flexible-extract
    n-shot: 5
    Metric: exact_match

Run: pytest tests/models/quantization/test_gpt_oss.py
"""

import importlib.metadata
16
import importlib.util
17
18
19
20
21
22
23
from dataclasses import dataclass

import huggingface_hub
import lm_eval
import pytest
from packaging import version

24
25
from vllm.utils.torch_utils import cuda_device_count_stateless

26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
MODEL_ACCURACIES = {
    # Full quantization: attention linears and MoE linears
    "amd/gpt-oss-20b-WFP8-AFP8-KVFP8": 0.89,
    # MoE linears only quantization
    "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8": 0.89,
    # MoE linears only quantization
    # "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-MXFP4-KV-FP8": 0.90,
}

QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
    importlib.metadata.version("amd-quark")
) >= version.parse("0.9.0")


def has_huggingface_access(repo):
    try:
        huggingface_hub.list_repo_refs(repo)
        return True
    except huggingface_hub.errors.RepositoryNotFoundError:
        return False


HF_HUB_AMD_ORG_ACCESS = all(
    [has_huggingface_access(model_name) for model_name in MODEL_ACCURACIES]
)


@dataclass
class ModelCase:
    model_id: str
    tp: int


@dataclass
class EvaluationConfig:
    model_name: str

    def get_model_args(self, tp_size: int):
        return {
            "pretrained": self.model_name,
            "chat_template_args": {"reasoning_effort": "low"},
            "enable_thinking": True,
            "think_end_token": "200008",
            "tensor_parallel_size": tp_size,
            "dtype": "auto",
            "gpu_memory_utilization": 0.95,
            "trust_remote_code": False,
            "enable_prefix_caching": False,
            "enforce_eager": False,
        }


@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
@pytest.mark.skipif(
    not HF_HUB_AMD_ORG_ACCESS,
    reason="Read access to huggingface.co/amd is required for this test.",
)
@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
@pytest.mark.parametrize("model_name, expected_accuracy", MODEL_ACCURACIES.items())
def test_gpt_oss_attention_quantization(
    model_name: str, tp_size: int, expected_accuracy: float
):
88
89
90
    if tp_size > cuda_device_count_stateless():
        pytest.skip("Not enough GPUs to run this test case")

91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
    model_args = EvaluationConfig(model_name).get_model_args(tp_size)

    extra_run_kwargs = {
        "gen_kwargs": {"max_gen_toks": 8000},
        "apply_chat_template": True,
        "fewshot_as_multiturn": True,
        "num_fewshot": 5,
    }

    lm_eval_out = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks="gsm8k_platinum",
        batch_size="auto",
        **extra_run_kwargs,
    )
    measured_accuracy = float(
        lm_eval_out["results"]["gsm8k_platinum"]["exact_match,flexible-extract"]
    )

    rtol = 0.02
112
113
114
115
    assert measured_accuracy >= expected_accuracy - rtol, (
        f"Accuracy {measured_accuracy:.4f} is below threshold "
        f"{expected_accuracy - rtol:.4f} (expected >= {expected_accuracy} - {rtol})"
    )