test_quantization_accuracy.py 1.55 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
7
8
9
10
11
12
13
14
15
16
from dataclasses import dataclass

import lm_eval
import pytest

TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03


@dataclass
class GSM8KAccuracyTestConfig:
    model_name: str
17
    expected_value: float
18
19

    def get_model_args(self) -> str:
20
        return f"pretrained={self.model_name},max_model_len=4096,max_num_seqs=32"
21
22
23
24
25
26


# NOTE: Accuracy scores measured on GPUs.
ACCURACY_CONFIGS = [
    GSM8KAccuracyTestConfig(
        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
27
28
        expected_value=0.76,
    ),  # no bias
29
    # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
30
    # so only one of these tests can run in a single call to pytest. As
31
    # a follow-up, move this into the LM-EVAL section of the CI.
32
33
    # GSM8KAccuracyTestConfig(
    #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
34
    #     expected_value=0.66),  # bias in QKV layers
35
36
37
38
39
40
41
42
43
44
45
46
]


@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=config.get_model_args(),
        tasks="gsm8k",
        batch_size="auto",
    )

47
    EXPECTED_VALUE = config.expected_value
48
    measured_value = results["results"][TASK][FILTER]
49
50
51
52
    assert (
        measured_value - RTOL < EXPECTED_VALUE
        and measured_value + RTOL > EXPECTED_VALUE
    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"