test_bitsandbytes.py 8.78 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
'''Tests whether bitsandbytes computation is enabled correctly.

Run `pytest tests/quantization/test_bitsandbytes.py`.
'''
7
8

import gc
9
import os
10

11
12
import pytest
import torch
13
from transformers import BitsAndBytesConfig
14

15
from tests.quantization.utils import is_quant_method_supported
zhuwenwen's avatar
zhuwenwen committed
16
17
from ..utils import models_path_prefix
from vllm.platforms import current_platform
18

19

20
from ..models.utils import check_embeddings_close
21
from ..utils import compare_two_settings, create_new_process_for_each_test
youkaichao's avatar
youkaichao committed
22

23
models_4bit_to_test = [
zhuwenwen's avatar
zhuwenwen committed
24
    (os.path.join(models_path_prefix, "facebook/opt-125m"), "quantize opt model inflight"),
zhuwenwen's avatar
zhuwenwen committed
25
    (os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3"),
26
     "quantize inflight model with both HF and Mistral format weights")
27
28
]

29
30
31
32
models_4bit_to_embedding_test = [
    ("intfloat/e5-mistral-7b-instruct", "quantize embedding model inflight"),
]

33
models_pre_qaunt_4bit_to_test = [
34
    (os.path.join(models_path_prefix, 'PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed'),
35
     'read pre-quantized 4-bit FP4 model'),
zhuwenwen's avatar
zhuwenwen committed
36
    (os.path.join(models_path_prefix, 'poedator/opt-125m-bnb-4bit'), 'read pre-quantized 4-bit NF4 opt model'),
37
38
39
]

models_pre_quant_8bit_to_test = [
zhuwenwen's avatar
zhuwenwen committed
40
    (os.path.join(models_path_prefix, 'meta-llama/Llama-Guard-3-8B-INT8'),
41
     'read pre-quantized llama 8-bit model'),
zhuwenwen's avatar
zhuwenwen committed
42
    (os.path.join(models_path_prefix, "yec019/fbopt-350m-8bit"), "read pre-quantized 8-bit opt model"),
43
44
45
]


46
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or current_platform.is_rocm(),
47
48
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
49
@create_new_process_for_each_test()
50
51
52
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:

53
54
    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
        load_in_4bit=True))
55
    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
56
                             model_name, False, hf_model_kwargs)
57
58


59
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or current_platform.is_rocm(),
60
61
62
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
                         models_pre_qaunt_4bit_to_test)
63
@create_new_process_for_each_test()
64
65
66
67
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                       model_name, description) -> None:

    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
68
                             model_name, True)
69

70

71
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or current_platform.is_rocm(),
72
                    reason='bitsandbytes is not supported on this GPU type.')
73
74
@pytest.mark.parametrize("model_name, description",
                         models_pre_quant_8bit_to_test)
75
@create_new_process_for_each_test()
76
77
78
79
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:

    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
80
                             model_name, True)
81
82


83
84
@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason='Test requires at least 2 GPUs.')
85
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or current_platform.is_rocm(),
86
87
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
88
@create_new_process_for_each_test()
89
90
91
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                model_name, description) -> None:

92
93
    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
        load_in_4bit=True))
94
95
96
97
    validate_generated_texts(hf_runner,
                             vllm_runner,
                             example_prompts[:1],
                             model_name,
98
                             False,
99
100
101
102
                             hf_model_kwargs,
                             vllm_tp_size=2)


103
104
105
106
107
@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason='Test requires at least 2 GPUs.')
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
108
@create_new_process_for_each_test()
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
    common_args = [
        "--disable-log-stats",
        "--disable-log-requests",
        "--dtype",
        "bfloat16",
        "--enable-prefix-caching",
        "--quantization",
        "bitsandbytes",
        "--gpu-memory-utilization",
        "0.7",
    ]
    pp_args = [
        *common_args,
        "--pipeline-parallel-size",
        "2",
    ]
    compare_two_settings(model_name, common_args, pp_args)


129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
                         models_4bit_to_embedding_test)
@pytest.mark.parametrize("dtype", ["half"])
@create_new_process_for_each_test()
def test_4bit_bnb_embedding_model(
    model_name,
    description,
    hf_runner,
    vllm_runner,
    example_prompts,
    dtype: str,
) -> None:

    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:
    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
    # This makes the input_ids different between hf_model and vllm_model.
    # So we need to strip the input texts to avoid test failing.
    example_prompts = [str(s).strip() for s in example_prompts]

    # Inflight 4bit quantization
    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
        load_in_4bit=True))
    with hf_runner(
            model_name,
            dtype=dtype,
            model_kwargs=hf_model_kwargs,
            is_sentence_transformer=True,
    ) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

    with vllm_runner(model_name,
                     task="embed",
                     dtype=dtype,
                     quantization="bitsandbytes") as vllm_model:
        vllm_outputs = vllm_model.encode(example_prompts)
    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
        tol=5e-2,
    )


177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def log_generated_texts(prompts, outputs, runner_name):
    logged_texts = []
    for i, (_, generated_text) in enumerate(outputs):
        log_entry = {
            "prompt": prompts[i],
            "runner_name": runner_name,
            "generated_text": generated_text,
        }
        logged_texts.append(log_entry)
    return logged_texts


def validate_generated_texts(hf_runner,
                             vllm_runner,
                             prompts,
                             model_name,
193
                             pre_quant=False,
194
195
                             hf_model_kwargs=None,
                             vllm_tp_size=1):
196

youkaichao's avatar
youkaichao committed
197
198
    # NOTE: run vLLM first, as it requires a clean process
    # when using distributed inference
199
    with vllm_runner(model_name,
200
                     quantization=None if pre_quant else 'bitsandbytes',
201
                     tensor_parallel_size=vllm_tp_size,
202
                     enforce_eager=False) as llm:
203
204
205
206
207
208
209
        vllm_outputs = llm.generate_greedy(prompts, 8)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

    # Clean up the GPU memory for the next test
    gc.collect()
    torch.cuda.empty_cache()

youkaichao's avatar
youkaichao committed
210
211
212
213
214
215
216
217
218
219
220
221
    if hf_model_kwargs is None:
        hf_model_kwargs = {}

    # Run with HF runner
    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
        hf_outputs = llm.generate_greedy(prompts, 8)
        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")

    # Clean up the GPU memory for the next test
    gc.collect()
    torch.cuda.empty_cache()

222
223
224
225
226
    # Compare the generated strings
    for hf_log, vllm_log in zip(hf_logs, vllm_logs):
        hf_str = hf_log["generated_text"]
        vllm_str = vllm_log["generated_text"]
        prompt = hf_log["prompt"]
227

228
229
230
231
        assert hf_str == vllm_str, (f"Model: {model_name}"
                                    f"Mismatch between HF and vLLM outputs:\n"
                                    f"Prompt: {prompt}\n"
                                    f"HF Output: '{hf_str}'\n"
232
                                    f"vLLM Output: '{vllm_str}'")