inference_benchmark.py 4.56 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Inference benchmarking tool.

Requirements:
    transformers
    accelerate
    bitsandbytes
    optimum-benchmark

Usage: python inference_benchmark.py model_id

options:
    -h, --help            show this help message and exit
    --configs {bf16,fp16,nf4,nf4-dq,int8,int8-decomp} [{bf16,fp16,nf4,nf4-dq,int8,int8-decomp} ...]
    --bf16
    --fp16
    --nf4
    --nf4-dq
    --int8
    --int8-decomp
    --batches BATCHES [BATCHES ...]
    --input-length INPUT_LENGTH
    --out-dir OUT_DIR
"""

import argparse
from pathlib import Path

from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
from optimum_benchmark.logging_utils import setup_logging
import torch

BFLOAT16_SUPPORT = torch.cuda.get_device_capability()[0] >= 8

WEIGHTS_CONFIGS = {
    "fp16": {"torch_dtype": "float16", "quantization_scheme": None, "quantization_config": {}},
    "bf16": {"torch_dtype": "bfloat16", "quantization_scheme": None, "quantization_config": {}},
    "nf4": {
        "torch_dtype": "bfloat16" if BFLOAT16_SUPPORT else "float16",
        "quantization_scheme": "bnb",
        "quantization_config": {
            "load_in_4bit": True,
            "bnb_4bit_quant_type": "nf4",
            "bnb_4bit_use_double_quant": False,
            "bnb_4bit_compute_dtype": torch.bfloat16 if BFLOAT16_SUPPORT else "float16",
        },
    },
    "nf4-dq": {
        "torch_dtype": "bfloat16" if BFLOAT16_SUPPORT else "float16",
        "quantization_scheme": "bnb",
        "quantization_config": {
            "load_in_4bit": True,
            "bnb_4bit_quant_type": "nf4",
            "bnb_4bit_use_double_quant": True,
            "bnb_4bit_compute_dtype": torch.bfloat16 if BFLOAT16_SUPPORT else "float16",
        },
    },
    "int8-decomp": {
        "torch_dtype": "float16",
        "quantization_scheme": "bnb",
        "quantization_config": {
            "load_in_8bit": True,
            "llm_int8_threshold": 6.0,
        },
    },
    "int8": {
        "torch_dtype": "float16",
        "quantization_scheme": "bnb",
        "quantization_config": {
            "load_in_8bit": True,
            "llm_int8_threshold": 0.0,
        },
    },
}

if __name__ == "__main__":
    setup_logging(level="INFO")

    parser = argparse.ArgumentParser(description="bitsandbytes inference benchmark tool")

    parser.add_argument("model_id", type=str, help="The model checkpoint to use.")

    parser.add_argument(
        "--configs",
        nargs="+",
        choices=["bf16", "fp16", "nf4", "nf4-dq", "int8", "int8-decomp"],
        default=["nf4", "int8", "int8-decomp"],
    )
    parser.add_argument("--bf16", dest="configs", action="append_const", const="bf16")
    parser.add_argument("--fp16", dest="configs", action="append_const", const="fp16")
    parser.add_argument("--nf4", dest="configs", action="append_const", const="nf4")
    parser.add_argument("--nf4-dq", dest="configs", action="append_const", const="nf4-dq")
    parser.add_argument("--int8", dest="configs", action="append_const", const="int8")
    parser.add_argument("--int8-decomp", dest="configs", action="append_const", const="int8-decomp")

    parser.add_argument("--batches", nargs="+", type=int, default=[1, 8, 16, 32])
    parser.add_argument("--input-length", type=int, default=64)

    parser.add_argument("--out-dir", type=str, default="reports")

    args = parser.parse_args()

    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    for batch_size in args.batches:
        print(f"Benchmarking batch size: {batch_size}")
        for config in args.configs:
            launcher_config = ProcessConfig(device_isolation=True, start_method="spawn")
            scenario_config = InferenceConfig(
                latency=True,
                memory=True,
                input_shapes={"batch_size": batch_size, "sequence_length": args.input_length},
            )
            backend_config = PyTorchConfig(
                device="cuda",
                device_ids="0",
                device_map="auto",
                no_weights=False,
                model=args.model_id,
                **WEIGHTS_CONFIGS[config],
            )
            benchmark_config = BenchmarkConfig(
                name=f"benchmark-{config}-bsz{batch_size}",
                scenario=scenario_config,
                launcher=launcher_config,
                backend=backend_config,
            )

            out_path = out_dir / f"benchmark_{config}_bsz{batch_size}.json"

            benchmark_report = Benchmark.launch(benchmark_config)
            benchmark_report.log()
            benchmark_report.save_json(out_path)