benchmark_quant.py 3.24 KB
Newer Older
1
2
3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

zhuwenwen's avatar
zhuwenwen committed
4
5
6
7
8
import time

import torch

from vllm import _custom_ops as ops
9
10
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
zhuwenwen's avatar
zhuwenwen committed
11
12
13


@torch.inference_mode()
14
15
16
17
18
19
20
21
22
23
24
25
def main(
    num_tokens: int,
    hidden_size: int,
    static_scale: bool,
    quant_dtype: torch.dtype,
    dtype: torch.dtype,
    seed: int = 0,
    do_profile: bool = False,
    num_warmup_iters: int = 5,
    num_iters: int = 100,
) -> None:
    current_platform.seed_everything(seed)
zhuwenwen's avatar
zhuwenwen committed
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    torch.set_default_device("cuda")

    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None

    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
        torch.cuda.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()

        for _ in range(num_iters):
            if quant_dtype == torch.int8:
                ops.scaled_int8_quant(x, scale)
            else:
                ops.scaled_fp8_quant(x, scale)
        torch.cuda.synchronize()

        end_time = time.perf_counter()
        if profile:
46
            torch.cuda.cudart().cudaProfilerStop()
zhuwenwen's avatar
zhuwenwen committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
        return (end_time - start_time) / num_iters

    # Warmup.
    print("Warming up...")
    run_benchmark = run_cuda_benchmark
    run_benchmark(num_iters=num_warmup_iters, profile=False)

    # Benchmark.
    if do_profile:
        latency = run_benchmark(num_iters=1, profile=True)
    else:
        latency = run_benchmark(num_iters=num_iters, profile=False)
    print(f"Kernel running time: {latency * 1000000:.3f} us")


62
if __name__ == "__main__":
zhuwenwen's avatar
zhuwenwen committed
63
64
65
66
67
68
69
70
71

    def to_torch_dtype(dt):
        if dt == "int8":
            return torch.int8
        if dt == "fp8":
            return torch.float8_e4m3fn
        raise ValueError(f"Unsupported dtype: {dt}")

    parser = FlexibleArgumentParser(
72
73
        description="Benchmark the quantization (fp8 or int8) kernel."
    )
zhuwenwen's avatar
zhuwenwen committed
74
75
76
    parser.add_argument("--num-tokens", type=int, default=4096)
    parser.add_argument("--hidden-size", type=int, default=8192)
    parser.add_argument("--static-scale", action="store_true")
77
78
79
80
81
82
    parser.add_argument(
        "--quant-dtype", type=str, choices=["fp8", "int8"], default="int8"
    )
    parser.add_argument(
        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
    )
zhuwenwen's avatar
zhuwenwen committed
83
84
85
86

    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--num-warmup-iters", type=int, default=5)
87
88
89
90
91
92
93
    parser.add_argument(
        "--num-iters",
        type=int,
        default=100,
        help="Number of benchmark iterations. "
        "If --profile is set, this number is ignored",
    )
zhuwenwen's avatar
zhuwenwen committed
94
95
96
97

    args = parser.parse_args()
    print(args)

98
99
100
101
102
103
104
105
106
107
108
    main(
        num_tokens=args.num_tokens,
        hidden_size=args.hidden_size,
        static_scale=args.static_scale,
        quant_dtype=to_torch_dtype(args.quant_dtype),
        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
        seed=args.seed,
        do_profile=args.profile,
        num_warmup_iters=args.num_warmup_iters,
        num_iters=args.num_iters,
    )