benchmark_rht_cast.py 4.36 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.

import argparse
import torch
import pandas as pd
import torch.utils.benchmark as benchmark

import transformer_engine.pytorch as te
import transformer_engine_torch as tex
import transformer_engine.pytorch.cpp_extensions as ext

from transformer_engine.pytorch.tensor.nvfp4_tensor import NVFP4Quantizer

scale_padding_to = 1
permute_scale = False

TORCH_TO_TE_FLOAT_MAP = {
    torch.bfloat16: tex.DType.kBFloat16,
}


def run_kernel(shape, stochastic_rounding: bool, input_dtype=torch.bfloat16):
    # Generate random input data
    M, K = shape
    x = torch.randn([M, K], dtype=input_dtype, device="cuda")

    assert shape[0] % 16 == 0, "Shape must be divisible by 16"
    assert shape[1] % 16 == 0, "Shape must be divisible by 16"

    # Quantize
    nvfp4_quantizer = NVFP4Quantizer(
        fp4_dtype=tex.DType.kFloat4E2M1,
        rowwise=True,
        columnwise=True,
        with_amax_reduction=False,
        amax_reduction_group=None,
        with_rht=True,
        with_post_rht_amax=True,
        with_random_sign_mask=True,
        stochastic_rounding=stochastic_rounding,
    )
    x_nvfp4_sut = nvfp4_quantizer.make_empty(
        (M, K), dtype=x.dtype, device=x.device, requires_grad=False
    )
    x_nvfp4_sut = nvfp4_quantizer.update_quantized(x, x_nvfp4_sut)

    with torch.no_grad():
        stmt = "kernel_func(input, output)"
        globals_dict = {
            "kernel_func": nvfp4_quantizer.update_quantized,
            "input": x,
            "output": x_nvfp4_sut,
        }

        timing = benchmark.Timer(
            stmt=stmt,
            globals=globals_dict,
            num_threads=1,
        ).blocked_autorange(min_run_time=5)
    print(timing)
    timing_us = timing.median * 1e6

    input_nbytes = shape[0] * shape[1] * 2  # bf16
    output_nbytes = shape[0] * shape[1] // 2  # //2 for fp4
    sf_nbytes = shape[0] * shape[1] // 16  # //16 for 1 byte per 16 elems

    total_nbytes = (
        0
        + input_nbytes
        * 3  # Reading input for Amax(x)&Amax(RHT(x.T)), Reading input for Cast(x), Reaindg input for Cast(RHT(x.T))
        + 2 * 4  # Output 2 * float for scale & amax
        + 2 * 4  # Input 2 * float
        + output_nbytes * 2  # Output from Cast(x) and Cast(RHT(x.T))
        + sf_nbytes * 2  # Scale factor
    )

    throughput_GBps = total_nbytes / (1024 * 1024 * 1024) / (timing_us / 1e6)

    print(
        f"Stochastic rounding: {stochastic_rounding}, Total: {total_nbytes} bytes, Throughput:"
        f" {throughput_GBps} GB/s"
    )
    return timing_us, throughput_GBps


# Nsight Compute Profiling Command:
# ncu -f -o block_scaled_1d_cast_transpose_kernel --set=full --kernel-name "block_scaled_1d_cast_transpose_kernel" -s 5 -c 5 python benchmark_cast_transpose_1d_block.py --profile

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--profile", action="store_true", help="Enable profiling mode")
    args = parser.parse_args()

    if args.profile:
        print("Profiling is enabled.")
    else:
        print("Profiling is disabled.")

    shapes = [
        (8192, 5120),
        (8192, 10240),
        (8192, 2560),
        (8192, 11328),
        (8192, 512),
        (8192, 3584),
        (5120, 8192),
        (10240, 8192),
        (2560, 8192),
        (11328, 8192),
        (512, 8192),
        (3584, 8192),
        (4096, 16384),
        (14336, 16384),
    ]

    if args.profile:
        shapes = [
            (16384, 6144),
        ]

    data = []
    for stochastic_rounding in [True]:  # , False]:
        for shape in shapes:
            print(
                f"Running benchmark_func with shape {shape} and stochastic_rounding"
                f" {stochastic_rounding}"
            )
            timing_us, throughput_GBps = run_kernel(shape, stochastic_rounding)
            data.append(
                [
                    "benchmark_func",
                    shape,
                    stochastic_rounding,
                    timing_us,
                    throughput_GBps,
                ]
            )

    df = pd.DataFrame(
        data=data,
        columns=[
            "kernel",
            "shape",
            "stochastic_rounding",
            "timing_us",
            "throughput(GB/s)",
        ],
    )
    print(df)
    df.to_csv("benchmark_cast_nvfp4.csv", index=False)