rms_norm.py 4.94 KB
Newer Older
xgqdut2016's avatar
xgqdut2016 committed
1
2
import torch
import ctypes
3
from ctypes import c_uint64
xgqdut2016's avatar
xgqdut2016 committed
4
from libinfiniop import (
5
6
    LIBINFINIOP,
    TestTensor,
xgqdut2016's avatar
xgqdut2016 committed
7
    get_test_devices,
PanZezhongQY's avatar
PanZezhongQY committed
8
    check_error,
xgqdut2016's avatar
xgqdut2016 committed
9
10
11
12
13
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
14
15
16
17
18
    TestWorkspace,
    InfiniDtype,
    InfiniDtypeNames,
    InfiniDeviceNames,
    infiniopOperatorDescriptor_t,
PanZezhongQY's avatar
PanZezhongQY committed
19
20
)

xgqdut2016's avatar
xgqdut2016 committed
21
22
23
24
# ==============================================================================
#  Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
25
26
27
_TEST_CASES_ = [
    # y_shape, x_shape, w_shape, y_stride, x_stride
    ((1, 4), (1, 4), (4,), None, None),
28
29
30
    ((2, 4), (2, 4), (4,), None, None),
    ((2, 2, 4), (2, 2, 4), (4,), None, None),
    ((2, 2, 4), (2, 2, 4), (4,), (12, 8, 1), (12, 8, 1)),
31
32
    ((16, 2048), (16, 2048), (2048,), None, None),
    ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1)),
zhangyue's avatar
zhangyue committed
33
    ((15, 3584), (15, 3584), (3584,), None, None),
34
35
36
    ((4, 4, 2048), (4, 4, 2048), (2048,), None, None),
    ((4, 4, 2048), (4, 4, 2048), (2048,), (2048, 8192, 1), (2048, 8192, 1)),
    ((4, 4, 2048), (4, 4, 2048), (2048,), (16384, 4096, 1), (16384, 4096, 1)),
thatPepe's avatar
thatPepe committed
37
38
    ((15, 3584), (15, 3584), (3584,), None, None),
    ((15, 8192), (15, 8192), (8192,), None, None),
xgqdut2016's avatar
xgqdut2016 committed
39
]
xgqdut2016's avatar
xgqdut2016 committed
40

41
# w (weight) types
42
# Note: 'None' means the same as input dtype
blkmjsian's avatar
blkmjsian committed
43
_WEIGHT_DTYPES = [None, InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
xgqdut2016's avatar
xgqdut2016 committed
44
# x types used for testing
45
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16]
46
47
48

# Form the test cases by appending each element of _WEIGHT_DTYPES to each tuple in _TEST_CASES_
_TEST_CASES = [
49
    test_case + (w_dtype,) for test_case in _TEST_CASES_ for w_dtype in _WEIGHT_DTYPES
50
]
xgqdut2016's avatar
xgqdut2016 committed
51
52
53

# Tolerance map for different data types
_TOLERANCE_MAP = {
54
    InfiniDtype.F16: {"atol": 2e-3, "rtol": 2e-3},
55
    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
xgqdut2016's avatar
xgqdut2016 committed
56
57
58
59
60
61
}

DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
62

xgqdut2016's avatar
xgqdut2016 committed
63

64
def rms_norm(ans, x, w, eps):
65
66
67
68
    input_dtype = x.dtype
    hidden_states = x.to(torch.float32)
    scale = hidden_states.pow(2).mean(-1, keepdim=True).add_(eps).rsqrt_()
    ans.set_((hidden_states.mul_(scale).mul_(w)).to(input_dtype))
PanZezhongQY's avatar
PanZezhongQY committed
69
70


71
def test(
xgqdut2016's avatar
xgqdut2016 committed
72
    handle,
73
    device,
xgqdut2016's avatar
xgqdut2016 committed
74
75
76
    y_shape,
    x_shape,
    w_shape,
xgqdut2016's avatar
xgqdut2016 committed
77
78
    y_stride,
    x_stride,
79
80
    w_dtype=InfiniDtype.F32,
    dtype=InfiniDtype.F16,
81
    sync=None,
xgqdut2016's avatar
xgqdut2016 committed
82
):
83
    w_dtype = w_dtype if w_dtype else dtype
xgqdut2016's avatar
xgqdut2016 committed
84
    print(
85
86
        f"Testing RMS_Norm on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
        f" y_stride:{y_stride} x_stride:{x_stride} w_dtype:{InfiniDtypeNames[w_dtype]} dtype:{InfiniDtypeNames[dtype]}"
xgqdut2016's avatar
xgqdut2016 committed
87
    )
PanZezhongQY's avatar
PanZezhongQY committed
88

89
90
91
    y = TestTensor(y_shape, y_stride, dtype, device, mode="ones")
    x = TestTensor(x_shape, x_stride, dtype, device, scale=0.01)
    w = TestTensor(w_shape, None, w_dtype, device)
PanZezhongQY's avatar
PanZezhongQY committed
92

93
94
    eps = 1e-6
    rms_norm(y.torch_tensor(), x.torch_tensor(), w.torch_tensor(), eps)
95

96
97
    if sync is not None:
        sync()
98

99
    descriptor = infiniopOperatorDescriptor_t()
PanZezhongQY's avatar
PanZezhongQY committed
100
101

    check_error(
102
        LIBINFINIOP.infiniopCreateRMSNormDescriptor(
xgqdut2016's avatar
xgqdut2016 committed
103
104
            handle,
            ctypes.byref(descriptor),
105
106
107
            y.descriptor,
            x.descriptor,
            w.descriptor,
xgqdut2016's avatar
xgqdut2016 committed
108
            eps,
PanZezhongQY's avatar
PanZezhongQY committed
109
110
111
112
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
113
114
    for tensor in [x, y, w]:
        tensor.destroy_desc()
PanZezhongQY's avatar
PanZezhongQY committed
115
116
117

    workspace_size = c_uint64(0)
    check_error(
118
119
120
        LIBINFINIOP.infiniopGetRMSNormWorkspaceSize(
            descriptor, ctypes.byref(workspace_size)
        )
PanZezhongQY's avatar
PanZezhongQY committed
121
    )
122
    workspace = TestWorkspace(workspace_size.value, y.device)
xgqdut2016's avatar
xgqdut2016 committed
123

xgqdut2016's avatar
xgqdut2016 committed
124
125
    def lib_rms_norm():
        check_error(
126
            LIBINFINIOP.infiniopRMSNorm(
xgqdut2016's avatar
xgqdut2016 committed
127
                descriptor,
128
                workspace.data(),
xgqdut2016's avatar
xgqdut2016 committed
129
                workspace_size.value,
130
131
132
                y.data(),
                x.data(),
                w.data(),
xgqdut2016's avatar
xgqdut2016 committed
133
134
                None,
            )
PanZezhongQY's avatar
PanZezhongQY committed
135
136
        )

xgqdut2016's avatar
xgqdut2016 committed
137
138
139
140
    lib_rms_norm()

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
141
142
        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
143

xgqdut2016's avatar
xgqdut2016 committed
144
145
146
    # Profiling workflow
    if PROFILE:
        # fmt: off
147
148
        profile_operation("PyTorch", lambda: rms_norm(y.torch_tensor(), x.torch_tensor(), w.torch_tensor(), eps), device, NUM_PRERUN, NUM_ITERATIONS)
        profile_operation("    lib", lambda: lib_rms_norm(), device, NUM_PRERUN, NUM_ITERATIONS)
xgqdut2016's avatar
xgqdut2016 committed
149
        # fmt: on
150
    check_error(LIBINFINIOP.infiniopDestroyRMSNormDescriptor(descriptor))
PanZezhongQY's avatar
PanZezhongQY committed
151

152

PanZezhongQY's avatar
PanZezhongQY committed
153
154
155
if __name__ == "__main__":
    args = get_args()

xgqdut2016's avatar
xgqdut2016 committed
156
157
158
159
160
161
162
163
    # Configure testing options
    DEBUG = args.debug
    PROFILE = args.profile
    NUM_PRERUN = args.num_prerun
    NUM_ITERATIONS = args.num_iterations

    # Execute tests
    for device in get_test_devices(args):
164
        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
xgqdut2016's avatar
xgqdut2016 committed
165

PanZezhongQY's avatar
PanZezhongQY committed
166
    print("\033[92mTest passed!\033[0m")