test_functional.py 55.9 KB
Newer Older
Tim Dettmers's avatar
Tim Dettmers committed
1
2
3
import math
import random
import time
Tim Dettmers's avatar
Tim Dettmers committed
4

5
import einops
Aarni Koskela's avatar
Aarni Koskela committed
6
import numpy as np
7
8
9
10
import pytest
import torch

import bitsandbytes as bnb
Tim Dettmers's avatar
Tim Dettmers committed
11
from bitsandbytes import functional as F
Aarni Koskela's avatar
Aarni Koskela committed
12
13
14
15
from tests.helpers import (
    BOOLEAN_TUPLES,
    TRUE_FALSE,
    describe_dtype,
16
    get_available_devices,
Aarni Koskela's avatar
Aarni Koskela committed
17
18
    get_test_dims,
    id_formatter,
19
    is_supported_on_hpu,
Aarni Koskela's avatar
Aarni Koskela committed
20
)
Tim Dettmers's avatar
Tim Dettmers committed
21

Ruff's avatar
Ruff committed
22
torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
Tim Dettmers's avatar
Tim Dettmers committed
23
24
k = 20

25

Tim Dettmers's avatar
Tim Dettmers committed
26
def assert_all_approx_close(a, b, rtol=1e-3, atol=1e-3, count=0, throw=True):
27
    idx = torch.isclose(a, b, rtol=rtol, atol=atol)
28
    sumval = (idx == 0).sum().item()
Tim Dettmers's avatar
Tim Dettmers committed
29
    if sumval > count:
Tim Dettmers's avatar
Tim Dettmers committed
30
31
        if throw:
            print(f"Too many values not close: assert {sumval} < {count}")
32
            torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
Tim Dettmers's avatar
Tim Dettmers committed
33
34

    return sumval
Tim Dettmers's avatar
Tim Dettmers committed
35

36

Tim Dettmers's avatar
Tim Dettmers committed
37
38
class FFN(torch.nn.Module):
    def __init__(self, input_features, hidden_size, bias=True):
39
        super().__init__()
Tim Dettmers's avatar
Tim Dettmers committed
40
41
42
43
44
45
46
47
48
49
50
51
        self.fc1 = torch.nn.Linear(input_features, hidden_size, bias=bias)
        self.fc2 = torch.nn.Linear(hidden_size, input_features, bias=bias)

        with torch.no_grad():
            torch.nn.init.xavier_uniform_(self.fc1.weight)
            torch.nn.init.xavier_uniform_(self.fc2.weight)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

52

53
class Timer:
Tim Dettmers's avatar
Tim Dettmers committed
54
55
56
57
58
    def __init__(self):
        self.starts = {}
        self.ends = {}
        self.agg = {}

59
    def tick(self, name="default"):
Tim Dettmers's avatar
Tim Dettmers committed
60
61
62
63
64
65
66
        if name not in self.starts:
            self.starts[name] = torch.cuda.Event(enable_timing=True)
            self.ends[name] = torch.cuda.Event(enable_timing=True)
            self.starts[name].record()
        else:
            ms = self.tock(name, evict=True, print_ms=False)

67
    def tock(self, name="default", evict=True, print_ms=True):
Tim Dettmers's avatar
Tim Dettmers committed
68
69
70
71
        if name in self.ends:
            self.ends[name].record()
            torch.cuda.synchronize()
            ms = self.starts[name].elapsed_time(self.ends[name])
72
73
            if name not in self.agg:
                self.agg[name] = 0.0
Tim Dettmers's avatar
Tim Dettmers committed
74
75
76
77
78
79
            self.agg[name] += ms
            if evict:
                self.starts.pop(name)
                self.ends.pop(name)

        if print_ms and name in self.agg:
80
            print(f"{name} took: {self.agg[name] / 1000.0:.5f}s")
Tim Dettmers's avatar
Tim Dettmers committed
81
82
83
84

        return self.agg[name]

    def reset(self):
85
        self.starts = {}
Tim Dettmers's avatar
Tim Dettmers committed
86
87
        self.ends = {}
        self.agg = {}
88
89
        print("Resetting benchmark data")

Tim Dettmers's avatar
Tim Dettmers committed
90

91
class Test8BitBlockwiseQuantizeFunctional:
92
    @pytest.mark.parametrize("device", get_available_devices())
93
94
95
96
    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
    @pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested"))
    @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
    @pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
97
    def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
98
99
        iters = 100

100
        if device == "cpu":
101
102
            iters = 10

103
104
105
106
            # This test is slow on CPU, so avoid atypical use cases.
            if nested:
                pytest.skip("Not a typical use case.")
            if blocksize != 256:
107
                pytest.skip("Only blocksize 256 is used in CPU/XPU")
108
            if dtype != torch.float32:
109
                pytest.skip("Only float32 is used in CPU/XPU")
110

111
112
        diffs = []
        reldiffs = []
113
        for i in range(iters):
114
            A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
115
116
117
118
119
120
121
122
123
124
125
126
127
128
            C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested)
            A2 = F.dequantize_blockwise(C, S)
            diff = torch.abs(A1 - A2).float()
            reldiff = diff / torch.abs(A1.float() + 1e-8)
            diffs.append(diff.mean().item())
            reldiffs.append(reldiff.mean().item())
        abserr = sum(diffs) / len(diffs)
        relerr = sum(reldiffs) / len(reldiffs)
        assert abserr < 0.011
        assert relerr < 0.018
        assert A2.dtype == dtype

        diffs = []
        code = F.create_dynamic_map(signed=signed)
129
        for i in range(iters):
130
            A1 = torch.rand(1024, 1024, device=device, dtype=dtype)
131
132
133
134
135
136
137
138
139
140
            C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested, code=code)
            A2 = F.dequantize_blockwise(C, S)
            diff = torch.abs(A1 - A2).float()
            reldiff = diff / torch.abs(A1.float() + 1e-8)
            diffs.append(diff.mean().item())
            reldiffs.append(reldiff.mean().item())
            # torch.testing.assert_close(A1, A2, atol=1e-2, rtol=0)
        abserr = sum(diffs) / len(diffs)
        relerr = sum(reldiffs) / len(reldiffs)
        if signed:
141
            threshold_abserr = 0.0036 if device in ("cpu", "xpu") and (F.ipex_cpu or F.ipex_xpu) else 0.0035
142
            assert abserr < 0.0036
143
144
            assert relerr < 0.015
        else:
145
            assert abserr < 0.00175 if device in ("cpu", "xpu") and (F.ipex_cpu or F.ipex_xpu) else 0.0023
146
147
148
            assert relerr < 0.012
        assert A2.dtype == dtype

149
150
151
152
    @pytest.mark.skipif("cpu" not in get_available_devices(), reason="CPU is required")
    @pytest.mark.parametrize("hidden", [128])
    @pytest.mark.parametrize("blocksize", [4096, 16384])
    def test_blockwise_cpu_large(self, hidden, blocksize):
153
154
155
156
        diffs = []
        reldiffs = []
        batch = 128
        seq = 128
157
158
159
160
161
162
163
164
165
166
167
168
169
170

        for i in range(2):
            A1 = torch.randn(batch, seq, hidden, device="cpu")
            t0 = time.time()
            C, S = F.quantize_blockwise(A1, blocksize=blocksize)
            A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
            print(time.time() - t0)
            diff = torch.abs(A1 - A2)
            reldiff = diff / torch.abs(A1 + 1e-8)
            diffs.append(diff.mean().item())
            reldiffs.append(reldiff.mean().item())
            assert diffs[-1] < 0.011
        # print(sum(diffs)/len(diffs))
        # print(sum(reldiffs)/len(reldiffs))
171

172
    @pytest.mark.parametrize("device", get_available_devices())
173
    @pytest.mark.parametrize("bits", range(2, 9), ids=id_formatter("bits"))
Matthew Douglas's avatar
Matthew Douglas committed
174
    @pytest.mark.parametrize("method", ["linear", "fp8", "dynamic"])
175
    def test_few_bit_quant(self, device, bits, method):
176
        if bits != 8 and (device == "cpu" or (device == "xpu" and F.ipex_xpu)):
177
            pytest.skip("CPU/XPU implementation only supports 8 bits")
178

179
180
181
182
        abserrs = []
        relerrs = []
        code = None
        if method == "linear":
183
            code = F.create_linear_map(True, total_bits=bits).to(device)
184
185
186
        elif method == "fp8":
            ebits = math.ceil(bits / 2)
            pbits = bits - ebits - 1
187
            code = F.create_fp8_map(True, ebits, pbits, bits).to(device)
188
        elif method == "dynamic":
189
            code = F.create_dynamic_map(True, bits - 0, bits).to(device)
Matthew Douglas's avatar
Matthew Douglas committed
190

191
192
193
194
195
196
197
        # for some data types we have no zero
        # for some data types we have one zero
        # for some data types we have two zeros
        assert torch.unique(code).numel() in [2**bits, 2**bits - 1], f"bits: {bits}, method: {method}"
        # print(method, (code==0).sum())
        assert code.numel() == 256
        for i in range(10):
198
            values = torch.randn(1, 32, device=device)
199
200
201
202
203
204
205
206
207
208
            values /= values.abs().max()
            # values[values.abs() < 1e-6] += 1e-5

            q1 = []
            v1 = []
            for v in values[0]:
                idx = torch.abs(v - code).argmin()
                q1.append(idx.item())
                v1.append(code[idx].item())

209
210
            q1 = torch.tensor(q1, device=device)
            v1 = torch.tensor(v1, device=device)
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

            q2, S2 = F.quantize_blockwise(values, code=code)
            v2 = F.dequantize_blockwise(q2, S2)

            idx = torch.isclose(q1.int(), q2.int())
            err2 = torch.abs(v2 - values)
            abserrs.append(err2.mean().item())
            relerrs.append((err2 / (1e-10 + values).abs()).mean().item())
            if idx.sum():
                # some weird cases
                err1 = torch.abs(v1 - values).mean()
                # assert err2.mean() <= err1
            else:
                torch.testing.assert_close(q1, q2)

226
227
228
229
230
231
    @pytest.mark.parametrize("device", get_available_devices())
    def test_fp8_quant(self, device):
        # TODO
        if device == "cpu":
            pytest.skip("CPU implementation segfaults")

232
233
        for e_bits in range(1, 7):
            p_bits = 7 - e_bits
234
            code = F.create_fp8_map(True, e_bits, p_bits).to(device)
235
236
237
238

            abserr = []
            relerr = []
            for i in range(100):
239
                A1 = torch.randn(1024, 1024, device=device)
240
241
242
243
244
245
246
247
248
249
250
251
252
                C, SC = F.quantize_blockwise(A1, code=code)
                A2 = F.dequantize_blockwise(C, SC)
                diff = torch.abs(A1 - A2)
                reldiff = diff / torch.abs(A1 + 1e-8)
                abserr.append(diff.mean().item())
                relerr.append(reldiff.mean().item())
                # assert diff < 0.0075
            # print(sum(abserr)/len(abserr))
            # print(sum(relerr)/len(relerr))

            abserr = []
            relerr = []
            for i in range(100):
253
                A1 = torch.rand(1024, 1024, device=device)
254
255
256
257
258
259
260
261
262
263
264
265
266
                C, SC = F.quantize_blockwise(A1, code=code)
                A2 = F.dequantize_blockwise(C, SC)
                diff = torch.abs(A1 - A2)
                reldiff = diff / torch.abs(A1 + 1e-8)
                abserr.append(diff.mean().item())
                relerr.append(reldiff.mean().item())
                # assert diff < 0.0075
            # print(sum(abserr)/len(abserr))
            # print(sum(relerr)/len(relerr))

            abserr = []
            relerr = []
            for i in range(100):
267
                A1 = torch.randn(1024, 1024, device=device)
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
                C, SC = F.quantize_blockwise(A1)
                A2 = F.dequantize_blockwise(C, SC)
                diff = torch.abs(A1 - A2)
                reldiff = diff / torch.abs(A1 + 1e-8)
                abserr.append(diff.mean().item())
                relerr.append(reldiff.mean().item())
                # assert diff < 0.0075
            # print(3, sum(abserr)/len(abserr))
            # print(3, sum(relerr)/len(relerr))

    @pytest.mark.benchmark
    def test_bench_dequantization(self):
        a = torch.rand(1024, 1024, device="cuda").half()
        code = F.create_fp8_map(True, 3, 0, 4).cuda()
        qa, SA = F.quantize_blockwise(a, code=code)
        print(qa.max())

        max_theoretical_mu = 1024 * 1024 * 2 / 1024**3 / 672 * 1000 * 1000
        # print(max_theoretical_mu)

        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(100):
            qa, SA = F.quantize_blockwise(a)
        torch.cuda.synchronize()
        # print((time.time()-t0)/1e6)
Tim Dettmers's avatar
Tim Dettmers committed
294
295


296
297
298
def test_stable_embedding():
    layer = bnb.nn.StableEmbedding(1024, 1024)
    layer.reset_parameters()
Tim Dettmers's avatar
Tim Dettmers committed
299
300


Tim Dettmers's avatar
Tim Dettmers committed
301
302
def quant(x):
    max1 = torch.abs(x).max()
303
    x = torch.round(x / max1 * 127)
Tim Dettmers's avatar
Tim Dettmers committed
304
305
    return max1, x.to(torch.int8)

306

Tim Dettmers's avatar
Tim Dettmers committed
307
def dequant(c, maxC):
308
309
    return c.float() * (maxC / 127)

Tim Dettmers's avatar
Tim Dettmers committed
310
311

def mm_dequant(maxA, maxB, C):
312
313
    return C.float() * (maxA / 127) * (maxB / 127)

Tim Dettmers's avatar
Tim Dettmers committed
314
315
316

def quant_multi(x, dim):
    max1 = torch.amax(torch.abs(x), dim=dim, keepdim=True)
317
318
    max1[max1 == 0] = 1.0
    x = torch.round(x / max1 * 127)
Tim Dettmers's avatar
Tim Dettmers committed
319
320
    return max1, x.to(torch.int8)

321

Tim Dettmers's avatar
Tim Dettmers committed
322
def quant_multi_chunk(x, dim, chunk_size=32):
323
324
325
    if dim == 1:
        x_chunked = einops.rearrange(x, "(c a) b -> c a b", c=chunk_size)
        max1 = torch.amax(torch.abs(x_chunked), dim=dim + 1, keepdim=True)
Tim Dettmers's avatar
Tim Dettmers committed
326
327
        max1 = torch.tile(max1, (1, 1, x.shape[1]))
        max1 = max1.view(x.shape)
328
329
    elif dim == 0:
        x_chunked = einops.rearrange(x, "a (b c) -> a b c", c=chunk_size)
Tim Dettmers's avatar
Tim Dettmers committed
330
331
332
        max1 = torch.amax(torch.abs(x_chunked), dim=dim, keepdim=True)
        max1 = torch.tile(max1, (x.shape[0], 1, 1))
        max1 = max1.view(x.shape)
333
334
    max1[max1 == 0] = 1.0
    x = torch.round(x / max1 * 127)
Tim Dettmers's avatar
Tim Dettmers committed
335
336
    return max1, x.to(torch.int8)

337

Tim Dettmers's avatar
Tim Dettmers committed
338
def mean(xx):
339
340
    return sum(xx) / float(len(xx))

Tim Dettmers's avatar
Tim Dettmers committed
341

Aarni Koskela's avatar
Aarni Koskela committed
342
343
methods = {
    "linear": (
344
345
346
347
348
        lambda x, dim: quant(x),
        lambda x, dim: quant(x),
        dequant,
        dequant,
        mm_dequant,
Aarni Koskela's avatar
Aarni Koskela committed
349
350
351
    ),
    "vectorwise": (quant_multi, quant_multi, dequant, dequant, mm_dequant),
}
352
353


354
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
class TestIGEMMFunctional:
    @pytest.mark.parametrize("dim1", [1024 * 2], ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim2", [1024 * 16], ids=id_formatter("dim2"))
    @pytest.mark.parametrize("quant_methods", methods.values(), ids=methods.keys())
    @pytest.mark.parametrize("batched", TRUE_FALSE, ids=id_formatter("batched"))
    def test_approx_igemm(self, dim1, dim2, quant_methods, batched):
        dim1 = dim1 - (dim1 % 32)
        dim2 = dim2 - (dim2 % 32)
        errors = []
        relerrors = []
        # print("")
        for i in range(5):
            if batched:
                A = torch.normal(0, 0.5, size=(32, dim1, dim2 // 32), device="cuda")
                B = torch.normal(0, 0.5, size=(32, dim2 // 32, dim1), device="cuda")
                maxA, Ac = quant_methods[0](A, 2)
                maxB, Bc = quant_methods[1](B, 1)
            else:
                A = torch.normal(0, 0.5, size=(dim1, dim2), device="cuda")
                B = torch.normal(0, 0.5, size=(dim2, dim1), device="cuda")
                maxA, Ac = quant_methods[0](A, 1)
                maxB, Bc = quant_methods[1](B, 0)
            torch.testing.assert_close(quant_methods[2](maxA, Ac), A, atol=0.025, rtol=0.05)
            if batched:
                out2 = torch.bmm(A, B)
                C = torch.bmm(Ac.float(), Bc.float())
            else:
                out2 = torch.mm(A, B)
                C = F.igemm(Ac, Bc)
            out = quant_methods[4](maxA, maxB, C)
            std = out2.std()
            out /= std
            out2 /= std
            err = torch.abs(out - out2)
            relerr = err / torch.abs(out2)
            errors.append(err.mean().item())
            relerrors.append(relerr.mean().item())
        # print(mean(errors))
        # print(mean(relerrors))

Matthew Douglas's avatar
Matthew Douglas committed
395
396
397
    @pytest.mark.parametrize("hidden_dim", [32, 256], ids=id_formatter("hidden_dim"))
    @pytest.mark.parametrize("batch_dim", [16, 256], ids=id_formatter("batch_dim"))
    @pytest.mark.parametrize("seq_dim", [16, 256], ids=id_formatter("seq_dim"))
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
    @pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose"))
    def test_igemm(self, hidden_dim, batch_dim, transpose, seq_dim):
        hidden_dim = hidden_dim - (hidden_dim % 32)
        batch_dim = batch_dim - (batch_dim % 16)
        seq_dim = seq_dim - (seq_dim % 16)
        for i in range(k):
            shapeA = (batch_dim, hidden_dim) if not transpose[0] else (hidden_dim, batch_dim)
            shapeB = (
                (32 * random.randint(1, 4), hidden_dim) if transpose[1] else (hidden_dim, 32 * random.randint(1, 4))
            )
            A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8)
            B = torch.randint(-128, 127, size=shapeB, device="cuda").to(torch.int8)
            if not transpose[0] and not transpose[1]:
                out2 = torch.matmul(A.float(), B.float())
                out = F.igemm(A, B)
            elif not transpose[0] and transpose[1]:
                out2 = torch.matmul(A.float(), B.t().float())
                out = F.igemm(A, B.t())
            elif transpose[0] and not transpose[1]:
                out2 = torch.matmul(A.t().float(), B.float())
                out = F.igemm(A.t(), B)
            elif transpose[0] and transpose[1]:
                out2 = torch.matmul(A.t().float(), B.t().float())
                out = F.igemm(A.t(), B.t())

            torch.testing.assert_close(out.float(), out2)

        for i in range(k):
            shapeA = (batch_dim, seq_dim, hidden_dim)
            shapeB = (
                (32 * random.randint(1, 4), hidden_dim) if transpose[1] else (hidden_dim, 32 * random.randint(1, 4))
            )
            A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8)
            B = torch.randint(-128, 127, size=shapeB, device="cuda").to(torch.int8)
            if not transpose[0] and not transpose[1]:
                out2 = torch.matmul(A.float(), B.float())
                out = F.igemm(A, B)
            elif not transpose[0] and transpose[1]:
                out2 = torch.matmul(A.float(), B.t().float())
                out = F.igemm(A, B.t())

            torch.testing.assert_close(out.float(), out2)

Matthew Douglas's avatar
Matthew Douglas committed
441
442
443
    @pytest.mark.parametrize("seq_dim", [32, 256, 512], ids=id_formatter("seq_dim"))
    @pytest.mark.parametrize("hidden_dim", [64, 1024, 4096], ids=id_formatter("hidden_dim"))
    @pytest.mark.parametrize("batch_dim", [2, 8, 16], ids=id_formatter("batch_dim"))
444
445
446
447
448
449
450
451
452
453
454
455
456
    def test_dim3_igemm(self, seq_dim, hidden_dim, batch_dim):
        seq_dim = seq_dim - (seq_dim % 32)
        hidden_dim = hidden_dim - (hidden_dim % 32)
        batch_dim = batch_dim - (batch_dim % 2)
        for i in range(25):
            A = torch.randint(-128, 127, size=(batch_dim, seq_dim, hidden_dim), device="cuda").to(torch.int8)
            B = torch.randint(-128, 127, size=(batch_dim, seq_dim, 1024), device="cuda").to(torch.int8)
            out2 = torch.einsum("bsi, bso->io", A.float(), B.float())
            iout = torch.empty(A.shape[2], B.shape[2], dtype=torch.int32, device=A.device)
            out = F.igemm(A, B, out=iout)

            torch.testing.assert_close(out.float(), out2)

Matthew Douglas's avatar
Matthew Douglas committed
457
458
459
    @pytest.mark.parametrize("seq_dim", [32, 512], ids=id_formatter("seq_dim"))
    @pytest.mark.parametrize("hidden_dim", [32, 1024 * 4], ids=id_formatter("hidden_dim"))
    @pytest.mark.parametrize("batch_dim", [2, 16], ids=id_formatter("batch_dim"))
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
    @pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose"))
    def test_minmax_igemm(self, seq_dim, hidden_dim, batch_dim, transpose):
        def min_max(x):
            maxA = torch.amax(x, dim=2, keepdim=True)
            minA = torch.amin(x, dim=2, keepdim=True)
            scale = (maxA - minA) / 2.0
            return (127 * (x - minA - scale) / scale).to(torch.int8), minA, scale

        seq_dim = seq_dim - (seq_dim % 16)
        hidden_dim = hidden_dim - (hidden_dim % 16)
        batch_dim = batch_dim - (batch_dim % 2)
        errs = []
        relerrs = []
        errs2 = []
        relerrs2 = []
        for i in range(k):
            A = torch.normal(0.0, 0.5, size=(batch_dim, seq_dim, hidden_dim), device="cuda")
            if transpose:
                B = torch.normal(0, 0.5, size=(256, hidden_dim), device="cuda")
            else:
                B = torch.normal(0, 0.5, size=(hidden_dim, 256), device="cuda")
            Ac, minA, scale = min_max(A)
            if transpose:
                maxB, Bc = quant_multi(B, dim=(1 if transpose else 0))
                out = F.igemm(Ac, Bc.t())
                out2 = torch.matmul(A, B.t())
                offset = B.t().sum(0) * (minA + scale)
                out = out.float()
                out = (out * maxB.t() * scale / (127 * 127)) + offset

                maxA, Ac = quant_multi(A, dim=2)
                out3 = F.igemm(Ac, Bc.t())
                out3 = mm_dequant(maxA, maxB.t(), out3)
            else:
                maxB, Bc = quant_multi(B, dim=0)
                offset = B.sum(0) * (minA + scale)
                out = F.igemm(Ac, Bc)
                out2 = torch.matmul(A, B)
                out = out.float()
                out = (out * maxB * scale / (127 * 127)) + offset

                maxA, Ac = quant_multi(A, dim=2)
                out3 = F.igemm(Ac, Bc)
                out3 = mm_dequant(maxA, maxB, out3)

            std = out2.std()
            out2 /= std
            out /= std
            out3 /= std

            err = torch.abs(out - out2)
            relerr = err / (torch.abs(out2) + 1e-7)

            err2 = torch.abs(out3 - out2)
            relerr2 = err2 / (torch.abs(out2) + 1e-7)

            errs.append(err.mean().item())
            relerrs.append(relerr.mean().item())
            errs2.append(err2.mean().item())
            relerrs2.append(relerr2.mean().item())
        # print(mean(errs))
        # print(mean(relerrs))
        # print(mean(errs2))
        # print(mean(relerrs2))
        assert mean(errs) < 0.015
Matthew Douglas's avatar
Matthew Douglas committed
525
526
527
528
529
530
531

        # There's a higher relerr on L40S with torch 2.4+cu118.
        is_sm89 = torch.cuda.get_device_capability() == (8, 9)
        if torch.version.cuda == "11.8" and is_sm89 and torch.__version__ < (2, 5):
            assert mean(relerrs) < 0.41
        else:
            assert mean(relerrs) < 0.3
532

Matthew Douglas's avatar
Matthew Douglas committed
533
534
535
536
    @pytest.mark.parametrize("dim1", [1, 64], ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim2", [32, 128], ids=id_formatter("dim2"))
    @pytest.mark.parametrize("dim3", [32, 256], ids=id_formatter("dim3"))
    @pytest.mark.parametrize("dim4", [32, 256], ids=id_formatter("dim4"))
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
    @pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose"))
    def test_ibmm(self, dim1, dim2, dim3, dim4, transpose):
        dim2 = dim2 - (dim2 % 16)
        dim3 = dim3 - (dim3 % 16)
        dim4 = dim4 - (dim4 % 16)
        for i in range(k):
            shapeA = (dim1, dim3, dim2) if transpose[0] else (dim1, dim2, dim3)
            shapeB = (dim1, dim4, dim3) if transpose[1] else (dim1, dim3, dim4)
            A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8)
            B = torch.randint(-128, 127, size=shapeB, device="cuda").to(torch.int8)

            if not transpose[0] and not transpose[1]:
                out2 = torch.bmm(A.float(), B.float())
                out = F.igemm(A, B)
            elif not transpose[0] and transpose[1]:
                out2 = torch.bmm(A.float(), B.permute([0, 2, 1]).float())
                out = F.igemm(A, B.permute([0, 2, 1]))
            elif transpose[0] and not transpose[1]:
                out2 = torch.bmm(A.permute([0, 2, 1]).float(), B.float())
                out = F.igemm(A.permute([0, 2, 1]), B)
            elif transpose[0] and transpose[1]:
                out2 = torch.bmm(A.permute([0, 2, 1]).float(), B.permute([0, 2, 1]).float())
                out = F.igemm(A.permute([0, 2, 1]), B.permute([0, 2, 1]))
            torch.testing.assert_close(out.float(), out2.float())


class TestLLMInt8Functional:
Matthew Douglas's avatar
Matthew Douglas committed
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
    @staticmethod
    def vectorwise_mm_dequant(xq, S1, S2, dtype=torch.half):
        """Reference implementation for the F.int8_mm_dequant function."""
        C = 127.0

        x = xq.float()
        if len(S1.shape) == 3 and len(x.shape) == 2:
            S1 = S1.squeeze(0)
        if len(S2.shape) == 3 and len(x.shape) == 2:
            S2 = S2.squeeze(0)
        if len(S1.shape) == 2:
            x *= S1 / C
        else:
            x *= S1 / C
        x *= S2 / C
        return x.to(dtype)

    @staticmethod
    def vectorwise_quant(x, dim=1):
        """Reference implementation"""
        max1 = torch.amax(torch.abs(x), dim=dim, keepdim=True)
        xq = torch.round(x * (127.0 / max1)).to(torch.int8)
        return xq, max1

588
    @pytest.mark.parametrize("device", get_available_devices())
589
590
591
592
593
594
    @pytest.mark.parametrize("dim1", [128], ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim2", [256], ids=id_formatter("dim2"))
    @pytest.mark.parametrize("dim3", [499, 512], ids=id_formatter("dim3"))
    @pytest.mark.parametrize("dim4", [512], ids=id_formatter("dim4"))
    @pytest.mark.parametrize("dims", (2, 3), ids=id_formatter("dims"))
    @pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
595
    def test_int8_linear_matmul(self, device, dim1, dim2, dim3, dim4, dims, ldb):
596
597
        for i in range(k):
            if dims == 2:
598
                A = torch.randint(-128, 127, size=(dim1, dim3), dtype=torch.int8, device=device)
599
            elif dims == 3:
600
601
                A = torch.randint(-128, 127, size=(dim1, dim2, dim3), dtype=torch.int8, device=device)
            B = torch.randint(-128, 127, size=(dim4, dim3), dtype=torch.int8, device=device)
602
603
604
605
606
            C1 = torch.matmul(A.float(), B.t().float())

            C2 = F.int8_linear_matmul(A, B)
            torch.testing.assert_close(C1, C2.float())

607
    @pytest.mark.parametrize("device", get_available_devices())
608
609
610
611
612
    @pytest.mark.parametrize("dim1", [32], ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim2", [32], ids=id_formatter("dim2"))
    @pytest.mark.parametrize("dim3", [32], ids=id_formatter("dim3"))
    @pytest.mark.parametrize("dim4", [32], ids=id_formatter("dim4"))
    @pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
613
    def test_int8_linear_matmul_half(self, device, dim1, dim2, dim3, dim4, dims):
614
615
        for i in range(k):
            if dims == 2:
616
                A = torch.normal(0, 0.5, size=(dim1, dim3), device=device).half()
617
            elif dims == 3:
618
619
                A = torch.normal(0, 0.5, size=(dim1, dim2, dim3), device=device).half()
            B = torch.randn((dim4, dim3), device=device).half()
620
621
622
623
624
            torch.nn.init.xavier_uniform_(B)
            C1 = torch.matmul(A, B.t())

            A = A.view(-1, A.shape[-1])

625
            CA, statsA, _ = F.int8_vectorwise_quant(A)
626
627
628
629
630
            CB, statsB, _ = F.int8_vectorwise_quant(B)
            output = F.int8_mm_dequant(F.int8_linear_matmul(CA, CB), statsA, statsB)

            torch.testing.assert_close(C1.view(-1, C1.shape[-1]), output, atol=0.025, rtol=0.05)

631
    @pytest.mark.parametrize("device", get_available_devices())
632
633
634
635
    @pytest.mark.parametrize("dim1", (64, 256), ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim4", (64, 1024), ids=id_formatter("dim4"))
    @pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
    @pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias"))
636
    def test_dequant_mm(self, device, dim1, dim4, dims, has_bias):
637
638
        inner = 128
        bias = None
Ruff's avatar
Ruff committed
639
        if has_bias:
640
            bias = torch.randn(dim4, device=device, dtype=torch.float16)
641
642

        for i in range(1):
643
644
            A = torch.randn(dim1, inner, device=device)
            B = torch.randn(dim4, inner, device=device)
645
646
647
648
            C1 = torch.matmul(A.half(), B.t().half())
            if has_bias:
                C1 += bias

Matthew Douglas's avatar
Matthew Douglas committed
649
650
            A1, maxA = self.vectorwise_quant(A, dim=1)
            B1, maxB = self.vectorwise_quant(B, dim=1)
651
652
653

            C2 = F.int8_linear_matmul(A1, B1)

Matthew Douglas's avatar
Matthew Douglas committed
654
            C4 = self.vectorwise_mm_dequant(C2.float(), maxA, maxB.t())
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
            if has_bias:
                C4 += bias

            # TODO: is something wrong here? If so, the problem goes deeper
            # n = C1.numel()
            # p = 0.06
            std = C1.std(0).view(1, -1)
            C1 /= std
            C4 /= std
            # assert_all_approx_close(C1, C4, atol=0.02, rtol=0.1, count=int(n*0.06))
            # assert (count / n < p), f"error in more than {p} of elements: {count}/{n}={count/n}"

            C5 = F.int8_mm_dequant(C2, maxA, maxB, bias=bias)
            C5 /= std
            torch.testing.assert_close(C5, C4, atol=0.015, rtol=0.1)
            n = C5.numel()
            assert_all_approx_close(C1, C4, atol=0.015, rtol=0.1, count=int(0.01 * n))

    @pytest.mark.parametrize("dim1", [1 * 1024], ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim2", [1 * 1024], ids=id_formatter("dim2"))
    @pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
    @pytest.mark.parametrize("threshold", [0.0, 3.0], ids=id_formatter("decomp"))
677
    @pytest.mark.deprecated
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
    def test_colrow_absmax(self, dim1, dim2, dims, threshold):
        for i in range(k):
            A = torch.randn(dim1, dim2, device="cuda").half()

            assert dims == 2

            row_stats1, _ = torch.abs(A.float()).max(1)
            col_stats1, _ = torch.abs(A.float()).max(0)

            if threshold > 0.0:
                A_truncated = A.clone()
                A_truncated[torch.abs(A_truncated) >= threshold] = 0.0
                row_stats1_trunc, _ = torch.abs(A_truncated.float()).max(1)
                col_stats1_trunc, _ = torch.abs(A_truncated.float()).max(0)

                row_stats2, col_stats2, nnz_block_ptr2 = F.get_colrow_absmax(A, threshold=threshold)

                nnz_rows1_counts = (torch.abs(A) >= threshold).sum(1).flatten()
                nnz_block_ptr1 = torch.zeros(
                    nnz_rows1_counts.shape[0] + 1,
                    dtype=nnz_rows1_counts.dtype,
                    device=nnz_rows1_counts.device,
                )
                nnz_block_ptr1[1:] = nnz_rows1_counts.cumsum(0)

                torch.testing.assert_close(col_stats1_trunc, col_stats2)
                torch.testing.assert_close(row_stats1_trunc, row_stats2)
                # torch.testing.assert_close(nnz_block_ptr1, nnz_block_ptr2)
            else:
                row_stats2, col_stats2, nnz_block_ptr2 = F.get_colrow_absmax(A, threshold=0.0)
                assert nnz_block_ptr2 is None
                torch.testing.assert_close(col_stats1, col_stats2)
                torch.testing.assert_close(row_stats1, row_stats2)

    @pytest.mark.parametrize("dim1", [2048, 4096], ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim2", [512, 1024], ids=id_formatter("dim2"))
714
    @pytest.mark.deprecated
715
716
717
    def test_int8_double_quant(self, dim1, dim2):
        for i in range(k):
            A = torch.randn(dim1, dim2, device="cuda").half()
Matthew Douglas's avatar
Matthew Douglas committed
718
719
            out_col1, Scol = self.vectorwise_quant(A, dim=0)
            out_row1, Srow = self.vectorwise_quant(A, dim=1)
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734

            CA, CAt, statsA, statsAt, _ = F.int8_double_quant(A)

            # max difference is 1 due to rounding differences
            torch.testing.assert_close(CA, out_row1, atol=1, rtol=0)
            torch.testing.assert_close(CAt, out_col1, atol=1, rtol=0)

            n = CAt.numel()
            num_not_close_rows = (torch.isclose(CA, out_row1, atol=1) == 0).sum().item()
            num_not_close_cols = (torch.isclose(CAt, out_col1, atol=1) == 0).sum().item()

            # allow for 1:500 error due to rounding differences
            min_error = 1 / 500
            if num_not_close_cols > (min_error * n):
                print(
735
                    f"Min error exceeded {num_not_close_cols} elements are different. Error: {num_not_close_cols / n:.4f}"
736
737
738
739
                )
                assert False
            if num_not_close_rows > (min_error * n):
                print(
740
                    f"Min error exceeded {num_not_close_rows} elements are different. Error: {num_not_close_rows / n:.4f}"
741
742
743
744
745
746
                )
                assert False

            torch.testing.assert_close(Srow.flatten().float(), statsA)
            torch.testing.assert_close(Scol.flatten().float(), statsAt)

747
    @pytest.mark.parametrize("device", get_available_devices())
748
749
750
751
752
753
754
755
    @pytest.mark.parametrize(
        ("dim1", "dim4", "inner"),
        (
            pytest.param(dim1, dim4, inner, id=f"{dim1=},{dim4=},{inner=}")
            for (dim1, dim4, inner) in zip(
                (1, 8, 2048, 4096),
                (2, 128, 2048, 4096),
                (4, 256, 512, 4096),
756
            )
757
758
        ),
    )
759
    def test_integrated_int8_linear_matmul(self, device, dim1, dim4, inner):
760
761
762
        if device == "cpu" and inner > 2048:
            pytest.skip("Slow on CPU")

763
        for i in range(k):
764
765
            A = torch.randn(dim1, inner, device=device).half()
            B = torch.randn(dim4, inner, device=device).half()
766
767
768
769
770

            out1 = torch.matmul(A.half(), B.t().half())

            C1a, stats1a, _ = F.int8_vectorwise_quant(A)
            C2a, stats2a, _ = F.int8_vectorwise_quant(B)
Matthew Douglas's avatar
Matthew Douglas committed
771
772
            A1, maxA = self.vectorwise_quant(A, dim=1)
            B1, maxB = self.vectorwise_quant(B, dim=1)
773
774
775
776
777
778
779
780
781
782

            torch.testing.assert_close(maxA.flatten().float(), stats1a)
            torch.testing.assert_close(maxB.flatten().float(), stats2a)
            torch.testing.assert_close(C1a, A1, rtol=0, atol=1)
            torch.testing.assert_close(C2a, B1, rtol=0, atol=1)

            out2 = F.int8_linear_matmul(A1, B1)

            C2 = F.int8_linear_matmul(A1, B1)

Matthew Douglas's avatar
Matthew Douglas committed
783
            out3 = self.vectorwise_mm_dequant(C2.float(), maxA, maxB.t())
784
785
786
787
788

            err1 = torch.abs(out1 - out2).mean().item()
            err2 = torch.abs(out1 - out3).mean().item()
            assert err2 <= err1 * 1.025

789
    @pytest.mark.parametrize("device", get_available_devices())
790
791
    @pytest.mark.parametrize("dim1", [512, 2048], ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim2", [1024, 4096], ids=id_formatter("dim2"))
792
    def test_coo_double_quant(self, device, dim1, dim2):
793
794
        threshold = 2.00
        for i in range(k):
795
            A = torch.randn(dim1, dim2, device=device).half()
796
797
798
799
800
801
802
803
804
805
806
807
808

            idx = torch.abs(A) >= threshold
            CA, statsA, outlier_cols = F.int8_vectorwise_quant(A, threshold=threshold)

            if outlier_cols is not None:
                A1 = A * idx
                A2 = torch.zeros_like(A) + A1
                torch.testing.assert_close(A1, A2)

                A[:, outlier_cols] = 0
                A2 = (CA.float() * statsA.unsqueeze(1) / 127).half()
                torch.testing.assert_close(A, A2, rtol=0.05, atol=1.5e-2)

809
    @pytest.mark.parametrize("device", get_available_devices())
810
811
    @pytest.mark.parametrize("dim1", [512, 2048], ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim2", [1024, 4096], ids=id_formatter("dim2"))
812
    def test_coo_int8_vectorwise_quant(self, device, dim1, dim2):
813
814
        threshold = 3.00
        for i in range(k):
815
            A = torch.randn(dim1, dim2, device=device).half()
816
817
818
819
820
821
822
823
824
825

            idx = torch.abs(A) >= threshold
            CA, statsA, outlier_cols = F.int8_vectorwise_quant(A, threshold=threshold)

            if outlier_cols is not None:
                A2 = (CA.float() * statsA.unsqueeze(1) / 127).half()
                A[:, outlier_cols] = 0
                torch.testing.assert_close(A * (idx == 0), A2, rtol=0.05, atol=1.5e-2)


826
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
827
class TestSpMMFunctional:
Matthew Douglas's avatar
Matthew Douglas committed
828
829
    @pytest.mark.parametrize("dim1", [256, 1024], ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim2", [128, 512], ids=id_formatter("dim2"))
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
    @pytest.mark.parametrize("transposed_B", TRUE_FALSE, ids=id_formatter("transposed_B"))
    def test_spmm_coo(self, dim1, dim2, transposed_B):
        threshold = 1.5
        dim3 = torch.randint(32, 128, size=(1,)).item()
        # dim3 = 17
        for i in range(k):
            A = torch.randn(dim1, dim2).cuda().half()
            if transposed_B:
                B = torch.randn(dim3, dim2).cuda().half()
            else:
                B = torch.randn(dim2, dim3).cuda().half()

            idx = torch.abs(A) >= threshold
            nnz = (idx == 1).sum().item()
            rows, cols = torch.where(idx)
            values = A[idx]
            cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values)
            A2 = A * idx

            if transposed_B:
                out2 = F.spmm_coo(cooA, B.t())
                out1 = torch.matmul(A2, B.t())
            else:
                out2 = F.spmm_coo(cooA, B)
                out1 = torch.matmul(A2, B)

            assert_all_approx_close(out1, out2, rtol=0.01, atol=3.0e-2, count=30)

    @pytest.mark.benchmark
    def test_spmm_bench(self):
        batch = 2
        model = 1024 * 1
        hidden = model * 4
        seq = 1024
        dim1 = batch * seq
        dim2 = model
        dim3 = hidden
        threshold = 4
868
        A = torch.randn(dim1, dim2, device="cuda").half()
869
870
871
        B = torch.randn(dim2, dim3, device="cuda").half()
        for i in range(10):
            C1 = bnb.matmul(A, B.t())
Tim Dettmers's avatar
Tim Dettmers committed
872

873
874
875
876
877
878
        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(k):
            C1 = bnb.matmul(A, B.t())
        torch.cuda.synchronize()
        t8 = time.time() - t0
Tim Dettmers's avatar
Tim Dettmers committed
879

880
881
882
883
884
885
        idx = torch.abs(A) >= threshold
        nnz = (idx == 1).sum().item()
        print(nnz / idx.numel())
        rows, cols = torch.where(idx)
        values = A[idx]
        cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values)
Tim Dettmers's avatar
Tim Dettmers committed
886

887
888
        for i in range(10):
            out2 = F.spmm_coo(cooA, B)
Tim Dettmers's avatar
Tim Dettmers committed
889

890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(k):
            out2 = F.spmm_coo(cooA, B)
        torch.cuda.synchronize()
        tsp = time.time() - t0
        print(tsp, t8)
        print(tsp / t8)

    @pytest.mark.parametrize("dim1", [1 * 2048], ids=id_formatter("dim1"))
    @pytest.mark.parametrize("dim2", [12288], ids=id_formatter("dim2"))
    @pytest.mark.parametrize("dtype", [torch.float16], ids=describe_dtype)
    @pytest.mark.parametrize("out_func", ["zeros", "ones"], ids=id_formatter("out_func"))
    def test_spmm_coo_very_sparse(self, dim1, dim2, dtype, out_func):
        out_func = getattr(torch, out_func)

        threshold = 3.3
        # threshold = 2.8
        # threshold = 0.0
        A = torch.randn(dim1, dim2, device="cuda").half()
        if dtype == torch.float16:
            B = torch.randn(dim2, dim2 * 4, device="cuda").half()
            torch.nn.init.xavier_uniform_(B)
Tim Dettmers's avatar
Tim Dettmers committed
913
        else:
914
915
            B = torch.randn(dim2, dim2 * 4, device="cuda").half()
            torch.nn.init.xavier_uniform_(B)
Matthew Douglas's avatar
Matthew Douglas committed
916
917
918

            SB = torch.abs(B).max().float()
            B = torch.round(B / SB * 127).to(torch.int8)
Tim Dettmers's avatar
Tim Dettmers committed
919

920
921
922
923
924
925
926
927
928
929
930
931
        print("")
        idx = torch.abs(A) >= threshold
        nnz = (idx == 1).sum().item()
        rows, cols = torch.where(idx)
        values = A[idx]
        cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values)
        A2 = A * idx
        out1 = torch.matmul(A2.half(), B.half())
        out = out_func(out1.shape, dtype=torch.float16, device=out1.device)
        out1 += out.clone()
        out2 = F.spmm_coo_very_sparse(cooA, B, out=out)
        # print(B)
932
933
        # print(out1)
        # print(out2)
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
        p = 200 / (2048 * 12288 * 4)
        n = out1.numel()
        count = math.ceil(p * n)
        std = out1.std()
        out1 /= std
        out2 /= std
        assert_all_approx_close(out1, out2.half(), rtol=0.01, atol=3.0e-2, count=count)
        # assert_all_approx_close(out1, out2.half(), rtol=0.05, atol=0.01, count=count)

        idx_col = torch.randint(0, A2.shape[-1], size=(15,))

        # torch.testing.assert_close(out1, out2.half(), rtol=0.05, atol=0.001)

        # Bt = torch.randn(dim2*4, dim2, device='cuda').half()
        # torch.cuda.synchronize()
        # t0 = time.time()
        # print(A2.shape, B.shape)
        # for i in range(100):
        #   #out3 = F.spmm_coo(cooA, Bt.t())
        #   #out2 = F.spmm_coo(cooA, B)
        #   #out2 = F.spmm_coo_very_sparse(cooA, B)
        #   #out1 = torch.matmul(A, Bt.t())

        # torch.cuda.synchronize()
        # print(time.time() - t0)

    @pytest.mark.parametrize("dim1", [1 * 2048])
    @pytest.mark.parametrize("dim2", [2048])
    @pytest.mark.parametrize("dtype", [torch.int8])
    def test_spmm_coo_dequant(self, dim1, dim2, dtype):
        threshold = 6.0
        # threshold = 2.8
        # threshold = 0.0
967
        A = torch.randn(dim1, dim2, device="cuda").half()
968
969
970
        B = torch.empty(dim2, dim2 * 4, device="cuda", dtype=torch.float16)
        torch.nn.init.xavier_uniform_(B)
        Bt = B.t().contiguous()
Tim Dettmers's avatar
Tim Dettmers committed
971

972
        CB, CBt, statsB, statsBt, coo_tensor = F.int8_double_quant(B)
973

974
        rowidx = torch.randint(0, A.shape[-1], size=(15,))
Tim Dettmers's avatar
Tim Dettmers committed
975

976
        A[:, rowidx] = 8.0
Tim Dettmers's avatar
Tim Dettmers committed
977
978
979
980
981

        idx = torch.abs(A) >= threshold
        nnz = (idx == 1).sum().item()
        rows, cols = torch.where(idx)
        values = A[idx]
Ruff's avatar
Ruff committed
982
        cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values)
983
984
        A2 = A * idx
        out2 = F.spmm_coo_very_sparse(cooA, CBt, dequant_stats=statsBt)
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
        out1 = torch.matmul(A2, B.half())
        out3 = F.spmm_coo_very_sparse(cooA, CBt.half())
        out3 = out3 * statsBt.half() / 127

        values, counts = torch.unique(cooA.rowidx, return_counts=True)
        offset = counts.cumsum(0).int()
        max_count, max_idx = torch.sort(counts, descending=True)
        print(torch.median(max_count.float()))

        torch.testing.assert_close(out2, out3, rtol=0.05, atol=0.001)

        p = 200 / (2048 * 12288 * 4)
        n = out1.numel()
        count = math.ceil(p * n)
        assert_all_approx_close(out1, out2, rtol=0.01, atol=3.0e-2, count=count)

        # torch.cuda.synchronize()
        # t0 = time.time()
        # for i in range(100):
        #   out2 = F.spmm_coo_very_sparse(cooA, B)
        # torch.cuda.synchronize()
        # print('fp16', time.time() - t0)

        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(100):
            out2 = F.spmm_coo(cooA, B)
        torch.cuda.synchronize()
        print("cusparse fp16", time.time() - t0)
1014

1015
1016
1017
1018
1019
1020
        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(100):
            out2 = F.spmm_coo_very_sparse(cooA, CBt)
        torch.cuda.synchronize()
        print("int8", time.time() - t0)
1021

1022
1023
1024
1025
1026
1027
        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(100):
            out2 = F.spmm_coo_very_sparse(cooA, CBt, dequant_stats=statsBt)
        torch.cuda.synchronize()
        print("int8+dequant", time.time() - t0)
1028

1029
1030
1031
1032
1033
1034
        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(100):
            out2 = torch.matmul(A, B)
        torch.cuda.synchronize()
        print("matmul", time.time() - t0)
1035

1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(100):
            out1 = bnb.matmul(A, Bt)
            out2 = F.spmm_coo_very_sparse(cooA, CBt, dequant_stats=statsBt)
            out = out1 + out2
        torch.cuda.synchronize()
        print("sparse+ matmul", time.time() - t0)

        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(100):
            out1 = bnb.matmul(A, Bt)
            torch.matmul(A[:, rowidx], Bt.t()[rowidx], out=out1)
        torch.cuda.synchronize()
        print("partial matmul", time.time() - t0)
1052

1053
1054
1055
1056
1057
1058
        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(100):
            out1 = bnb.matmul(A, Bt)
        torch.cuda.synchronize()
        print("partial matmul", time.time() - t0)
1059

1060

1061
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
class TestSparseTensorFunctional:
    def test_coo2csr(self):
        threshold = 1
        A = torch.randn(128, 128).half().cuda()
        idx = torch.abs(A) >= threshold
        nnz = (idx == 1).sum().item()
        rows, cols = torch.where(idx)
        values = A[idx]
        cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values)
        A2 = A * idx
        csrA = F.coo2csr(cooA)
        counts = csrA.rowptr[1:] - csrA.rowptr[:-1]
        assert counts.numel() == A.shape[0]
1075

1076
1077
1078
        torch.testing.assert_close(counts.long(), (A2 != 0).sum(1))
        idx = A2 != 0
        torch.testing.assert_close(A2[idx], csrA.values)
Tim Dettmers's avatar
Tim Dettmers committed
1079

1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
    def test_coo2csc(self):
        threshold = 1
        A = torch.randn(128, 128).half().cuda()
        idx = torch.abs(A) >= threshold
        nnz = (idx == 1).sum().item()
        rows, cols = torch.where(idx)
        values = A[idx]
        cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values)
        A2 = A * idx
        cscA = F.coo2csc(cooA)
        counts = cscA.colptr[1:] - cscA.colptr[:-1]
        assert counts.numel() == A.shape[1]
Tim Dettmers's avatar
Tim Dettmers committed
1092

1093
1094
1095
1096
        torch.testing.assert_close(counts.long(), (A2 != 0).sum(0))
        # torch uses row-major -> use transpose to transfer to col-major
        idx = A2.t() != 0
        torch.testing.assert_close(A2.t()[idx], cscA.values)
Tim Dettmers's avatar
Tim Dettmers committed
1097

Tim Dettmers's avatar
Tim Dettmers committed
1098

1099
class TestQuantize4BitFunctional:
1100
    @pytest.mark.parametrize("device", get_available_devices())
1101
1102
1103
    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
1104
    def test_4bit_quant(self, device, dtype, quant_type, blocksize):
1105
1106
1107
        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
            pytest.skip("This configuration is not supported on HPU.")

1108
        A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
1109
1110
        qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
        A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
1111

1112
1113
1114
        err = (A1 - A2).abs().float()
        relerr = (err / (A1.abs().float() + 1e-8)).mean()
        err = err.mean()
1115

1116
        assert A2.dtype == dtype
1117

1118
1119
1120
1121
        # With larger block sizes, we can expect this to blow up.
        # At blocksize>=1024, don't even bother looking at relerr.
        if blocksize <= 64:
            assert err.item() < 0.1
1122
            assert relerr.item() < 0.28
1123
        elif blocksize <= 256:
1124
            assert err.item() < 0.11
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
            assert relerr.item() < 0.30
        elif blocksize <= 512:
            assert err.item() < 0.12
            assert relerr.item() < 0.31
        elif quant_type == "fp4":
            # 1024 => 0.48, 2048 => 0.52, 4096 => 0.56
            assert err.item() < 0.08 + math.log2(blocksize) * 4e-2
        else:
            # 1024 => 0.8, 2048 => 0.88, 4096 => 0.96
            assert err.item() < math.log2(blocksize) * 8e-2

1136
    @pytest.mark.parametrize("device", get_available_devices())
1137
    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
Matthew Douglas's avatar
Matthew Douglas committed
1138
    @pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize"))
1139
1140
1141
1142
1143
    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype)
    def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
            pytest.skip("FP4 quantization is not supported on HPU.")

Matthew Douglas's avatar
Matthew Douglas committed
1144
1145
1146
        errs1 = []
        errs2 = []
        for i in range(10):
1147
            A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
Matthew Douglas's avatar
Matthew Douglas committed
1148
1149
1150
1151
            q2, SA2 = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
            q3, SA3 = F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type)
            A2 = F.dequantize_4bit(q2, SA2, quant_type=quant_type)
            A3 = F.dequantize_4bit(q3, SA3, quant_type=quant_type)
1152

Matthew Douglas's avatar
Matthew Douglas committed
1153
1154
1155
            err = (A1 - A2).abs().float()
            relerr = (err / (A1.abs().float() + 1e-15)).mean()
            err = err.mean()
1156

Matthew Douglas's avatar
Matthew Douglas committed
1157
            errs1.append(err.item())
1158

Matthew Douglas's avatar
Matthew Douglas committed
1159
1160
            assert err.item() < 0.11
            assert relerr.item() < 0.28
1161

Matthew Douglas's avatar
Matthew Douglas committed
1162
1163
1164
            err = (A1 - A3).abs().float()
            relerr = (err / (A1.abs().float() + 1e-15)).mean()
            err = err.mean()
1165

Matthew Douglas's avatar
Matthew Douglas committed
1166
            errs2.append(err.item())
1167

Matthew Douglas's avatar
Matthew Douglas committed
1168
1169
            assert err.item() < 0.11
            assert relerr.item() < 0.28
1170
1171
1172

    # @pytest.mark.parametrize("quant_type", ['fp4', 'nf4'])
    @pytest.mark.parametrize("quant_type", ["nf4"])
1173
    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
    @pytest.mark.benchmark
    def test_bench_4bit_dequant(self, quant_type):
        blocksize = 256
        a = torch.rand(1024 * 12 * 4, 1024 * 12, device="cuda").half()
        qa, SA = F.quantize_4bit(a, blocksize=blocksize, quant_type=quant_type)

        input_size = a.numel() / 2
        output_size = a.numel() * 2
        num_bytes = input_size + output_size
        GB = num_bytes / 1e9
        max_theoretical_s = GB / 768
        # print(max_theoretical_s*1e6)
        b = torch.randn(128, 1024 * 12, device="cuda").half()

        iters = 100
        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(iters):
            F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
            # b.copy_(a)
        torch.cuda.synchronize()
        # print((time.time()-t0)/iters*1e6)

        # torch.cuda.synchronize()
        # t0 = time.time()
        # for i in range(iters):
        #    torch.matmul(b, a.t())
        # torch.cuda.synchronize()
        # print((time.time()-t0)/iters*1e6)

1204
    @pytest.mark.parametrize("device", get_available_devices())
1205
1206
1207
1208
1209
1210
1211
1212
1213
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
Matthew Douglas's avatar
Matthew Douglas committed
1214
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
1215
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
1216
1217
1218
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")

Matthew Douglas's avatar
Matthew Douglas committed
1219
1220
1221
1222
1223
1224
1225
1226
1227
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
1228

1229
1230
1231
1232
1233
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA for now.
        iters = 100 if device == "cuda" else 10

        for i in range(iters):
Matthew Douglas's avatar
Matthew Douglas committed
1234
            if kind == "fc1":
1235
1236
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
Matthew Douglas's avatar
Matthew Douglas committed
1237
            elif kind == "fc2":
1238
1239
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
Matthew Douglas's avatar
Matthew Douglas committed
1240
            elif kind == "attn":
1241
1242
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
Matthew Douglas's avatar
Matthew Douglas committed
1243
            elif kind == "attn_packed":
1244
1245
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
Matthew Douglas's avatar
Matthew Douglas committed
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316

            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)

            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()

            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5

            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3

            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()

            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())

            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())

            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())

            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1

            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3

        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328

                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
Matthew Douglas's avatar
Matthew Douglas committed
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.995
            assert maxratio < 1.005 and maxratio > 0.995
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
                assert maxerr1 < 1e-7
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.995
            assert maxratio < 1.005 and maxratio > 0.995
        elif dtype == torch.bfloat16:
            if dim <= 512:
                assert err1 < 6e-4
                assert relerr1 < 0.007
                assert maxerr1 < 0.015
            else:
                assert err1 < 2e-4
                assert relerr1 < 0.002
                assert maxerr1 < 0.0012
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.04 and relratio > 0.96
            assert maxratio < 1.02 and maxratio > 0.98
1359

1360
    @pytest.mark.parametrize("device", get_available_devices())
1361
1362
1363
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
1364
    def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
Matthew Douglas's avatar
Matthew Douglas committed
1365
1366
1367
        if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3):
            pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3")

1368
1369
1370
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype):
            pytest.skip("This configuration is not supported on HPU.")

1371
1372
1373
1374
1375
1376
        dims = 10
        torch.random.manual_seed(np.random.randint(0, 412424242))
        dims = get_test_dims(0, 8192, n=dims)
        dims = [dim + (64 - (dim % 64)) for dim in dims]
        # for dim in [576, 5120, 3520, 5184, 1280, 4992, 5312, 2048]:
        for dim in dims:
1377
1378
            A = torch.normal(0, 0.1, size=(1, 1, dim), dtype=dtype, device=device)
            B = torch.eye(dim, dtype=dtype, device=device)
1379
1380
1381
1382
1383
1384

            qB, state = F.quantize_4bit(B, quant_type=storage_type, compress_statistics=double_quant)
            C3 = torch.matmul(A, B.t())
            C2 = bnb.matmul_4bit(A, qB.t(), state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
1385

1386
1387
1388
1389
1390
            torch.testing.assert_close(A, C3)
            torch.testing.assert_close(A, C1)
            torch.testing.assert_close(A, C2)
        # torch.testing.assert_close(A, C1, rtol=1e-5, atol=0.00001)
        # torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080)
1391
1392
1393
1394


def test_normal_map_tree():
    code = F.create_normal_map()
Ruff's avatar
Ruff committed
1395
    values = code[:8].tolist() + code[-8:].tolist()
1396
    num_pivots = 1
Ruff's avatar
Ruff committed
1397
1398
1399
1400
    # print(values)
    while num_pivots < 16:
        idx = list(range(16 // num_pivots // 2, 16, 16 // num_pivots))
        # print(idx)
1401
1402
1403
        num_pivots *= 2
        pivots = []
        for i in idx:
Ruff's avatar
Ruff committed
1404
1405
            pivots.append((values[i - 1] + values[i]) / 2)
        # print(pivots)