test_mm_int8int2.py 2.47 KB
Newer Older
cmx's avatar
cmx committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pytest
import torch

from liger_kernel.ops.experimental.mm_int8int2 import matmul
from liger_kernel.ops.experimental.mm_int8int2 import pack_weights
from liger_kernel.ops.experimental.mm_int8int2 import unpack_weights
from liger_kernel.utils import infer_device

device = infer_device()


# input_features = size*4 when the weight matrix is unpacked
@pytest.mark.skip(reason="mm_int8int2 is under experimentation")
@pytest.mark.parametrize(
    "size",
    [
        2048,
        1024,
        512,
    ],
)
@pytest.mark.parametrize(
    "batch_size",
    [1, 2, 3, 8],
)
@pytest.mark.parametrize(
    "seq_len",
    [1, 7, 16, 2048],
)
@pytest.mark.parametrize(
    "out_features",
    [
        1024,
        2048,
        4096,
        10000,
    ],
)
@pytest.mark.parametrize(
    "atol, rtol, device",
    [
        (1e-2, 1e-2, device),
    ],
)
def test_kernel_correctness(batch_size, seq_len, out_features, size, atol, rtol, device):
    print(f"\nTesting kernel with size: {size}, atol: {atol}, rtol: {rtol}")

    # Generate the random tensors
    ht = torch.randint(-127, 127, (batch_size, seq_len, size * 4), device=device, dtype=torch.int8)
    u = torch.randint(0, 255, (out_features, size), device=device, dtype=torch.uint8)

    # Calculate dimensions
    B, M, N = ht.size()

    # Compute triton output
    triton_output = matmul(ht.view(B * M, N), u.T.contiguous()).view(B, M, -1)

    # Unpack weights and compute torch output
    unpacked = unpack_weights(u.T, bits=2).T
    torch_output = torch.matmul(ht.to(torch.float32), unpacked.T.contiguous().to(torch.float32))

    # Print the results (optional, can be commented out)
    print("triton_output =", triton_output)
    print("torch_output =", torch_output)

    # Check if outputs are close within the given tolerances
    assert torch.allclose(triton_output, torch_output.to(torch.int32), atol=atol, rtol=rtol), "Results differ"


@pytest.mark.skip(reason="mm_int8int2 is under experimentation")
@pytest.mark.parametrize(
    "size",
    [
        2048,
        1024,
        512,
    ],
)
@pytest.mark.parametrize(
    "out_features",
    [
        1024,
        2048,
        4096,
        10000,
    ],
)
@pytest.mark.parametrize(
    "device",
    [
        device,
    ],
)
def test_unpack_pack_correctness(out_features, size, device):
    u = torch.randint(0, 255, (out_features, size), device=device, dtype=torch.uint8)

    assert (pack_weights(unpack_weights(u.T), 2) == u.T).all(), "Packed weights do not match original weights."