"online_apiserver_test/benchmarks/kernels/benchmark_aqlm.py" did not exist on "fba2e3b53349552607f568c17f48428c716c8c65"
test_tvd.py 5.5 KB
Newer Older
cmx's avatar
cmx committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import pytest
import torch

from test.utils import supports_bfloat16

from liger_kernel.transformers.tvd import LigerTVDLoss
from liger_kernel.utils import infer_device


class TorchTVDLoss(torch.nn.Module):
    def __init__(self, reduction="batchmean", ignore_index: int = -100):
        super(TorchTVDLoss, self).__init__()
        self.reduction = reduction
        self.ignore_index = ignore_index

    def forward(self, p, q, label=None):
        tvd = torch.abs(p - q) / 2.0
        n_non_ignore = p.size(0)
        if label is not None:
            tvd = torch.where(label.unsqueeze(1) != self.ignore_index, tvd, torch.zeros_like(tvd))
            n_non_ignore = (label != self.ignore_index).sum().item()
            if n_non_ignore == 0:
                return torch.tensor(0.0).to(tvd.device)

        if self.reduction == "mean":
            return torch.sum(tvd) / (n_non_ignore * p.size(1))
        elif self.reduction == "sum":
            return torch.sum(tvd)
        elif self.reduction == "none":
            return tvd
        elif self.reduction == "batchmean":
            return torch.sum(tvd) / n_non_ignore
        else:
            raise ValueError("Invalid reduction type.")


_SHAPE_PARAMS = (
    "B, T, V",
    [
        (1, 4096, 32000),
        (32, 4096, 1024),
        (41, 401, 1271),
        pytest.param(
            1,
            4096,
            128256,
            marks=pytest.mark.skipif(
                hasattr(torch, infer_device())
                and getattr(torch, infer_device()).is_available()
                and getattr(torch, infer_device()).get_device_properties(0).total_memory < 36e9,
                reason="This test requires a GPU with at least 36GB of memory",
            ),
        ),
        (3, 423, 32000),
    ],
)

_DTYPE_PARAMS = (
    "dtype, atol, rtol",
    [
        pytest.param(
            torch.bfloat16,
            1e-8,
            1e-6,
            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
        ),
        (torch.float32, 1e-8, 1e-6),
        # (torch.float16, 1e-1, 1e-2), # turn off because of numerical instability of torch.float16
    ],
)


def _test_correctness_once(
    target_tvd,
    torch_tvd,
    B,
    T,
    V,
    dtype,
    atol,
    rtol,
    reduction,
    is_last_layer=True,
    device=infer_device(),
):
    torch.manual_seed(0)
    input = torch.randn(B * T, V, device=device, dtype=dtype, requires_grad=True)

    x1 = input.detach().clone().requires_grad_(True)
    x2 = input.detach().clone().requires_grad_(True)

    with torch.no_grad():
        target = torch.randn(B * T, V, device=device).softmax(dim=-1)

    output = target_tvd(x1, target)
    output2 = torch_tvd(x2, target)

    assert torch.allclose(output, output2, atol=atol, rtol=rtol)

    if not is_last_layer:
        output = output * 2.0
        output2 = output2 * 2.0

    if reduction == "none":
        return

    output.backward()
    output2.backward()
    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)


def _test_correctness_with_ignore_index_once(
    target_tvd,
    torch_tvd,
    ignore_index,
    B,
    T,
    V,
    dtype,
    atol,
    rtol,
    reduction,
    device=infer_device(),
):
    input = torch.randn(B * T, V, device=device, dtype=dtype, requires_grad=True)

    x1 = input.detach().clone().requires_grad_(True)
    x2 = input.detach().clone().requires_grad_(True)

    with torch.no_grad():
        target = torch.randn(B * T, V, device=device).softmax(dim=-1)

    label = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)

    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
    label[indices_to_assign] = ignore_index

    output = torch_tvd(x1, target, label)
    output2 = target_tvd(x2, target, label)

    assert torch.allclose(output, output2, atol=atol, rtol=rtol)

    if reduction == "none":
        return

    output.backward()
    output2.backward()
    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)


@pytest.mark.parametrize(*_SHAPE_PARAMS)
@pytest.mark.parametrize("reduction", ["batchmean", "sum", "mean", "none"])
@pytest.mark.parametrize(*_DTYPE_PARAMS)
def test_correctness(B, T, V, reduction, dtype, atol, rtol):
    liger_tvd = LigerTVDLoss(reduction=reduction)
    torch_tvd = TorchTVDLoss(reduction=reduction)
    _test_correctness_once(liger_tvd, torch_tvd, B, T, V, dtype, atol, rtol, reduction)


@pytest.mark.parametrize(*_SHAPE_PARAMS)
@pytest.mark.parametrize("reduction", ["batchmean", "sum", "mean", "none"])
@pytest.mark.parametrize(*_DTYPE_PARAMS)
def test_correctness_not_last(B, T, V, reduction, dtype, atol, rtol):
    liger_tvd = LigerTVDLoss(reduction=reduction)
    torch_tvd = TorchTVDLoss(reduction=reduction)
    _test_correctness_once(
        liger_tvd,
        torch_tvd,
        B,
        T,
        V,
        dtype,
        atol,
        rtol,
        reduction,
        is_last_layer=False,
    )


@pytest.mark.parametrize(*_SHAPE_PARAMS)
@pytest.mark.parametrize("reduction", ["batchmean", "sum", "mean", "none"])
@pytest.mark.parametrize(*_DTYPE_PARAMS)
@pytest.mark.parametrize("ignore_index", [-100, 0, 1])
def test_correctness_with_ignore_index(B, T, V, reduction, dtype, atol, rtol, ignore_index):
    liger_tvd = LigerTVDLoss(reduction=reduction, ignore_index=ignore_index)
    torch_tvd = TorchTVDLoss(reduction=reduction, ignore_index=ignore_index)
    _test_correctness_with_ignore_index_once(liger_tvd, torch_tvd, ignore_index, B, T, V, dtype, atol, rtol, reduction)