learned_nonlin_test.py 13.8 KB
Newer Older
1
2
3
# Caution: this will fail occasionally due to cutoffs not being quite large enough.
# As long as it passes most of the time, it's OK.

Daniel Povey's avatar
Daniel Povey committed
4
import random
Daniel Povey's avatar
Daniel Povey committed
5
import torch
6
from torch_learned_nonlin import learned_nonlin
Daniel Povey's avatar
Daniel Povey committed
7
8


9
10
11
12
13
14
15
16
def test_learned_nonlin_basic():
    for dtype in [torch.float32, torch.float64]:
        B = 2
        C = 4
        T = 10
        x = -2.0 + 0.4 * torch.arange(10, dtype=dtype)
        x = x.reshape(1, 1, 10).repeat(B, C, 1)

17

18
19
        K = 4
        N = K * 2
Daniel Povey's avatar
Daniel Povey committed
20
        params = torch.arange(N + 1, dtype=dtype).unsqueeze(0) + torch.arange(C, dtype=dtype).unsqueeze(1) - 3
Daniel Povey's avatar
Daniel Povey committed
21
22
        x.requires_grad = True
        params.requires_grad = True
23
24
25
        print("x = ", x)
        print("params = ", params)
        print("x.shape = ", x.shape)
26

27
        y = learned_nonlin(x, params, dim = 1)
28

29
30
31
32
33
34
35
36
37
38
39

        if True:
            # Check
            x2 = x.reshape(B, C, 5, 2)
            assert torch.allclose(learned_nonlin(x, params, dim = 1), learned_nonlin(x2, params, dim = 1).reshape(x.shape))

            x2 = x.reshape(B, 1, C, 10)
            assert torch.allclose(learned_nonlin(x, params, dim = 1), learned_nonlin(x2, params, dim = 2).reshape(x.shape))



40
        print("y = ", y)
41
        y.sum().backward()
42

43
44
45
        if torch.cuda.is_available():
            # test that the CUDA forward is the same as the CPU forward.
            device = torch.device('cuda:0')
46
47
48
49
50
            x2 = x.to(device).detach()
            x2.requires_grad = True
            params2 = params.to(device).detach()
            params2.requires_grad = True
            y2 = learned_nonlin(x2, params2, dim = 1).to(torch.device('cpu'))
51
52
53
54
55
            print("Checking CUDA is same")
            if not torch.allclose(y, y2, atol=1.0e-06):
                print(f"Error: CPU versus CUDA not the same: {y} vs. {y2}, diff = {y2-y}")
                assert(0);

56
57
58
59
60
61
62
63
64
65
            y2.sum().backward()

            if not torch.allclose(x.grad, x2.grad.to('cpu'), atol=1.0e-06):
                print(f"Error: CPU x-grad versus CUDA grad not the same: {x.grad} vs. {x2.grad}, diff = {x2.grad.to('cpu')-x.grad}")
                assert(0);
            if not torch.allclose(params.grad, params2.grad.to('cpu'), atol=1.0e-06):
                print(f"Error: CPU params-grad versus CUDA grad not the same: {params.grad} vs. {params2.grad}, diff = {params2.grad.to('cpu')-params.grad}")
                assert(0);


Daniel Povey's avatar
Daniel Povey committed
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

        print("x.grad = ", x.grad)
        print("params.grad = ", params.grad)

        # Just eyeballing the above tgo make sure it looks reasonable.


def test_learned_nonlin_deriv():
    """ Tests derivatives in randomized way """
    for _ in range(10):
        for dtype in [torch.float32, torch.float64]:
            B = random.randrange(1, 10)
            C = random.randrange(1, 10)
            T = random.randrange(1, 20)
            x = torch.randn(B, C, T, dtype=dtype)

            K = 2 ** random.randrange(0, 4)
            N = K * 2
            params = torch.randn(C, N + 1, dtype=dtype)
            x.requires_grad = True
            params.requires_grad = True
            print(f"B,C,T,K = {B},{C},{T},{K}")
            y = learned_nonlin(x, params, dim = 1)

90
91
            y_deriv = torch.randn_like(y)
            y.backward(gradient=y_deriv)
92
93
94
95

            if torch.cuda.is_available():
                # test that the CUDA forward is the same as the CPU forward.
                device = torch.device('cuda:0')
96
97
98
99
100
101
102
103
104
105
106
                x2, params2 = x.to(device).detach(), params.to(device).detach()
                x2.requires_grad = True
                params2.requires_grad = True
                y2 = learned_nonlin(x2, params2, dim = 1)

                if N >= 4 and N <= 16:  # Currently backprop requires these conditions
                    y2.backward(gradient=y_deriv.to(device))
                    x2grad, params2grad = x2.grad.to('cpu'), params2.grad.to('cpu')

                y2 = y2.to('cpu')

107
                print("Checking CUDA is same")
108
                if not torch.allclose(y, y2, atol=1.0e-05):
109
                    print(f"Error: CPU output versus CUDA not the same: {y} vs. {y2}, diff = {y2-y}, max-diff = {(y2-y).abs().max()}")
110
111
                    assert(0)

112
113
114
115
116
117
118
119
120
                if N >= 4 and N <= 16:  # Currently backprop requires these conditions
                    if not torch.allclose(x.grad, x2grad, atol=1.0e-05):
                        print(f"Error: CPU x.grad versus CUDA not the same: {x.grad} vs. {x2grad}, diff = {x2grad-x.grad}, max-diff = {(x2grad-x.grad).abs().max()}")
                        assert(0)
                    if not torch.allclose(params.grad, params2grad, atol=1.0e-05):
                        print(f"Error: CPU params.grad versus CUDA not the same: {params.grad} vs. {params2grad}, "
                              f"diff = {params2grad-params.grad}, max-diff = {(params2grad-params.grad).abs().max()}")
                        assert(0)

Daniel Povey's avatar
Daniel Povey committed
121
122
123
124

            delta = 1.0e-04
            delta_x = torch.randn_like(x) * delta
            pred_change = (x.grad * delta_x).sum()
125
126
            y2 = learned_nonlin(x + delta_x, params, dim = 1)
            observed_change = (y_deriv * (y2 - y)).sum()
Daniel Povey's avatar
Daniel Povey committed
127
            print(f"for input: pred_change = {pred_change}, observed_change={observed_change}")
128
            if not torch.allclose(pred_change, observed_change, rtol=5.0e-02, atol=3.0e-05):
129
130
                print(f"For changed input, output differs too much: params={params}, input={x}, mod_input={x+delta_x}, y={y}, y2={y2}, diff={y2-y}")
                assert 0
Daniel Povey's avatar
Daniel Povey committed
131
132
133
134
135
136

            delta_params = torch.randn_like(params) * delta
            pred_change = (params.grad * delta_params).sum()
            observed_change = (y_deriv * (learned_nonlin(x, params + delta_params, dim = 1) - y)).sum()
            print(f"for params: pred_change = {pred_change}, observed_change={observed_change}")
            assert torch.allclose(pred_change, observed_change, rtol=1.0e-02, atol=1.0e-05)
137
138
139
140



def test_learned_nonlin_zeros():
Daniel Povey's avatar
Daniel Povey committed
141
142
143
144
145
    N = 1
    C = 2
    H = 3
    W = 4
    for device in [ torch.device('cpu'), torch.device('cuda:0') ]:
Daniel Povey's avatar
Daniel Povey committed
146
147
148
        if device == torch.device('cuda:0') and not torch.cuda.is_available():
            print("Warning: torch not available, not testing this part.")
            continue
Daniel Povey's avatar
Daniel Povey committed
149
150
151
152
153
154
        for dtype in [torch.float32, torch.float64]:
            print("device=", device, ", dtype=", dtype)
            input = torch.zeros(N, 2 * C, H, W, device=device, dtype=dtype)
            kH = 5
            kW = 5
            pos_add = torch.zeros(C, kH, kW, device=device, dtype=dtype)
155
            pos_mul = torch.ones(C, kH, kW, device=device, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
156
157
158
            input.requires_grad = True
            pos_add.requires_grad = True
            pos_mul.requires_grad = True
Daniel Povey's avatar
Daniel Povey committed
159

Daniel Povey's avatar
Daniel Povey committed
160
            output_ref = torch.zeros(N, C, H, W, device=device, dtype=dtype)
161
            output = learned_nonlin(input, pos_add, pos_mul)
Daniel Povey's avatar
Daniel Povey committed
162
            assert torch.allclose(output, output_ref)
Daniel Povey's avatar
Daniel Povey committed
163

Daniel Povey's avatar
Daniel Povey committed
164
165
166
167
168
            output.sum().backward()
            print("input_grad=", input.grad)
            print("pos_add_grad=", pos_add.grad)
            print("pos_mul_grad=", pos_mul.grad)

Daniel Povey's avatar
Daniel Povey committed
169

170
def test_learned_nonlin_compare():
Daniel Povey's avatar
Daniel Povey committed
171
172
173
174
175
176
177
178
179
    N = 1
    C = 2
    H = 3
    W = 4
    if not torch.cuda.is_available():
        print("Warning: torch not available, not testing this part.")
        return
    for dtype in [torch.float32, torch.float64]:
        print("dtype=", dtype)
Daniel Povey's avatar
Daniel Povey committed
180
        input = torch.randn(N, 2 * C, H, W, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
181
        device = torch.device('cuda:0')
182
        input_cuda = input.to(device).detach()
Daniel Povey's avatar
Daniel Povey committed
183
184
185

        kH = 5
        kW = 5
Daniel Povey's avatar
Daniel Povey committed
186
187
188
        pos_add = torch.randn(C, kH, kW, dtype=dtype)
        pos_mul = torch.randn(C, kH, kW, dtype=dtype)

189
190
191
192
193
        pos_add_cuda = pos_add.to(device).detach()
        pos_mul_cuda = pos_mul.to(device).detach()

        for x in [ pos_add, pos_mul, pos_add_cuda, pos_mul_cuda, input, input_cuda ]:
            x.requires_grad = True
Daniel Povey's avatar
Daniel Povey committed
194

195
196
        output = learned_nonlin(input, pos_add, pos_mul)
        output_cuda = learned_nonlin(input_cuda, pos_add_cuda, pos_mul_cuda)
Daniel Povey's avatar
Daniel Povey committed
197
198
        print("output = ", output)
        print("output_cuda = ", output_cuda)
199
200
201
202
203

        output_grad = torch.randn(*output.shape, dtype=dtype)
        output.backward(gradient=output_grad)
        output_cuda.backward(gradient=output_grad.to(device))

Daniel Povey's avatar
Daniel Povey committed
204
205
206
207
208
        diff = (output - output_cuda.to(torch.device('cpu'))).abs().sum()
        abs = output.abs().sum()
        print("Diff = ", diff, ", abs = ", abs)
        assert torch.allclose(output, output_cuda.to(torch.device('cpu')),
                              atol=1.0e-05)
Daniel Povey's avatar
Daniel Povey committed
209
210


211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
        for a,b,name in [ (pos_add, pos_add_cuda, 'pos_add'),
                          (pos_mul, pos_mul_cuda, 'pos_mul'),
                          (input, input_cuda, 'input') ]:
            grad = a.grad
            cuda_grad = b.grad.to(torch.device('cpu'))
            diff_abs = (grad - cuda_grad).abs().sum().item()
            sum_abs = (grad + cuda_grad).abs().sum().item()
            print(f"Comparing grad of {name}: diff={diff_abs}, sum={sum_abs}")
            if diff_abs > 1.0e-05 * sum_abs:
                print(f"Error: too much difference in grad of {name}.")
                print("grad = ", grad)
                print("cuda_grad = ", cuda_grad)



226
def test_learned_nonlin_rand_compare():
Daniel Povey's avatar
Daniel Povey committed
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
    for _ in range(30):
        N = random.randint(1, 256)
        C = random.randint(1, 64)
        H = random.randint(1, 128)
        W = random.randint(1, 128)

        while N * C * H * W > 65535:
            if N >= C and N >= H and N >= W:
                N = N // 2
            elif C >= H and C >= W:
                C = C // 2
            elif H >= W:
                H = H // 2
            else:
                W = W // 2


        if not torch.cuda.is_available():
            print("Warning: torch not available, not testing this part.")
            return
        for dtype in [torch.float32, torch.float64]:
            print("dtype=", dtype)
Daniel Povey's avatar
Daniel Povey committed
249
            input = torch.randn(N, 2 * C, H, W, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
250
251
252
253
254
255
256
257
258
            device = torch.device('cuda:0')
            input_cuda = input.to(device)

            kH = random.randint(1, 10)
            kW = random.randint(1, 10)
            if kH % 2 == 0:
                kH += 1
            if kW % 2 == 0:
                kW += 1
Daniel Povey's avatar
Daniel Povey committed
259
260
            pos_add = torch.randn(C, kH, kW, dtype=dtype)
            pos_mul = torch.randn(C, kH, kW, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
261
262
263
            pos_add_cuda = pos_add.to(device)
            pos_mul_cuda = pos_mul.to(device)

264
265
            output = learned_nonlin(input, pos_add, pos_mul)
            output_cuda = learned_nonlin(input_cuda, pos_add_cuda, pos_mul_cuda)
Daniel Povey's avatar
Daniel Povey committed
266
267

            diff = (output - output_cuda.to(torch.device('cpu'))).abs().sum()
268
269
            sum_abs = output.abs().sum()
            print("Diff = ", diff, ", abs = ", sum_abs)
Daniel Povey's avatar
Daniel Povey committed
270

271
            if (diff / sum_abs).item() > 0.001:
Daniel Povey's avatar
Daniel Povey committed
272
273
274
                print("output = ", output)
                print("output_cuda = ", output_cuda)
                assert 0, "outputs differ"
275
276
277



278
def test_learned_nonlin_rand_grad():
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
    for _ in range(30):
        N = random.randint(1, 256)
        C = random.randint(1, 64)
        H = random.randint(1, 128)
        W = random.randint(1, 128)

        while N * C * H * W > 65535:
            if N >= C and N >= H and N >= W:
                N = N // 2
            elif C >= H and C >= W:
                C = C // 2
            elif H >= W:
                H = H // 2
            else:
                W = W // 2

        for device in [ torch.device('cpu'), torch.device('cuda:0') ]:
            if device == torch.device('cuda:0') and not torch.cuda.is_available():
                print("Warning: torch not available, not testing this part.")
                continue
            for dtype in [torch.float32, torch.float64]:
                print("dtype=", dtype, ", device=", device)
                input = torch.randn(N, 2 * C, H, W, dtype=dtype, device=device)


                kH = random.randint(1, 10)
                kW = random.randint(1, 10)
                if kH % 2 == 0:
                    kH += 1
                if kW % 2 == 0:
                    kW += 1
                pos_add = torch.randn(C, kH, kW, dtype=dtype, device=device)
                pos_mul = torch.randn(C, kH, kW, dtype=dtype, device=device)
                input.requires_grad = True
                pos_add.requires_grad = True
                pos_mul.requires_grad = True

316
                output = learned_nonlin(input, pos_add, pos_mul)
317
318
319
320
321
322
323
                output_grad = torch.randn(N, C, H, W, dtype=dtype, device=device)

                output.backward(gradient=output_grad)

                delta = 1.0e-05
                pos_delta = delta * torch.randn(C, kH, kW, dtype=dtype, device=device)
                pred_change = (pos_delta * pos_add.grad).sum().to('cpu').item()
324
                change = (output_grad * (learned_nonlin(input, pos_add + pos_delta, pos_mul) - output )).sum().to('cpu').item()
325
326
327
328
                print(f"For pos_add: pred_change={pred_change}, change={change}")
                #assert abs(pred_change - change)  < 1.0e-04

                pred_change = (pos_delta * pos_mul.grad).sum().to('cpu').item()
329
                change = (output_grad * (learned_nonlin(input, pos_add, pos_mul + pos_delta) - output )).sum().to('cpu').item()
330
331
332
333
334
                print(f"For pos_mul: pred_change={pred_change}, change={change}")
                #assert abs(pred_change - change) / abs(change) < 1.0e-04

                input_delta = delta * torch.randn(N, 2*C, H, W, dtype=dtype, device=device)
                pred_change = (input_delta * input.grad).sum().to('cpu').item()
335
                change = (output_grad * (learned_nonlin(input + input_delta, pos_add, pos_mul) - output )).sum().to('cpu').item()
336
337
338
339
340
                print(f"For input: pred_change={pred_change}, change={change}")
                #assert abs(pred_change - change) / abs(change) < 1.0e-04


if __name__ == "__main__":
341
    test_learned_nonlin_basic()
Daniel Povey's avatar
Daniel Povey committed
342
    test_learned_nonlin_deriv()
343
344
345
346
347
    if False:
        test_learned_nonlin_rand_grad()
        test_learned_nonlin_zeros()
        test_learned_nonlin_compare()
        test_learned_nonlin_rand_compare()