learned_nonlin_test.py 13.7 KB
Newer Older
Daniel Povey's avatar
Daniel Povey committed
1
import random
Daniel Povey's avatar
Daniel Povey committed
2
import torch
3
from torch_learned_nonlin import learned_nonlin
Daniel Povey's avatar
Daniel Povey committed
4
5


6
7
8
9
10
11
12
13
def test_learned_nonlin_basic():
    for dtype in [torch.float32, torch.float64]:
        B = 2
        C = 4
        T = 10
        x = -2.0 + 0.4 * torch.arange(10, dtype=dtype)
        x = x.reshape(1, 1, 10).repeat(B, C, 1)

14

15
16
        K = 4
        N = K * 2
Daniel Povey's avatar
Daniel Povey committed
17
        params = torch.arange(N + 1, dtype=dtype).unsqueeze(0) + torch.arange(C, dtype=dtype).unsqueeze(1) - 3
Daniel Povey's avatar
Daniel Povey committed
18
19
        x.requires_grad = True
        params.requires_grad = True
20
21
22
        print("x = ", x)
        print("params = ", params)
        print("x.shape = ", x.shape)
23

24
        y = learned_nonlin(x, params, dim = 1)
25

26
27
28
29
30
31
32
33
34
35
36

        if True:
            # Check
            x2 = x.reshape(B, C, 5, 2)
            assert torch.allclose(learned_nonlin(x, params, dim = 1), learned_nonlin(x2, params, dim = 1).reshape(x.shape))

            x2 = x.reshape(B, 1, C, 10)
            assert torch.allclose(learned_nonlin(x, params, dim = 1), learned_nonlin(x2, params, dim = 2).reshape(x.shape))



37
        print("y = ", y)
38
        y.sum().backward()
39

40
41
42
        if torch.cuda.is_available():
            # test that the CUDA forward is the same as the CPU forward.
            device = torch.device('cuda:0')
43
44
45
46
47
            x2 = x.to(device).detach()
            x2.requires_grad = True
            params2 = params.to(device).detach()
            params2.requires_grad = True
            y2 = learned_nonlin(x2, params2, dim = 1).to(torch.device('cpu'))
48
49
50
51
52
            print("Checking CUDA is same")
            if not torch.allclose(y, y2, atol=1.0e-06):
                print(f"Error: CPU versus CUDA not the same: {y} vs. {y2}, diff = {y2-y}")
                assert(0);

53
54
55
56
57
58
59
60
61
62
            y2.sum().backward()

            if not torch.allclose(x.grad, x2.grad.to('cpu'), atol=1.0e-06):
                print(f"Error: CPU x-grad versus CUDA grad not the same: {x.grad} vs. {x2.grad}, diff = {x2.grad.to('cpu')-x.grad}")
                assert(0);
            if not torch.allclose(params.grad, params2.grad.to('cpu'), atol=1.0e-06):
                print(f"Error: CPU params-grad versus CUDA grad not the same: {params.grad} vs. {params2.grad}, diff = {params2.grad.to('cpu')-params.grad}")
                assert(0);


Daniel Povey's avatar
Daniel Povey committed
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

        print("x.grad = ", x.grad)
        print("params.grad = ", params.grad)

        # Just eyeballing the above tgo make sure it looks reasonable.


def test_learned_nonlin_deriv():
    """ Tests derivatives in randomized way """
    for _ in range(10):
        for dtype in [torch.float32, torch.float64]:
            B = random.randrange(1, 10)
            C = random.randrange(1, 10)
            T = random.randrange(1, 20)
            x = torch.randn(B, C, T, dtype=dtype)

            K = 2 ** random.randrange(0, 4)
            N = K * 2
            params = torch.randn(C, N + 1, dtype=dtype)
            x.requires_grad = True
            params.requires_grad = True
            print(f"B,C,T,K = {B},{C},{T},{K}")
            y = learned_nonlin(x, params, dim = 1)

87
88
            y_deriv = torch.randn_like(y)
            y.backward(gradient=y_deriv)
89
90
91
92

            if torch.cuda.is_available():
                # test that the CUDA forward is the same as the CPU forward.
                device = torch.device('cuda:0')
93
94
95
96
97
98
99
100
101
102
103
                x2, params2 = x.to(device).detach(), params.to(device).detach()
                x2.requires_grad = True
                params2.requires_grad = True
                y2 = learned_nonlin(x2, params2, dim = 1)

                if N >= 4 and N <= 16:  # Currently backprop requires these conditions
                    y2.backward(gradient=y_deriv.to(device))
                    x2grad, params2grad = x2.grad.to('cpu'), params2.grad.to('cpu')

                y2 = y2.to('cpu')

104
                print("Checking CUDA is same")
105
                if not torch.allclose(y, y2, atol=1.0e-05):
106
                    print(f"Error: CPU output versus CUDA not the same: {y} vs. {y2}, diff = {y2-y}, max-diff = {(y2-y).abs().max()}")
107
108
                    assert(0)

109
110
111
112
113
114
115
116
117
                if N >= 4 and N <= 16:  # Currently backprop requires these conditions
                    if not torch.allclose(x.grad, x2grad, atol=1.0e-05):
                        print(f"Error: CPU x.grad versus CUDA not the same: {x.grad} vs. {x2grad}, diff = {x2grad-x.grad}, max-diff = {(x2grad-x.grad).abs().max()}")
                        assert(0)
                    if not torch.allclose(params.grad, params2grad, atol=1.0e-05):
                        print(f"Error: CPU params.grad versus CUDA not the same: {params.grad} vs. {params2grad}, "
                              f"diff = {params2grad-params.grad}, max-diff = {(params2grad-params.grad).abs().max()}")
                        assert(0)

Daniel Povey's avatar
Daniel Povey committed
118
119
120
121

            delta = 1.0e-04
            delta_x = torch.randn_like(x) * delta
            pred_change = (x.grad * delta_x).sum()
122
123
            y2 = learned_nonlin(x + delta_x, params, dim = 1)
            observed_change = (y_deriv * (y2 - y)).sum()
Daniel Povey's avatar
Daniel Povey committed
124
            print(f"for input: pred_change = {pred_change}, observed_change={observed_change}")
125
            if not torch.allclose(pred_change, observed_change, rtol=5.0e-02, atol=3.0e-05):
126
127
                print(f"For changed input, output differs too much: params={params}, input={x}, mod_input={x+delta_x}, y={y}, y2={y2}, diff={y2-y}")
                assert 0
Daniel Povey's avatar
Daniel Povey committed
128
129
130
131
132
133

            delta_params = torch.randn_like(params) * delta
            pred_change = (params.grad * delta_params).sum()
            observed_change = (y_deriv * (learned_nonlin(x, params + delta_params, dim = 1) - y)).sum()
            print(f"for params: pred_change = {pred_change}, observed_change={observed_change}")
            assert torch.allclose(pred_change, observed_change, rtol=1.0e-02, atol=1.0e-05)
134
135
136
137



def test_learned_nonlin_zeros():
Daniel Povey's avatar
Daniel Povey committed
138
139
140
141
142
    N = 1
    C = 2
    H = 3
    W = 4
    for device in [ torch.device('cpu'), torch.device('cuda:0') ]:
Daniel Povey's avatar
Daniel Povey committed
143
144
145
        if device == torch.device('cuda:0') and not torch.cuda.is_available():
            print("Warning: torch not available, not testing this part.")
            continue
Daniel Povey's avatar
Daniel Povey committed
146
147
148
149
150
151
        for dtype in [torch.float32, torch.float64]:
            print("device=", device, ", dtype=", dtype)
            input = torch.zeros(N, 2 * C, H, W, device=device, dtype=dtype)
            kH = 5
            kW = 5
            pos_add = torch.zeros(C, kH, kW, device=device, dtype=dtype)
152
            pos_mul = torch.ones(C, kH, kW, device=device, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
153
154
155
            input.requires_grad = True
            pos_add.requires_grad = True
            pos_mul.requires_grad = True
Daniel Povey's avatar
Daniel Povey committed
156

Daniel Povey's avatar
Daniel Povey committed
157
            output_ref = torch.zeros(N, C, H, W, device=device, dtype=dtype)
158
            output = learned_nonlin(input, pos_add, pos_mul)
Daniel Povey's avatar
Daniel Povey committed
159
            assert torch.allclose(output, output_ref)
Daniel Povey's avatar
Daniel Povey committed
160

Daniel Povey's avatar
Daniel Povey committed
161
162
163
164
165
            output.sum().backward()
            print("input_grad=", input.grad)
            print("pos_add_grad=", pos_add.grad)
            print("pos_mul_grad=", pos_mul.grad)

Daniel Povey's avatar
Daniel Povey committed
166

167
def test_learned_nonlin_compare():
Daniel Povey's avatar
Daniel Povey committed
168
169
170
171
172
173
174
175
176
    N = 1
    C = 2
    H = 3
    W = 4
    if not torch.cuda.is_available():
        print("Warning: torch not available, not testing this part.")
        return
    for dtype in [torch.float32, torch.float64]:
        print("dtype=", dtype)
Daniel Povey's avatar
Daniel Povey committed
177
        input = torch.randn(N, 2 * C, H, W, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
178
        device = torch.device('cuda:0')
179
        input_cuda = input.to(device).detach()
Daniel Povey's avatar
Daniel Povey committed
180
181
182

        kH = 5
        kW = 5
Daniel Povey's avatar
Daniel Povey committed
183
184
185
        pos_add = torch.randn(C, kH, kW, dtype=dtype)
        pos_mul = torch.randn(C, kH, kW, dtype=dtype)

186
187
188
189
190
        pos_add_cuda = pos_add.to(device).detach()
        pos_mul_cuda = pos_mul.to(device).detach()

        for x in [ pos_add, pos_mul, pos_add_cuda, pos_mul_cuda, input, input_cuda ]:
            x.requires_grad = True
Daniel Povey's avatar
Daniel Povey committed
191

192
193
        output = learned_nonlin(input, pos_add, pos_mul)
        output_cuda = learned_nonlin(input_cuda, pos_add_cuda, pos_mul_cuda)
Daniel Povey's avatar
Daniel Povey committed
194
195
        print("output = ", output)
        print("output_cuda = ", output_cuda)
196
197
198
199
200

        output_grad = torch.randn(*output.shape, dtype=dtype)
        output.backward(gradient=output_grad)
        output_cuda.backward(gradient=output_grad.to(device))

Daniel Povey's avatar
Daniel Povey committed
201
202
203
204
205
        diff = (output - output_cuda.to(torch.device('cpu'))).abs().sum()
        abs = output.abs().sum()
        print("Diff = ", diff, ", abs = ", abs)
        assert torch.allclose(output, output_cuda.to(torch.device('cpu')),
                              atol=1.0e-05)
Daniel Povey's avatar
Daniel Povey committed
206
207


208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
        for a,b,name in [ (pos_add, pos_add_cuda, 'pos_add'),
                          (pos_mul, pos_mul_cuda, 'pos_mul'),
                          (input, input_cuda, 'input') ]:
            grad = a.grad
            cuda_grad = b.grad.to(torch.device('cpu'))
            diff_abs = (grad - cuda_grad).abs().sum().item()
            sum_abs = (grad + cuda_grad).abs().sum().item()
            print(f"Comparing grad of {name}: diff={diff_abs}, sum={sum_abs}")
            if diff_abs > 1.0e-05 * sum_abs:
                print(f"Error: too much difference in grad of {name}.")
                print("grad = ", grad)
                print("cuda_grad = ", cuda_grad)



223
def test_learned_nonlin_rand_compare():
Daniel Povey's avatar
Daniel Povey committed
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
    for _ in range(30):
        N = random.randint(1, 256)
        C = random.randint(1, 64)
        H = random.randint(1, 128)
        W = random.randint(1, 128)

        while N * C * H * W > 65535:
            if N >= C and N >= H and N >= W:
                N = N // 2
            elif C >= H and C >= W:
                C = C // 2
            elif H >= W:
                H = H // 2
            else:
                W = W // 2


        if not torch.cuda.is_available():
            print("Warning: torch not available, not testing this part.")
            return
        for dtype in [torch.float32, torch.float64]:
            print("dtype=", dtype)
Daniel Povey's avatar
Daniel Povey committed
246
            input = torch.randn(N, 2 * C, H, W, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
247
248
249
250
251
252
253
254
255
            device = torch.device('cuda:0')
            input_cuda = input.to(device)

            kH = random.randint(1, 10)
            kW = random.randint(1, 10)
            if kH % 2 == 0:
                kH += 1
            if kW % 2 == 0:
                kW += 1
Daniel Povey's avatar
Daniel Povey committed
256
257
            pos_add = torch.randn(C, kH, kW, dtype=dtype)
            pos_mul = torch.randn(C, kH, kW, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
258
259
260
            pos_add_cuda = pos_add.to(device)
            pos_mul_cuda = pos_mul.to(device)

261
262
            output = learned_nonlin(input, pos_add, pos_mul)
            output_cuda = learned_nonlin(input_cuda, pos_add_cuda, pos_mul_cuda)
Daniel Povey's avatar
Daniel Povey committed
263
264

            diff = (output - output_cuda.to(torch.device('cpu'))).abs().sum()
265
266
            sum_abs = output.abs().sum()
            print("Diff = ", diff, ", abs = ", sum_abs)
Daniel Povey's avatar
Daniel Povey committed
267

268
            if (diff / sum_abs).item() > 0.001:
Daniel Povey's avatar
Daniel Povey committed
269
270
271
                print("output = ", output)
                print("output_cuda = ", output_cuda)
                assert 0, "outputs differ"
272
273
274



275
def test_learned_nonlin_rand_grad():
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
    for _ in range(30):
        N = random.randint(1, 256)
        C = random.randint(1, 64)
        H = random.randint(1, 128)
        W = random.randint(1, 128)

        while N * C * H * W > 65535:
            if N >= C and N >= H and N >= W:
                N = N // 2
            elif C >= H and C >= W:
                C = C // 2
            elif H >= W:
                H = H // 2
            else:
                W = W // 2

        for device in [ torch.device('cpu'), torch.device('cuda:0') ]:
            if device == torch.device('cuda:0') and not torch.cuda.is_available():
                print("Warning: torch not available, not testing this part.")
                continue
            for dtype in [torch.float32, torch.float64]:
                print("dtype=", dtype, ", device=", device)
                input = torch.randn(N, 2 * C, H, W, dtype=dtype, device=device)


                kH = random.randint(1, 10)
                kW = random.randint(1, 10)
                if kH % 2 == 0:
                    kH += 1
                if kW % 2 == 0:
                    kW += 1
                pos_add = torch.randn(C, kH, kW, dtype=dtype, device=device)
                pos_mul = torch.randn(C, kH, kW, dtype=dtype, device=device)
                input.requires_grad = True
                pos_add.requires_grad = True
                pos_mul.requires_grad = True

313
                output = learned_nonlin(input, pos_add, pos_mul)
314
315
316
317
318
319
320
                output_grad = torch.randn(N, C, H, W, dtype=dtype, device=device)

                output.backward(gradient=output_grad)

                delta = 1.0e-05
                pos_delta = delta * torch.randn(C, kH, kW, dtype=dtype, device=device)
                pred_change = (pos_delta * pos_add.grad).sum().to('cpu').item()
321
                change = (output_grad * (learned_nonlin(input, pos_add + pos_delta, pos_mul) - output )).sum().to('cpu').item()
322
323
324
325
                print(f"For pos_add: pred_change={pred_change}, change={change}")
                #assert abs(pred_change - change)  < 1.0e-04

                pred_change = (pos_delta * pos_mul.grad).sum().to('cpu').item()
326
                change = (output_grad * (learned_nonlin(input, pos_add, pos_mul + pos_delta) - output )).sum().to('cpu').item()
327
328
329
330
331
                print(f"For pos_mul: pred_change={pred_change}, change={change}")
                #assert abs(pred_change - change) / abs(change) < 1.0e-04

                input_delta = delta * torch.randn(N, 2*C, H, W, dtype=dtype, device=device)
                pred_change = (input_delta * input.grad).sum().to('cpu').item()
332
                change = (output_grad * (learned_nonlin(input + input_delta, pos_add, pos_mul) - output )).sum().to('cpu').item()
333
334
335
336
337
                print(f"For input: pred_change={pred_change}, change={change}")
                #assert abs(pred_change - change) / abs(change) < 1.0e-04


if __name__ == "__main__":
338
    test_learned_nonlin_basic()
Daniel Povey's avatar
Daniel Povey committed
339
    test_learned_nonlin_deriv()
340
341
342
343
344
    if False:
        test_learned_nonlin_rand_grad()
        test_learned_nonlin_zeros()
        test_learned_nonlin_compare()
        test_learned_nonlin_rand_compare()