learned_nonlin_test.py 13.3 KB
Newer Older
Daniel Povey's avatar
Daniel Povey committed
1
import random
Daniel Povey's avatar
Daniel Povey committed
2
import torch
3
from torch_learned_nonlin import learned_nonlin
Daniel Povey's avatar
Daniel Povey committed
4
5


6
7
8
9
10
11
12
13
14
15
def test_learned_nonlin_basic():
    for dtype in [torch.float32, torch.float64]:
        B = 2
        C = 4
        T = 10
        x = -2.0 + 0.4 * torch.arange(10, dtype=dtype)
        x = x.reshape(1, 1, 10).repeat(B, C, 1)

        K = 4
        N = K * 2
Daniel Povey's avatar
Daniel Povey committed
16
        params = torch.arange(N + 1, dtype=dtype).unsqueeze(0) + torch.arange(C, dtype=dtype).unsqueeze(1) - 3
Daniel Povey's avatar
Daniel Povey committed
17
18
        x.requires_grad = True
        params.requires_grad = True
19
20
21
22
        print("x = ", x)
        print("params = ", params)
        print("x.shape = ", x.shape)
        y = learned_nonlin(x, params, dim = 1)
23

24
        print("y = ", y)
25
        y.sum().backward()
26

27
28
29
        if torch.cuda.is_available():
            # test that the CUDA forward is the same as the CPU forward.
            device = torch.device('cuda:0')
30
31
32
33
34
            x2 = x.to(device).detach()
            x2.requires_grad = True
            params2 = params.to(device).detach()
            params2.requires_grad = True
            y2 = learned_nonlin(x2, params2, dim = 1).to(torch.device('cpu'))
35
36
37
38
39
            print("Checking CUDA is same")
            if not torch.allclose(y, y2, atol=1.0e-06):
                print(f"Error: CPU versus CUDA not the same: {y} vs. {y2}, diff = {y2-y}")
                assert(0);

40
41
42
43
44
45
46
47
48
49
            y2.sum().backward()

            if not torch.allclose(x.grad, x2.grad.to('cpu'), atol=1.0e-06):
                print(f"Error: CPU x-grad versus CUDA grad not the same: {x.grad} vs. {x2.grad}, diff = {x2.grad.to('cpu')-x.grad}")
                assert(0);
            if not torch.allclose(params.grad, params2.grad.to('cpu'), atol=1.0e-06):
                print(f"Error: CPU params-grad versus CUDA grad not the same: {params.grad} vs. {params2.grad}, diff = {params2.grad.to('cpu')-params.grad}")
                assert(0);


Daniel Povey's avatar
Daniel Povey committed
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

        print("x.grad = ", x.grad)
        print("params.grad = ", params.grad)

        # Just eyeballing the above tgo make sure it looks reasonable.


def test_learned_nonlin_deriv():
    """ Tests derivatives in randomized way """
    for _ in range(10):
        for dtype in [torch.float32, torch.float64]:
            B = random.randrange(1, 10)
            C = random.randrange(1, 10)
            T = random.randrange(1, 20)
            x = torch.randn(B, C, T, dtype=dtype)

            K = 2 ** random.randrange(0, 4)
            N = K * 2
            params = torch.randn(C, N + 1, dtype=dtype)
            x.requires_grad = True
            params.requires_grad = True
            print(f"B,C,T,K = {B},{C},{T},{K}")
            y = learned_nonlin(x, params, dim = 1)

74
75
            y_deriv = torch.randn_like(y)
            y.backward(gradient=y_deriv)
76
77
78
79

            if torch.cuda.is_available():
                # test that the CUDA forward is the same as the CPU forward.
                device = torch.device('cuda:0')
80
81
82
83
84
85
86
87
88
89
90
                x2, params2 = x.to(device).detach(), params.to(device).detach()
                x2.requires_grad = True
                params2.requires_grad = True
                y2 = learned_nonlin(x2, params2, dim = 1)

                if N >= 4 and N <= 16:  # Currently backprop requires these conditions
                    y2.backward(gradient=y_deriv.to(device))
                    x2grad, params2grad = x2.grad.to('cpu'), params2.grad.to('cpu')

                y2 = y2.to('cpu')

91
                print("Checking CUDA is same")
92
                if not torch.allclose(y, y2, atol=1.0e-05):
93
                    print(f"Error: CPU output versus CUDA not the same: {y} vs. {y2}, diff = {y2-y}, max-diff = {(y2-y).abs().max()}")
94
95
                    assert(0)

96
97
98
99
100
101
102
103
104
                if N >= 4 and N <= 16:  # Currently backprop requires these conditions
                    if not torch.allclose(x.grad, x2grad, atol=1.0e-05):
                        print(f"Error: CPU x.grad versus CUDA not the same: {x.grad} vs. {x2grad}, diff = {x2grad-x.grad}, max-diff = {(x2grad-x.grad).abs().max()}")
                        assert(0)
                    if not torch.allclose(params.grad, params2grad, atol=1.0e-05):
                        print(f"Error: CPU params.grad versus CUDA not the same: {params.grad} vs. {params2grad}, "
                              f"diff = {params2grad-params.grad}, max-diff = {(params2grad-params.grad).abs().max()}")
                        assert(0)

Daniel Povey's avatar
Daniel Povey committed
105
106
107
108

            delta = 1.0e-04
            delta_x = torch.randn_like(x) * delta
            pred_change = (x.grad * delta_x).sum()
109
110
            y2 = learned_nonlin(x + delta_x, params, dim = 1)
            observed_change = (y_deriv * (y2 - y)).sum()
Daniel Povey's avatar
Daniel Povey committed
111
            print(f"for input: pred_change = {pred_change}, observed_change={observed_change}")
112
            if not torch.allclose(pred_change, observed_change, rtol=5.0e-02, atol=3.0e-05):
113
114
                print(f"For changed input, output differs too much: params={params}, input={x}, mod_input={x+delta_x}, y={y}, y2={y2}, diff={y2-y}")
                assert 0
Daniel Povey's avatar
Daniel Povey committed
115
116
117
118
119
120

            delta_params = torch.randn_like(params) * delta
            pred_change = (params.grad * delta_params).sum()
            observed_change = (y_deriv * (learned_nonlin(x, params + delta_params, dim = 1) - y)).sum()
            print(f"for params: pred_change = {pred_change}, observed_change={observed_change}")
            assert torch.allclose(pred_change, observed_change, rtol=1.0e-02, atol=1.0e-05)
121
122
123
124



def test_learned_nonlin_zeros():
Daniel Povey's avatar
Daniel Povey committed
125
126
127
128
129
    N = 1
    C = 2
    H = 3
    W = 4
    for device in [ torch.device('cpu'), torch.device('cuda:0') ]:
Daniel Povey's avatar
Daniel Povey committed
130
131
132
        if device == torch.device('cuda:0') and not torch.cuda.is_available():
            print("Warning: torch not available, not testing this part.")
            continue
Daniel Povey's avatar
Daniel Povey committed
133
134
135
136
137
138
        for dtype in [torch.float32, torch.float64]:
            print("device=", device, ", dtype=", dtype)
            input = torch.zeros(N, 2 * C, H, W, device=device, dtype=dtype)
            kH = 5
            kW = 5
            pos_add = torch.zeros(C, kH, kW, device=device, dtype=dtype)
139
            pos_mul = torch.ones(C, kH, kW, device=device, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
140
141
142
            input.requires_grad = True
            pos_add.requires_grad = True
            pos_mul.requires_grad = True
Daniel Povey's avatar
Daniel Povey committed
143

Daniel Povey's avatar
Daniel Povey committed
144
            output_ref = torch.zeros(N, C, H, W, device=device, dtype=dtype)
145
            output = learned_nonlin(input, pos_add, pos_mul)
Daniel Povey's avatar
Daniel Povey committed
146
            assert torch.allclose(output, output_ref)
Daniel Povey's avatar
Daniel Povey committed
147

Daniel Povey's avatar
Daniel Povey committed
148
149
150
151
152
            output.sum().backward()
            print("input_grad=", input.grad)
            print("pos_add_grad=", pos_add.grad)
            print("pos_mul_grad=", pos_mul.grad)

Daniel Povey's avatar
Daniel Povey committed
153

154
def test_learned_nonlin_compare():
Daniel Povey's avatar
Daniel Povey committed
155
156
157
158
159
160
161
162
163
    N = 1
    C = 2
    H = 3
    W = 4
    if not torch.cuda.is_available():
        print("Warning: torch not available, not testing this part.")
        return
    for dtype in [torch.float32, torch.float64]:
        print("dtype=", dtype)
Daniel Povey's avatar
Daniel Povey committed
164
        input = torch.randn(N, 2 * C, H, W, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
165
        device = torch.device('cuda:0')
166
        input_cuda = input.to(device).detach()
Daniel Povey's avatar
Daniel Povey committed
167
168
169

        kH = 5
        kW = 5
Daniel Povey's avatar
Daniel Povey committed
170
171
172
        pos_add = torch.randn(C, kH, kW, dtype=dtype)
        pos_mul = torch.randn(C, kH, kW, dtype=dtype)

173
174
175
176
177
        pos_add_cuda = pos_add.to(device).detach()
        pos_mul_cuda = pos_mul.to(device).detach()

        for x in [ pos_add, pos_mul, pos_add_cuda, pos_mul_cuda, input, input_cuda ]:
            x.requires_grad = True
Daniel Povey's avatar
Daniel Povey committed
178

179
180
        output = learned_nonlin(input, pos_add, pos_mul)
        output_cuda = learned_nonlin(input_cuda, pos_add_cuda, pos_mul_cuda)
Daniel Povey's avatar
Daniel Povey committed
181
182
        print("output = ", output)
        print("output_cuda = ", output_cuda)
183
184
185
186
187

        output_grad = torch.randn(*output.shape, dtype=dtype)
        output.backward(gradient=output_grad)
        output_cuda.backward(gradient=output_grad.to(device))

Daniel Povey's avatar
Daniel Povey committed
188
189
190
191
192
        diff = (output - output_cuda.to(torch.device('cpu'))).abs().sum()
        abs = output.abs().sum()
        print("Diff = ", diff, ", abs = ", abs)
        assert torch.allclose(output, output_cuda.to(torch.device('cpu')),
                              atol=1.0e-05)
Daniel Povey's avatar
Daniel Povey committed
193
194


195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
        for a,b,name in [ (pos_add, pos_add_cuda, 'pos_add'),
                          (pos_mul, pos_mul_cuda, 'pos_mul'),
                          (input, input_cuda, 'input') ]:
            grad = a.grad
            cuda_grad = b.grad.to(torch.device('cpu'))
            diff_abs = (grad - cuda_grad).abs().sum().item()
            sum_abs = (grad + cuda_grad).abs().sum().item()
            print(f"Comparing grad of {name}: diff={diff_abs}, sum={sum_abs}")
            if diff_abs > 1.0e-05 * sum_abs:
                print(f"Error: too much difference in grad of {name}.")
                print("grad = ", grad)
                print("cuda_grad = ", cuda_grad)



210
def test_learned_nonlin_rand_compare():
Daniel Povey's avatar
Daniel Povey committed
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
    for _ in range(30):
        N = random.randint(1, 256)
        C = random.randint(1, 64)
        H = random.randint(1, 128)
        W = random.randint(1, 128)

        while N * C * H * W > 65535:
            if N >= C and N >= H and N >= W:
                N = N // 2
            elif C >= H and C >= W:
                C = C // 2
            elif H >= W:
                H = H // 2
            else:
                W = W // 2


        if not torch.cuda.is_available():
            print("Warning: torch not available, not testing this part.")
            return
        for dtype in [torch.float32, torch.float64]:
            print("dtype=", dtype)
Daniel Povey's avatar
Daniel Povey committed
233
            input = torch.randn(N, 2 * C, H, W, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
234
235
236
237
238
239
240
241
242
            device = torch.device('cuda:0')
            input_cuda = input.to(device)

            kH = random.randint(1, 10)
            kW = random.randint(1, 10)
            if kH % 2 == 0:
                kH += 1
            if kW % 2 == 0:
                kW += 1
Daniel Povey's avatar
Daniel Povey committed
243
244
            pos_add = torch.randn(C, kH, kW, dtype=dtype)
            pos_mul = torch.randn(C, kH, kW, dtype=dtype)
Daniel Povey's avatar
Daniel Povey committed
245
246
247
            pos_add_cuda = pos_add.to(device)
            pos_mul_cuda = pos_mul.to(device)

248
249
            output = learned_nonlin(input, pos_add, pos_mul)
            output_cuda = learned_nonlin(input_cuda, pos_add_cuda, pos_mul_cuda)
Daniel Povey's avatar
Daniel Povey committed
250
251

            diff = (output - output_cuda.to(torch.device('cpu'))).abs().sum()
252
253
            sum_abs = output.abs().sum()
            print("Diff = ", diff, ", abs = ", sum_abs)
Daniel Povey's avatar
Daniel Povey committed
254

255
            if (diff / sum_abs).item() > 0.001:
Daniel Povey's avatar
Daniel Povey committed
256
257
258
                print("output = ", output)
                print("output_cuda = ", output_cuda)
                assert 0, "outputs differ"
259
260
261



262
def test_learned_nonlin_rand_grad():
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
    for _ in range(30):
        N = random.randint(1, 256)
        C = random.randint(1, 64)
        H = random.randint(1, 128)
        W = random.randint(1, 128)

        while N * C * H * W > 65535:
            if N >= C and N >= H and N >= W:
                N = N // 2
            elif C >= H and C >= W:
                C = C // 2
            elif H >= W:
                H = H // 2
            else:
                W = W // 2

        for device in [ torch.device('cpu'), torch.device('cuda:0') ]:
            if device == torch.device('cuda:0') and not torch.cuda.is_available():
                print("Warning: torch not available, not testing this part.")
                continue
            for dtype in [torch.float32, torch.float64]:
                print("dtype=", dtype, ", device=", device)
                input = torch.randn(N, 2 * C, H, W, dtype=dtype, device=device)


                kH = random.randint(1, 10)
                kW = random.randint(1, 10)
                if kH % 2 == 0:
                    kH += 1
                if kW % 2 == 0:
                    kW += 1
                pos_add = torch.randn(C, kH, kW, dtype=dtype, device=device)
                pos_mul = torch.randn(C, kH, kW, dtype=dtype, device=device)
                input.requires_grad = True
                pos_add.requires_grad = True
                pos_mul.requires_grad = True

300
                output = learned_nonlin(input, pos_add, pos_mul)
301
302
303
304
305
306
307
                output_grad = torch.randn(N, C, H, W, dtype=dtype, device=device)

                output.backward(gradient=output_grad)

                delta = 1.0e-05
                pos_delta = delta * torch.randn(C, kH, kW, dtype=dtype, device=device)
                pred_change = (pos_delta * pos_add.grad).sum().to('cpu').item()
308
                change = (output_grad * (learned_nonlin(input, pos_add + pos_delta, pos_mul) - output )).sum().to('cpu').item()
309
310
311
312
                print(f"For pos_add: pred_change={pred_change}, change={change}")
                #assert abs(pred_change - change)  < 1.0e-04

                pred_change = (pos_delta * pos_mul.grad).sum().to('cpu').item()
313
                change = (output_grad * (learned_nonlin(input, pos_add, pos_mul + pos_delta) - output )).sum().to('cpu').item()
314
315
316
317
318
                print(f"For pos_mul: pred_change={pred_change}, change={change}")
                #assert abs(pred_change - change) / abs(change) < 1.0e-04

                input_delta = delta * torch.randn(N, 2*C, H, W, dtype=dtype, device=device)
                pred_change = (input_delta * input.grad).sum().to('cpu').item()
319
                change = (output_grad * (learned_nonlin(input + input_delta, pos_add, pos_mul) - output )).sum().to('cpu').item()
320
321
322
323
324
                print(f"For input: pred_change={pred_change}, change={change}")
                #assert abs(pred_change - change) / abs(change) < 1.0e-04


if __name__ == "__main__":
325
    test_learned_nonlin_basic()
Daniel Povey's avatar
Daniel Povey committed
326
    test_learned_nonlin_deriv()
327
328
329
330
331
    if False:
        test_learned_nonlin_rand_grad()
        test_learned_nonlin_zeros()
        test_learned_nonlin_compare()
        test_learned_nonlin_rand_compare()