test_linker.py 9.92 KB
Newer Older
dugupeiwen's avatar
dugupeiwen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
import numpy as np
import warnings
from numba.cuda.testing import unittest
from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing)
from numba.cuda.testing import CUDATestCase, test_data_dir
from numba.cuda.cudadrv.driver import (CudaAPIError, Linker,
                                       LinkerError)
from numba.cuda.cudadrv.error import NvrtcError
from numba.cuda import require_context
from numba.tests.support import ignore_internal_warnings
from numba import cuda, void, float64, int64, int32, typeof, float32


CONST1D = np.arange(10, dtype=np.float64)


def simple_const_mem(A):
    C = cuda.const.array_like(CONST1D)
    i = cuda.grid(1)

    A[i] = C[i] + 1.0


def func_with_lots_of_registers(x, a, b, c, d, e, f):
    a1 = 1.0
    a2 = 1.0
    a3 = 1.0
    a4 = 1.0
    a5 = 1.0
    b1 = 1.0
    b2 = 1.0
    b3 = 1.0
    b4 = 1.0
    b5 = 1.0
    c1 = 1.0
    c2 = 1.0
    c3 = 1.0
    c4 = 1.0
    c5 = 1.0
    d1 = 10
    d2 = 10
    d3 = 10
    d4 = 10
    d5 = 10
    for i in range(a):
        a1 += b
        a2 += c
        a3 += d
        a4 += e
        a5 += f
        b1 *= b
        b2 *= c
        b3 *= d
        b4 *= e
        b5 *= f
        c1 /= b
        c2 /= c
        c3 /= d
        c4 /= e
        c5 /= f
        d1 <<= b
        d2 <<= c
        d3 <<= d
        d4 <<= e
        d5 <<= f
    x[cuda.grid(1)] = a1 + a2 + a3 + a4 + a5
    x[cuda.grid(1)] += b1 + b2 + b3 + b4 + b5
    x[cuda.grid(1)] += c1 + c2 + c3 + c4 + c5
    x[cuda.grid(1)] += d1 + d2 + d3 + d4 + d5


def simple_smem(ary, dty):
    sm = cuda.shared.array(100, dty)
    i = cuda.grid(1)
    if i == 0:
        for j in range(100):
            sm[j] = j
    cuda.syncthreads()
    ary[i] = sm[i]


def coop_smem2d(ary):
    i, j = cuda.grid(2)
    sm = cuda.shared.array((10, 20), float32)
    sm[i, j] = (i + 1) / (j + 1)
    cuda.syncthreads()
    ary[i, j] = sm[i, j]


def simple_maxthreads(ary):
    i = cuda.grid(1)
    ary[i] = i


LMEM_SIZE = 1000


def simple_lmem(A, B, dty):
    C = cuda.local.array(LMEM_SIZE, dty)
    for i in range(C.shape[0]):
        C[i] = A[i]
    for i in range(C.shape[0]):
        B[i] = C[i]


@skip_on_cudasim('Linking unsupported in the simulator')
class TestLinker(CUDATestCase):
    _NUMBA_NVIDIA_BINDING_0_ENV = {'NUMBA_CUDA_USE_NVIDIA_BINDING': '0'}

    @require_context
    def test_linker_basic(self):
        '''Simply go through the constructor and destructor
        '''
        linker = Linker.new(cc=(5, 3))
        del linker

    def _test_linking(self, eager):
        global bar  # must be a global; other it is recognized as a freevar
        bar = cuda.declare_device('bar', 'int32(int32)')

        link = str(test_data_dir / 'jitlink.ptx')

        if eager:
            args = ['void(int32[:], int32[:])']
        else:
            args = []

        @cuda.jit(*args, link=[link])
        def foo(x, y):
            i = cuda.grid(1)
            x[i] += bar(y[i])

        A = np.array([123], dtype=np.int32)
        B = np.array([321], dtype=np.int32)

        foo[1, 1](A, B)

        self.assertTrue(A[0] == 123 + 2 * 321)

    def test_linking_lazy_compile(self):
        self._test_linking(eager=False)

    def test_linking_eager_compile(self):
        self._test_linking(eager=True)

    def test_linking_cu(self):
        bar = cuda.declare_device('bar', 'int32(int32)')

        link = str(test_data_dir / 'jitlink.cu')

        @cuda.jit(link=[link])
        def kernel(r, x):
            i = cuda.grid(1)

            if i < len(r):
                r[i] = bar(x[i])

        x = np.arange(10, dtype=np.int32)
        r = np.zeros_like(x)

        kernel[1, 32](r, x)

        # Matches the operation of bar() in jitlink.cu
        expected = x * 2
        np.testing.assert_array_equal(r, expected)

    def test_linking_cu_log_warning(self):
        bar = cuda.declare_device('bar', 'int32(int32)')

        link = str(test_data_dir / 'warn.cu')

        with warnings.catch_warnings(record=True) as w:
            ignore_internal_warnings()

            @cuda.jit('void(int32)', link=[link])
            def kernel(x):
                bar(x)

        self.assertEqual(len(w), 1, 'Expected warnings from NVRTC')
        # Check the warning refers to the log messages
        self.assertIn('NVRTC log messages', str(w[0].message))
        # Check the message pertaining to the unused variable is provided
        self.assertIn('declared but never referenced', str(w[0].message))

    def test_linking_cu_error(self):
        bar = cuda.declare_device('bar', 'int32(int32)')

        link = str(test_data_dir / 'error.cu')

        with self.assertRaises(NvrtcError) as e:
            @cuda.jit('void(int32)', link=[link])
            def kernel(x):
                bar(x)

        msg = e.exception.args[0]
        # Check the error message refers to the NVRTC compile
        self.assertIn('NVRTC Compilation failure', msg)
        # Check the expected error in the CUDA source is reported
        self.assertIn('identifier "SYNTAX" is undefined', msg)
        # Check the filename is reported correctly
        self.assertIn('in the compilation of "error.cu"', msg)

    def test_linking_unknown_filetype_error(self):
        expected_err = "Don't know how to link file with extension .cuh"
        with self.assertRaisesRegex(RuntimeError, expected_err):
            @cuda.jit('void()', link=['header.cuh'])
            def kernel():
                pass

    def test_linking_file_with_no_extension_error(self):
        expected_err = "Don't know how to link file with no extension"
        with self.assertRaisesRegex(RuntimeError, expected_err):
            @cuda.jit('void()', link=['data'])
            def kernel():
                pass

    @skip_if_cuda_includes_missing
    def test_linking_cu_cuda_include(self):
        link = str(test_data_dir / 'cuda_include.cu')

        # An exception will be raised when linking this kernel due to the
        # compile failure if CUDA includes cannot be found by Nvrtc.
        @cuda.jit('void()', link=[link])
        def kernel():
            pass

    def test_try_to_link_nonexistent(self):
        with self.assertRaises(LinkerError) as e:
            @cuda.jit('void(int32[::1])', link=['nonexistent.a'])
            def f(x):
                x[0] = 0
        self.assertIn('nonexistent.a not found', e.exception.args)

    def test_set_registers_no_max(self):
        """Ensure that the jitted kernel used in the test_set_registers_* tests
        uses more than 57 registers - this ensures that test_set_registers_*
        are really checking that they reduced the number of registers used from
        something greater than the maximum."""
        compiled = cuda.jit(func_with_lots_of_registers)
        compiled = compiled.specialize(np.empty(32), *range(6))
        self.assertGreater(compiled.get_regs_per_thread(), 57)

    def test_set_registers_57(self):
        compiled = cuda.jit(max_registers=57)(func_with_lots_of_registers)
        compiled = compiled.specialize(np.empty(32), *range(6))
        self.assertLessEqual(compiled.get_regs_per_thread(), 57)

    def test_set_registers_38(self):
        compiled = cuda.jit(max_registers=38)(func_with_lots_of_registers)
        compiled = compiled.specialize(np.empty(32), *range(6))
        self.assertLessEqual(compiled.get_regs_per_thread(), 38)

    def test_set_registers_eager(self):
        sig = void(float64[::1], int64, int64, int64, int64, int64, int64)
        compiled = cuda.jit(sig, max_registers=38)(func_with_lots_of_registers)
        self.assertLessEqual(compiled.get_regs_per_thread(), 38)

    def test_get_const_mem_size(self):
        sig = void(float64[::1])
        compiled = cuda.jit(sig)(simple_const_mem)
        const_mem_size = compiled.get_const_mem_size()
        self.assertGreaterEqual(const_mem_size, CONST1D.nbytes)

    def test_get_no_shared_memory(self):
        compiled = cuda.jit(func_with_lots_of_registers)
        compiled = compiled.specialize(np.empty(32), *range(6))
        shared_mem_size = compiled.get_shared_mem_per_block()
        self.assertEqual(shared_mem_size, 0)

    def test_get_shared_mem_per_block(self):
        sig = void(int32[::1], typeof(np.int32))
        compiled = cuda.jit(sig)(simple_smem)
        shared_mem_size = compiled.get_shared_mem_per_block()
        self.assertEqual(shared_mem_size, 400)

    def test_get_shared_mem_per_specialized(self):
        compiled = cuda.jit(simple_smem)
        compiled_specialized = compiled.specialize(
            np.zeros(100, dtype=np.int32), np.float64)
        shared_mem_size = compiled_specialized.get_shared_mem_per_block()
        self.assertEqual(shared_mem_size, 800)

    def test_get_max_threads_per_block(self):
        compiled = cuda.jit("void(float32[:,::1])")(coop_smem2d)
        max_threads = compiled.get_max_threads_per_block()
        self.assertGreater(max_threads, 0)

    def test_max_threads_exceeded(self):
        compiled = cuda.jit("void(int32[::1])")(simple_maxthreads)
        max_threads = compiled.get_max_threads_per_block()
        nelem = max_threads + 1
        ary = np.empty(nelem, dtype=np.int32)
        try:
            compiled[1, nelem](ary)
        except CudaAPIError as e:
            self.assertIn("cuLaunchKernel", e.msg)

    def test_get_local_mem_per_thread(self):
        sig = void(int32[::1], int32[::1], typeof(np.int32))
        compiled = cuda.jit(sig)(simple_lmem)
        local_mem_size = compiled.get_local_mem_per_thread()
        calc_size = np.dtype(np.int32).itemsize * LMEM_SIZE
        self.assertGreaterEqual(local_mem_size, calc_size)

    def test_get_local_mem_per_specialized(self):
        compiled = cuda.jit(simple_lmem)
        compiled_specialized = compiled.specialize(
            np.zeros(LMEM_SIZE, dtype=np.int32),
            np.zeros(LMEM_SIZE, dtype=np.int32),
            np.float64)
        local_mem_size = compiled_specialized.get_local_mem_per_thread()
        calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE
        self.assertGreaterEqual(local_mem_size, calc_size)


if __name__ == '__main__':
    unittest.main()