test_async_kernel.py 3.33 KB
Newer Older
dugupeiwen's avatar
dugupeiwen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Test async kernel copy
"""

import logging

import numpy as np

from numba import roc
import unittest
from numba.roc.hsadrv.driver import dgpu_present

logger = logging.getLogger()


@unittest.skipUnless(dgpu_present, 'test only on dGPU system')
class TestAsyncKernel(unittest.TestCase):

    def test_manual_stream(self):
        logger.info('context info: %s', roc.get_context().agent)

        @roc.jit("int32[:], int32[:]")
        def add1_kernel(dst, src):
            i = roc.get_global_id(0)
            if i < dst.size:
                dst[i] = src[i] + 1

        blksz = 256
        gridsz = 10**5
        nitems = blksz * gridsz
        ntimes = 500

        arr = np.arange(nitems, dtype=np.int32)

        logger.info('make coarse_arr')
        coarse_arr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
        coarse_arr[:] = arr

        logger.info('make coarse_res_arr')
        coarse_res_arr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
        coarse_res_arr[:] = 0

        logger.info("make stream")
        stream = roc.stream()

        logger.info('make gpu_res_arr')
        gpu_res_arr = roc.device_array_like(coarse_arr)

        logger.info('make gpu_arr')
        gpu_arr = roc.to_device(coarse_arr, stream=stream)

        for i in range(ntimes):
            logger.info('launch kernel: %d', i)
            add1_kernel[gridsz, blksz, stream](gpu_res_arr, gpu_arr)
            gpu_arr.copy_to_device(gpu_res_arr, stream=stream)

        logger.info('get kernel result')
        gpu_res_arr.copy_to_host(coarse_res_arr, stream=stream)

        logger.info("synchronize")
        stream.synchronize()

        logger.info("compare result")
        np.testing.assert_equal(coarse_res_arr, coarse_arr + ntimes)

    def test_ctx_managed_stream(self):
        logger.info('context info: %s', roc.get_context().agent)

        @roc.jit("int32[:], int32[:]")
        def add1_kernel(dst, src):
            i = roc.get_global_id(0)
            if i < dst.size:
                dst[i] = src[i] + 1

        blksz = 256
        gridsz = 10**5
        nitems = blksz * gridsz
        ntimes = 500

        arr = np.arange(nitems, dtype=np.int32)

        logger.info('make coarse_arr')
        coarse_arr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
        coarse_arr[:] = arr

        logger.info('make coarse_res_arr')
        coarse_res_arr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
        coarse_res_arr[:] = 0

        logger.info("make stream")
        stream = roc.stream()

        with stream.auto_synchronize():
            logger.info('make gpu_res_arr')
            gpu_res_arr = roc.device_array_like(coarse_arr)

            logger.info('make gpu_arr')
            gpu_arr = roc.to_device(coarse_arr, stream=stream)

            for i in range(ntimes):
                logger.info('launch kernel: %d', i)
                add1_kernel[gridsz, blksz, stream](gpu_res_arr, gpu_arr)
                gpu_arr.copy_to_device(gpu_res_arr, stream=stream)

            logger.info('get kernel result')
            gpu_res_arr.copy_to_host(coarse_res_arr, stream=stream)

        logger.info("synchronize on ctx __exit__")

        logger.info("compare result")
        np.testing.assert_equal(coarse_res_arr, coarse_arr + ntimes)


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    unittest.main()