test_multigpu.py 4.04 KB
Newer Older
dugupeiwen's avatar
dugupeiwen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from numba import cuda
import numpy as np
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
import threading
import unittest


class TestMultiGPUContext(CUDATestCase):
    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
    def test_multigpu_context(self):
        @cuda.jit("void(float64[:], float64[:])")
        def copy_plus_1(inp, out):
            i = cuda.grid(1)
            if i < out.size:
                out[i] = inp[i] + 1

        def check(inp, out):
            np.testing.assert_equal(inp + 1, out)

        N = 32
        A = np.arange(N, dtype=np.float64)
        B = np.arange(N, dtype=np.float64)

        with cuda.gpus[0]:
            copy_plus_1[1, N](A, B)

        check(A, B)

        copy_plus_1[1, N](A, B)
        check(A, B)

        with cuda.gpus[0]:
            A0 = np.arange(N, dtype=np.float64)
            B0 = np.arange(N, dtype=np.float64)
            copy_plus_1[1, N](A0, B0)

            with cuda.gpus[1]:
                A1 = np.arange(N, dtype=np.float64)
                B1 = np.arange(N, dtype=np.float64)
                copy_plus_1[1, N](A1, B1)

        check(A0, B0)
        check(A1, B1)

        A = np.arange(N, dtype=np.float64)
        B = np.arange(N, dtype=np.float64)
        copy_plus_1[1, N](A, B)
        check(A, B)

    @skip_on_cudasim('Simulator does not support multiple threads')
    def test_multithreaded(self):
        def work(gpu, dA, results, ridx):
            try:
                with gpu:
                    arr = dA.copy_to_host()

            except Exception as e:
                results[ridx] = e

            else:
                results[ridx] = np.all(arr == np.arange(10))

        dA = cuda.to_device(np.arange(10))

        nthreads = 10
        results = [None] * nthreads
        threads = [threading.Thread(target=work, args=(cuda.gpus.current,
                                                       dA, results, i))
                   for i in range(nthreads)]
        for th in threads:
            th.start()

        for th in threads:
            th.join()

        for r in results:
            if isinstance(r, BaseException):
                raise r
            else:
                self.assertTrue(r)

    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
    def test_with_context(self):

        @cuda.jit
        def vector_add_scalar(arr, val):
            i = cuda.grid(1)
            if i < arr.size:
                arr[i] += val

        hostarr = np.arange(10, dtype=np.float32)
        with cuda.gpus[0]:
            arr1 = cuda.to_device(hostarr)

        with cuda.gpus[1]:
            arr2 = cuda.to_device(hostarr)

        with cuda.gpus[0]:
            vector_add_scalar[1, 10](arr1, 1)

        with cuda.gpus[1]:
            vector_add_scalar[1, 10](arr2, 2)

        with cuda.gpus[0]:
            np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))

        with cuda.gpus[1]:
            np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))

    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
    def test_with_context_peer_copy(self):
        # Peer access is not always possible - for example, with one GPU in TCC
        # mode and one in WDDM - if that is the case, this test would fail so
        # we need to skip it.
        with cuda.gpus[0]:
            ctx = cuda.current_context()
            if not ctx.can_access_peer(1):
                self.skipTest('Peer access between GPUs disabled')

        # 1. Create a range in an array
        hostarr = np.arange(10, dtype=np.float32)

        # 2. Copy range array from host -> GPU 0
        with cuda.gpus[0]:
            arr1 = cuda.to_device(hostarr)

        # 3. Initialize a zero-filled array on GPU 1
        with cuda.gpus[1]:
            arr2 = cuda.to_device(np.zeros_like(hostarr))

        with cuda.gpus[0]:
            # 4. Copy range from GPU 0 -> GPU 1
            arr2.copy_to_device(arr1)

            # 5. Copy range from GPU 1 -> host and check contents
            np.testing.assert_equal(arr2.copy_to_host(), hostarr)


if __name__ == '__main__':
    unittest.main()