test_casting.py 8.6 KB
Newer Older
dugupeiwen's avatar
dugupeiwen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import numpy as np

from numba.cuda import compile_ptx
from numba.core.types import f2, i1, i2, i4, i8, u1, u2, u4, u8
from numba import cuda
from numba.core import types
from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
                                skip_unless_cc_53)
from numba.types import float16, float32
import itertools
import unittest


def native_cast(x):
    return float(x)


def to_int8(x):
    return np.int8(x)


def to_int16(x):
    return np.int16(x)


def to_int32(x):
    return np.int32(x)


def to_int64(x):
    return np.int64(x)


def to_uint8(x):
    return np.uint8(x)


def to_uint16(x):
    return np.uint16(x)


def to_uint32(x):
    return types.uint32(x)


def to_uint64(x):
    return types.uint64(x)


def to_float16(x):
    # When division and operators on float16 types are supported, this should
    # be changed to match the implementation in to_float32.
    return (np.float16(x) * np.float16(0.5))


def to_float32(x):
    return np.float32(x) / np.float32(2)


def to_float64(x):
    return np.float64(x) / np.float64(2)


def to_complex64(x):
    return np.complex64(x)


def to_complex128(x):
    return np.complex128(x)


# Since multiplication of float16 is not supported via the operator * on
# float16s yet, and the host does not implement cuda.fp16.*, we need two
# versions of the following functions:
#
# - The device version uses cuda.fp16.hmul
# - The host version uses the * operator

def cuda_int_literal_to_float16(x):
    # Note that we need to use `2` and not `np.float16(2)` to ensure that this
    # types as a literal int and not a const float16.
    return cuda.fp16.hmul(np.float16(x), 2)


def reference_int_literal_to_float16(x):
    return np.float16(x) * np.float16(2)


def cuda_float_literal_to_float16(x):
    # Note that `2.5` types as a const float64 and not a literal float, but
    # this case is provided in case that changes in future.
    return cuda.fp16.hmul(np.float16(x), 2.5)


def reference_float_literal_to_float16(x):
    return np.float16(x) * np.float16(2.5)


class TestCasting(CUDATestCase):
    def _create_wrapped(self, pyfunc, intype, outtype):
        wrapped_func = cuda.jit(device=True)(pyfunc)

        @cuda.jit
        def cuda_wrapper_fn(arg, res):
            res[0] = wrapped_func(arg[0])

        def wrapper_fn(arg):
            argarray = np.zeros(1, dtype=intype)
            argarray[0] = arg
            resarray = np.zeros(1, dtype=outtype)
            cuda_wrapper_fn[1, 1](argarray, resarray)
            return resarray[0]

        return wrapper_fn

    @skip_unless_cc_53
    def test_float_to_int(self):
        pyfuncs = (to_int8, to_int16, to_int32, to_int64)
        totys = (np.int8, np.int16, np.int32, np.int64)
        fromtys = (np.float16, np.float32, np.float64)

        for pyfunc, toty in zip(pyfuncs, totys):
            for fromty in fromtys:
                with self.subTest(fromty=fromty, toty=toty):
                    cfunc = self._create_wrapped(pyfunc, fromty, toty)
                    self.assertEqual(cfunc(12.3), pyfunc(12.3))
                    self.assertEqual(cfunc(12.3), int(12.3))
                    self.assertEqual(cfunc(-12.3), pyfunc(-12.3))
                    self.assertEqual(cfunc(-12.3), int(-12.3))

    @skip_on_cudasim('Compilation unsupported in the simulator')
    def test_float16_to_int_ptx(self):
        pyfuncs = (to_int8, to_int16, to_int32, to_int64)
        sizes = (8, 16, 32, 64)

        for pyfunc, size in zip(pyfuncs, sizes):
            ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
            self.assertIn(f"cvt.rni.s{size}.f16", ptx)

    @skip_unless_cc_53
    def test_float_to_uint(self):
        pyfuncs = (to_int8, to_int16, to_int32, to_int64)
        totys = (np.uint8, np.uint16, np.uint32, np.uint64)
        fromtys = (np.float16, np.float32, np.float64)

        for pyfunc, toty in zip(pyfuncs, totys):
            for fromty in fromtys:
                with self.subTest(fromty=fromty, toty=toty):
                    cfunc = self._create_wrapped(pyfunc, fromty, toty)
                    self.assertEqual(cfunc(12.3), pyfunc(12.3))
                    self.assertEqual(cfunc(12.3), int(12.3))

    @skip_on_cudasim('Compilation unsupported in the simulator')
    def test_float16_to_uint_ptx(self):
        pyfuncs = (to_uint8, to_uint16, to_uint32, to_uint64)
        sizes = (8, 16, 32, 64)

        for pyfunc, size in zip(pyfuncs, sizes):
            ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
            self.assertIn(f"cvt.rni.u{size}.f16", ptx)

    @skip_unless_cc_53
    def test_int_to_float(self):
        pyfuncs = (to_float16, to_float32, to_float64)
        totys = (np.float16, np.float32, np.float64)

        for pyfunc, toty in zip(pyfuncs, totys):
            with self.subTest(toty=toty):
                cfunc = self._create_wrapped(pyfunc, np.int64, toty)
                self.assertEqual(cfunc(321), pyfunc(321))

    @skip_unless_cc_53
    def test_literal_to_float16(self):
        cudafuncs = (cuda_int_literal_to_float16,
                     cuda_float_literal_to_float16)
        hostfuncs = (reference_int_literal_to_float16,
                     reference_float_literal_to_float16)

        for cudafunc, hostfunc in zip(cudafuncs, hostfuncs):
            with self.subTest(func=cudafunc):
                cfunc = self._create_wrapped(cudafunc, np.float16, np.float16)
                self.assertEqual(cfunc(321), hostfunc(321))

    @skip_on_cudasim('Compilation unsupported in the simulator')
    def test_int_to_float16_ptx(self):
        fromtys = (i1, i2, i4, i8)
        sizes = (8, 16, 32, 64)

        for ty, size in zip(fromtys, sizes):
            ptx, _ = compile_ptx(to_float16, (ty,), device=True)
            self.assertIn(f"cvt.rn.f16.s{size}", ptx)

    @skip_on_cudasim('Compilation unsupported in the simulator')
    def test_uint_to_float16_ptx(self):
        fromtys = (u1, u2, u4, u8)
        sizes = (8, 16, 32, 64)

        for ty, size in zip(fromtys, sizes):
            ptx, _ = compile_ptx(to_float16, (ty,), device=True)
            self.assertIn(f"cvt.rn.f16.u{size}", ptx)

    @skip_unless_cc_53
    def test_float_to_float(self):
        pyfuncs = (to_float16, to_float32, to_float64)
        tys = (np.float16, np.float32, np.float64)

        for (pyfunc, fromty), toty in itertools.product(zip(pyfuncs, tys), tys):
            with self.subTest(fromty=fromty, toty=toty):
                cfunc = self._create_wrapped(pyfunc, fromty, toty)
                # For this test we cannot use the pyfunc for comparison because
                # the CUDA target doesn't yet implement division (or operators)
                # for float16 values, so we test by comparing with the computed
                # expression instead.
                np.testing.assert_allclose(cfunc(12.3),
                                           toty(12.3) / toty(2), rtol=0.0003)
                np.testing.assert_allclose(cfunc(-12.3),
                                           toty(-12.3) / toty(2), rtol=0.0003)

    @skip_on_cudasim('Compilation unsupported in the simulator')
    def test_float16_to_float_ptx(self):
        pyfuncs = (to_float32, to_float64)
        postfixes = ("f32", "f64")

        for pyfunc, postfix in zip(pyfuncs, postfixes):
            ptx, _ = compile_ptx(pyfunc, (f2,), device=True)
            self.assertIn(f"cvt.{postfix}.f16", ptx)

    @skip_unless_cc_53
    def test_float_to_complex(self):
        pyfuncs = (to_complex64, to_complex128)
        totys = (np.complex64, np.complex128)
        fromtys = (np.float16, np.float32, np.float64)

        for pyfunc, toty in zip(pyfuncs, totys):
            for fromty in fromtys:
                with self.subTest(fromty=fromty, toty=toty):
                    cfunc = self._create_wrapped(pyfunc, fromty, toty)
                    # Here we need to explicitly cast the input to the pyfunc
                    # to match the casting that is automatically applied when
                    # passing the input to the cfunc as part of wrapping it in
                    # an array of type fromtype.
                    np.testing.assert_allclose(cfunc(3.21),
                                               pyfunc(fromty(3.21)))
                    np.testing.assert_allclose(cfunc(-3.21),
                                               pyfunc(fromty(-3.21)) + 0j)

    @skip_on_cudasim('Compilation unsupported in the simulator')
    def test_native_cast(self):
        float32_ptx, _ = cuda.compile_ptx(native_cast, (float32,), device=True)
        self.assertIn("st.f32", float32_ptx)

        float16_ptx, _ = cuda.compile_ptx(native_cast, (float16,), device=True)
        self.assertIn("st.u16", float16_ptx)


if __name__ == '__main__':
    unittest.main()