dispatch.py 4.7 KB
Newer Older
dugupeiwen's avatar
dugupeiwen committed
1
2
import numpy as np

3
4
5
# from numba.np.ufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
#                                         GUFuncCallSteps)
from numba.np.ufunc.deviceufunc import (UFuncMechanism, GeneralizedUFunc,
dugupeiwen's avatar
dugupeiwen committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
                                        GUFuncCallSteps)
from numba.roc.hsadrv.driver import dgpu_present
import numba.roc.hsadrv.devicearray as devicearray
import numba.roc.api as api

class HsaUFuncDispatcher(object):
    """
    Invoke the HSA ufunc specialization for the given inputs.
    """

    def __init__(self, types_to_retty_kernels):
        self.functions = types_to_retty_kernels

    def __call__(self, *args, **kws):
        """
        *args: numpy arrays
        **kws:
            stream -- hsa stream; when defined, asynchronous mode is used.
            out    -- output array. Can be a numpy array or DeviceArrayBase
                      depending on the input arguments.  Type must match
                      the input arguments.
        """
        return HsaUFuncMechanism.call(self.functions, args, kws)

    def reduce(self, arg, stream=0):
        raise NotImplementedError


class HsaUFuncMechanism(UFuncMechanism):
    """
    Provide OpenCL specialization
    """
    DEFAULT_STREAM = 0
    ARRAY_ORDER = 'A'

    def is_device_array(self, obj):
        if dgpu_present:
            return devicearray.is_hsa_ndarray(obj)
        else:
            return isinstance(obj, np.ndarray)

    def is_host_array(self, obj):
        if dgpu_present:
            return False
        else:
            return isinstance(obj, np.ndarray)

    def to_device(self, hostary, stream):
        if dgpu_present:
            return api.to_device(hostary)
        else:
            return hostary

    def launch(self, func, count, stream, args):
        # ILP must match vectorize kernel source
        ilp = 4
        # Use more wavefront to allow hiding latency
        tpb = 64 * 2
        count = (count + (ilp - 1)) // ilp
        blockcount = (count + (tpb - 1)) // tpb
        func[blockcount, tpb](*args)

    def device_array(self, shape, dtype, stream):
        if dgpu_present:
            return api.device_array(shape=shape, dtype=dtype)
        else:
            return np.empty(shape=shape, dtype=dtype)

    def broadcast_device(self, ary, shape):
        if dgpu_present:
            raise NotImplementedError('device broadcast_device NIY')
        else:
            ax_differs = [ax for ax in range(len(shape))
                          if ax >= ary.ndim
                          or ary.shape[ax] != shape[ax]]

            missingdim = len(shape) - len(ary.shape)
            strides = [0] * missingdim + list(ary.strides)

            for ax in ax_differs:
                strides[ax] = 0

            return np.ndarray(shape=shape, strides=strides,
                              dtype=ary.dtype, buffer=ary)


class _HsaGUFuncCallSteps(GUFuncCallSteps):
    __slots__ = ()

    def is_device_array(self, obj):
        if dgpu_present:
            return devicearray.is_hsa_ndarray(obj)
        else:
            return True

    def to_device(self, hostary):
        if dgpu_present:
            return api.to_device(hostary)
        else:
            return hostary

    def to_host(self, devary, hostary):
        if dgpu_present:
            out = devary.copy_to_host(hostary)
            return out
        else:
            pass

    def device_array(self, shape, dtype):
        if dgpu_present:
            return api.device_array(shape=shape, dtype=dtype)
        else:
            return np.empty(shape=shape, dtype=dtype)

    def launch_kernel(self, kernel, nelem, args):
        kernel.configure(nelem, min(nelem, 64))(*args)


124
class HSAGenerializedUFunc(GeneralizedUFunc):
dugupeiwen's avatar
dugupeiwen committed
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
    @property
    def _call_steps(self):
        return _HsaGUFuncCallSteps

    def _broadcast_scalar_input(self, ary, shape):
        if dgpu_present:
            return devicearray.DeviceNDArray(shape=shape,
                                         strides=(0,),
                                         dtype=ary.dtype,
                                         dgpu_data=ary.dgpu_data)
        else:
            return np.lib.stride_tricks.as_strided(ary, shape=(shape,),
                                               strides=(0,))

    def _broadcast_add_axis(self, ary, newshape):
        newax = len(newshape) - len(ary.shape)
        # Add 0 strides for missing dimension
        newstrides = (0,) * newax + ary.strides
        if dgpu_present:
                return devicearray.DeviceNDArray(shape=newshape,
                                         strides=newstrides,
                                         dtype=ary.dtype,
                                         dgpu_data=ary.dgpu_data)
        else:
                raise NotImplementedError