dispatch.py 5.03 KB
Newer Older
1
2
3
# sugon
# This file refers to CUDA, move to vectorizers.py file
# Refer to CUDA dispatcher.py for numba-0.58, this file should be transformed into a kernel scheduling
dugupeiwen's avatar
dugupeiwen committed
4
5
import numpy as np

6
7
8
# from numba.np.ufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
#                                         GUFuncCallSteps)
from numba.np.ufunc.deviceufunc import (UFuncMechanism, GeneralizedUFunc,
dugupeiwen's avatar
dugupeiwen committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
                                        GUFuncCallSteps)
from numba.roc.hsadrv.driver import dgpu_present
import numba.roc.hsadrv.devicearray as devicearray
import numba.roc.api as api

class HsaUFuncDispatcher(object):
    """
    Invoke the HSA ufunc specialization for the given inputs.
    """

    def __init__(self, types_to_retty_kernels):
        self.functions = types_to_retty_kernels

    def __call__(self, *args, **kws):
        """
        *args: numpy arrays
        **kws:
            stream -- hsa stream; when defined, asynchronous mode is used.
            out    -- output array. Can be a numpy array or DeviceArrayBase
                      depending on the input arguments.  Type must match
                      the input arguments.
        """
        return HsaUFuncMechanism.call(self.functions, args, kws)

    def reduce(self, arg, stream=0):
        raise NotImplementedError


class HsaUFuncMechanism(UFuncMechanism):
    """
    Provide OpenCL specialization
    """
    DEFAULT_STREAM = 0
    ARRAY_ORDER = 'A'

    def is_device_array(self, obj):
        if dgpu_present:
            return devicearray.is_hsa_ndarray(obj)
        else:
            return isinstance(obj, np.ndarray)

    def is_host_array(self, obj):
        if dgpu_present:
            return False
        else:
            return isinstance(obj, np.ndarray)

    def to_device(self, hostary, stream):
        if dgpu_present:
            return api.to_device(hostary)
        else:
            return hostary

    def launch(self, func, count, stream, args):
        # ILP must match vectorize kernel source
        ilp = 4
        # Use more wavefront to allow hiding latency
        tpb = 64 * 2
        count = (count + (ilp - 1)) // ilp
        blockcount = (count + (tpb - 1)) // tpb
        func[blockcount, tpb](*args)
70
71
    # sugon: adapt for numba-0.58
    def allocate_device_array(self, shape, dtype, stream):
dugupeiwen's avatar
dugupeiwen committed
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
        if dgpu_present:
            return api.device_array(shape=shape, dtype=dtype)
        else:
            return np.empty(shape=shape, dtype=dtype)

    def broadcast_device(self, ary, shape):
        if dgpu_present:
            raise NotImplementedError('device broadcast_device NIY')
        else:
            ax_differs = [ax for ax in range(len(shape))
                          if ax >= ary.ndim
                          or ary.shape[ax] != shape[ax]]

            missingdim = len(shape) - len(ary.shape)
            strides = [0] * missingdim + list(ary.strides)

            for ax in ax_differs:
                strides[ax] = 0

            return np.ndarray(shape=shape, strides=strides,
                              dtype=ary.dtype, buffer=ary)


class _HsaGUFuncCallSteps(GUFuncCallSteps):
    __slots__ = ()

    def is_device_array(self, obj):
        if dgpu_present:
            return devicearray.is_hsa_ndarray(obj)
        else:
            return True
103
104
105
    # sugon: adapt for numba-0.58
    def as_device_array(self, obj):
        pass
dugupeiwen's avatar
dugupeiwen committed
106
107
108
109
110
111
112
113
114
115
116
117
118

    def to_device(self, hostary):
        if dgpu_present:
            return api.to_device(hostary)
        else:
            return hostary

    def to_host(self, devary, hostary):
        if dgpu_present:
            out = devary.copy_to_host(hostary)
            return out
        else:
            pass
119
120
    # sugon: adapt for numba-58
    def allocate_device_array(self, shape, dtype):
dugupeiwen's avatar
dugupeiwen committed
121
122
123
124
125
126
127
128
129
        if dgpu_present:
            return api.device_array(shape=shape, dtype=dtype)
        else:
            return np.empty(shape=shape, dtype=dtype)

    def launch_kernel(self, kernel, nelem, args):
        kernel.configure(nelem, min(nelem, 64))(*args)


130
class HSAGenerializedUFunc(GeneralizedUFunc):
dugupeiwen's avatar
dugupeiwen committed
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
    @property
    def _call_steps(self):
        return _HsaGUFuncCallSteps

    def _broadcast_scalar_input(self, ary, shape):
        if dgpu_present:
            return devicearray.DeviceNDArray(shape=shape,
                                         strides=(0,),
                                         dtype=ary.dtype,
                                         dgpu_data=ary.dgpu_data)
        else:
            return np.lib.stride_tricks.as_strided(ary, shape=(shape,),
                                               strides=(0,))

    def _broadcast_add_axis(self, ary, newshape):
        newax = len(newshape) - len(ary.shape)
        # Add 0 strides for missing dimension
        newstrides = (0,) * newax + ary.strides
        if dgpu_present:
                return devicearray.DeviceNDArray(shape=newshape,
                                         strides=newstrides,
                                         dtype=ary.dtype,
                                         dgpu_data=ary.dgpu_data)
        else:
                raise NotImplementedError