dispatch.py 4.57 KB
Newer Older
dugupeiwen's avatar
dugupeiwen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import numpy as np

from numba.np.ufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
                                        GUFuncCallSteps)
from numba.roc.hsadrv.driver import dgpu_present
import numba.roc.hsadrv.devicearray as devicearray
import numba.roc.api as api

class HsaUFuncDispatcher(object):
    """
    Invoke the HSA ufunc specialization for the given inputs.
    """

    def __init__(self, types_to_retty_kernels):
        self.functions = types_to_retty_kernels

    def __call__(self, *args, **kws):
        """
        *args: numpy arrays
        **kws:
            stream -- hsa stream; when defined, asynchronous mode is used.
            out    -- output array. Can be a numpy array or DeviceArrayBase
                      depending on the input arguments.  Type must match
                      the input arguments.
        """
        return HsaUFuncMechanism.call(self.functions, args, kws)

    def reduce(self, arg, stream=0):
        raise NotImplementedError


class HsaUFuncMechanism(UFuncMechanism):
    """
    Provide OpenCL specialization
    """
    DEFAULT_STREAM = 0
    ARRAY_ORDER = 'A'

    def is_device_array(self, obj):
        if dgpu_present:
            return devicearray.is_hsa_ndarray(obj)
        else:
            return isinstance(obj, np.ndarray)

    def is_host_array(self, obj):
        if dgpu_present:
            return False
        else:
            return isinstance(obj, np.ndarray)

    def to_device(self, hostary, stream):
        if dgpu_present:
            return api.to_device(hostary)
        else:
            return hostary

    def launch(self, func, count, stream, args):
        # ILP must match vectorize kernel source
        ilp = 4
        # Use more wavefront to allow hiding latency
        tpb = 64 * 2
        count = (count + (ilp - 1)) // ilp
        blockcount = (count + (tpb - 1)) // tpb
        func[blockcount, tpb](*args)

    def device_array(self, shape, dtype, stream):
        if dgpu_present:
            return api.device_array(shape=shape, dtype=dtype)
        else:
            return np.empty(shape=shape, dtype=dtype)

    def broadcast_device(self, ary, shape):
        if dgpu_present:
            raise NotImplementedError('device broadcast_device NIY')
        else:
            ax_differs = [ax for ax in range(len(shape))
                          if ax >= ary.ndim
                          or ary.shape[ax] != shape[ax]]

            missingdim = len(shape) - len(ary.shape)
            strides = [0] * missingdim + list(ary.strides)

            for ax in ax_differs:
                strides[ax] = 0

            return np.ndarray(shape=shape, strides=strides,
                              dtype=ary.dtype, buffer=ary)


class _HsaGUFuncCallSteps(GUFuncCallSteps):
    __slots__ = ()

    def is_device_array(self, obj):
        if dgpu_present:
            return devicearray.is_hsa_ndarray(obj)
        else:
            return True

    def to_device(self, hostary):
        if dgpu_present:
            return api.to_device(hostary)
        else:
            return hostary

    def to_host(self, devary, hostary):
        if dgpu_present:
            out = devary.copy_to_host(hostary)
            return out
        else:
            pass

    def device_array(self, shape, dtype):
        if dgpu_present:
            return api.device_array(shape=shape, dtype=dtype)
        else:
            return np.empty(shape=shape, dtype=dtype)

    def launch_kernel(self, kernel, nelem, args):
        kernel.configure(nelem, min(nelem, 64))(*args)


class HSAGenerializedUFunc(GenerializedUFunc):
    @property
    def _call_steps(self):
        return _HsaGUFuncCallSteps

    def _broadcast_scalar_input(self, ary, shape):
        if dgpu_present:
            return devicearray.DeviceNDArray(shape=shape,
                                         strides=(0,),
                                         dtype=ary.dtype,
                                         dgpu_data=ary.dgpu_data)
        else:
            return np.lib.stride_tricks.as_strided(ary, shape=(shape,),
                                               strides=(0,))

    def _broadcast_add_axis(self, ary, newshape):
        newax = len(newshape) - len(ary.shape)
        # Add 0 strides for missing dimension
        newstrides = (0,) * newax + ary.strides
        if dgpu_present:
                return devicearray.DeviceNDArray(shape=newshape,
                                         strides=newstrides,
                                         dtype=ary.dtype,
                                         dgpu_data=ary.dgpu_data)
        else:
                raise NotImplementedError