nvrtc.pyx 8.9 KB
Newer Older
root's avatar
root committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# distutils: language = c++

"""Thin wrapper of NVRTC API.

There are four differences compared to the original C API.

1. Not all functions are ported.
2. Errors are translated into NVRTCError exceptions.
3. The 'nvrtc' prefix of each API is omitted and the next character is set to
   lower case.
4. The resulting values are returned directly instead of references.

"""
import sys as _sys  # no-cython-lint

cimport cython  # NOQA
from libcpp cimport vector

from cupy_backends.cuda.api cimport runtime
from cupy_backends.cuda._softlink cimport SoftLink


###############################################################################
# Extern
###############################################################################

IF CUPY_USE_CUDA_PYTHON:
    from cuda.cnvrtc cimport *
ELSE:
    cdef extern from '../../cupy_rtc.h' nogil:
        const char *nvrtcGetErrorString(Result result)
        int nvrtcVersion(int *major, int *minor)
        int nvrtcCreateProgram(
            Program* prog, const char* src, const char* name, int numHeaders,
            const char** headers, const char** includeNames)
        int nvrtcDestroyProgram(Program *prog)
        int nvrtcCompileProgram(Program prog, int numOptions,
                                const char** options)
        int nvrtcGetPTXSize(Program prog, size_t *ptxSizeRet)
        int nvrtcGetPTX(Program prog, char *ptx)
        int nvrtcGetCUBINSize(Program prog, size_t *cubinSizeRet)
        int nvrtcGetCUBIN(Program prog, char *cubin)
        int nvrtcGetProgramLogSize(Program prog, size_t* logSizeRet)
        int nvrtcGetProgramLog(Program prog, char* log)
        int nvrtcAddNameExpression(Program, const char*)
        int nvrtcGetLoweredName(Program, const char*, const char**)

    ctypedef int (*f_type)(...) nogil  # NOQA
    IF 11020 <= CUPY_CUDA_VERSION < 12000:
        if _sys.platform == 'linux':
            _libname = 'libnvrtc.so.11.2'
        else:
            _libname = 'nvrtc64_112_0.dll'
    ELIF 12000 <= CUPY_CUDA_VERSION < 13000:
        if _sys.platform == 'linux':
            _libname = 'libnvrtc.so.12'
        else:
            _libname = 'nvrtc64_120_0.dll'
    ELSE:
        _libname = None

    cdef SoftLink _lib = SoftLink(_libname, 'nvrtc')
    # APIs added after CUDA 11.2+.
    cdef f_type nvrtcGetNumSupportedArchs = <f_type>_lib.get('GetNumSupportedArchs')  # NOQA
    cdef f_type nvrtcGetSupportedArchs = <f_type>_lib.get('GetSupportedArchs')  # NOQA
    cdef f_type nvrtcGetNVVMSize = <f_type>_lib.get('GetNVVMSize')
    cdef f_type nvrtcGetNVVM = <f_type>_lib.get('GetNVVM')


###############################################################################
# Error handling
###############################################################################

class NVRTCError(RuntimeError):

    def __init__(self, status):
        self.status = status
        cdef bytes msg = nvrtcGetErrorString(<Result>status)
        super(NVRTCError, self).__init__(
            '{} ({})'.format(msg.decode(), status))

    def __reduce__(self):
        return (type(self), (self.status,))


@cython.profile(False)
cpdef inline check_status(int status):
    if status != 0:
        raise NVRTCError(status)


cpdef tuple getVersion():
    cdef int major, minor
    with nogil:
        status = nvrtcVersion(&major, &minor)
    check_status(status)
    return major, minor


cpdef tuple getSupportedArchs():
    cdef int status, num_archs
    cdef vector.vector[int] archs
    if runtime._is_hip_environment:
        raise RuntimeError("HIP does not support getSupportedArchs")
    if runtime.runtimeGetVersion() < 11020:
        raise RuntimeError("getSupportedArchs is supported since CUDA 11.2")
    with nogil:
        status = nvrtcGetNumSupportedArchs(&num_archs)
        if status == 0:
            archs.resize(num_archs)
            status = nvrtcGetSupportedArchs(archs.data())
    check_status(status)
    return tuple(archs)


###############################################################################
# Program
###############################################################################

cpdef intptr_t createProgram(unicode src, unicode name, headers,
                             include_names) except? 0:
    cdef Program prog
    cdef bytes b_src = src.encode()
    cdef const char* src_ptr = b_src
    cdef bytes b_name = name.encode()
    cdef const char* name_ptr
    if len(name) > 0:
        name_ptr = b_name
    else:
        name_ptr = NULL
    cdef int num_headers = len(headers)
    cdef vector.vector[const char*] header_vec
    cdef vector.vector[const char*] include_name_vec
    cdef const char** header_vec_ptr = NULL
    cdef const char** include_name_vec_ptr = NULL
    assert num_headers == len(include_names)
    for i in headers:
        header_vec.push_back(<const char*>i)
    for i in include_names:
        include_name_vec.push_back(<const char*>i)
    if num_headers > 0:
        header_vec_ptr = header_vec.data()
        include_name_vec_ptr = include_name_vec.data()
    with nogil:
        status = nvrtcCreateProgram(
            &prog, src_ptr, name_ptr, num_headers, header_vec_ptr,
            include_name_vec_ptr)
    check_status(status)
    return <intptr_t>prog


cpdef destroyProgram(intptr_t prog):
    cdef Program p = <Program>prog
    with nogil:
        status = nvrtcDestroyProgram(&p)
    check_status(status)


cpdef compileProgram(intptr_t prog, options):
    cdef int option_num = len(options)
    cdef vector.vector[const char*] option_vec
    cdef option_list = [opt.encode() for opt in options]
    cdef const char** option_vec_ptr = NULL
    for i in option_list:
        option_vec.push_back(<const char*>i)
    if option_num > 0:
        option_vec_ptr = option_vec.data()
    with nogil:
        status = nvrtcCompileProgram(<Program>prog, option_num,
                                     option_vec_ptr)
    check_status(status)


cpdef bytes getPTX(intptr_t prog):
    cdef size_t ptxSizeRet
    cdef vector.vector[char] ptx
    cdef char* ptx_ptr = NULL
    with nogil:
        status = nvrtcGetPTXSize(<Program>prog, &ptxSizeRet)
    check_status(status)
    if ptxSizeRet == 0:
        return b''
    ptx.resize(ptxSizeRet)
    ptx_ptr = ptx.data()
    with nogil:
        status = nvrtcGetPTX(<Program>prog, ptx_ptr)
    check_status(status)

    # Strip the trailing NULL.
    return ptx_ptr[:ptxSizeRet-1]


cpdef bytes getCUBIN(intptr_t prog):
    cdef size_t cubinSizeRet = 0
    cdef vector.vector[char] cubin
    cdef char* cubin_ptr = NULL
    if runtime._is_hip_environment:
        raise RuntimeError("HIP does not support getCUBIN")
    if runtime.runtimeGetVersion() < 11010:
        raise RuntimeError("getCUBIN is supported since CUDA 11.1")
    with nogil:
        status = nvrtcGetCUBINSize(<Program>prog, &cubinSizeRet)
    check_status(status)
    if cubinSizeRet <= 1:
        # On CUDA 11.1, cubinSizeRet=1 if -arch=compute_XX is used, but the
        # spec says it should be 0 in this case...
        raise RuntimeError('cubin is requested, but the real arch (sm_XX) is '
                           'not provided')
    cubin.resize(cubinSizeRet)
    cubin_ptr = cubin.data()
    with nogil:
        status = nvrtcGetCUBIN(<Program>prog, cubin_ptr)
    check_status(status)

    # Strip the trailing NULL.
    return cubin_ptr[:cubinSizeRet-1]


cpdef bytes getNVVM(intptr_t prog):
    if runtime._is_hip_environment:
        raise RuntimeError("HIP does not support getNVVM")
    if runtime.runtimeGetVersion() < 11040:
        raise RuntimeError("getNVVM is supported since CUDA 11.4")

    cdef size_t nvvmSizeRet = 0
    cdef vector.vector[char] nvvm
    cdef char* nvvm_ptr = NULL

    with nogil:
        status = nvrtcGetNVVMSize(<Program>prog, &nvvmSizeRet)
    check_status(status)

    nvvm.resize(nvvmSizeRet)
    nvvm_ptr = nvvm.data()
    with nogil:
        status = nvrtcGetNVVM(<Program>prog, nvvm_ptr)
    check_status(status)

    # Strip the trailing NULL.
    return nvvm_ptr[:nvvmSizeRet-1]


cpdef unicode getProgramLog(intptr_t prog):
    cdef size_t logSizeRet
    cdef vector.vector[char] log
    cdef char* log_ptr = NULL
    with nogil:
        status = nvrtcGetProgramLogSize(<Program>prog, &logSizeRet)
    check_status(status)
    if logSizeRet == 0:
        return ''
    log.resize(logSizeRet)
    log_ptr = log.data()
    with nogil:
        status = nvrtcGetProgramLog(<Program>prog, log_ptr)
    check_status(status)

    # Strip the trailing NULL.
    return log_ptr[:logSizeRet-1].decode('UTF-8')


cpdef addNameExpression(intptr_t prog, str name):
    cdef bytes b_name = name.encode()
    cdef const char* c_name = b_name
    with nogil:
        status = nvrtcAddNameExpression(<Program>prog, c_name)
    check_status(status)


cpdef str getLoweredName(intptr_t prog, str name):
    cdef bytes b_name = name.encode()
    cdef const char* c_name = b_name
    cdef const char* mangled_name
    with nogil:
        status = nvrtcGetLoweredName(<Program>prog, c_name, &mangled_name)
    check_status(status)
    b_name = mangled_name
    return b_name.decode('UTF-8')