_command.py 7.75 KB
Newer Older
root's avatar
root committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import json
import os
import os.path
import subprocess
import sys
from typing import Any, Dict, List, Tuple

import setuptools
import setuptools.command.build_ext

import cupy_builder
import cupy_builder.install_build as build
from cupy_builder._context import Context
from cupy_builder._compiler import DeviceCompilerUnix, DeviceCompilerWin32


def filter_files_by_extension(
        sources: List[str],
        extension: str,
) -> Tuple[List[str], List[str]]:
    sources_selected = []
    sources_others = []
    for src in sources:
        if os.path.splitext(src)[1] == extension:
            sources_selected.append(src)
        else:
            sources_others.append(src)
    return sources_selected, sources_others


def compile_device_code(
        ctx: Context,
        ext: setuptools.Extension
) -> Tuple[List[str], List[str]]:
    """Compiles device code ("*.cu").

    This method invokes the device compiler (nvcc/hipcc) to build object
    files from device code, then returns the tuple of:
    - list of remaining (non-device) source files ("*.cpp")
    - list of compiled object files for device code ("*.o")
    """
    sources_cu, sources_cpp = filter_files_by_extension(
        ext.sources, '.cu')
    if len(sources_cu) == 0:
        # No device code used in this extension.
        return ext.sources, []

    if sys.platform == 'win32':
        compiler = DeviceCompilerWin32(ctx)
    else:
        compiler = DeviceCompilerUnix(ctx)

    objects = []
    for src in sources_cu:
        print(f'{ext.name}: Device code: {src}')
        obj_ext = 'obj' if sys.platform == 'win32' else 'o'
        # TODO(kmaehashi): embed CUDA version in path
        obj = f'build/temp.device_objects/{src}.{obj_ext}'
        if os.path.exists(obj) and (_get_timestamp(src) < _get_timestamp(obj)):
            print(f'{ext.name}: Reusing cached object file: {obj}')
        else:
            os.makedirs(os.path.dirname(obj), exist_ok=True)
            print(f'{ext.name}: Building: {obj}')
            compiler.compile(obj, src, ext)
        objects.append(obj)

    return sources_cpp, objects


def _get_timestamp(path: str) -> float:
    stat = os.lstat(path)
    return max(stat.st_atime, stat.st_mtime, stat.st_ctime)


def dumpbin_dependents(dumpbin: str, path: str) -> List[str]:
    args = [dumpbin, '/nologo', '/dependents', path]
    try:
        p = subprocess.run(args, stdout=subprocess.PIPE)
    except FileNotFoundError:
        print(f'*** DUMPBIN not found: {args}')
        return []
    if p.returncode != 0:
        print(f'*** DUMPBIN failed ({p.returncode}): {args}')
        return []
    sections = p.stdout.decode().split('\r\n\r\n')
    for num, section in enumerate(sections):
        if 'Image has the following dependencies:' in section:
            return [line.strip() for line in sections[num+1].splitlines()]
    print(f'*** DUMPBIN output could not be parsed: {args}')
    return []


class custom_build_ext(setuptools.command.build_ext.build_ext):

    """Custom `build_ext` command to include CUDA C source files."""

    def _cythonize(self, nthreads: int) -> None:
        # Defer importing Cython as it may be installed via setup_requires if
        # the user does not have Cython installed.
        import Cython.Build

        ctx = cupy_builder.get_context()
        compiler_directives = {
            'linetrace': ctx.linetrace,
            'profile': ctx.profile,
            # Embed signatures for Sphinx documentation.
            'embedsignature': True,
        }

        # Compile-time constants to be used in Cython code
        compile_time_env: Dict[str, Any] = {}

        # Enable CUDA Python.
        # TODO: add `cuda` to `setup_requires` only when this flag is set
        use_cuda_python = ctx.use_cuda_python
        compile_time_env['CUPY_USE_CUDA_PYTHON'] = use_cuda_python
        if use_cuda_python:
            print('Using CUDA Python')

        compile_time_env['CUPY_CUFFT_STATIC'] = False
        compile_time_env['CUPY_CYTHON_VERSION'] = Cython.__version__
        if ctx.use_stub:  # on RTD
            compile_time_env['CUPY_CUDA_VERSION'] = 0
            compile_time_env['CUPY_HIP_VERSION'] = 0
        elif ctx.use_hip:  # on ROCm/HIP
            compile_time_env['CUPY_CUDA_VERSION'] = 0
            compile_time_env['CUPY_HIP_VERSION'] = build.get_hip_version()
        else:  # on CUDA
            compile_time_env['CUPY_CUDA_VERSION'] = (
                ctx.features['cuda'].get_version())
            compile_time_env['CUPY_HIP_VERSION'] = 0

        print('Compile-time constants: ' +
              json.dumps(compile_time_env, indent=4))

        if sys.platform == 'win32':
            # Disable multiprocessing on Windows (spawn)
            nthreads = 0

        Cython.Build.cythonize(
            self.extensions, verbose=True, nthreads=nthreads, language_level=3,
            compiler_directives=compiler_directives, annotate=ctx.annotate,
            compile_time_env=compile_time_env)

    def build_extensions(self) -> None:
        num_jobs = int(os.environ.get('CUPY_NUM_BUILD_JOBS', '4'))
        if num_jobs > 1:
            self.parallel = num_jobs
            if hasattr(self.compiler, 'initialize'):
                # Workarounds a bug in setuptools/distutils on Windows by
                # initializing the compiler before starting a thread.
                # By default, MSVCCompiler performs initialization in the
                # first compilation. However, in parallel compilation mode,
                # the init code runs in each thread and messes up the internal
                # state as the init code is not locked and is not idempotent.
                # https://github.com/pypa/setuptools/blob/v60.0.0/setuptools/_distutils/_msvccompiler.py#L322-L327
                self.compiler.initialize()

        # Compile "*.pyx" files into "*.cpp" files.
        print('Cythonizing...')
        self._cythonize(num_jobs)

        # Change an extension in each source filenames from "*.pyx" to "*.cpp".
        # c.f. `Cython.Distutils.old_build_ext`
        for ext in self.extensions:
            sources_pyx, sources_others = filter_files_by_extension(
                ext.sources, '.pyx')
            sources_cpp = ['{}.cpp'.format(os.path.splitext(src)[0])
                           for src in sources_pyx]
            ext.sources = sources_cpp + sources_others
            for src in ext.sources:
                if not os.path.isfile(src):
                    raise RuntimeError(f'Fatal error: missing file: {src}')

        print('Building extensions...')
        super().build_extensions()

        if sys.platform == 'win32':
            print('Generating DLL dependency list...')

            # Find "dumpbin.exe" next to "lib.exe".
            dumpbin = os.path.join(
                os.path.dirname(self.compiler.lib), 'dumpbin.exe')

            # Save list of dependent DLLs for all extension modules.
            depends = sorted(set(sum([
                dumpbin_dependents(dumpbin, f) for f in self.get_outputs()
            ], [])))

            depends_json = os.path.join(
                self.build_lib, 'cupy', '.data', '_depends.json')
            os.makedirs(os.path.dirname(depends_json), exist_ok=True)
            with open(depends_json, 'w') as f:
                json.dump({'depends': depends}, f)

    def build_extension(self, ext: setuptools.Extension) -> None:
        ctx = cupy_builder.get_context()

        # Compile "*.cu" files into object files.
        sources_cpp, extra_objects = compile_device_code(ctx, ext)

        # Remove device code from list of sources, and instead add compiled
        # object files to link.
        ext.sources = sources_cpp
        ext.extra_objects += extra_objects

        # Let setuptools do the rest of the build process, i.e., compile
        # "*.cpp" files and link object files generated from "*.cu".
        super().build_extension(ext)