builder.py 29.7 KB
Newer Older
aiss's avatar
aiss committed
1
2
3
4
5
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

6
import os
aiss's avatar
aiss committed
7
import sys
8
9
10
11
import time
import importlib
from pathlib import Path
import subprocess
aiss's avatar
aiss committed
12
13
14
15
16
17
18
import shlex
import shutil
import tempfile
import distutils.ccompiler
import distutils.log
import distutils.sysconfig
from distutils.errors import CompileError, LinkError
19
from abc import ABC, abstractmethod
aiss's avatar
aiss committed
20
from typing import List
21
22
23
24
25
26

YELLOW = '\033[93m'
END = '\033[0m'
WARNING = f"{YELLOW} [WARNING] {END}"

DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
27
DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
28

aiss's avatar
aiss committed
29
30
31
try:
    import torch
except ImportError:
aiss's avatar
aiss committed
32
    print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.")
aiss's avatar
aiss committed
33
34
35
36
else:
    TORCH_MAJOR = int(torch.__version__.split('.')[0])
    TORCH_MINOR = int(torch.__version__.split('.')[1])

37

aiss's avatar
aiss committed
38
def installed_cuda_version(name=""):
39
40
41
42
    import torch.utils.cpp_extension
    cuda_home = torch.utils.cpp_extension.CUDA_HOME
    assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
    # Ensure there is not a cuda version mismatch between torch and nvcc compiler
aiss's avatar
aiss committed
43
    output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
44
45
46
47
    output_split = output.split()
    release_idx = output_split.index("release")
    release = output_split[release_idx + 1].replace(',', '').split(".")
    # Ignore patch versions, only look at major + minor
48
49
50
51
    cuda_major, cuda_minor = release[:2]
    return int(cuda_major), int(cuda_minor)


aiss's avatar
aiss committed
52
def get_default_compute_capabilities():
53
    compute_caps = DEFAULT_COMPUTE_CAPABILITIES
Jeff Rasley's avatar
Jeff Rasley committed
54
    import torch.utils.cpp_extension
aiss's avatar
aiss committed
55
    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version()[0] >= 11:
Xingjian Shi's avatar
Xingjian Shi committed
56
57
58
59
60
        if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
            # Special treatment of CUDA 11.0 because compute_86 is not supported.
            compute_caps += ";8.0"
        else:
            compute_caps += ";8.0;8.6"
61
62
63
    return compute_caps


aiss's avatar
aiss committed
64
65
66
67
68
69
70
71
# list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used
# to build deepspeed and system-wide installed cuda 11.2
cuda_minor_mismatch_ok = {
    10: [
        "10.0",
        "10.1",
        "10.2",
    ],
aiss's avatar
aiss committed
72
    11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
aiss's avatar
aiss committed
73
74
75
}


aiss's avatar
aiss committed
76
77
def assert_no_cuda_mismatch(name=""):
    cuda_major, cuda_minor = installed_cuda_version(name)
78
    sys_cuda_version = f'{cuda_major}.{cuda_minor}'
79
80
    torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
    # This is a show-stopping error, should probably not proceed past this
81
    if sys_cuda_version != torch_cuda_version:
aiss's avatar
aiss committed
82
        if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
aiss's avatar
aiss committed
83
84
85
86
                and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
            print(f"Installed CUDA version {sys_cuda_version} does not match the "
                  f"version torch was compiled with {torch.version.cuda} "
                  "but since the APIs are compatible, accepting this combination")
aiss's avatar
aiss committed
87
            return True
aiss's avatar
aiss committed
88
89
90
        raise Exception(f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
                        f"version torch was compiled with {torch.version.cuda}, unable to compile "
                        "cuda/cpp extensions without a matching cuda version.")
aiss's avatar
aiss committed
91
    return True
92
93
94


class OpBuilder(ABC):
aiss's avatar
aiss committed
95
96
97
    _rocm_version = None
    _is_rocm_pytorch = None

98
99
100
    def __init__(self, name):
        self.name = name
        self.jit_mode = False
aiss's avatar
aiss committed
101
102
        self.build_for_cpu = False
        self.error_log = None
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

    @abstractmethod
    def absolute_name(self):
        '''
        Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
        will be installed as something like: deepspeed/ops/adam/cpu_adam.so
        '''
        pass

    @abstractmethod
    def sources(self):
        '''
        Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
        '''
        pass

aiss's avatar
aiss committed
119
120
121
122
    def hipify_extension(self):
        pass

    @staticmethod
aiss's avatar
aiss committed
123
    def validate_torch_version(torch_info):
aiss's avatar
aiss committed
124
125
        install_torch_version = torch_info['version']
        current_torch_version = ".".join(torch.__version__.split('.')[:2])
aiss's avatar
aiss committed
126
        if install_torch_version != current_torch_version:
aiss's avatar
aiss committed
127
128
129
130
131
            raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed "
                               "with a different version than what is being used at runtime. "
                               f"Please re-install DeepSpeed or switch torch versions. "
                               f"Install torch version={install_torch_version}, "
                               f"Runtime torch version={current_torch_version}")
aiss's avatar
aiss committed
132

aiss's avatar
aiss committed
133
134
    @staticmethod
    def validate_torch_op_version(torch_info):
aiss's avatar
aiss committed
135
        if not OpBuilder.is_rocm_pytorch():
aiss's avatar
aiss committed
136
137
138
            current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
            install_cuda_version = torch_info['cuda_version']
            if install_cuda_version != current_cuda_version:
aiss's avatar
aiss committed
139
140
141
142
143
                raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed "
                                   "with a different version than what is being used at runtime. "
                                   f"Please re-install DeepSpeed or switch torch versions. "
                                   f"Install CUDA version={install_cuda_version}, "
                                   f"Runtime CUDA version={current_cuda_version}")
aiss's avatar
aiss committed
144
        else:
aiss's avatar
aiss committed
145
146
147
            current_hip_version = ".".join(torch.version.hip.split('.')[:2])
            install_hip_version = torch_info['hip_version']
            if install_hip_version != current_hip_version:
aiss's avatar
aiss committed
148
149
150
151
152
                raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed "
                                   "with a different version than what is being used at runtime. "
                                   f"Please re-install DeepSpeed or switch torch versions. "
                                   f"Install HIP version={install_hip_version}, "
                                   f"Runtime HIP version={current_hip_version}")
aiss's avatar
aiss committed
153
154
155
156
157
158
159
160
161
162
163
164
165

    @staticmethod
    def is_rocm_pytorch():
        if OpBuilder._is_rocm_pytorch is not None:
            return OpBuilder._is_rocm_pytorch

        _is_rocm_pytorch = False
        try:
            import torch
        except ImportError:
            pass
        else:
            if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
aiss's avatar
aiss committed
166
                _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
aiss's avatar
aiss committed
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
                if _is_rocm_pytorch:
                    from torch.utils.cpp_extension import ROCM_HOME
                    _is_rocm_pytorch = ROCM_HOME is not None
        OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
        return OpBuilder._is_rocm_pytorch

    @staticmethod
    def installed_rocm_version():
        if OpBuilder._rocm_version:
            return OpBuilder._rocm_version

        ROCM_MAJOR = '0'
        ROCM_MINOR = '0'
        if OpBuilder.is_rocm_pytorch():
            from torch.utils.cpp_extension import ROCM_HOME
aiss's avatar
aiss committed
182
183
184
185
186
187
188
189
190
            rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version-dev")
            if rocm_ver_file.is_file():
                with open(rocm_ver_file, 'r') as file:
                    ROCM_VERSION_DEV_RAW = file.read()
            elif "rocm" in torch.__version__:
                ROCM_VERSION_DEV_RAW = torch.__version__.split("rocm")[1]
            else:
                assert False, "Could not detect ROCm version"
            assert ROCM_VERSION_DEV_RAW != "", "Could not detect ROCm version"
aiss's avatar
aiss committed
191
192
193
194
195
            ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
            ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
        OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
        return OpBuilder._rocm_version

196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
    def include_paths(self):
        '''
        Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
        '''
        return []

    def nvcc_args(self):
        '''
        Returns optional list of compiler flags to forward to nvcc when building CUDA sources
        '''
        return []

    def cxx_args(self):
        '''
        Returns optional list of compiler flags to forward to the build
        '''
        return []

aiss's avatar
aiss committed
214
    def is_compatible(self, verbose=True):
215
216
217
218
219
        '''
        Check if all non-python dependencies are satisfied to build this op
        '''
        return True

Samyam Rajbhandari's avatar
Samyam Rajbhandari committed
220
    def extra_ldflags(self):
aiss's avatar
aiss committed
221
222
        #return []
        return ['-liomp5']
Samyam Rajbhandari's avatar
Samyam Rajbhandari committed
223
224
225
226
227

    def libraries_installed(self, libraries):
        valid = False
        check_cmd = 'dpkg -l'
        for lib in libraries:
aiss's avatar
aiss committed
228
            result = subprocess.Popen(f'dpkg -l {lib}', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
Samyam Rajbhandari's avatar
Samyam Rajbhandari committed
229
230
231
            valid = valid or result.wait() == 0
        return valid

aiss's avatar
aiss committed
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
    def has_function(self, funcname, libraries, verbose=False):
        '''
        Test for existence of a function within a tuple of libraries.

        This is used as a smoke test to check whether a certain library is available.
        As a test, this creates a simple C program that calls the specified function,
        and then distutils is used to compile that program and link it with the specified libraries.
        Returns True if both the compile and link are successful, False otherwise.
        '''
        tempdir = None  # we create a temporary directory to hold various files
        filestderr = None  # handle to open file to which we redirect stderr
        oldstderr = None  # file descriptor for stderr
        try:
            # Echo compile and link commands that are used.
            if verbose:
                distutils.log.set_verbosity(1)

            # Create a compiler object.
            compiler = distutils.ccompiler.new_compiler(verbose=verbose)

            # Configure compiler and linker to build according to Python install.
            distutils.sysconfig.customize_compiler(compiler)

            # Create a temporary directory to hold test files.
            tempdir = tempfile.mkdtemp()

            # Define a simple C program that calls the function in question
aiss's avatar
aiss committed
259
            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname)
aiss's avatar
aiss committed
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279

            # Write the test program to a file.
            filename = os.path.join(tempdir, 'test.c')
            with open(filename, 'w') as f:
                f.write(prog)

            # Redirect stderr file descriptor to a file to silence compile/link warnings.
            if not verbose:
                filestderr = open(os.path.join(tempdir, 'stderr.txt'), 'w')
                oldstderr = os.dup(sys.stderr.fileno())
                os.dup2(filestderr.fileno(), sys.stderr.fileno())

            # Workaround for behavior in distutils.ccompiler.CCompiler.object_filenames()
            # Otherwise, a local directory will be used instead of tempdir
            drive, driveless_filename = os.path.splitdrive(filename)
            root_dir = driveless_filename[0] if os.path.isabs(driveless_filename) else ''
            output_dir = os.path.join(drive, root_dir)

            # Attempt to compile the C program into an object file.
            cflags = shlex.split(os.environ.get('CFLAGS', ""))
aiss's avatar
aiss committed
280
            objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags))
aiss's avatar
aiss committed
281
282
283
284
285

            # Attempt to link the object file into an executable.
            # Be sure to tack on any libraries that have been specified.
            ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
            compiler.link_executable(objs,
aiss's avatar
aiss committed
286
                                     os.path.join(tempdir, 'a.out'),
aiss's avatar
aiss committed
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
                                     extra_preargs=self.strip_empty_entries(ldflags),
                                     libraries=libraries)

            # Compile and link succeeded
            return True

        except CompileError:
            return False

        except LinkError:
            return False

        except:
            return False

        finally:
            # Restore stderr file descriptor and close the stderr redirect file.
            if oldstderr is not None:
                os.dup2(oldstderr, sys.stderr.fileno())
            if filestderr is not None:
                filestderr.close()

            # Delete the temporary directory holding the test program and stderr files.
            if tempdir is not None:
                shutil.rmtree(tempdir)

    def strip_empty_entries(self, args):
        '''
        Drop any empty strings from the list of compile and link flags
        '''
        return [x for x in args if len(x) > 0]

    def cpu_arch(self):
        try:
            from cpuinfo import get_cpu_info
        except ImportError as e:
            cpu_info = self._backup_cpuinfo()
            if cpu_info is None:
                return "-march=native"

        try:
            cpu_info = get_cpu_info()
        except Exception as e:
aiss's avatar
aiss committed
330
331
            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
                         "falling back to `lscpu` to get this information.")
aiss's avatar
aiss committed
332
333
334
335
336
337
338
339
340
            cpu_info = self._backup_cpuinfo()
            if cpu_info is None:
                return "-march=native"

        if cpu_info['arch'].startswith('PPC_'):
            # gcc does not provide -march on PowerPC, use -mcpu instead
            return '-mcpu=native'
        return '-march=native'

aiss's avatar
aiss committed
341
342
    def is_cuda_enable(self):
        try:
aiss's avatar
aiss committed
343
344
345
            #assert_no_cuda_mismatch(self.name)
            #return '-D__ENABLE_CUDA__'
            #aiss
aiss's avatar
aiss committed
346
347
            if torch.cuda.is_available():
                return '-D__ENABLE_CUDA__'
aiss's avatar
aiss committed
348
349
350
        except BaseException:
            print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, "
                  "only cpu ops can be compiled!")
aiss's avatar
aiss committed
351
352
353
            return '-D__DISABLE_CUDA__'
        return '-D__DISABLE_CUDA__'

aiss's avatar
aiss committed
354
355
    def _backup_cpuinfo(self):
        # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
Samyam Rajbhandari's avatar
Samyam Rajbhandari committed
356
        if not self.command_exists('lscpu'):
aiss's avatar
aiss committed
357
358
359
            self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
                         "to detect the CPU architecture. 'lscpu' does not appear to exist on "
                         "your system, will fall back to use -march=native and non-vectorized execution.")
aiss's avatar
aiss committed
360
            return None
Samyam Rajbhandari's avatar
Samyam Rajbhandari committed
361
362
        result = subprocess.check_output('lscpu', shell=True)
        result = result.decode('utf-8').strip().lower()
aiss's avatar
aiss committed
363
364
365
366
367
368

        cpu_info = {}
        cpu_info['arch'] = None
        cpu_info['flags'] = ""
        if 'genuineintel' in result or 'authenticamd' in result:
            cpu_info['arch'] = 'X86_64'
Samyam Rajbhandari's avatar
Samyam Rajbhandari committed
369
            if 'avx512' in result:
aiss's avatar
aiss committed
370
                cpu_info['flags'] += 'avx512,'
aiss's avatar
aiss committed
371
372
            elif 'avx512f' in result:
                cpu_info['flags'] += 'avx512f,'
aiss's avatar
aiss committed
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
            if 'avx2' in result:
                cpu_info['flags'] += 'avx2'
        elif 'ppc64le' in result:
            cpu_info['arch'] = "PPC_"

        return cpu_info

    def simd_width(self):
        try:
            from cpuinfo import get_cpu_info
        except ImportError as e:
            cpu_info = self._backup_cpuinfo()
            if cpu_info is None:
                return '-D__SCALAR__'

        try:
            cpu_info = get_cpu_info()
        except Exception as e:
aiss's avatar
aiss committed
391
392
            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
                         "falling back to `lscpu` to get this information.")
aiss's avatar
aiss committed
393
394
395
396
397
            cpu_info = self._backup_cpuinfo()
            if cpu_info is None:
                return '-D__SCALAR__'

        if cpu_info['arch'] == 'X86_64':
aiss's avatar
aiss committed
398
            if 'avx512' in cpu_info['flags'] or 'avx512f' in cpu_info['flags']:
Samyam Rajbhandari's avatar
Samyam Rajbhandari committed
399
                return '-D__AVX512__'
aiss's avatar
aiss committed
400
            elif 'avx2' in cpu_info['flags']:
Samyam Rajbhandari's avatar
Samyam Rajbhandari committed
401
                return '-D__AVX256__'
aiss's avatar
aiss committed
402
        return '-D__SCALAR__'
Samyam Rajbhandari's avatar
Samyam Rajbhandari committed
403

404
405
406
407
408
409
410
411
412
413
414
    def command_exists(self, cmd):
        if '|' in cmd:
            cmds = cmd.split("|")
        else:
            cmds = [cmd]
        valid = False
        for cmd in cmds:
            result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
            valid = valid or result.wait() == 0

        if not valid and len(cmds) > 1:
aiss's avatar
aiss committed
415
            print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!")
416
        elif not valid and len(cmds) == 1:
aiss's avatar
aiss committed
417
            print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!")
418
419
420
        return valid

    def warning(self, msg):
aiss's avatar
aiss committed
421
        self.error_log = f"{msg}"
422
423
424
425
426
427
428
429
430
431
        print(f"{WARNING} {msg}")

    def deepspeed_src_path(self, code_path):
        if os.path.isabs(code_path):
            return code_path
        else:
            return os.path.join(Path(__file__).parent.parent.absolute(), code_path)

    def builder(self):
        from torch.utils.cpp_extension import CppExtension
aiss's avatar
aiss committed
432
433
434
435
436
        return CppExtension(name=self.absolute_name(),
                            sources=self.strip_empty_entries(self.sources()),
                            include_dirs=self.strip_empty_entries(self.include_paths()),
                            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
                            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
437
438

    def load(self, verbose=True):
aiss's avatar
aiss committed
439
        from deepspeed.git_version_info import installed_ops, torch_info
440
441
442
        if installed_ops[self.name]:
            # Ensure the op we're about to load was compiled with the same
            # torch/cuda versions we are currently using at runtime.
aiss's avatar
aiss committed
443
444
            self.validate_torch_version(torch_info)
            if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
aiss's avatar
aiss committed
445
446
                self.validate_torch_op_version(torch_info)

447
448
449
450
451
            return importlib.import_module(self.absolute_name())
        else:
            return self.jit_load(verbose)

    def jit_load(self, verbose=True):
aiss's avatar
aiss committed
452
        if not self.is_compatible(verbose):
453
            raise RuntimeError(
aiss's avatar
aiss committed
454
                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
455
456
            )
        try:
aiss's avatar
aiss committed
457
            import ninja  # noqa: F401
458
        except ImportError:
aiss's avatar
aiss committed
459
            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
460

aiss's avatar
aiss committed
461
        if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
aiss's avatar
aiss committed
462
463
464
465
466
            try:
                assert_no_cuda_mismatch(self.name)
                self.build_for_cpu = False
            except BaseException:
                self.build_for_cpu = True
467
468
469
470
471

        self.jit_mode = True
        from torch.utils.cpp_extension import load

        start_build = time.time()
aiss's avatar
aiss committed
472
        sources = [self.deepspeed_src_path(path) for path in self.sources()]
aiss's avatar
aiss committed
473
        extra_include_paths = [self.deepspeed_src_path(path) for path in self.include_paths()]
aiss's avatar
aiss committed
474
475
476
477
478
479
480
481
482
483

        # Torch will try and apply whatever CCs are in the arch list at compile time,
        # we have already set the intended targets ourselves we know that will be
        # needed at runtime. This prevents CC collisions such as multiple __half
        # implementations. Stash arch list to reset after build.
        torch_arch_list = None
        if "TORCH_CUDA_ARCH_LIST" in os.environ:
            torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
            os.environ["TORCH_CUDA_ARCH_LIST"] = ""

aiss's avatar
aiss committed
484
485
486
487
488
489
490
        op_module = load(name=self.name,
                         sources=self.strip_empty_entries(sources),
                         extra_include_paths=self.strip_empty_entries(extra_include_paths),
                         extra_cflags=self.strip_empty_entries(self.cxx_args()),
                         extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
                         extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
                         verbose=verbose)
aiss's avatar
aiss committed
491

492
493
494
        build_duration = time.time() - start_build
        if verbose:
            print(f"Time to load {self.name} op: {build_duration} seconds")
aiss's avatar
aiss committed
495
496
497
498
499

        # Reset arch list so we are not silently removing it for other possible use cases
        if torch_arch_list:
            os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list

500
501
502
503
        return op_module


class CUDAOpBuilder(OpBuilder):
aiss's avatar
aiss committed
504

505
    def compute_capability_args(self, cross_compile_archs=None):
506
507
        """
        Returns nvcc compute capability compile flags.
508

509
510
        1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
        2. If neither is set default compute capabilities will be used
511
        3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
512
513
514
515
516
517

        Format:

        - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:

        TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
aiss's avatar
aiss committed
518
        TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
519
520
521
522
523

        - `cross_compile_archs` uses ; separator.

        """
        ccs = []
524
        if self.jit_mode:
525
526
527
528
529
530
531
            # Compile for underlying architectures since we know those at runtime
            for i in range(torch.cuda.device_count()):
                CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i)
                cc = f"{CC_MAJOR}.{CC_MINOR}"
                if cc not in ccs:
                    ccs.append(cc)
            ccs = sorted(ccs)
532
            ccs[-1] += '+PTX'
533
534
        else:
            # Cross-compile mode, compile for various architectures
535
536
537
538
539
540
541
542
543
544
            # env override takes priority
            cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
            if cross_compile_archs_env is not None:
                if cross_compile_archs is not None:
                    print(
                        f"{WARNING} env var `TORCH_CUDA_ARCH_LIST={cross_compile_archs_env}` overrides `cross_compile_archs={cross_compile_archs}`"
                    )
                cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
            else:
                if cross_compile_archs is None:
aiss's avatar
aiss committed
545
                    cross_compile_archs = get_default_compute_capabilities()
546
547
            ccs = cross_compile_archs.split(';')

aiss's avatar
aiss committed
548
549
550
        ccs = self.filter_ccs(ccs)
        if len(ccs) == 0:
            raise RuntimeError(
aiss's avatar
aiss committed
551
                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering")
aiss's avatar
aiss committed
552

553
554
        args = []
        for cc in ccs:
555
556
557
558
            num = cc[0] + cc[2]
            args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
            if cc.endswith('+PTX'):
                args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
559

560
561
        return args

aiss's avatar
aiss committed
562
563
564
565
566
567
568
    def filter_ccs(self, ccs: List[str]):
        """
        Prune any compute capabilities that are not compatible with the builder. Should log
        which CCs have been pruned.
        """
        return ccs

569
570
571
572
573
574
575
576
577
578
579
580
581
    def version_dependent_macros(self):
        # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
        version_ge_1_1 = []
        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
            version_ge_1_1 = ['-DVERSION_GE_1_1']
        version_ge_1_3 = []
        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
            version_ge_1_3 = ['-DVERSION_GE_1_3']
        version_ge_1_5 = []
        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
            version_ge_1_5 = ['-DVERSION_GE_1_5']
        return version_ge_1_1 + version_ge_1_3 + version_ge_1_5

aiss's avatar
aiss committed
582
583
    def is_compatible(self, verbose=True):
        return super().is_compatible(verbose)
584
585

    def builder(self):
aiss's avatar
aiss committed
586
587
588
589
590
591
592
593
594
595
        #try:
        #    assert_no_cuda_mismatch(self.name)
        #    self.build_for_cpu = False
        #except BaseException:
        #    self.build_for_cpu = True

        #if self.build_for_cpu:
        #    from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
        #else:
        #    from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
aiss's avatar
aiss committed
596
        #aiss
aiss's avatar
aiss committed
597
        if not self.is_rocm_pytorch():
aiss's avatar
aiss committed
598
599
600
601
602
603
            self.build_for_cpu = not assert_no_cuda_mismatch(self.name) 
            if self.build_for_cpu:
                from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
            else:
                from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
        else:
aiss's avatar
aiss committed
604
            from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder 
aiss's avatar
aiss committed
605
606
607
608
609

        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
                       {'cxx': self.strip_empty_entries(self.cxx_args()), \
                           'nvcc': self.strip_empty_entries(self.nvcc_args())}

aiss's avatar
aiss committed
610
611
612
613
614
        cuda_ext = ExtensionBuilder(name=self.absolute_name(),
                                    sources=self.strip_empty_entries(self.sources()),
                                    include_dirs=self.strip_empty_entries(self.include_paths()),
                                    libraries=self.strip_empty_entries(self.libraries_args()),
                                    extra_compile_args=compile_args)
aiss's avatar
aiss committed
615

aiss's avatar
aiss committed
616
617
618
619
620
621
        if self.is_rocm_pytorch():
            # hip converts paths to absolute, this converts back to relative
            sources = cuda_ext.sources
            curr_file = Path(__file__).parent.parent  # ds root
            for i in range(len(sources)):
                src = Path(sources[i])
aiss's avatar
aiss committed
622
623
624
625
                if src.is_absolute():
                    sources[i] = str(src.relative_to(curr_file))
                else:
                    sources[i] = str(src)
aiss's avatar
aiss committed
626
627
628
629
630
631
            cuda_ext.sources = sources
        return cuda_ext

    def hipify_extension(self):
        if self.is_rocm_pytorch():
            from torch.utils.hipify import hipify_python
aiss's avatar
aiss committed
632
633
            #torch1.13
        import torch
aiss's avatar
aiss committed
634
635
636
637
        torch_version = torch.__version__
        print("torch version: ", torch.__version__)
        if (torch_version.startswith('1.13')):
            print("compile with torch1.13!")
aiss's avatar
aiss committed
638
639
640
641
642
643
644
645
646
647
648
649
650
            hipify_python.hipify(
                project_directory=os.getcwd(),
                output_directory=os.getcwd(),
                header_include_dirs=self.include_paths(),
                includes=[os.path.join(os.getcwd(),
                                       '*')] + [os.path.abspath(s) for s in self.sources()],
                extra_files=[os.path.abspath(s) for s in self.sources()],
                show_detailed=True,
                is_pytorch_extension=True,
                hipify_extra_files_only=True,
            )
        else:
            #torch1.10
aiss's avatar
aiss committed
651
            print("compile with torch1.10")
aiss's avatar
aiss committed
652
653
654
            hipify_python.hipify(
                project_directory=os.getcwd(),
                output_directory=os.getcwd(),
aiss's avatar
aiss committed
655
                #header_include_dirs=self.include_paths(),
aiss's avatar
aiss committed
656
657
                #includes=[os.path.join(os.getcwd(),
                #                       '*')],
aiss's avatar
aiss committed
658
                includes=[os.path.join(os.getcwd(),
aiss's avatar
aiss committed
659
                                       '*')] + [os.path.abspath(s) for s in self.sources()],
aiss's avatar
aiss committed
660
                extra_files=[os.path.abspath(s) for s in self.sources()],
aiss's avatar
aiss committed
661
                show_progress=True,
aiss's avatar
aiss committed
662
                is_pytorch_extension=True,
aiss's avatar
aiss committed
663
                #hipify_extra_files_only=True,
aiss's avatar
aiss committed
664
            )
aiss's avatar
aiss committed
665

aiss's avatar
aiss committed
666
667
668
669
670
671
672
673

    def cxx_args(self):
        if sys.platform == "win32":
            return ['-O2']
        else:
            return ['-O3', '-std=c++14', '-g', '-Wno-reorder']

    def nvcc_args(self):
aiss's avatar
aiss committed
674
675
        if self.build_for_cpu:
            return []
aiss's avatar
aiss committed
676
677
678
679
        args = ['-O3']
        if self.is_rocm_pytorch():
            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
            args += [
aiss's avatar
aiss committed
680
                '-std=c++14', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__',
aiss's avatar
aiss committed
681
682
                '-U__HIP_NO_HALF2_OPERATORS__',
                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
aiss's avatar
aiss committed
683
                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR,
aiss's avatar
aiss committed
684
685
                '--gpu-max-threads-per-block=1024',
                '-mllvm -amdgpu-enable-flat-scratch=false'
aiss's avatar
aiss committed
686
687
688
689
            ]
        else:
            cuda_major, _ = installed_cuda_version()
            args += [
aiss's avatar
aiss committed
690
691
692
                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math',
                '-std=c++17' if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
                '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__'
aiss's avatar
aiss committed
693
            ]
aiss's avatar
aiss committed
694
695
            if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
                args.append('--ptxas-options=-v')
aiss's avatar
aiss committed
696
697
698
699
            args += self.compute_capability_args()
        return args

    def libraries_args(self):
aiss's avatar
aiss committed
700
701
702
        if self.build_for_cpu:
            return []

aiss's avatar
aiss committed
703
704
705
        if sys.platform == "win32":
            return ['cublas', 'curand']
        else:
aiss's avatar
aiss committed
706
            #return []
aiss's avatar
aiss committed
707
            return ['iomp5']
aiss's avatar
aiss committed
708
709
710


class TorchCPUOpBuilder(CUDAOpBuilder):
aiss's avatar
aiss committed
711

aiss's avatar
aiss committed
712
    def extra_ldflags(self):
aiss's avatar
aiss committed
713
714
715
        if self.build_for_cpu:
            return ['-fopenmp']

aiss's avatar
aiss committed
716
717
        if not self.is_rocm_pytorch():
            return ['-lcurand']
aiss's avatar
aiss committed
718
719
720

        #return []
        return ['-liomp5']
aiss's avatar
aiss committed
721
722
723

    def cxx_args(self):
        import torch
aiss's avatar
aiss committed
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
        args = []
        if not self.build_for_cpu:
            if not self.is_rocm_pytorch():
                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
            else:
                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")

            args += super().cxx_args()
            args += [
                f'-L{CUDA_LIB64}',
                '-lcudart',
                '-lcublas',
                '-g',
            ]

aiss's avatar
aiss committed
739
740
        CPU_ARCH = self.cpu_arch()
        SIMD_WIDTH = self.simd_width()
aiss's avatar
aiss committed
741
        CUDA_ENABLE = self.is_cuda_enable()
aiss's avatar
aiss committed
742
743
744
745
        args += [
            CPU_ARCH,
            '-fopenmp',
            SIMD_WIDTH,
aiss's avatar
aiss committed
746
            CUDA_ENABLE,
aiss's avatar
aiss committed
747
        ]
aiss's avatar
aiss committed
748

aiss's avatar
aiss committed
749
        return args