transformer_inference.py 2.56 KB
Newer Older
aiss's avatar
aiss committed
1
2
3
4
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team
aiss's avatar
aiss committed
5
6

from .builder import CUDAOpBuilder, installed_cuda_version
aiss's avatar
aiss committed
7
8
9
10
11
12
13
14
15
16
17
18
19


class InferenceBuilder(CUDAOpBuilder):
    BUILD_VAR = "DS_BUILD_TRANSFORMER_INFERENCE"
    NAME = "transformer_inference"

    def __init__(self, name=None):
        name = self.NAME if name is None else name
        super().__init__(name=name)

    def absolute_name(self):
        return f'deepspeed.ops.transformer.inference.{self.NAME}_op'

aiss's avatar
aiss committed
20
21
22
23
    def is_compatible(self, verbose=True):
        try:
            import torch
        except ImportError:
aiss's avatar
aiss committed
24
            self.warning("Please install torch if trying to pre-compile inference kernels")
aiss's avatar
aiss committed
25
26
27
28
29
30
31
32
            return False

        cuda_okay = True
        if not self.is_rocm_pytorch() and torch.cuda.is_available():
            sys_cuda_major, _ = installed_cuda_version()
            torch_cuda_major = int(torch.version.cuda.split('.')[0])
            cuda_capability = torch.cuda.get_device_properties(0).major
            if cuda_capability < 6:
aiss's avatar
aiss committed
33
                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
aiss's avatar
aiss committed
34
35
36
                cuda_okay = False
            if cuda_capability >= 8:
                if torch_cuda_major < 11 or sys_cuda_major < 11:
aiss's avatar
aiss committed
37
                    self.warning("On Ampere and higher architectures please use CUDA 11+")
aiss's avatar
aiss committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
                    cuda_okay = False
        return super().is_compatible(verbose) and cuda_okay

    def filter_ccs(self, ccs):
        ccs_retained = []
        ccs_pruned = []
        for cc in ccs:
            if int(cc[0]) >= 6:
                ccs_retained.append(cc)
            else:
                ccs_pruned.append(cc)
        if len(ccs_pruned) > 0:
            self.warning(f"Filtered compute capabilities {ccs_pruned}")
        return ccs_retained

aiss's avatar
aiss committed
53
54
55
56
    def sources(self):
        return [
            'csrc/transformer/inference/csrc/pt_binding.cpp',
            'csrc/transformer/inference/csrc/gelu.cu',
aiss's avatar
aiss committed
57
58
            'csrc/transformer/inference/csrc/relu.cu',
            'csrc/transformer/inference/csrc/layer_norm.cu',
aiss's avatar
aiss committed
59
60
61
            'csrc/transformer/inference/csrc/softmax.cu',
            'csrc/transformer/inference/csrc/dequantize.cu',
            'csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu',
aiss's avatar
aiss committed
62
            'csrc/transformer/inference/csrc/transform.cu',
aiss's avatar
aiss committed
63
64
65
66
67
68
69
70
71
        ]

    def extra_ldflags(self):
        if not self.is_rocm_pytorch():
            return ['-lcurand']
        else:
            return []

    def include_paths(self):
aiss's avatar
aiss committed
72
        return ['csrc/transformer/inference/includes', 'csrc/includes']