setup.py 10.4 KB
Newer Older
zbian's avatar
zbian committed
1
import os
2
import re
3
4
5
import subprocess

from setuptools import Extension, find_packages, setup
zbian's avatar
zbian committed
6

7
8
9
10
11
12
13
14
15
16
17
18
19
try:
    import torch
    from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension
    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
    TORCH_MAJOR = int(torch.__version__.split('.')[0])
    TORCH_MINOR = int(torch.__version__.split('.')[1])

    if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 10):
        raise RuntimeError("Colossal-AI requires Pytorch 1.10 or newer.\n"
                           "The latest stable release can be obtained from https://pytorch.org/")
except ImportError:
    raise ModuleNotFoundError('torch is not found. You need to install PyTorch before installing Colossal-AI.')

zbian's avatar
zbian committed
20
21
# ninja build does not work unless include_dirs are abs path
this_dir = os.path.dirname(os.path.abspath(__file__))
ver217's avatar
ver217 committed
22
23
24
build_cuda_ext = True
ext_modules = []

25
if int(os.environ.get('NO_CUDA_EXT', '0')) == 1:
ver217's avatar
ver217 committed
26
    build_cuda_ext = False
zbian's avatar
zbian committed
27
28
29


def get_cuda_bare_metal_version(cuda_dir):
ver217's avatar
ver217 committed
30
    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
zbian's avatar
zbian committed
31
32
33
34
35
36
37
38
39
    output = raw_output.split()
    release_idx = output.index("release") + 1
    release = output[release_idx].split(".")
    bare_metal_major = release[0]
    bare_metal_minor = release[1][0]

    return raw_output, bare_metal_major, bare_metal_minor


40
def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
ver217's avatar
ver217 committed
41
    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
42
43
44
45
46
47
    torch_binary_major = torch.version.cuda.split(".")[0]
    torch_binary_minor = torch.version.cuda.split(".")[1]

    print("\nCompiling cuda extensions with")
    print(raw_output + "from " + cuda_dir + "/bin\n")

ver217's avatar
ver217 committed
48
    if bare_metal_major != torch_binary_major:
xyupeng's avatar
xyupeng committed
49
50
        print(f'The detected CUDA version ({raw_output}) mismatches the version that was used to compile PyTorch '
              f'({torch.version.cuda}). CUDA extension will not be installed.')
ver217's avatar
ver217 committed
51
52
53
54
        return False

    if bare_metal_minor != torch_binary_minor:
        print("\nWarning: Cuda extensions are being compiled with a version of Cuda that does "
xyupeng's avatar
xyupeng committed
55
56
57
58
              "not match the version used to compile Pytorch binaries.  "
              f"Pytorch binaries were compiled with Cuda {torch.version.cuda}.\n"
              "In some cases, a minor-version mismatch will not cause later errors:  "
              "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. ")
ver217's avatar
ver217 committed
59
60
61
62
63
64
    return True


def check_cuda_availability(cuda_dir):
    if not torch.cuda.is_available():
        # https://github.com/NVIDIA/apex/issues/486
xyupeng's avatar
xyupeng committed
65
66
67
68
69
70
71
72
73
74
75
        # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query
        # torch.cuda.get_device_capability(), which will fail if you are compiling in an environment
        # without visible GPUs (e.g. during an nvidia-docker build command).
        print(
            '\nWarning: Torch did not find available GPUs on this system.\n',
            'If your intention is to cross-compile, this is not an error.\n'
            'By default, Colossal-AI will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
            'Volta (compute capability 7.0), Turing (compute capability 7.5),\n'
            'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n'
            'If you wish to cross-compile for a single specific architecture,\n'
            'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n')
ver217's avatar
ver217 committed
76
77
78
79
80
81
82
83
84
        if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
            _, bare_metal_major, _ = get_cuda_bare_metal_version(cuda_dir)
            if int(bare_metal_major) == 11:
                os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
            else:
                os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
        return False

    if cuda_dir is None:
xyupeng's avatar
xyupeng committed
85
86
        print("nvcc was not found. CUDA extension will not be installed. If you're installing within a container from "
              "https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
ver217's avatar
ver217 committed
87
88
        return False
    return True
89
90


ver217's avatar
ver217 committed
91
92
93
94
95
96
97
def append_nvcc_threads(nvcc_extra_args):
    _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
    if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2:
        return nvcc_extra_args + ["--threads", "4"]
    return nvcc_extra_args


98
99
100
101
102
def fetch_requirements(path):
    with open(path, 'r') as fd:
        return [r.strip() for r in fd.readlines()]


ver217's avatar
ver217 committed
103
104
105
106
107
108
def fetch_readme():
    with open('README.md', encoding='utf-8') as f:
        return f.read()


def get_version():
109
110
111
112
113
114
    setup_file_path = os.path.abspath(__file__)
    project_path = os.path.dirname(setup_file_path)
    version_txt_path = os.path.join(project_path, 'version.txt')
    version_py_path = os.path.join(project_path, 'colossalai/version.py')

    with open(version_txt_path) as f:
115
116
        version = f.read().strip()
        if build_cuda_ext:
ver217's avatar
ver217 committed
117
118
            torch_version = '.'.join(torch.__version__.split('.')[:2])
            cuda_version = '.'.join(get_cuda_bare_metal_version(CUDA_HOME)[1:])
119
            version += f'+torch{torch_version}cu{cuda_version}'
ver217's avatar
ver217 committed
120

121
122
123
124
125
    # write version into version.py
    with open(version_py_path, 'w') as f:
        f.write(f"__version__ = '{version}'\n")

    return version
126
127


ver217's avatar
ver217 committed
128
129
130
131
132
133
134
135
136
137
138
if build_cuda_ext:
    build_cuda_ext = check_cuda_availability(CUDA_HOME) and check_cuda_torch_binary_vs_bare_metal(CUDA_HOME)

if build_cuda_ext:
    # Set up macros for forward/backward compatibility hack around
    # https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
    # and
    # https://github.com/NVIDIA/apex/issues/456
    # https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
    version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5']

LuGY's avatar
LuGY committed
139
    def cuda_ext_helper(name, sources, extra_cuda_flags, extra_cxx_flags=[]):
xyupeng's avatar
xyupeng committed
140
141
142
143
144
145
146
147
148
        return CUDAExtension(
            name=name,
            sources=[os.path.join('colossalai/kernel/cuda_native/csrc', path) for path in sources],
            include_dirs=[os.path.join(this_dir, 'colossalai/kernel/cuda_native/csrc/kernels/include')],
            extra_compile_args={
                'cxx': ['-O3'] + version_dependent_macros + extra_cxx_flags,
                'nvcc': append_nvcc_threads(['-O3', '--use_fast_math'] + version_dependent_macros + extra_cuda_flags)
            })

149
150
151
152
153
154
155
    cc_flag = []
    for arch in torch.cuda.get_arch_list():
        res = re.search(r'sm_(\d+)', arch)
        if res:
            arch_cap = res[1]
            if int(arch_cap) >= 60:
                cc_flag.extend(['-gencode', f'arch=compute_{arch_cap},code={arch}'])
ver217's avatar
ver217 committed
156

ver217's avatar
ver217 committed
157
158
159
    extra_cuda_flags = ['-lineinfo']

    ext_modules.append(
160
        cuda_ext_helper('colossalai._C.fused_optim', [
ver217's avatar
ver217 committed
161
162
163
164
            'colossal_C_frontend.cpp', 'multi_tensor_sgd_kernel.cu', 'multi_tensor_scale_kernel.cu',
            'multi_tensor_adam.cu', 'multi_tensor_l2norm_kernel.cu', 'multi_tensor_lamb.cu'
        ], extra_cuda_flags + cc_flag))

xyupeng's avatar
xyupeng committed
165
166
167
168
    extra_cuda_flags = [
        '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr',
        '--expt-extended-lambda'
    ]
ver217's avatar
ver217 committed
169

xyupeng's avatar
xyupeng committed
170
    ext_modules.append(
171
        cuda_ext_helper('colossalai._C.scaled_upper_triang_masked_softmax',
xyupeng's avatar
xyupeng committed
172
173
                        ['scaled_upper_triang_masked_softmax.cpp', 'scaled_upper_triang_masked_softmax_cuda.cu'],
                        extra_cuda_flags + cc_flag))
ver217's avatar
ver217 committed
174

xyupeng's avatar
xyupeng committed
175
    ext_modules.append(
176
        cuda_ext_helper('colossalai._C.scaled_masked_softmax',
xyupeng's avatar
xyupeng committed
177
                        ['scaled_masked_softmax.cpp', 'scaled_masked_softmax_cuda.cu'], extra_cuda_flags + cc_flag))
ver217's avatar
ver217 committed
178

xyupeng's avatar
xyupeng committed
179
    ext_modules.append(
180
        cuda_ext_helper('colossalai._C.moe', ['moe_cuda.cpp', 'moe_cuda_kernel.cu'], extra_cuda_flags + cc_flag))
181

ver217's avatar
ver217 committed
182
183
    extra_cuda_flags = ['-maxrregcount=50']

xyupeng's avatar
xyupeng committed
184
    ext_modules.append(
185
        cuda_ext_helper('colossalai._C.layer_norm', ['layer_norm_cuda.cpp', 'layer_norm_cuda_kernel.cu'],
xyupeng's avatar
xyupeng committed
186
187
188
189
190
191
192
193
                        extra_cuda_flags + cc_flag))

    extra_cuda_flags = [
        '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
        '-DTHRUST_IGNORE_CUB_VERSION_CHECK'
    ]

    ext_modules.append(
194
        cuda_ext_helper('colossalai._C.multihead_attention', [
xyupeng's avatar
xyupeng committed
195
196
197
198
199
            'multihead_attention_1d.cpp', 'kernels/cublas_wrappers.cu', 'kernels/transform_kernels.cu',
            'kernels/dropout_kernels.cu', 'kernels/normalize_kernels.cu', 'kernels/softmax_kernels.cu',
            'kernels/general_kernels.cu', 'kernels/cuda_util.cu'
        ], extra_cuda_flags + cc_flag))

LuGY's avatar
LuGY committed
200
    extra_cxx_flags = ['-std=c++14', '-lcudart', '-lcublas', '-g', '-Wno-reorder', '-fopenmp', '-march=native']
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
    ext_modules.append(cuda_ext_helper('colossalai._C.cpu_optim', ['cpu_adam.cpp'], extra_cuda_flags, extra_cxx_flags))

setup(name='colossalai',
      version=get_version(),
      packages=find_packages(exclude=(
          'benchmark',
          'docker',
          'tests',
          'docs',
          'examples',
          'tests',
          'scripts',
          'requirements',
          '*.egg-info',
      )),
      description='An integrated large-scale model training system with efficient parallelization techniques',
      long_description=fetch_readme(),
      long_description_content_type='text/markdown',
      license='Apache Software License 2.0',
      url='https://www.colossalai.org',
      project_urls={
          'Forum': 'https://github.com/hpcaitech/ColossalAI/discussions',
          'Bug Tracker': 'https://github.com/hpcaitech/ColossalAI/issues',
          'Examples': 'https://github.com/hpcaitech/ColossalAI-Examples',
          'Documentation': 'http://colossalai.readthedocs.io',
          'Github': 'https://github.com/hpcaitech/ColossalAI',
      },
      ext_modules=ext_modules,
      cmdclass={'build_ext': BuildExtension} if ext_modules else {},
      install_requires=fetch_requirements('requirements/requirements.txt'),
      entry_points='''
232
        [console_scripts]
233
        colossalai=colossalai.cli:cli
234
    ''',
235
236
237
238
239
240
241
242
243
      python_requires='>=3.6',
      classifiers=[
          'Programming Language :: Python :: 3',
          'License :: OSI Approved :: Apache Software License',
          'Environment :: GPU :: NVIDIA CUDA',
          'Topic :: Scientific/Engineering :: Artificial Intelligence',
          'Topic :: System :: Distributed Computing',
      ],
      package_data={'colossalai': ['_C/*.pyi']})