setup.py 25.4 KB
Newer Older
1
import torch
2
from setuptools import setup, find_packages
mcarilli's avatar
mcarilli committed
3
import subprocess
4

jjsjann123's avatar
jjsjann123 committed
5
import sys
Marek Kolodziej's avatar
Marek Kolodziej committed
6
import warnings
mcarilli's avatar
mcarilli committed
7
import os
jjsjann123's avatar
jjsjann123 committed
8

9
10
11
# ninja build does not work unless include_dirs are abs path
this_dir = os.path.dirname(os.path.abspath(__file__))

12
if not torch.cuda.is_available():
mcarilli's avatar
mcarilli committed
13
14
15
16
17
18
19
20
21
22
23
    # https://github.com/NVIDIA/apex/issues/486
    # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(),
    # which will fail if you are compiling in an environment without visible GPUs (e.g. during an nvidia-docker build command).
    print('\nWarning: Torch did not find available GPUs on this system.\n',
          'If your intention is to cross-compile, this is not an error.\n'
          'By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
          'Volta (compute capability 7.0), and Turing (compute capability 7.5).\n'
          'If you wish to cross-compile for a single specific architecture,\n'
          'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n')
    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
        os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
24

25
print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
26
27
28
29
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])

if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
Michael Carilli's avatar
Michael Carilli committed
30
      raise RuntimeError("Apex requires Pytorch 0.4 or newer.\n" +
31
32
                         "The latest stable release can be obtained from https://pytorch.org/")

jjsjann123's avatar
jjsjann123 committed
33
34
35
cmdclass = {}
ext_modules = []

ptrblck's avatar
ptrblck committed
36
extras = {}
Marek Kolodziej's avatar
Marek Kolodziej committed
37
if "--pyprof" in sys.argv:
38
39
40
41
42
    string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
             "soon be removed from Apex.  Please visit\n" + \
             "https://github.com/NVIDIA/PyProf\n" + \
             "for the latest version."
    warnings.warn(string, DeprecationWarning)
Marek Kolodziej's avatar
Marek Kolodziej committed
43
44
    with open('requirements.txt') as f:
        required_packages = f.read().splitlines()
ptrblck's avatar
ptrblck committed
45
        extras['pyprof'] = required_packages
Marek Kolodziej's avatar
Marek Kolodziej committed
46
47
48
49
50
51
52
    try:
        sys.argv.remove("--pyprof")
    except:
        pass
else:
    warnings.warn("Option --pyprof not specified. Not installing PyProf dependencies!")

53
if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
Michael Carilli's avatar
Michael Carilli committed
54
55
    if TORCH_MAJOR == 0:
        raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "
56
                           "found torch.__version__ = {}".format(torch.__version__))
57
58
59
60
61
62
63
64
65
66
    from torch.utils.cpp_extension import BuildExtension
    cmdclass['build_ext'] = BuildExtension

if "--cpp_ext" in sys.argv:
    from torch.utils.cpp_extension import CppExtension
    sys.argv.remove("--cpp_ext")
    ext_modules.append(
        CppExtension('apex_C',
                     ['csrc/flatten_unflatten.cpp',]))

mcarilli's avatar
mcarilli committed
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
    output = raw_output.split()
    release_idx = output.index("release") + 1
    release = output[release_idx].split(".")
    bare_metal_major = release[0]
    bare_metal_minor = release[1][0]
    torch_binary_major = torch.version.cuda.split(".")[0]
    torch_binary_minor = torch.version.cuda.split(".")[1]

    print("\nCompiling cuda extensions with")
    print(raw_output + "from " + cuda_dir + "/bin\n")

    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
Michael Carilli's avatar
Michael Carilli committed
81
82
83
84
85
86
        raise RuntimeError("Cuda extensions are being compiled with a version of Cuda that does " +
                           "not match the version used to compile Pytorch binaries.  " +
                           "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) +
                           "In some cases, a minor-version mismatch will not cause later errors:  " +
                           "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
                           "You can try commenting out this check (at your own risk).")
mcarilli's avatar
mcarilli committed
87

88
89
90
91
92
93
94
95
def check_if_rocm_pytorch():
    is_rocm_pytorch = False
    if torch.__version__ >= '1.5':
        from torch.utils.cpp_extension import ROCM_HOME
        is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False

    return is_rocm_pytorch

mcarilli's avatar
mcarilli committed
96
97
98
99
100
101
102
103
104
105
106
# Set up macros for forward/backward compatibility hack around
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
# and
# https://github.com/NVIDIA/apex/issues/456
# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
version_ge_1_1 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
    version_ge_1_1 = ['-DVERSION_GE_1_1']
version_ge_1_3 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
    version_ge_1_3 = ['-DVERSION_GE_1_3']
107
108
109
110
version_ge_1_5 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
    version_ge_1_5 = ['-DVERSION_GE_1_5']
version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
mcarilli's avatar
mcarilli committed
111

112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
if "--distributed_lamb" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--distributed_lamb")

    from torch.utils.cpp_extension import BuildExtension
    cmdclass['build_ext'] = BuildExtension

    if torch.utils.cpp_extension.CUDA_HOME is None:
        raise RuntimeError("--distributed_lamb was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
    else:
        ext_modules.append(
            CUDAExtension(name='distributed_lamb_cuda',
                          sources=['apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp',
                                   'apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu'],
                          include_dirs=[os.path.join(this_dir, 'csrc')],
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
                                              'nvcc':['-O3',
                                                      '--use_fast_math'] + version_dependent_macros}))

jjsjann123's avatar
jjsjann123 committed
131
if "--cuda_ext" in sys.argv:
132
    from torch.utils.cpp_extension import CUDAExtension
jjsjann123's avatar
jjsjann123 committed
133
    sys.argv.remove("--cuda_ext")
134

135
136
137
138
    is_rocm_pytorch = False
    if torch.__version__ >= '1.5':
        from torch.utils.cpp_extension import ROCM_HOME
        is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
139

140
    if torch.utils.cpp_extension.CUDA_HOME is None and (not is_rocm_pytorch):
Michael Carilli's avatar
Michael Carilli committed
141
        raise RuntimeError("--cuda_ext was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
142
    else:
143
144
145
        if not is_rocm_pytorch:
            check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)

146
147
148
149
        print ("INFO: Building the multi-tensor apply extension.")
        nvcc_args_multi_tensor = ['-lineinfo', '-O3', '--use_fast_math'] + version_dependent_macros
        hipcc_args_multi_tensor = ['-O3'] + version_dependent_macros
        ext_modules.append(
150
151
152
153
154
155
156
157
158
                CUDAExtension(name='amp_C',
                              sources=['csrc/amp_C_frontend.cpp',
                                       'csrc/multi_tensor_sgd_kernel.cu',
                                       'csrc/multi_tensor_scale_kernel.cu',
                                       'csrc/multi_tensor_axpby_kernel.cu',
                                       'csrc/multi_tensor_l2norm_kernel.cu',
                                       'csrc/multi_tensor_lamb_stage_1.cu',
                                       'csrc/multi_tensor_lamb_stage_2.cu',
                                       'csrc/multi_tensor_adam.cu',
159
                                       'csrc/multi_tensor_adagrad.cu',
160
161
                                       'csrc/multi_tensor_novograd.cu',
                                       'csrc/multi_tensor_lamb.cu'],
162
163
                              extra_compile_args = { 'cxx' : ['-O3'] + version_dependent_macros,
                                                     'nvcc': nvcc_args_multi_tensor if not is_rocm_pytorch else hipcc_args_multi_tensor}))
164

lcskrishna's avatar
lcskrishna committed
165
        print ("INFO: Building syncbn extension.")
166
        ext_modules.append(
167
168
169
                CUDAExtension(name='syncbn',
                              sources=['csrc/syncbn.cpp',
                                       'csrc/welford.cu'],
170
                              extra_compile_args= ['-O3'] + version_dependent_macros))
171

172
173
174
175
        nvcc_args_layer_norm = ['maxrregcount=50', '-O3', '--use_fast_math'] + version_dependent_macros
        hipcc_args_layer_norm = ['-O3'] + version_dependent_macros
        print ("INFO: Building fused layernorm extension.")
        ext_modules.append(
176
177
178
179
                CUDAExtension(name='fused_layer_norm_cuda',
                              sources=['csrc/layer_norm_cuda.cpp',
                                       'csrc/layer_norm_cuda_kernel.cu'],
                              extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
180
                                                  'nvcc': nvcc_args_layer_norm if not is_rocm_pytorch else hipcc_args_layer_norm}))
181

182
183
        print ("INFO: Building the MLP Extension.")
        ext_modules.append(
184
185
186
                CUDAExtension(name='mlp_cuda',
                              sources=['csrc/mlp.cpp',
                                       'csrc/mlp_cuda.cu'],
187
                              extra_compile_args=['-O3'] + version_dependent_macros))
188

jjsjann123's avatar
jjsjann123 committed
189
190
191
192
193
194
195
196
if "--bnp" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--bnp")

    from torch.utils.cpp_extension import BuildExtension
    cmdclass['build_ext'] = BuildExtension

    if torch.utils.cpp_extension.CUDA_HOME is None:
197
        raise RuntimeError("--bnp was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
jjsjann123's avatar
jjsjann123 committed
198
199
200
201
202
203
204
    else:
        ext_modules.append(
            CUDAExtension(name='bnp',
                          sources=['apex/contrib/csrc/groupbn/batch_norm.cu',
                                   'apex/contrib/csrc/groupbn/ipc.cu',
                                   'apex/contrib/csrc/groupbn/interface.cpp',
                                   'apex/contrib/csrc/groupbn/batch_norm_add_relu.cu'],
205
                          include_dirs=[os.path.join(this_dir, 'csrc')],
mcarilli's avatar
mcarilli committed
206
                          extra_compile_args={'cxx': [] + version_dependent_macros,
jjsjann123's avatar
jjsjann123 committed
207
208
209
                                              'nvcc':['-DCUDA_HAS_FP16=1',
                                                      '-D__CUDA_NO_HALF_OPERATORS__',
                                                      '-D__CUDA_NO_HALF_CONVERSIONS__',
210
                                                      '-D__CUDA_NO_HALF2_OPERATORS__'] + version_dependent_macros}))
jjsjann123's avatar
jjsjann123 committed
211

212
213
214
215
216
217
218
if "--xentropy" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--xentropy")

    from torch.utils.cpp_extension import BuildExtension
    cmdclass['build_ext'] = BuildExtension

219
220
221
    is_rocm_pytorch = check_if_rocm_pytorch()

    if torch.utils.cpp_extension.CUDA_HOME is None and (not is_rocm_pytorch):
222
223
        raise RuntimeError("--xentropy was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
    else:
224
225
        print ("INFO: Building the xentropy extension.")
        ext_modules.append(
226
227
228
229
                CUDAExtension(name='xentropy_cuda',
                              sources=['apex/contrib/csrc/xentropy/interface.cpp',
                                       'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
                              include_dirs=[os.path.join(this_dir, 'csrc')],
230
                              extra_compile_args=['-O3'] + version_dependent_macros))
231
   
232

233
234
235
236
237
238
239
if "--deprecated_fused_adam" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--deprecated_fused_adam")

    from torch.utils.cpp_extension import BuildExtension
    cmdclass['build_ext'] = BuildExtension

240
241
242
    is_rocm_pytorch = check_if_rocm_pytorch()

    if torch.utils.cpp_extension.CUDA_HOME is None and (not is_rocm_pytorch):
243
244
        raise RuntimeError("--deprecated_fused_adam was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
    else:
245
246
247
248
        print ("INFO: Building deprecated fused adam extension.")
        nvcc_args_fused_adam = ['-O3', '--use_fast_math'] + version_dependent_macros
        hipcc_args_fused_adam = ['-O3'] + version_dependent_macros
        ext_modules.append(
249
250
251
252
                CUDAExtension(name='fused_adam_cuda',
                              sources=['apex/contrib/csrc/optimizers/fused_adam_cuda.cpp',
                                       'apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu'],
                              include_dirs=[os.path.join(this_dir, 'csrc')],
253
254
                              extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
                                                  'nvcc' : nvcc_args_fused_adam if not is_rocm_pytorch else hipcc_args_fused_adam}))
255
256
257
258
259
260
261
if "--deprecated_fused_lamb" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--deprecated_fused_lamb")

    from torch.utils.cpp_extension import BuildExtension
    cmdclass['build_ext'] = BuildExtension

262
263
264
    is_rocm_pytorch = check_if_rocm_pytorch()

    if torch.utils.cpp_extension.CUDA_HOME is None and (not is_rocm_pytorch):
265
266
        raise RuntimeError("--deprecated_fused_lamb was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
    else:
267
268
269
270
        print ("INFO: Building deprecated fused lamb extension.")
        nvcc_args_fused_lamb = ['-O3', '--use_fast_math'] + version_dependent_macros
        hipcc_args_fused_lamb = ['-O3'] + version_dependent_macros
        ext_modules.append(
271
272
273
274
275
                CUDAExtension(name='fused_lamb_cuda',
                              sources=['apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp',
                                       'apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu',
                                       'csrc/multi_tensor_l2norm_kernel.cu'],
                              include_dirs=[os.path.join(this_dir, 'csrc')],
276
                              extra_compile_args = nvcc_args_fused_lamb if not is_rocm_pytorch else hipcc_args_fused_lamb))
277

278
# Check, if ATen/CUDAGenerator.h is found, otherwise use the new ATen/CUDAGeneratorImpl.h, due to breaking change in https://github.com/pytorch/pytorch/pull/36026
ptrblck's avatar
ptrblck committed
279
280
281
282
283
generator_flag = []
torch_dir = torch.__path__[0]
if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')):
    generator_flag = ['-DOLD_GENERATOR']

284
285
286
287
288
if "--fast_multihead_attn" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--fast_multihead_attn")

    from torch.utils.cpp_extension import BuildExtension
289
    cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
290
291
292
293
294

    if torch.utils.cpp_extension.CUDA_HOME is None:
        raise RuntimeError("--fast_multihead_attn was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
    else:
        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/multihead_attn/cutlass"])
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
        ext_modules.append(
            CUDAExtension(name='fast_additive_mask_softmax_dropout',
                          sources=['apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp',
                                   'apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu'],
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
                                              'nvcc':['-O3',
                                                      '-gencode', 'arch=compute_70,code=sm_70',
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_mask_softmax_dropout',
                          sources=['apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp',
                                   'apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu'],
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
                                              'nvcc':['-O3',
                                                      '-gencode', 'arch=compute_70,code=sm_70',
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_self_multihead_attn_bias_additive_mask',
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp',
                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu'],
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
                                              'nvcc':['-O3',
                                                      '-gencode', 'arch=compute_70,code=sm_70',
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_self_multihead_attn_bias',
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp',
                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu'],
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
                                              'nvcc':['-O3',
                                                      '-gencode', 'arch=compute_70,code=sm_70',
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
347
348
        ext_modules.append(
            CUDAExtension(name='fast_self_multihead_attn',
349
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp',
350
                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu'],
ptrblck's avatar
ptrblck committed
351
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
352
                                              'nvcc':['-O3',
353
                                                      '-gencode', 'arch=compute_70,code=sm_70',
354
355
356
357
358
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
ptrblck's avatar
ptrblck committed
359
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
360
361
        ext_modules.append(
            CUDAExtension(name='fast_self_multihead_attn_norm_add',
362
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp',
363
                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu'],
ptrblck's avatar
ptrblck committed
364
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
365
                                              'nvcc':['-O3',
366
                                                      '-gencode', 'arch=compute_70,code=sm_70',
367
368
369
370
371
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
ptrblck's avatar
ptrblck committed
372
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
373
374
        ext_modules.append(
            CUDAExtension(name='fast_encdec_multihead_attn',
375
                          sources=['apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp',
376
                                   'apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu'],
ptrblck's avatar
ptrblck committed
377
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
378
                                              'nvcc':['-O3',
379
                                                      '-gencode', 'arch=compute_70,code=sm_70',
380
381
382
383
384
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
ptrblck's avatar
ptrblck committed
385
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
386
387
        ext_modules.append(
            CUDAExtension(name='fast_encdec_multihead_attn_norm_add',
388
                          sources=['apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp',
389
                                   'apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu'],
ptrblck's avatar
ptrblck committed
390
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
391
                                              'nvcc':['-O3',
392
                                                      '-gencode', 'arch=compute_70,code=sm_70',
393
394
395
396
397
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
ptrblck's avatar
ptrblck committed
398
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
399

Christian Sarofeen's avatar
Christian Sarofeen committed
400
setup(
401
402
    name='apex',
    version='0.1',
403
404
405
406
    packages=find_packages(exclude=('build',
                                    'csrc',
                                    'include',
                                    'tests',
407
408
409
410
411
                                    'dist',
                                    'docs',
                                    'tests',
                                    'examples',
                                    'apex.egg-info',)),
Christian Sarofeen's avatar
Christian Sarofeen committed
412
    description='PyTorch Extensions written by NVIDIA',
jjsjann123's avatar
jjsjann123 committed
413
414
    ext_modules=ext_modules,
    cmdclass=cmdclass,
ptrblck's avatar
ptrblck committed
415
    extras_require=extras,
Christian Sarofeen's avatar
Christian Sarofeen committed
416
)