Merge branch 'vijay/fused_kernel_compiliation' into 'main'

Avoid non deterministic arch order in compilation See merge request ADLR/megatron-lm!146

Merge branch 'vijay/fused_kernel_compiliation' into 'main'
Avoid non deterministic arch order in compilation See merge request ADLR/megatron-lm!146
56e16cba · Jared Casper · d80433e1 · 3dcd7dc0 · 56e16cba
Commit 56e16cba authored Oct 05, 2020 by Jared Casper
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 2 deletions

megatron/fused_kernels/__init__.py megatron/fused_kernels/__init__.py +12 -2

No files found.
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -15,8 +15,16 @@
 import pathlib
 import subprocess
+import os
 from torch.utils import cpp_extension
+# Setting this param to a list has a problem of generating
+# different compilation commands (with diferent order of architectures)
+# and leading to recompilation of fused kernels.
+# set it to empty string to avoid recompilation
+# and assign arch flags explicity in extra_cuda_cflags below
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 def get_cuda_bare_metal_version(cuda_dir):
    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], 
                                         universal_newlines=True)
@@ -42,7 +50,8 @@ def load_scaled_upper_triang_masked_softmax_fusion_kernel():
    scaled_upper_triang_masked_softmax_cuda = cpp_extension.load(
        name='scaled_upper_triang_masked_softmax_cuda', 
        sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', 
-                 srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'], 
+                 srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'],
+        build_directory=srcpath / 'build',
        extra_cflags=['-O3',],
        extra_cuda_cflags=['-O3',
                           '-gencode', 'arch=compute_70,code=sm_70',
@@ -66,7 +75,8 @@ def load_scaled_masked_softmax_fusion_kernel():
    scaled_upper_triang_masked_softmax_cuda = cpp_extension.load(
        name='scaled_masked_softmax_cuda', 
        sources=[srcpath / 'scaled_masked_softmax.cpp', 
-                 srcpath / 'scaled_masked_softmax_cuda.cu'], 
+                 srcpath / 'scaled_masked_softmax_cuda.cu'],
+        build_directory=srcpath / 'build',
        extra_cflags=['-O3',],
        extra_cuda_cflags=['-O3',
                           '-gencode', 'arch=compute_70,code=sm_70',