Apex transformer (#77)

* Add setup_simple.py for debugging the compiling issue of scaled_masked_softmax_cuda * Comment out CUDA-specific implementations * Resolve filename collision of *cpp files with to-hipify code and *cu files

Apex transformer (#77)
* Add setup_simple.py for debugging the compiling issue of scaled_masked_softmax_cuda * Comment out CUDA-specific implementations * Resolve filename collision of *cpp files with to-hipify code and *cu files
27a47345 · Hubert Lu · GitHub · dd584a59 · 27a47345 · 27a47345
Unverified Commit 27a47345 authored Apr 14, 2022 by Hubert Lu Committed by GitHub Apr 14, 2022
5 changed files
--- a/csrc/megatron/scaled_masked_softmax.cpp
+++ b/csrc/megatron/scaled_masked_softmax.cpp
@@ -14,7 +14,6 @@
 * limitations under the License.
 */
-#include <cuda_fp16.h>
 #include <torch/extension.h>
 #include <vector>

--- a/csrc/megatron/scaled_masked_softmax_cuda.cu
+++ b/csrc/megatron/scaled_masked_softmax_cuda.cu
@@ -18,7 +18,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
+//#include <cuda_profiler_api.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #include "scaled_masked_softmax.h"

--- a/csrc/megatron/scaled_upper_triang_masked_softmax.cpp
+++ b/csrc/megatron/scaled_upper_triang_masked_softmax.cpp
@@ -14,7 +14,6 @@
 * limitations under the License.
 */
-#include <cuda_fp16.h>
 #include <torch/extension.h>
 #include <vector>

--- a/csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu
@@ -18,7 +18,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
+//#include <cuda_profiler_api.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #include "scaled_upper_triang_masked_softmax.h"

--- a/setup.py
+++ b/setup.py
@@ -261,31 +261,30 @@ if "--cuda_ext" in sys.argv:
                                   'csrc/fused_dense_cuda.cu'],
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
                                              'nvcc':['-O3'] + version_dependent_macros}))
-        """
+        nvcc_args_transformer = ['-O3',
+                                 '-U__CUDA_NO_HALF_OPERATORS__',
+                                 '-U__CUDA_NO_HALF_CONVERSIONS__',
+                                 '--expt-relaxed-constexpr',
+                                 '--expt-extended-lambda'] + version_dependent_macros
+        hipcc_args_transformer = ['-O3',
+                                 '-U__CUDA_NO_HALF_OPERATORS__',
+                                 '-U__CUDA_NO_HALF_CONVERSIONS__'] + version_dependent_macros
        ext_modules.append(
            CUDAExtension(name='scaled_upper_triang_masked_softmax_cuda',
                          sources=['csrc/megatron/scaled_upper_triang_masked_softmax.cpp',
                                   'csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu'],
                          include_dirs=[os.path.join(this_dir, 'csrc')],
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
-                                              'nvcc':['-O3',
+                                              'nvcc':nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer}))
-                                                      '-U__CUDA_NO_HALF_OPERATORS__',
-                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
-                                                      '--expt-relaxed-constexpr',
-                                                      '--expt-extended-lambda'] + version_dependent_macros}))
        ext_modules.append(
            CUDAExtension(name='scaled_masked_softmax_cuda',
                          sources=['csrc/megatron/scaled_masked_softmax.cpp',
                                   'csrc/megatron/scaled_masked_softmax_cuda.cu'],
-                          include_dirs=[os.path.join(this_dir, 'csrc')],
+                          include_dirs=[os.path.join(this_dir, 'csrc'),
+                                        os.path.join(this_dir, 'csrc/megatron')],
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
-                                              'nvcc':['-O3',
+                                              'nvcc':nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer}))
-                                                      '-U__CUDA_NO_HALF_OPERATORS__',
-                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
-                                                      '--expt-relaxed-constexpr',
-                                                      '--expt-extended-lambda'] + version_dependent_macros}))
-        """
 if "--bnp" in sys.argv or "--cuda_ext" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension