Bump minimum CUDA version to 12.0 (#1103)

* Bump minimum CUDA version to 12.0 Signed-off-by: Tim Moon <tmoon@nvidia.com> * Debug CUDA version check Signed-off-by: Tim Moon <tmoon@nvidia.com> * Debug CMake build Signed-off-by: Tim Moon <tmoon@nvidia.com> * Review suggestions from @ksivaman and @ptrendx Remove logic for CUDA <12.0 in PyTorch and Paddle builds. Update version in docs and README. Signed-off-by: Tim Moon <tmoon@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Tim Moon <tmoon@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Bump minimum CUDA version to 12.0 (#1103)
* Bump minimum CUDA version to 12.0 Signed-off-by: Tim Moon <tmoon@nvidia.com> * Debug CUDA version check Signed-off-by: Tim Moon <tmoon@nvidia.com> * Debug CMake build Signed-off-by: Tim Moon <tmoon@nvidia.com> * Review suggestions from @ksivaman and @ptrendx Remove logic for CUDA <12.0 in PyTorch and Paddle builds. Update version in docs and README. Signed-off-by: Tim Moon <tmoon@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Tim Moon <tmoon@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
cc329b79 · Tim Moon · GitHub · 8ef3308a · cc329b79 · cc329b79
Unverified Commit cc329b79 authored Aug 14, 2024 by Tim Moon Committed by GitHub Aug 14, 2024
7 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -12,7 +12,7 @@ jobs:
    name: 'Core'
    runs-on: ubuntu-latest
    container:
-      image: nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04
+      image: nvcr.io/nvidia/cuda:12.0.0-devel-ubuntu22.04
      options: --user root
    steps:
      - name: 'Dependencies'

--- a/README.rst
+++ b/README.rst
@@ -149,8 +149,8 @@ Installation
 Pre-requisites
 ^^^^^^^^^^^^^^^^^^^^
 * Linux x86_64
-* CUDA 11.8+ for Hopper and CUDA 12.1+ for Ada
+* CUDA 12.0+ for Hopper and CUDA 12.1+ for Ada
-* NVIDIA Driver supporting CUDA 11.8 or later
+* NVIDIA Driver supporting CUDA 12.0 or later
 * cuDNN 8.1 or later
 * For fused attention, CUDA 12.1 or later, NVIDIA Driver supporting CUDA 12.1 or later, and cuDNN 8.9 or later.

--- a/build_tools/build_ext.py
+++ b/build_tools/build_ext.py
@@ -70,8 +70,8 @@ class CMakeExtension(setuptools.Extension):
        configure_command.append(f"-Dpybind11_DIR={pybind11_dir}")
        # CMake build and install commands
-        build_command = [_cmake_bin, "--build", build_dir]
+        build_command = [_cmake_bin, "--build", build_dir, "--verbose"]
-        install_command = [_cmake_bin, "--install", build_dir]
+        install_command = [_cmake_bin, "--install", build_dir, "--verbose"]
        # Check whether parallel build is restricted
        max_jobs = get_max_jobs_for_parallel_build()

--- a/build_tools/paddle.py
+++ b/build_tools/paddle.py
@@ -62,12 +62,18 @@ def setup_paddle_extension(
    except FileNotFoundError:
        print("Could not determine CUDA Toolkit version")
    else:
-        if version >= (11, 2):
+        if version < (12, 0):
-            nvcc_flags.extend(["--threads", os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1")])
+            raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
-        if version >= (11, 0):
+        nvcc_flags.extend(
-            nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
+            (
-        if version >= (11, 8):
+                "--threads",
-            nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
+                os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
+                "-gencode",
+                "arch=compute_80,code=sm_80",
+                "-gencode",
+                "arch=compute_90,code=sm_90",
+            )
+        )
    # Construct Paddle CUDA extension
    sources = [str(path) for path in sources]

--- a/build_tools/pytorch.py
+++ b/build_tools/pytorch.py
@@ -67,12 +67,18 @@ def setup_pytorch_extension(
    except FileNotFoundError:
        print("Could not determine CUDA Toolkit version")
    else:
-        if version >= (11, 2):
+        if version < (12, 0):
-            nvcc_flags.extend(["--threads", os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1")])
+            raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
-        if version >= (11, 0):
+        nvcc_flags.extend(
-            nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
+            (
-        if version >= (11, 8):
+                "--threads",
-            nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
+                os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
+                "-gencode",
+                "arch=compute_80,code=sm_80",
+                "-gencode",
+                "arch=compute_90,code=sm_90",
+            )
+        )
    # Libraries
    library_dirs = []

--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -12,8 +12,8 @@ Prerequisites
 .. _driver link: https://www.nvidia.com/drivers
 1. Linux x86_64
-2. `CUDA 11.8 <https://developer.nvidia.com/cuda-downloads>`__
+2. `CUDA 12.0 <https://developer.nvidia.com/cuda-downloads>`__
-3. |driver link|_ supporting CUDA 11.8 or later.
+3. |driver link|_ supporting CUDA 12.0 or later.
 4. `cuDNN 8.1 <https://developer.nvidia.com/cudnn>`__ or later.
 5. For FP8/FP16/BF16 fused attention, `CUDA 12.1 <https://developer.nvidia.com/cuda-downloads>`__ or later, |driver link|_ supporting CUDA 12.1 or later, and `cuDNN 8.9.1 <https://developer.nvidia.com/cudnn>`__ or later.

--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -4,39 +4,27 @@
 cmake_minimum_required(VERSION 3.21)
+# Language options
 if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90)
 endif()
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-project(transformer_engine LANGUAGES CUDA CXX)
-set(BUILD_THREADS_PER_JOB $ENV{NVTE_BUILD_THREADS_PER_JOB})
-if (NOT BUILD_THREADS_PER_JOB)
-  set(BUILD_THREADS_PER_JOB 1)
-endif()
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --threads ${BUILD_THREADS_PER_JOB}")
-if(DEFINED ENV{MAX_JOBS})
-  set(JOBS $ENV{MAX_JOBS})
-elseif(DEFINED ENV{NVTE_BUILD_MAX_JOBS})
-  set(JOBS $ENV{NVTE_BUILD_MAX_JOBS})
-else()
-  set(JOBS "max number of")
-endif()
-message(STATUS "Parallel build with ${JOBS} jobs and ${BUILD_THREADS_PER_JOB} threads per job")
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
  set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G")
 endif()
+# Transformer Engine library
+project(transformer_engine LANGUAGES CUDA CXX)
+# CUDA Toolkit
 find_package(CUDAToolkit REQUIRED)
+if (CUDAToolkit_VERSION VERSION_LESS 12.0)
+  message(FATAL_ERROR "CUDA 12.0+ is required, but found CUDA ${CUDAToolkit_VERSION}")
+endif()
-# Check for cuDNN frontend API
+# cuDNN frontend API
 set(CUDNN_FRONTEND_INCLUDE_DIR
    "${CMAKE_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include")
 if(NOT EXISTS "${CUDNN_FRONTEND_INCLUDE_DIR}")
@@ -47,10 +35,11 @@ if(NOT EXISTS "${CUDNN_FRONTEND_INCLUDE_DIR}")
 endif()
 include(${CMAKE_SOURCE_DIR}/../../3rdparty/cudnn-frontend/cmake/cuDNN.cmake)
+# Python
 find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
-include_directories(${PROJECT_SOURCE_DIR}/..)
 # Configure Transformer Engine library
+include_directories(${PROJECT_SOURCE_DIR}/..)
 set(transformer_engine_SOURCES)
 list(APPEND transformer_engine_SOURCES
     pycudnn.cpp
@@ -89,8 +78,6 @@ add_library(transformer_engine SHARED ${transformer_engine_SOURCES})
 target_include_directories(transformer_engine PUBLIC
                           "${CMAKE_CURRENT_SOURCE_DIR}/include")
-target_compile_definitions(transformer_engine PUBLIC NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING)
 # Configure dependencies
 target_link_libraries(transformer_engine PUBLIC
                      CUDA::cublas
@@ -100,7 +87,10 @@ target_include_directories(transformer_engine PRIVATE
                           ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 target_include_directories(transformer_engine PRIVATE "${CUDNN_FRONTEND_INCLUDE_DIR}")
-# Make header files with C++ strings
+# Hack to enable dynamic loading in cuDNN frontend
+target_compile_definitions(transformer_engine PUBLIC NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING)
+# Helper functions to make header files with C++ strings
 function(make_string_header STRING STRING_NAME)
    configure_file(util/string_header.h.in
                   "string_headers/${STRING_NAME}.h"
@@ -112,10 +102,11 @@ function(make_string_header_from_file file_ STRING_NAME)
                   "string_headers/${STRING_NAME}.h"
                   @ONLY)
 endfunction()
+# Header files with C++ strings
 list(GET CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES 0 cuda_include_path)
 make_string_header("${cuda_include_path}"
                   string_path_cuda_include)
 make_string_header_from_file(transpose/rtc/cast_transpose_fusion.cu
                             string_code_transpose_rtc_cast_transpose_fusion_cu)
 make_string_header_from_file(transpose/rtc/cast_transpose.cu
@@ -126,7 +117,6 @@ make_string_header_from_file(utils.cuh
                             string_code_utils_cuh)
 make_string_header_from_file(util/math.h
                             string_code_util_math_h)
 target_include_directories(transformer_engine PRIVATE
                           "${CMAKE_CURRENT_BINARY_DIR}/string_headers")
@@ -139,6 +129,23 @@ set_source_files_properties(fused_softmax/scaled_masked_softmax.cu
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
+# Number of parallel build jobs
+if(ENV{MAX_JOBS})
+  set(BUILD_JOBS_STR "$ENV{MAX_JOBS}")
+elseif(ENV{NVTE_BUILD_MAX_JOBS})
+  set(BUILD_JOBS_STR "$ENV{NVTE_BUILD_MAX_JOBS}")
+else()
+  set(BUILD_JOBS_STR "max")
+endif()
+message(STATUS "Parallel build jobs: ${BUILD_JOBS_STR}")
+# Number of threads per parallel build job
+set(BUILD_THREADS_PER_JOB $ENV{NVTE_BUILD_THREADS_PER_JOB})
+if (NOT BUILD_THREADS_PER_JOB)
+  set(BUILD_THREADS_PER_JOB 1)
+endif()
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --threads ${BUILD_THREADS_PER_JOB}")
+message(STATUS "Threads per parallel build job: ${BUILD_THREADS_PER_JOB}")
 # Install library
 install(TARGETS transformer_engine DESTINATION .)