Unverified Commit cc329b79 authored by Tim Moon's avatar Tim Moon Committed by GitHub
Browse files

Bump minimum CUDA version to 12.0 (#1103)



* Bump minimum CUDA version to 12.0
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Debug CUDA version check
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Debug CMake build
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* Review suggestions from @ksivaman and @ptrendx

Remove logic for CUDA <12.0 in PyTorch and Paddle builds. Update version in docs and README.
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



---------
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>
Co-authored-by: default avatarpre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
parent 8ef3308a
...@@ -12,7 +12,7 @@ jobs: ...@@ -12,7 +12,7 @@ jobs:
name: 'Core' name: 'Core'
runs-on: ubuntu-latest runs-on: ubuntu-latest
container: container:
image: nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04 image: nvcr.io/nvidia/cuda:12.0.0-devel-ubuntu22.04
options: --user root options: --user root
steps: steps:
- name: 'Dependencies' - name: 'Dependencies'
......
...@@ -149,8 +149,8 @@ Installation ...@@ -149,8 +149,8 @@ Installation
Pre-requisites Pre-requisites
^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^
* Linux x86_64 * Linux x86_64
* CUDA 11.8+ for Hopper and CUDA 12.1+ for Ada * CUDA 12.0+ for Hopper and CUDA 12.1+ for Ada
* NVIDIA Driver supporting CUDA 11.8 or later * NVIDIA Driver supporting CUDA 12.0 or later
* cuDNN 8.1 or later * cuDNN 8.1 or later
* For fused attention, CUDA 12.1 or later, NVIDIA Driver supporting CUDA 12.1 or later, and cuDNN 8.9 or later. * For fused attention, CUDA 12.1 or later, NVIDIA Driver supporting CUDA 12.1 or later, and cuDNN 8.9 or later.
......
...@@ -70,8 +70,8 @@ class CMakeExtension(setuptools.Extension): ...@@ -70,8 +70,8 @@ class CMakeExtension(setuptools.Extension):
configure_command.append(f"-Dpybind11_DIR={pybind11_dir}") configure_command.append(f"-Dpybind11_DIR={pybind11_dir}")
# CMake build and install commands # CMake build and install commands
build_command = [_cmake_bin, "--build", build_dir] build_command = [_cmake_bin, "--build", build_dir, "--verbose"]
install_command = [_cmake_bin, "--install", build_dir] install_command = [_cmake_bin, "--install", build_dir, "--verbose"]
# Check whether parallel build is restricted # Check whether parallel build is restricted
max_jobs = get_max_jobs_for_parallel_build() max_jobs = get_max_jobs_for_parallel_build()
......
...@@ -62,12 +62,18 @@ def setup_paddle_extension( ...@@ -62,12 +62,18 @@ def setup_paddle_extension(
except FileNotFoundError: except FileNotFoundError:
print("Could not determine CUDA Toolkit version") print("Could not determine CUDA Toolkit version")
else: else:
if version >= (11, 2): if version < (12, 0):
nvcc_flags.extend(["--threads", os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1")]) raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
if version >= (11, 0): nvcc_flags.extend(
nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"]) (
if version >= (11, 8): "--threads",
nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"]) os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
"-gencode",
"arch=compute_80,code=sm_80",
"-gencode",
"arch=compute_90,code=sm_90",
)
)
# Construct Paddle CUDA extension # Construct Paddle CUDA extension
sources = [str(path) for path in sources] sources = [str(path) for path in sources]
......
...@@ -67,12 +67,18 @@ def setup_pytorch_extension( ...@@ -67,12 +67,18 @@ def setup_pytorch_extension(
except FileNotFoundError: except FileNotFoundError:
print("Could not determine CUDA Toolkit version") print("Could not determine CUDA Toolkit version")
else: else:
if version >= (11, 2): if version < (12, 0):
nvcc_flags.extend(["--threads", os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1")]) raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
if version >= (11, 0): nvcc_flags.extend(
nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"]) (
if version >= (11, 8): "--threads",
nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"]) os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
"-gencode",
"arch=compute_80,code=sm_80",
"-gencode",
"arch=compute_90,code=sm_90",
)
)
# Libraries # Libraries
library_dirs = [] library_dirs = []
......
...@@ -12,8 +12,8 @@ Prerequisites ...@@ -12,8 +12,8 @@ Prerequisites
.. _driver link: https://www.nvidia.com/drivers .. _driver link: https://www.nvidia.com/drivers
1. Linux x86_64 1. Linux x86_64
2. `CUDA 11.8 <https://developer.nvidia.com/cuda-downloads>`__ 2. `CUDA 12.0 <https://developer.nvidia.com/cuda-downloads>`__
3. |driver link|_ supporting CUDA 11.8 or later. 3. |driver link|_ supporting CUDA 12.0 or later.
4. `cuDNN 8.1 <https://developer.nvidia.com/cudnn>`__ or later. 4. `cuDNN 8.1 <https://developer.nvidia.com/cudnn>`__ or later.
5. For FP8/FP16/BF16 fused attention, `CUDA 12.1 <https://developer.nvidia.com/cuda-downloads>`__ or later, |driver link|_ supporting CUDA 12.1 or later, and `cuDNN 8.9.1 <https://developer.nvidia.com/cudnn>`__ or later. 5. For FP8/FP16/BF16 fused attention, `CUDA 12.1 <https://developer.nvidia.com/cuda-downloads>`__ or later, |driver link|_ supporting CUDA 12.1 or later, and `cuDNN 8.9.1 <https://developer.nvidia.com/cudnn>`__ or later.
......
...@@ -4,39 +4,27 @@ ...@@ -4,39 +4,27 @@
cmake_minimum_required(VERSION 3.21) cmake_minimum_required(VERSION 3.21)
# Language options
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90) set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90)
endif() endif()
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD_REQUIRED ON)
project(transformer_engine LANGUAGES CUDA CXX)
set(BUILD_THREADS_PER_JOB $ENV{NVTE_BUILD_THREADS_PER_JOB})
if (NOT BUILD_THREADS_PER_JOB)
set(BUILD_THREADS_PER_JOB 1)
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --threads ${BUILD_THREADS_PER_JOB}")
if(DEFINED ENV{MAX_JOBS})
set(JOBS $ENV{MAX_JOBS})
elseif(DEFINED ENV{NVTE_BUILD_MAX_JOBS})
set(JOBS $ENV{NVTE_BUILD_MAX_JOBS})
else()
set(JOBS "max number of")
endif()
message(STATUS "Parallel build with ${JOBS} jobs and ${BUILD_THREADS_PER_JOB} threads per job")
if (CMAKE_BUILD_TYPE STREQUAL "Debug") if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G") set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G")
endif() endif()
# Transformer Engine library
project(transformer_engine LANGUAGES CUDA CXX)
# CUDA Toolkit
find_package(CUDAToolkit REQUIRED) find_package(CUDAToolkit REQUIRED)
if (CUDAToolkit_VERSION VERSION_LESS 12.0)
message(FATAL_ERROR "CUDA 12.0+ is required, but found CUDA ${CUDAToolkit_VERSION}")
endif()
# Check for cuDNN frontend API # cuDNN frontend API
set(CUDNN_FRONTEND_INCLUDE_DIR set(CUDNN_FRONTEND_INCLUDE_DIR
"${CMAKE_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include") "${CMAKE_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include")
if(NOT EXISTS "${CUDNN_FRONTEND_INCLUDE_DIR}") if(NOT EXISTS "${CUDNN_FRONTEND_INCLUDE_DIR}")
...@@ -47,10 +35,11 @@ if(NOT EXISTS "${CUDNN_FRONTEND_INCLUDE_DIR}") ...@@ -47,10 +35,11 @@ if(NOT EXISTS "${CUDNN_FRONTEND_INCLUDE_DIR}")
endif() endif()
include(${CMAKE_SOURCE_DIR}/../../3rdparty/cudnn-frontend/cmake/cuDNN.cmake) include(${CMAKE_SOURCE_DIR}/../../3rdparty/cudnn-frontend/cmake/cuDNN.cmake)
# Python
find_package(Python COMPONENTS Interpreter Development.Module REQUIRED) find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
include_directories(${PROJECT_SOURCE_DIR}/..)
# Configure Transformer Engine library # Configure Transformer Engine library
include_directories(${PROJECT_SOURCE_DIR}/..)
set(transformer_engine_SOURCES) set(transformer_engine_SOURCES)
list(APPEND transformer_engine_SOURCES list(APPEND transformer_engine_SOURCES
pycudnn.cpp pycudnn.cpp
...@@ -89,8 +78,6 @@ add_library(transformer_engine SHARED ${transformer_engine_SOURCES}) ...@@ -89,8 +78,6 @@ add_library(transformer_engine SHARED ${transformer_engine_SOURCES})
target_include_directories(transformer_engine PUBLIC target_include_directories(transformer_engine PUBLIC
"${CMAKE_CURRENT_SOURCE_DIR}/include") "${CMAKE_CURRENT_SOURCE_DIR}/include")
target_compile_definitions(transformer_engine PUBLIC NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING)
# Configure dependencies # Configure dependencies
target_link_libraries(transformer_engine PUBLIC target_link_libraries(transformer_engine PUBLIC
CUDA::cublas CUDA::cublas
...@@ -100,7 +87,10 @@ target_include_directories(transformer_engine PRIVATE ...@@ -100,7 +87,10 @@ target_include_directories(transformer_engine PRIVATE
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_include_directories(transformer_engine PRIVATE "${CUDNN_FRONTEND_INCLUDE_DIR}") target_include_directories(transformer_engine PRIVATE "${CUDNN_FRONTEND_INCLUDE_DIR}")
# Make header files with C++ strings # Hack to enable dynamic loading in cuDNN frontend
target_compile_definitions(transformer_engine PUBLIC NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING)
# Helper functions to make header files with C++ strings
function(make_string_header STRING STRING_NAME) function(make_string_header STRING STRING_NAME)
configure_file(util/string_header.h.in configure_file(util/string_header.h.in
"string_headers/${STRING_NAME}.h" "string_headers/${STRING_NAME}.h"
...@@ -112,10 +102,11 @@ function(make_string_header_from_file file_ STRING_NAME) ...@@ -112,10 +102,11 @@ function(make_string_header_from_file file_ STRING_NAME)
"string_headers/${STRING_NAME}.h" "string_headers/${STRING_NAME}.h"
@ONLY) @ONLY)
endfunction() endfunction()
# Header files with C++ strings
list(GET CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES 0 cuda_include_path) list(GET CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES 0 cuda_include_path)
make_string_header("${cuda_include_path}" make_string_header("${cuda_include_path}"
string_path_cuda_include) string_path_cuda_include)
make_string_header_from_file(transpose/rtc/cast_transpose_fusion.cu make_string_header_from_file(transpose/rtc/cast_transpose_fusion.cu
string_code_transpose_rtc_cast_transpose_fusion_cu) string_code_transpose_rtc_cast_transpose_fusion_cu)
make_string_header_from_file(transpose/rtc/cast_transpose.cu make_string_header_from_file(transpose/rtc/cast_transpose.cu
...@@ -126,7 +117,6 @@ make_string_header_from_file(utils.cuh ...@@ -126,7 +117,6 @@ make_string_header_from_file(utils.cuh
string_code_utils_cuh) string_code_utils_cuh)
make_string_header_from_file(util/math.h make_string_header_from_file(util/math.h
string_code_util_math_h) string_code_util_math_h)
target_include_directories(transformer_engine PRIVATE target_include_directories(transformer_engine PRIVATE
"${CMAKE_CURRENT_BINARY_DIR}/string_headers") "${CMAKE_CURRENT_BINARY_DIR}/string_headers")
...@@ -139,6 +129,23 @@ set_source_files_properties(fused_softmax/scaled_masked_softmax.cu ...@@ -139,6 +129,23 @@ set_source_files_properties(fused_softmax/scaled_masked_softmax.cu
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
# Number of parallel build jobs
if(ENV{MAX_JOBS})
set(BUILD_JOBS_STR "$ENV{MAX_JOBS}")
elseif(ENV{NVTE_BUILD_MAX_JOBS})
set(BUILD_JOBS_STR "$ENV{NVTE_BUILD_MAX_JOBS}")
else()
set(BUILD_JOBS_STR "max")
endif()
message(STATUS "Parallel build jobs: ${BUILD_JOBS_STR}")
# Number of threads per parallel build job
set(BUILD_THREADS_PER_JOB $ENV{NVTE_BUILD_THREADS_PER_JOB})
if (NOT BUILD_THREADS_PER_JOB)
set(BUILD_THREADS_PER_JOB 1)
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --threads ${BUILD_THREADS_PER_JOB}")
message(STATUS "Threads per parallel build job: ${BUILD_THREADS_PER_JOB}")
# Install library # Install library
install(TARGETS transformer_engine DESTINATION .) install(TARGETS transformer_engine DESTINATION .)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment