CMakeLists.txt 8.84 KB
Newer Older
1
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Przemek Tredak's avatar
Przemek Tredak committed
2
3
#
# See LICENSE for license information.
4

5
cmake_minimum_required(VERSION 3.21)
6

7
# Language options
8
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
9
10
11
  if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13.0)
    set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90 100 120)
  elseif (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.8)
12
13
14
15
    set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90 100 120)
  else ()
    set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90)
  endif()
16
17
18
19
20
endif()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
21
  set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G")
22
23
endif()

24
25
26
27
# Hide non-necessary symbols in shared object.
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libtransformer_engine.version")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libtransformer_engine.version")

28
29
30
31
# Transformer Engine library
project(transformer_engine LANGUAGES CUDA CXX)

# CUDA Toolkit
32
find_package(CUDAToolkit REQUIRED)
33
34
35
if (CUDAToolkit_VERSION VERSION_LESS 12.0)
  message(FATAL_ERROR "CUDA 12.0+ is required, but found CUDA ${CUDAToolkit_VERSION}")
endif()
36

37
# cuDNN frontend API
38
set(CUDNN_FRONTEND_INCLUDE_DIR
39
    "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include")
40
41
if(NOT EXISTS "${CUDNN_FRONTEND_INCLUDE_DIR}")
    message(FATAL_ERROR
42
            "Could not find cuDNN frontend API at ${CUDNN_FRONTEND_INCLUDE_DIR}. "
43
44
45
            "Try running 'git submodule update --init --recursive' "
            "within the Transformer Engine source.")
endif()
46
include(${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/cudnn-frontend/cmake/cuDNN.cmake)
47

48
# Python
49
50
find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)

51
# Configure Transformer Engine library
52
include_directories(${PROJECT_SOURCE_DIR}/..)
53
set(transformer_engine_SOURCES)
54
list(APPEND transformer_engine_SOURCES
55
     cudnn_utils.cpp
56
     transformer_engine.cpp
57
     common.cu
58
59
60
61
62
     multi_tensor/adam.cu
     multi_tensor/compute_scale.cu
     multi_tensor/l2norm.cu
     multi_tensor/scale.cu
     multi_tensor/sgd.cu
63
64
65
66
67
     transpose/cast_transpose.cu
     transpose/transpose.cu
     transpose/cast_transpose_fusion.cu
     transpose/transpose_fusion.cu
     transpose/multi_cast_transpose.cu
68
69
     transpose/quantize_transpose_square_blockwise.cu
     transpose/quantize_transpose_vector_blockwise.cu
70
     transpose/swap_first_dims.cu
71
     activation/gelu.cu
72
73
74
     fused_attn/flash_attn.cu
     fused_attn/context_parallel.cu
     fused_attn/kv_cache.cu
75
76
     fused_attn/fused_attn_f16_max512_seqlen.cu
     fused_attn/fused_attn_f16_arbitrary_seqlen.cu
77
78
     activation/relu.cu
     activation/swiglu.cu
cyanguwa's avatar
cyanguwa committed
79
80
81
     fused_attn/fused_attn_fp8.cu
     fused_attn/fused_attn.cpp
     fused_attn/utils.cu
82
     gemm/cublaslt_gemm.cu
83
84
85
86
87
88
89
     normalization/common.cpp
     normalization/layernorm/ln_api.cpp
     normalization/layernorm/ln_bwd_semi_cuda_kernel.cu
     normalization/layernorm/ln_fwd_cuda_kernel.cu
     normalization/rmsnorm/rmsnorm_api.cpp
     normalization/rmsnorm/rmsnorm_bwd_semi_cuda_kernel.cu
     normalization/rmsnorm/rmsnorm_fwd_cuda_kernel.cu
90
     permutation/permutation.cu
91
     util/cast.cu
92
     util/padding.cu
Tim Moon's avatar
Tim Moon committed
93
     util/cuda_driver.cpp
94
     util/cuda_nvml.cpp
Tim Moon's avatar
Tim Moon committed
95
     util/cuda_runtime.cpp
96
     util/multi_stream.cpp
Tim Moon's avatar
Tim Moon committed
97
     util/rtc.cpp
98
     swizzle/swizzle.cu
Tim Moon's avatar
Tim Moon committed
99
100
     fused_softmax/scaled_masked_softmax.cu
     fused_softmax/scaled_upper_triang_masked_softmax.cu
101
     fused_softmax/scaled_aligned_causal_masked_softmax.cu
102
     fused_rope/fused_rope.cu
103
104
105
     fused_router/fused_moe_aux_loss.cu
     fused_router/fused_score_for_moe_aux_loss.cu
     fused_router/fused_topk_with_score_function.cu
106
     recipe/current_scaling.cu
107
     recipe/delayed_scaling.cu
108
     recipe/fp8_block_scaling.cu
109
110
111
112
     comm_gemm_overlap/userbuffers/ipcsocket.cc
     comm_gemm_overlap/userbuffers/userbuffers-host.cpp
     comm_gemm_overlap/userbuffers/userbuffers.cu
     comm_gemm_overlap/comm_gemm_overlap.cpp)
113
add_library(transformer_engine SHARED ${transformer_engine_SOURCES})
114
115
116
target_include_directories(transformer_engine PUBLIC
                           "${CMAKE_CURRENT_SOURCE_DIR}/include")

117
118


119
120
121
# Configure dependencies
target_link_libraries(transformer_engine PUBLIC
                      CUDA::cublas
122
123
                      CUDA::cudart
                      CUDNN::cudnn_all)
124
125
target_include_directories(transformer_engine PRIVATE
                           ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
126
target_include_directories(transformer_engine PRIVATE "${CUDNN_FRONTEND_INCLUDE_DIR}")
127

128
129
130
131
132
133
134
135
136
# Compiling Userbuffers with native MPI bootstrapping requires linking against MPI
option(NVTE_UB_WITH_MPI "Bootstrap Userbuffers with MPI" OFF)
if (NVTE_UB_WITH_MPI)
    find_package(MPI REQUIRED)
    target_link_libraries(transformer_engine PUBLIC MPI::MPI_CXX)
    target_include_directories(transformer_engine PRIVATE ${MPI_CXX_INCLUDES})
    target_compile_definitions(transformer_engine PUBLIC NVTE_UB_WITH_MPI)
endif()

137
138
139
140
141
142
143
option(NVTE_ENABLE_NVSHMEM "Compile with NVSHMEM library" OFF)
if (NVTE_ENABLE_NVSHMEM)
    add_subdirectory(nvshmem_api)
    target_link_libraries(transformer_engine PUBLIC nvshmemapi)
    target_include_directories(transformer_engine PUBLIC ${NVSHMEMAPI_INCLUDE_DIR})
endif()

144
145
146
147
# Hack to enable dynamic loading in cuDNN frontend
target_compile_definitions(transformer_engine PUBLIC NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING)

# Helper functions to make header files with C++ strings
Tim Moon's avatar
Tim Moon committed
148
149
150
151
152
153
154
155
156
157
158
function(make_string_header STRING STRING_NAME)
    configure_file(util/string_header.h.in
                   "string_headers/${STRING_NAME}.h"
                   @ONLY)
endfunction()
function(make_string_header_from_file file_ STRING_NAME)
    file(READ "${file_}" STRING)
    configure_file(util/string_header.h.in
                   "string_headers/${STRING_NAME}.h"
                   @ONLY)
endfunction()
159
160

# Header files with C++ strings
Tim Moon's avatar
Tim Moon committed
161
162
163
list(GET CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES 0 cuda_include_path)
make_string_header("${cuda_include_path}"
                   string_path_cuda_include)
164
165
make_string_header_from_file(transpose/rtc/cast_transpose_fusion.cu
                             string_code_transpose_rtc_cast_transpose_fusion_cu)
166
167
make_string_header_from_file(transpose/rtc/cast_transpose.cu
                             string_code_transpose_rtc_cast_transpose_cu)
Tim Moon's avatar
Tim Moon committed
168
169
make_string_header_from_file(transpose/rtc/transpose.cu
                             string_code_transpose_rtc_transpose_cu)
170
171
make_string_header_from_file(transpose/rtc/swap_first_dims.cu
                             string_code_transpose_rtc_swap_first_dims_cu)
172
173
make_string_header_from_file(utils.cuh
                             string_code_utils_cuh)
174
175
make_string_header_from_file(util/math.h
                             string_code_util_math_h)
Tim Moon's avatar
Tim Moon committed
176
177
178
target_include_directories(transformer_engine PRIVATE
                           "${CMAKE_CURRENT_BINARY_DIR}/string_headers")

179
# Compiler options
180
181
set_source_files_properties(fused_softmax/scaled_masked_softmax.cu
                            fused_softmax/scaled_upper_triang_masked_softmax.cu
182
                            fused_softmax/scaled_aligned_causal_masked_softmax.cu
183
184
185
186
187
                            multi_tensor/adam.cu
                            multi_tensor/compute_scale.cu
                            multi_tensor/l2norm.cu
                            multi_tensor/scale.cu
                            multi_tensor/sgd.cu
188
189
190
                            fused_attn/flash_attn.cu
                            fused_attn/context_parallel.cu
                            fused_attn/kv_cache.cu
191
192
                            PROPERTIES
                            COMPILE_OPTIONS "--use_fast_math")
193
194
195
196
197
option(NVTE_BUILD_ACTIVATION_WITH_FAST_MATH "Compile activation kernels with --use_fast_math option" OFF)
if (NVTE_BUILD_ACTIVATION_WITH_FAST_MATH)
  set_source_files_properties(activation/gelu.cu
                              activation/relu.cu
                              activation/swiglu.cu
198
                              util/cast.cu
199
200
201
                              PROPERTIES
                              COMPILE_OPTIONS "--use_fast_math")
endif()
202
203
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
Tim Moon's avatar
Tim Moon committed
204

205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# Number of parallel build jobs
if(ENV{MAX_JOBS})
  set(BUILD_JOBS_STR "$ENV{MAX_JOBS}")
elseif(ENV{NVTE_BUILD_MAX_JOBS})
  set(BUILD_JOBS_STR "$ENV{NVTE_BUILD_MAX_JOBS}")
else()
  set(BUILD_JOBS_STR "max")
endif()
message(STATUS "Parallel build jobs: ${BUILD_JOBS_STR}")

# Number of threads per parallel build job
set(BUILD_THREADS_PER_JOB $ENV{NVTE_BUILD_THREADS_PER_JOB})
if (NOT BUILD_THREADS_PER_JOB)
  set(BUILD_THREADS_PER_JOB 1)
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --threads ${BUILD_THREADS_PER_JOB}")
message(STATUS "Threads per parallel build job: ${BUILD_THREADS_PER_JOB}")

Tim Moon's avatar
Tim Moon committed
223
224
# Install library
install(TARGETS transformer_engine DESTINATION .)