utils.cmake 21.7 KB
Newer Older
bnellnm's avatar
bnellnm committed
1
2
3
4
5
6
7
#
# Attempt to find the python package that uses the same python executable as
# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
#
macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
  set(Python_EXECUTABLE ${EXECUTABLE})
8
  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
bnellnm's avatar
bnellnm committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
  if (NOT Python_FOUND)
    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
  endif()
  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
    message(FATAL_ERROR
      "Python version (${_VER}) is not one of the supported versions: "
      "${_SUPPORTED_VERSIONS_LIST}.")
  endif()
  message(STATUS "Found python matching: ${EXECUTABLE}.")
endmacro()

#
# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
# has trailing whitespace stripped.  If an error is encountered when running
# python, a fatal message `ERR_MSG` is issued.
#
function (run_python OUT EXPR ERR_MSG)
  execute_process(
    COMMAND
    "${Python_EXECUTABLE}" "-c" "${EXPR}"
    OUTPUT_VARIABLE PYTHON_OUT
    RESULT_VARIABLE PYTHON_ERROR_CODE
    ERROR_VARIABLE PYTHON_STDERR
    OUTPUT_STRIP_TRAILING_WHITESPACE)

  if(NOT PYTHON_ERROR_CODE EQUAL 0)
    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
  endif()
  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
endfunction()

# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
macro (append_cmake_prefix_path PKG EXPR)
  run_python(_PREFIX_PATH
    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
endmacro()

#
# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
# of CUDA source files. The names of the corresponding "hipified" sources are
# stored in `OUT_SRCS`.
#
function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
  #
  # Split into C++ and non-C++ (i.e. CUDA) sources.
  #
  set(SRCS ${ORIG_SRCS})
  set(CXX_SRCS ${ORIG_SRCS})
61
62
  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
bnellnm's avatar
bnellnm committed
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

  #
  # Generate ROCm/HIP source file names from CUDA file names.
  # Since HIP files are generated code, they will appear in the build area
  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
  #
  set(HIP_SRCS)
  foreach (SRC ${SRCS})
    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
  endforeach()

  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
  add_custom_target(
    hipify${NAME}
79
    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
bnellnm's avatar
bnellnm committed
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
    DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
    BYPRODUCTS ${HIP_SRCS}
    COMMENT "Running hipify on ${NAME} extension source files.")

  # Swap out original extension sources with hipified sources.
  list(APPEND HIP_SRCS ${CXX_SRCS})
  set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
endfunction()

#
# Get additional GPU compiler flags from torch.
#
function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
  if (${GPU_LANG} STREQUAL "CUDA")
    #
    # Get common NVCC flags from torch.
    #
    run_python(GPU_FLAGS
      "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
      "Failed to determine torch nvcc compiler flags")

    if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
102
      list(APPEND GPU_FLAGS "-DENABLE_FP8")
103
104
    endif()
    if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
105
106
107
108
109
      list(REMOVE_ITEM GPU_FLAGS
        "-D__CUDA_NO_HALF_OPERATORS__"
        "-D__CUDA_NO_HALF_CONVERSIONS__"
        "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
        "-D__CUDA_NO_HALF2_OPERATORS__")
bnellnm's avatar
bnellnm committed
110
111
112
113
114
115
116
117
118
119
120
121
    endif()

  elseif(${GPU_LANG} STREQUAL "HIP")
    #
    # Get common HIP/HIPCC flags from torch.
    #
    run_python(GPU_FLAGS
      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
      "Failed to determine torch nvcc compiler flags")

    list(APPEND GPU_FLAGS
      "-DUSE_ROCM"
122
      "-DENABLE_FP8"
bnellnm's avatar
bnellnm committed
123
124
      "-U__HIP_NO_HALF_CONVERSIONS__"
      "-U__HIP_NO_HALF_OPERATORS__"
125
      "-Werror=unused-variable"
bnellnm's avatar
bnellnm committed
126
127
128
129
130
131
      "-fno-gpu-rdc")

  endif()
  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
endfunction()

132
133
134
135
136
137
138
139
140
141
142
# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
#   libgomp.so    -> libgomp-<hash>.so...
#   libgomp.so.1  -> libgomp-<hash>.so...
# OUTPUT: TORCH_GOMP_SHIM_DIR  ("" if not found)
function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
  set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)

  # Use run_python to locate vendored libgomp; never throw on failure.
  run_python(_VLLM_TORCH_GOMP_PATH
    "
import os, glob
143
144
145
146
147
148
149
150
151
152
153
154
155
import torch
torch_pkg = os.path.dirname(torch.__file__)
site_root = os.path.dirname(torch_pkg)

# Search both torch.libs and torch/lib
roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
candidates = []
for root in roots:
    if not os.path.isdir(root):
        continue
    candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))

print(candidates[0] if candidates else '')
156
"
157
    "failed to probe for libgomp")
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174

  if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
    return()
  endif()

  # Create shim under the build tree
  set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
  file(MAKE_DIRECTORY "${_shim}")

  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")

  set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
endfunction()

bnellnm's avatar
bnellnm committed
175
# Macro for converting a `gencode` version number to a cmake version number.
176
177
# Preserves architecture-specific suffixes (a/f) needed for correct
# __CUDA_ARCH_FAMILY_SPECIFIC__ definition. E.g. "121a" -> "12.1a".
bnellnm's avatar
bnellnm committed
178
macro(string_to_ver OUT_VER IN_STR)
179
  string(REGEX REPLACE "\([0-9]+\)\([0-9][af]?\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
bnellnm's avatar
bnellnm committed
180
181
endmacro()

182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#
# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
# `CUDA_ARCH_FLAGS`.
#
# Example:
#   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
#   clear_cuda_arches(CUDA_ARCH_FLAGS)
#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
#   CMAKE_CUDA_FLAGS="-Wall"
#
macro(clear_cuda_arches CUDA_ARCH_FLAGS)
    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
      ${CMAKE_CUDA_FLAGS})

    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
    # and passed back via the `CUDA_ARCHITECTURES` property.
    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
      ${CMAKE_CUDA_FLAGS})
endmacro()

#
# Extract unique CUDA architectures from a list of compute capabilities codes in 
# the form `<major><minor>[<letter>]`, convert them to the form sort 
# `<major>.<minor>`, dedupes them and then sorts them in ascending order and 
# stores them in `OUT_ARCHES`.
#
# Example:
#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" 
#   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
#   OUT_ARCHES="7.5;...;9.0"
function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
  set(_CUDA_ARCHES)
  foreach(_ARCH ${CUDA_ARCH_FLAGS})
216
    string(REGEX MATCH "arch=compute_\([0-9]+[af]?\)" _COMPUTE ${_ARCH})
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
    if (_COMPUTE)
      set(_COMPUTE ${CMAKE_MATCH_1})
    endif()

    string_to_ver(_COMPUTE_VER ${_COMPUTE})
    list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
  endforeach()

  list(REMOVE_DUPLICATES _CUDA_ARCHES)
  list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
  set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
endfunction()

#
# For a specific file set the `-gencode` flag in compile options conditionally 
# for the CUDA language. 
#
# Example:
#   set_gencode_flag_for_srcs(
#     SRCS "foo.cu"
#     ARCH "compute_75"
#     CODE "sm_75")
#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for 
#    `foo.cu` (only for the CUDA language).
#
macro(set_gencode_flag_for_srcs)
  set(options)
  set(oneValueArgs ARCH CODE)
  set(multiValueArgs SRCS)
  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
                        "${multiValueArgs}" ${ARGN} )
  set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
  set_property(
    SOURCE ${arg_SRCS}
    APPEND PROPERTY
    COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
  )

  message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
endmacro(set_gencode_flag_for_srcs)

#
# For a list of source files set the `-gencode` flags in the files specific 
#  compile options (specifically for the CUDA language).
#
# arguments are:
#  SRCS: list of source files
#  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
#  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS 
#    that is larger than BUILD_PTX_FOR_ARCH.
#
macro(set_gencode_flags_for_srcs)
  set(options)
  set(oneValueArgs BUILD_PTX_FOR_ARCH)
  set(multiValueArgs SRCS CUDA_ARCHS)
  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
                        "${multiValueArgs}" ${ARGN} )

  foreach(_ARCH ${arg_CUDA_ARCHS})
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
    # handle +PTX suffix: generate both sm and ptx codes if requested
    string(FIND "${_ARCH}" "+PTX" _HAS_PTX)
    if(NOT _HAS_PTX EQUAL -1)
      string(REPLACE "+PTX" "" _BASE_ARCH "${_ARCH}")
      string(REPLACE "." "" _STRIPPED_ARCH "${_BASE_ARCH}")
      set_gencode_flag_for_srcs(
        SRCS ${arg_SRCS}
        ARCH "compute_${_STRIPPED_ARCH}"
        CODE "sm_${_STRIPPED_ARCH}")
      set_gencode_flag_for_srcs(
        SRCS ${arg_SRCS}
        ARCH "compute_${_STRIPPED_ARCH}"
        CODE "compute_${_STRIPPED_ARCH}")
    else()
      string(REPLACE "." "" _STRIPPED_ARCH "${_ARCH}")
      set_gencode_flag_for_srcs(
        SRCS ${arg_SRCS}
        ARCH "compute_${_STRIPPED_ARCH}"
        CODE "sm_${_STRIPPED_ARCH}")
    endif()
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
  endforeach()

  if (${arg_BUILD_PTX_FOR_ARCH})
    list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
    list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
    if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
      string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
      set_gencode_flag_for_srcs(
        SRCS ${arg_SRCS}
        ARCH "compute_${_PTX_ARCH}"
        CODE "compute_${_PTX_ARCH}")
    endif()
  endif()
endmacro()

#
313
314
# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
#  `<major>.<minor>[letter]` compute the "loose intersection" with the
315
316
317
318
#  `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in
#  `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there
#  is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the
#  architecture in `SRC_CUDA_ARCHS`.
319
320
321
322
323
# The loose intersection is defined as:
#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
#  where `<=` is the version comparison operator.
# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
#  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
324
325
# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
#  in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
326
#  x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
327
328
329
330
331
332
333
334
# The result is stored in `OUT_CUDA_ARCHS`.
#
# Example:
#   SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
#   TGT_CUDA_ARCHS="8.0;8.9;9.0"
#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
#   OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
#
335
336
337
338
339
340
# Example With PTX:
#   SRC_CUDA_ARCHS="8.0+PTX"
#   TGT_CUDA_ARCHS="9.0"
#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
#   OUT_CUDA_ARCHS="8.0+PTX"
#
341
function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
  set(_SRC_CUDA_ARCHS "${SRC_CUDA_ARCHS}")
  set(_TGT_CUDA_ARCHS ${TGT_CUDA_ARCHS})

  # handle +PTX suffix: separate base arch for matching, record PTX requests
  set(_PTX_ARCHS)
  foreach(_arch ${_SRC_CUDA_ARCHS})
    if(_arch MATCHES "\\+PTX$")
      string(REPLACE "+PTX" "" _base "${_arch}")
      list(APPEND _PTX_ARCHS "${_base}")
      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
      list(APPEND _SRC_CUDA_ARCHS "${_base}")
    endif()
  endforeach()
  list(REMOVE_DUPLICATES _PTX_ARCHS)
  list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
357

Johnny's avatar
Johnny committed
358
359
360
361
362
  # Handle architecture-specific suffixes (a/f) for SRC entries.
  # First try exact base match (x.y), then cross-suffix match (x.ya / x.yf).
  # For 'f' (family) suffix: if no exact/cross match, fall back to major-version
  # match — e.g. SRC="12.0f" matches TGT="12.1a" since SM121 is in the SM12x
  # family. The output uses TGT's value to preserve the user's compilation flags.
363
  set(_CUDA_ARCHS)
364
  foreach(_arch ${_SRC_CUDA_ARCHS})
Johnny's avatar
Johnny committed
365
    if(_arch MATCHES "[af]$")
366
      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
Johnny's avatar
Johnny committed
367
      string(REGEX REPLACE "[af]$" "" _base "${_arch}")
368
369
370
      if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
        list(APPEND _CUDA_ARCHS "${_arch}")
Johnny's avatar
Johnny committed
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
      elseif("${_base}a" IN_LIST _TGT_CUDA_ARCHS)
        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}a")
        list(APPEND _CUDA_ARCHS "${_base}a")
      elseif("${_base}f" IN_LIST _TGT_CUDA_ARCHS)
        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}f")
        list(APPEND _CUDA_ARCHS "${_base}f")
      elseif(_arch MATCHES "f$")
        # Family suffix: match any TGT entry in the same major version family.
        string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" _src_major "${_base}")
        foreach(_tgt ${_TGT_CUDA_ARCHS})
          string(REGEX REPLACE "[af]$" "" _tgt_base "${_tgt}")
          string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" _tgt_major "${_tgt_base}")
          if(_tgt_major STREQUAL _src_major)
            list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_tgt}")
            list(APPEND _CUDA_ARCHS "${_tgt}")
            break()
          endif()
        endforeach()
      endif()
    endif()
  endforeach()

  # Symmetric handling: if TGT has x.ya/f and SRC has x.y (without suffix),
  # preserve TGT's suffix in the output.
  set(_tgt_copy ${_TGT_CUDA_ARCHS})
  foreach(_arch ${_tgt_copy})
    if(_arch MATCHES "[af]$")
      string(REGEX REPLACE "[af]$" "" _base "${_arch}")
      if ("${_base}" IN_LIST _SRC_CUDA_ARCHS)
        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_arch}")
        list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_base}")
        list(APPEND _CUDA_ARCHS "${_arch}")
403
      endif()
404
    endif()
405
  endforeach()
406

407
  list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
408

409
410
411
  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
  # is less or equal to ARCH (but has the same major version since SASS binary
  # compatibility is only forward compatible within the same major version).
412
  foreach(_ARCH ${_TGT_CUDA_ARCHS})
413
414
415
    set(_TMP_ARCH)
    # Extract the major version of the target arch
    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
416
    foreach(_SRC_ARCH ${_SRC_CUDA_ARCHS})
417
418
      # Extract the major version of the source arch
      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
419
      # Check version-less-or-equal, and allow PTX arches to match across majors
420
      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
421
        if (_SRC_ARCH IN_LIST _PTX_ARCHS OR SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
422
423
424
425
426
427
428
429
430
431
432
          set(_TMP_ARCH "${_SRC_ARCH}")
        endif()
      else()
        # If we hit a version greater than the target, we can break
        break()
      endif()
    endforeach()

    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
    if (_TMP_ARCH)
      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
433
434
435
436
    endif()
  endforeach()

  list(REMOVE_DUPLICATES _CUDA_ARCHS)
437

438
439
440
441
442
443
444
445
446
447
  # reapply +PTX suffix to architectures that requested PTX
  set(_FINAL_ARCHS)
  foreach(_arch ${_CUDA_ARCHS})
    if(_arch IN_LIST _PTX_ARCHS)
      list(APPEND _FINAL_ARCHS "${_arch}+PTX")
    else()
      list(APPEND _FINAL_ARCHS "${_arch}")
    endif()
  endforeach()
  set(_CUDA_ARCHS ${_FINAL_ARCHS})
448

449
450
451
  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
endfunction()

bnellnm's avatar
bnellnm committed
452
453
454
#
# Override the GPU architectures detected by cmake/torch and filter them by
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
455
456
# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set 
# the architectures on a per file basis.
bnellnm's avatar
bnellnm committed
457
458
459
460
461
462
463
464
465
466
467
#
# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
#
macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
  set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
  message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")

  if (${GPU_LANG} STREQUAL "HIP")
    #
    # `GPU_ARCHES` controls the `--offload-arch` flags.
    #
468
469
470
471
472
473
474
475
476
477
    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
    # "rocm_agent_enumerator" in "enable_language(HIP)"
    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
    #
    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
    else()
      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
    endif()
bnellnm's avatar
bnellnm committed
478
479
480
481
482
    #
    # Find the intersection of the supported + detected architectures to
    # set the module architecture flags.
    #
    set(${GPU_ARCHES})
483
    foreach (_ARCH ${HIP_ARCHITECTURES})
bnellnm's avatar
bnellnm committed
484
485
486
487
488
489
490
      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
        list(APPEND ${GPU_ARCHES} ${_ARCH})
      endif()
    endforeach()

    if(NOT ${GPU_ARCHES})
      message(FATAL_ERROR
491
        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
bnellnm's avatar
bnellnm committed
492
493
494
495
496
497
        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
    endif()
  endif()
endmacro()

#
498
# Define a target named `MOD_NAME` for a single extension. The
bnellnm's avatar
bnellnm committed
499
500
501
# arguments are:
#
# DESTINATION <dest>         - Module destination directory.
502
503
# LANGUAGE <lang>            - The language for this module, e.g. CUDA, HIP,
#                              CXX, etc.
bnellnm's avatar
bnellnm committed
504
505
506
507
508
# SOURCES <sources>          - List of source files relative to CMakeLists.txt
#                              directory.
#
# Optional arguments:
#
509
510
511
# ARCHITECTURES <arches>     - A list of target architectures in cmake format.
#                              For GPU, refer to CMAKE_CUDA_ARCHITECTURES and
#                              CMAKE_HIP_ARCHITECTURES for more info.
bnellnm's avatar
bnellnm committed
512
513
514
515
#                              ARCHITECTURES will use cmake's defaults if
#                              not provided.
# COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
516
# LIBRARIES <libraries>      - Extra link libraries.
bnellnm's avatar
bnellnm committed
517
# WITH_SOABI                 - Generate library with python SOABI suffix name.
518
# USE_SABI <version>         - Use python stable api <version>
bnellnm's avatar
bnellnm committed
519
520
521
#
# Note: optimization level/debug info is set via cmake build type.
#
522
function (define_extension_target MOD_NAME)
bnellnm's avatar
bnellnm committed
523
  cmake_parse_arguments(PARSE_ARGV 1
524
    ARG
bnellnm's avatar
bnellnm committed
525
    "WITH_SOABI"
526
    "DESTINATION;LANGUAGE;USE_SABI"
bnellnm's avatar
bnellnm committed
527
528
529
    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")

  # Add hipify preprocessing step when building with HIP/ROCm.
530
531
  if (ARG_LANGUAGE STREQUAL "HIP")
    hipify_sources_target(ARG_SOURCES ${MOD_NAME} "${ARG_SOURCES}")
bnellnm's avatar
bnellnm committed
532
533
  endif()

534
535
  if (ARG_WITH_SOABI)
    set(SOABI_KEYWORD WITH_SOABI)
bnellnm's avatar
bnellnm committed
536
  else()
537
    set(SOABI_KEYWORD "")
bnellnm's avatar
bnellnm committed
538
539
  endif()

540
541
542
543
544
545
546
  run_python(IS_FREETHREADED_PYTHON
    "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
    "Failed to determine whether interpreter is free-threaded")

  # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
  # so avoid using the stable ABI under free-threading only.
  if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
547
    Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
548
  else()
549
    Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
550
  endif()
bnellnm's avatar
bnellnm committed
551

552
  if (ARG_LANGUAGE STREQUAL "HIP")
bnellnm's avatar
bnellnm committed
553
    # Make this target dependent on the hipify preprocessor step.
554
    add_dependencies(${MOD_NAME} hipify${MOD_NAME})
555
    # Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder
556
557
    target_include_directories(${MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
      ${ARG_INCLUDE_DIRECTORIES})
558
  else()
559
560
    target_include_directories(${MOD_NAME} PRIVATE csrc
      ${ARG_INCLUDE_DIRECTORIES})
bnellnm's avatar
bnellnm committed
561
562
  endif()

563
564
565
  if (ARG_ARCHITECTURES)
    set_target_properties(${MOD_NAME} PROPERTIES
      ${ARG_LANGUAGE}_ARCHITECTURES "${ARG_ARCHITECTURES}")
bnellnm's avatar
bnellnm committed
566
567
  endif()

568
569
  target_compile_options(${MOD_NAME} PRIVATE
    $<$<COMPILE_LANGUAGE:${ARG_LANGUAGE}>:${ARG_COMPILE_FLAGS}>)
bnellnm's avatar
bnellnm committed
570

571
572
  target_compile_definitions(${MOD_NAME} PRIVATE
    "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
bnellnm's avatar
bnellnm committed
573

574
  target_link_libraries(${MOD_NAME} PRIVATE torch ${ARG_LIBRARIES})
bnellnm's avatar
bnellnm committed
575

576
577
  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
  # dependencies that are not necessary and may not be installed.
578
579
  if (ARG_LANGUAGE STREQUAL "CUDA")
    target_link_libraries(${MOD_NAME} PRIVATE torch CUDA::cudart CUDA::cuda_driver ${ARG_LIBRARIES})
580
  else()
581
    target_link_libraries(${MOD_NAME} PRIVATE torch ${TORCH_LIBRARIES} ${ARG_LIBRARIES})
582
583
  endif()

584
  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${ARG_DESTINATION} COMPONENT ${MOD_NAME})
bnellnm's avatar
bnellnm committed
585
endfunction()