[Makefile] Refactor CUDA makefile and add Hopper (SM90) to default build (#4830)

* Update CUDA.cmake to align with PyTorch's * add Ada and Hopper * add more comments * resolve comments Co-authored-by: Triston <triston.cao@gmail.com>

[Makefile] Refactor CUDA makefile and add Hopper (SM90) to default build (#4830)
* Update CUDA.cmake to align with PyTorch's * add Ada and Hopper * add more comments * resolve comments Co-authored-by: Triston <triston.cao@gmail.com>
65b34702 · Xin Yao · GitHub · c8ea9fa4 · 65b34702
Unverified Commit 65b34702 authored Nov 20, 2022 by Xin Yao Committed by GitHub Nov 19, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 42 additions and 14 deletions

cmake/modules/CUDA.cmake cmake/modules/CUDA.cmake +42 -14

No files found.
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -10,9 +10,16 @@ endif()
 include(CheckCXXCompilerFlag)
 check_cxx_compiler_flag("-std=c++14"   SUPPORT_CXX14)
-set(dgl_known_gpu_archs "35 50 60 70")
+set(dgl_known_gpu_archs "35" "50" "60" "70")
+set(dgl_cuda_arch_ptx "70")
 if (CUDA_VERSION_MAJOR GREATER_EQUAL "11")
-  set(dgl_known_gpu_archs "${dgl_known_gpu_archs} 80")
+  list(APPEND dgl_known_gpu_archs "80")
+  set(dgl_cuda_arch_ptx "80")
+endif()
+# CMake 3.5 doesn't support VERSION_GREATER_EQUAL
+if (NOT CUDA_VERSION VERSION_LESS "11.8")
+  list(APPEND dgl_known_gpu_archs "90")
+  set(dgl_cuda_arch_ptx "90")
 endif()
 ################################################################################################
@@ -63,10 +70,14 @@ set(CUDA_gpu_detect_output "")
      # nvcc outputs text containing line breaks when building with MSVC.
      # The line below prevents CMake from inserting a variable with line
      # breaks in the cache
-      message(STATUS "Found CUDA arch ${__nvcc_out}")
+      message(STATUS "Found GPU arch ${__nvcc_out}")
      string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
-      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
+      if(__nvcc_out VERSION_LESS "3.5")
-      set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from mshadow_detect_gpus tool" FORCE)
+        # drop support for cc < 3.5 and build for all known archs.
+        message(WARNING "GPU arch less than 3.5 is not supported.")
+      else()
+        set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from mshadow_detect_gpus tool" FORCE)
+      endif()
    else()
      message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}")
    endif()
@@ -86,8 +97,8 @@ endfunction()
 # Usage:
 #   dgl_select_nvcc_arch_flags(out_variable)
 function(dgl_select_nvcc_arch_flags out_variable)
-  # List of arch names
+  # List of arch names. Turing and Ada don't have a new major version, so they are not added to default build.
-  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "Ampere" "All" "Manual")
+  set(__archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "Ada" "Hopper" "All" "Manual")
  set(__archs_name_default "All")
  if(NOT CMAKE_CROSSCOMPILING)
    list(APPEND __archs_names "Auto")
@@ -107,36 +118,53 @@ function(dgl_select_nvcc_arch_flags out_variable)
  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
    set(CUDA_ARCH_BIN ${dgl_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    set(CUDA_ARCH_PTX ${dgl_cuda_arch_ptx} CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
  else()
    unset(CUDA_ARCH_BIN CACHE)
    unset(CUDA_ARCH_PTX CACHE)
  endif()
-  if(${CUDA_ARCH_NAME} STREQUAL "Fermi")
+  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
-    set(__cuda_arch_bin "20 21(20)")
+    set(__cuda_arch_bin "35")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(__cuda_arch_ptx "35")
-    set(__cuda_arch_bin "30 35")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
    set(__cuda_arch_bin "50")
+    set(__cuda_arch_ptx "50")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
-    set(__cuda_arch_bin "60 61")
+    set(__cuda_arch_bin "60")
+    set(__cuda_arch_ptx "60")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
    set(__cuda_arch_bin "70")
+    set(__cuda_arch_ptx "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
+    set(__cuda_arch_bin "75")
+    set(__cuda_arch_ptx "75")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
    set(__cuda_arch_bin "80")
+    set(__cuda_arch_ptx "80")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Ada")
+    set(__cuda_arch_bin "89")
+    set(__cuda_arch_ptx "89")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper")
+    set(__cuda_arch_bin "90")
+    set(__cuda_arch_ptx "90")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(__cuda_arch_bin ${dgl_known_gpu_archs})
+    set(__cuda_arch_ptx ${dgl_cuda_arch_ptx})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
    dgl_detect_installed_gpus(__cuda_arch_bin)
+    # if detect successes, __cuda_arch_ptx = __cuda_arch_bin
+    # if detect fails, __cuda_arch_ptx is the latest arch in __cuda_arch_bin
+    list(GET __cuda_arch_bin -1 __cuda_arch_ptx)
  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
    set(__cuda_arch_bin ${CUDA_ARCH_BIN})
+    set(__cuda_arch_ptx ${CUDA_ARCH_PTX})
  endif()
  # remove dots and convert to lists
  string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${__cuda_arch_ptx}")
  string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
  string(REGEX MATCHALL "[0-9]+"   __cuda_arch_ptx "${__cuda_arch_ptx}")
  mshadow_list_unique(__cuda_arch_bin __cuda_arch_ptx)