update cmake for max blocksize and device judge

097ed7e2 · sangwzh · 833803f3 · 097ed7e2 · 097ed7e2 · 097ed7e2
Commit 097ed7e2 authored Sep 24, 2024 by sangwzh
4 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ dgl_option(EXTERNAL_METIS_PATH "Path to external metis" OFF)
 dgl_option(EXTERNAL_METIS_LIB_PATH "Path to external metis library" OFF)
 dgl_option(EXTERNAL_GKLIB_PATH "Path to external gklib" OFF)

+
 # Options for building DGL features: "none," "dev," "dogfood," "release," and
 # "all."
 #    "none"  - The feature is OFF for all build types. This is used when
@@ -275,9 +276,6 @@ file(GLOB_RECURSE DGL_SRC_1
 )

 list(APPEND DGL_SRC ${DGL_SRC_1})
-if(NOT USE_HIP AND NOT USE_CUDA)
-  add_library(dgl SHARED ${DGL_SRC})
-endif()

 if (NOT MSVC)
  file(GLOB_RECURSE DGL_RPC_SRC src/rpc/*.cc)
@@ -296,6 +294,7 @@ if(USE_HIP)
  set(HIP_HIPCC_FLAGS "-std=c++17")
  add_library(dgl SHARED ${DGL_SRC})
  target_link_options(dgl PRIVATE "-Wl,--allow-multiple-definition")
+  target_compile_options(dgl PUBLIC "--gpu-max-threads-per-block=1024")
  # set_target_properties(dgl PROPERTIES LINKER_LANGUAGE hip)
  target_link_libraries(dgl ${DGL_LINKER_LIBS})
  target_include_directories(dgl PRIVATE "${CMAKE_SOURCE_DIR}/include/dgl")

--- a/graphbolt/CMakeLists.txt
+++ b/graphbolt/CMakeLists.txt
@@ -77,18 +77,13 @@ target_link_libraries(${LIB_GRAPHBOLT_NAME} "${TORCH_LIBRARIES}")
 if(USE_HIP)
  # set_target_properties(${LIB_GRAPHBOLT_NAME} PROPERTIES CUDA_STANDARD 17)
  message(STATUS "Use external CCCL library for a consistent API and performance for graphbolt.")
+  target_compile_options(${LIB_GRAPHBOLT_NAME} PRIVATE "--gpu-max-threads-per-block=1024")
  target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE
-  #                           #  "/opt/dgl_dep/hipcub-install-0915/include/"
-  #                           #  "/opt/dgl_dep/rocprim-install-0915/include/"
  #                            "${ROCM_PATH}/include/thrust"
                             "${ROCM_PATH}/include/hipcub"
                             "${ROCM_PATH}/include/rocprim"
  )

-  # target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE
-  #                            "../third_party/cccl/thrust"
-  #                            "../third_party/cccl/cub"
-  #                            "../third_party/cccl/libcudacxx/include")

  message(STATUS "Use HugeCTR gpu_cache for graphbolt with INCLUDE_DIRS $ENV{GPU_CACHE_INCLUDE_DIRS}.")
  target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE $ENV{GPU_CACHE_INCLUDE_DIRS})

--- a/src/partition/ndarray_partition.cc
+++ b/src/partition/ndarray_partition.cc
@@ -107,7 +107,7 @@ class RangePartition : public NDArrayPartition {
        // we have only one CPU context, and can safely copy the array to that.
        range_cpu_(range.CopyTo(DGLContext{kDGLCPU, 0})) {
    auto ctx = range->ctx;
-    if (ctx.device_type != kDGLCUDA) {
+    if (ctx.device_type != kDGLCUDA || ctx.device_type != kDGLROCM) {
      LOG(FATAL) << "The range for an NDArrayPartition is only supported "
                    " on GPUs. Transfer the range to the target device before "
                    "creating the partition.";

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -74,7 +74,6 @@ class CUDADeviceAPI final : public DeviceAPI {
        hipDeviceProp_t props;
        CUDA_CALL(hipGetDeviceProperties(&props, ctx.device_id));
        *rv = std::string(props.name);
-        printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
        return;
      }
      case kMaxClockRate: {