update dgl codes to hip

833803f3 · sangwzh · 1d28bf8b · 833803f3 · 833803f3 · 833803f3
Commit 833803f3 authored Sep 23, 2024 by sangwzh
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ message(STATUS "Start configuring project ${PROJECT_NAME}")
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_HIP_STANDARD 17)
 # cmake utils
 include(cmake/util/Util.cmake)
@@ -64,8 +65,10 @@ dgl_feature_option(
 dgl_feature_option(
    USE_LIBXSMM
    "Build with LIBXSMM library optimization"
-    "all"
+    "none"
 )
+message(STATUS "USE_LIBXSMM: ${USE_LIBXSMM}")
 dgl_feature_option(
    USE_OPENMP
    "Build with OpenMP"
@@ -78,6 +81,8 @@ dgl_feature_option(
    "all"
 )
+message(STATUS "BUILD_GRAPHBOLT: ${BUILD_GRAPHBOLT}")
 dgl_feature_option(
    LIBCXX_ENABLE_PARALLEL_ALGORITHMS
    "Enable the parallel algorithms library. This requires the PSTL to be available."
@@ -146,6 +151,29 @@ if(USE_CUDA)
  cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/cccl/libcudacxx/include")
 endif(USE_CUDA)
+if(USE_HIP)
+  message(STATUS "Build with ROCM support")
+  project(dgl C CXX HIP)
+  include(cmake/modules/ROCM.cmake)
+  # target_compile_features(dgl PRIVATE cxx_std_17)
+  set(CMAKE_HIP_ARCHITECTURES gfx906;gfx928;gfx926)
+  # see https://github.com/NVIDIA/thrust/issues/1401
+  if(NOT DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH "/opt/dtk" CACHE PATH "Path to which RoCm has been installed")
+    set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
+  else()
+    set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
+    set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
+  endif()
+  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+  find_package(hip REQUIRED)
+  set(HIP_FOUND TRUE)
+  message(STATUS "HIP_FOUND  :${HIP_FOUND}")
+  # add_definitions(-DTHRUST_CUB_WRAPPED_NAMESPACE=dgl)
+  include(cmake/modules/ROCM.cmake)
+  message(STATUS "Use external CUB/Thrust library for a consistent API and performance.")
+endif(USE_HIP)
 # initial variables
 if(NOT MSVC)
 set(DGL_LINKER_LIBS "dl")
@@ -247,6 +275,9 @@ file(GLOB_RECURSE DGL_SRC_1
 )
 list(APPEND DGL_SRC ${DGL_SRC_1})
+if(NOT USE_HIP AND NOT USE_CUDA)
+  add_library(dgl SHARED ${DGL_SRC})
+endif()
 if (NOT MSVC)
  file(GLOB_RECURSE DGL_RPC_SRC src/rpc/*.cc)
@@ -255,6 +286,27 @@ else()
 endif()
 list(APPEND DGL_SRC ${DGL_RPC_SRC})
+# Configure hip
+message(STATUS ">>>>>>>> USE_HIP: ${USE_HIP}")
+message(STATUS ">>>>>>>> DGL_SRC: ${DGL_SRC}")
+message(STATUS ">>>>>>>> DGL_RPC_SRC: ${DGL_RPC_SRC}")
+if(USE_HIP)
+  dgl_config_hip(DGL_CUDA_SRC)
+  list(APPEND DGL_SRC ${DGL_CUDA_SRC})
+  set(HIP_HIPCC_FLAGS "-std=c++17")
+  add_library(dgl SHARED ${DGL_SRC})
+  target_link_options(dgl PRIVATE "-Wl,--allow-multiple-definition")
+  # set_target_properties(dgl PROPERTIES LINKER_LANGUAGE hip)
+  target_link_libraries(dgl ${DGL_LINKER_LIBS})
+  target_include_directories(dgl PRIVATE "${CMAKE_SOURCE_DIR}/include/dgl")
+  target_include_directories(dgl PRIVATE "${CMAKE_SOURCE_DIR}/include")
+  target_include_directories(dgl PRIVATE "${ROCM_PATH}/include")
+  target_include_directories(dgl PRIVATE "${ROCM_PATH}/include/hiprand")
+  target_include_directories(dgl PRIVATE "${ROCM_PATH}/include/rocrand")
+  message(STATUS ">>>>>>>> DGL_LINKER_LIBS: ${DGL_LINKER_LIBS}")
+endif(USE_HIP)
 # Configure cuda
 if(USE_CUDA)
  file(GLOB_RECURSE DGL_CUDA_SRC
@@ -274,12 +326,13 @@ if(USE_CUDA)
  list(APPEND DGL_SRC ${DGL_CUDA_SRC})
  dgl_config_cuda(DGL_LINKER_LIBS)
  cuda_add_library(dgl SHARED ${DGL_SRC})
-else(USE_CUDA)
+endif()
+if(NOT USE_CUDA AND NOT USE_HIP)
  add_library(dgl SHARED ${DGL_SRC})
-endif(USE_CUDA)
+endif()
-# include directories
+target_include_directories(dgl PUBLIC "include")
-target_include_directories(dgl PRIVATE "include")
+target_include_directories(dgl PUBLIC "${CMAKE_SOURCE_DIR}/include")
 # check for conda includes
 if("$ENV{CONDA_BUILD}" STREQUAL "1")
  set(in_conda_build TRUE)
@@ -419,6 +472,20 @@ if(USE_CUDA)
  target_include_directories(dgl PRIVATE "third_party/HugeCTR/gpu_cache/include")
  list(APPEND DGL_LINKER_LIBS gpu_cache)
  message(STATUS "Build with HugeCTR GPU embedding cache.")
+elseif(USE_HIP)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GPU_CACHE")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU_CACHE")
+  # Manually build gpu_cache because CMake always builds it as shared
+  file(GLOB gpu_cache_src
+    third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.hip
+  )
+  add_library(gpu_cache STATIC ${gpu_cache_src})
+  target_compile_options(gpu_cache PRIVATE "-fPIC")
+  set_target_properties(gpu_cache PROPERTIES LINKER_LANGUAGE HIP)
+  target_include_directories(gpu_cache PRIVATE "third_party/HugeCTR/gpu_cache/include")
+  target_include_directories(dgl PRIVATE "third_party/HugeCTR/gpu_cache/include")
+  list(APPEND DGL_LINKER_LIBS gpu_cache)
+  message(STATUS "Build with HugeCTR GPU embedding cache.")
 endif(USE_CUDA)
 # support PARALLEL_ALGORITHMS
@@ -461,8 +528,8 @@ if(BUILD_TORCH)
      tensoradapter_pytorch
      ${CMAKE_COMMAND} -E env
      CMAKE_COMMAND=${CMAKE_CMD}
-      CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+      # CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
-      USE_CUDA=${USE_CUDA}
+      USE_HIP=${USE_HIP}
      EXTERNAL_DMLC_LIB_PATH=${EXTERNAL_DMLC_LIB_PATH}
      BINDIR=${CMAKE_CURRENT_BINARY_DIR}
      bash ${BUILD_SCRIPT} ${TORCH_PYTHON_INTERPS}
@@ -491,6 +558,8 @@ if(BUILD_CPP_TEST)
  add_executable(runUnitTests ${TEST_SRC_FILES})
  target_link_libraries(runUnitTests gtest gtest_main)
  target_link_libraries(runUnitTests dgl)
+  target_link_options(runUnitTests PRIVATE -Wl,--allow-multiple-definition  -fuse-ld=lld)
+  target_compile_options(runUnitTests PRIVATE "-fPIC")
  add_test(UnitTests runUnitTests)
 endif(BUILD_CPP_TEST)
@@ -525,8 +594,7 @@ if(BUILD_SPARSE)
      ALL
      ${CMAKE_COMMAND} -E env
      CMAKE_COMMAND=${CMAKE_CMD}
-      CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+      USE_HIP=${USE_HIP}
-      USE_CUDA=${USE_CUDA}
      BINDIR=${CMAKE_CURRENT_BINARY_DIR}
      INCLUDEDIR="${DGL_INCLUDE_DIRS}"
      CFLAGS=${CMAKE_C_FLAGS}
@@ -541,12 +609,12 @@ endif(BUILD_SPARSE)
 if(BUILD_GRAPHBOLT)
  message(STATUS "Configuring graphbolt library")
-  string(REPLACE ";" "\\;" CUDA_ARCHITECTURES_ESCAPED "${CUDA_ARCHITECTURES}")
+  # string(REPLACE ";" "\\;" CUDA_ARCHITECTURES_ESCAPED "${CUDA_ARCHITECTURES}")
  file(TO_NATIVE_PATH ${CMAKE_CURRENT_BINARY_DIR} BINDIR)
  file(TO_NATIVE_PATH ${CMAKE_COMMAND} CMAKE_CMD)
-  if(USE_CUDA)
+  if(USE_HIP)
    get_target_property(GPU_CACHE_INCLUDE_DIRS gpu_cache INCLUDE_DIRECTORIES)
-  endif(USE_CUDA)
+  endif(USE_HIP)
  string(REPLACE ";" "\\;" GPU_CACHE_INCLUDE_DIRS_ESCAPED "${GPU_CACHE_INCLUDE_DIRS}")
  if(MSVC)
    file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/graphbolt/build.bat BUILD_SCRIPT)
@@ -573,8 +641,7 @@ if(BUILD_GRAPHBOLT)
      ALL
      ${CMAKE_COMMAND} -E env
      CMAKE_COMMAND=${CMAKE_CMD}
-      CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+      USE_HIP=${USE_HIP}
-      USE_CUDA=${USE_CUDA}
      BINDIR=${CMAKE_CURRENT_BINARY_DIR}
      GPU_CACHE_INCLUDE_DIRS="${GPU_CACHE_INCLUDE_DIRS_ESCAPED}"
      CFLAGS=${CMAKE_C_FLAGS}

--- a/cmake/modules/ROCM.cmake
+++ b/cmake/modules/ROCM.cmake
+################################################################################################
+# Config hip compilation.
+# Usage:
+#   dgl_config_hip(<dgl_cuda_src>)
+macro(dgl_config_hip out_variable)
+  if(NOT HIP_FOUND)
+    message(FATAL_ERROR "Cannot find HIP.")
+  endif()
+  # always set the includedir when cuda is available
+  # avoid global retrigger of cmake
+	include_directories(${CUDA_INCLUDE_DIRS})
+  add_definitions(-DDGL_USE_CUDA)
+  add_definitions(-D__HIP_PLATFORM_AMD__)
+  add_definitions(-DCUDART_VERSION_LT_11000=true)
+  add_definitions(-DDTKRT_VERSION=11080)
+  add_definitions(-D__DTK_ARCH__=11080)
+  include_directories(BEFORE SYSTEM "${CMAKE_SOURCE_DIR}/include/")
+  message(STATUS ">>>>>>>>>>>> CUDA_INCLUDE_DIRS : ${CUDA_INCLUDE_DIRS}")
+  set_source_files_properties(src/random/random.cc PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+  set_source_files_properties(src/array/cuda/csr_transpose.cc PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+  set_source_files_properties(src/runtime/cuda/cuda_device_api.cc PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+  file(GLOB_RECURSE DGL_HIP_SRC
+    src/array/cuda/*.cc
+    src/array/cuda/*.hip
+    src/array/cuda/uvm/*.cc
+    src/array/cuda/uvm/*.hip
+    src/kernel/cuda/*.cc
+    src/kernel/cuda/*.hip
+    src/partition/cuda/*.hip
+    src/runtime/cuda/*.cc
+    src/runtime/cuda/*.hip
+    src/geometry/cuda/*.hip
+    src/graph/transform/cuda/*.hip
+    src/graph/sampling/randomwalks/*.hip
+  )
+  find_library(DCU_RUNTIME galaxyhip ${ROCM_PATH}/lib)
+  find_library(DCU_SPARSE hipsparse ${ROCM_PATH}/lib)
+  find_library(DCU_BLAS hipblas ${ROCM_PATH}/lib)
+  find_library(DCU_RAND hiprand ${ROCM_PATH}/lib)
+  message(STATUS "Found DCU_RUNTIME: ${DCU_RUNTIME}")
+  message(STATUS "Found DCU_SPARSE: ${DCU_SPARSE}")
+  message(STATUS "Found DCU_BLAS: ${DCU_BLAS}")
+  message(STATUS "Found DCU_RAND: ${DCU_RAND}")
+  list(APPEND DGL_LINKER_LIBS
+    ${DCU_RUNTIME}
+    ${DCU_SPARSE}
+    ${DCU_BLAS}
+    ${DCU_RAND}
+  )
+  set(${out_variable} ${DGL_HIP_SRC})
+endmacro()
--- a/dgl_sparse/build.sh
+++ b/dgl_sparse/build.sh
@@ -22,7 +22,7 @@ CMAKE_FLAGS="$CMAKE_FLAGS -DDGL_INCLUDE_DIRS=${INCLUDEDIR// /;} -DDGL_BUILD_DIR=
 echo $CMAKE_FLAGS
 if [ $# -eq 0 ]; then
-	$CMAKE_COMMAND $CMAKE_FLAGS ..
+	CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS ..
 	make -j VERBOSE=1
 	cp -v $CPSOURCE $BINDIR/dgl_sparse
 else
@@ -30,7 +30,7 @@ else
 		TORCH_VER=$($PYTHON_INTERP -c 'import torch; print(torch.__version__.split("+")[0])')
 		mkdir -p $TORCH_VER
 		cd $TORCH_VER
-		$CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../..
+		CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../..
 		make -j VERBOSE=1
 		cp -v $CPSOURCE $BINDIR/dgl_sparse
 		cd ..

--- a/graphbolt/CMakeLists.txt
+++ b/graphbolt/CMakeLists.txt
@@ -8,6 +8,12 @@ if(USE_CUDA)
  add_definitions(-DGRAPHBOLT_USE_CUDA)
 endif()
+if(USE_HIP)
+  message(STATUS "Build graphbolt with CUDA support")
+  enable_language(HIP)
+  add_definitions(-DGRAPHBOLT_USE_CUDA)
+endif()
 # For windows, define NOMINMAX to avoid conflict with std::min/max
 if(MSVC)
  add_definitions(-DNOMINMAX)
@@ -44,14 +50,15 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb")
 set(LIB_GRAPHBOLT_NAME "graphbolt_pytorch_${TORCH_VER}")
+# set(LIB_GRAPHBOLT_NAME "graphbolt")
 set(BOLT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
 set(BOLT_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/include")
 file(GLOB BOLT_HEADERS ${BOLT_INCLUDE})
 file(GLOB BOLT_SRC ${BOLT_DIR}/*.cc)
-if(USE_CUDA)
+if(USE_HIP)
  file(GLOB BOLT_CUDA_SRC
-    ${BOLT_DIR}/cuda/*.cu
+    ${BOLT_DIR}/cuda/*.hip
    ${BOLT_DIR}/cuda/*.cc
  )
  list(APPEND BOLT_SRC ${BOLT_CUDA_SRC})
@@ -67,20 +74,28 @@ target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE ${BOLT_DIR}
                           "../third_party/pcg/include")
 target_link_libraries(${LIB_GRAPHBOLT_NAME} "${TORCH_LIBRARIES}")
-if(USE_CUDA)
+if(USE_HIP)
-  set_target_properties(${LIB_GRAPHBOLT_NAME} PROPERTIES CUDA_STANDARD 17)
+  # set_target_properties(${LIB_GRAPHBOLT_NAME} PROPERTIES CUDA_STANDARD 17)
  message(STATUS "Use external CCCL library for a consistent API and performance for graphbolt.")
  target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE
-                             "../third_party/cccl/thrust"
+  #                           #  "/opt/dgl_dep/hipcub-install-0915/include/"
-                             "../third_party/cccl/cub"
+  #                           #  "/opt/dgl_dep/rocprim-install-0915/include/"
-                             "../third_party/cccl/libcudacxx/include")
+  #                            "${ROCM_PATH}/include/thrust"
+                             "${ROCM_PATH}/include/hipcub"
+                             "${ROCM_PATH}/include/rocprim"
+  )
+  # target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE
+  #                            "../third_party/cccl/thrust"
+  #                            "../third_party/cccl/cub"
+  #                            "../third_party/cccl/libcudacxx/include")
  message(STATUS "Use HugeCTR gpu_cache for graphbolt with INCLUDE_DIRS $ENV{GPU_CACHE_INCLUDE_DIRS}.")
  target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE $ENV{GPU_CACHE_INCLUDE_DIRS})
  target_link_directories(${LIB_GRAPHBOLT_NAME} PRIVATE ${GPU_CACHE_BUILD_DIR})
  target_link_libraries(${LIB_GRAPHBOLT_NAME} gpu_cache)
-  get_property(archs TARGET ${LIB_GRAPHBOLT_NAME} PROPERTY CUDA_ARCHITECTURES)
+  # get_property(archs TARGET ${LIB_GRAPHBOLT_NAME} PROPERTY CUDA_ARCHITECTURES)
  message(STATUS "CUDA_ARCHITECTURES for graphbolt: ${archs}")
 endif()

--- a/graphbolt/build.sh
+++ b/graphbolt/build.sh
@@ -12,20 +12,27 @@ else
  CPSOURCE=*.so
 fi
-CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DUSE_CUDA=$USE_CUDA -DGPU_CACHE_BUILD_DIR=$BINDIR"
+CMAKE_FLAGS=" -DUSE_HIP=$USE_HIP -DGPU_CACHE_BUILD_DIR=$BINDIR"
 echo $CMAKE_FLAGS
+# add new hipcub
+# export C_INCLUDE_PATH=/opt/dgl_dep/hipcub-install-0915/include/:$C_INCLUDE_PATH
+# export CPLUS_INCLUDE_PATH=/opt/dgl_dep/hipcub-install-0915/include/:$C_INCLUDE_PATH
+# export C_INCLUDE_PATH=/opt/dgl_dep/rocprim-install-0915/include/:$C_INCLUDE_PATH
+# export CPLUS_INCLUDE_PATH=/opt/dgl_dep/rocprim-install-0915/include/:$C_INCLUDE_PATH
 if [ $# -eq 0 ]; then
-  $CMAKE_COMMAND $CMAKE_FLAGS ..
+  CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS ..
-  make -j
+  make -j VERBOSE=1
  cp -v $CPSOURCE $BINDIR/graphbolt
 else
  for PYTHON_INTERP in $@; do
    TORCH_VER=$($PYTHON_INTERP -c 'import torch; print(torch.__version__.split("+")[0])')
    mkdir -p $TORCH_VER
    cd $TORCH_VER
-    $CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../..
+    CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../..
-    make -j
+    make -j VERBOSE=1
    cp -v $CPSOURCE $BINDIR/graphbolt
    cd ..
  done

--- a/graphbolt/src/cuda/expand_indptr.hip
+++ b/graphbolt/src/cuda/expand_indptr.hip
@@ -5,13 +5,14 @@
 * @file cuda/expand_indptr.cu
 * @brief ExpandIndptr operator implementation on CUDA.
 */
+#include <hip/hip_runtime.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <hipcub/hipcub.hpp>
 #include <limits>
+#include <hipcub/backend/rocprim/device/device_copy.hpp>
 #include "common.h"
 namespace graphbolt {

--- a/graphbolt/src/cuda/neighbor_sampler.hip
+++ b/graphbolt/src/cuda/neighbor_sampler.hip
@@ -15,6 +15,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
+#include <hipcub/backend/rocprim/device/device_copy.hpp>
 #include <algorithm>
 #include <array>
@@ -27,6 +28,29 @@
 #include "common.h"
 #include "utils.h"
+namespace rocprim{
+namespace detail{
+template<>
+struct float_bit_mask<__hip_bfloat16>
+{
+    static constexpr uint16_t sign_bit = 0x8000;
+    static constexpr uint16_t exponent = 0x7F80;
+    static constexpr uint16_t mantissa = 0x007F;
+    using bit_type = uint16_t;
+};
+template<>
+struct radix_key_codec_base<__hip_bfloat16> : radix_key_codec_floating<__hip_bfloat16, unsigned short> { 
+};
+}
+}
+__host__ __device__ bool operator>(const __hip_bfloat16& a, const __hip_bfloat16& b)
+{
+  return float(a)>float(b);
+}
 namespace graphbolt {
 namespace ops {
@@ -344,7 +368,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                CUB_CALL(
                    DeviceSegmentedSort::SortKeys, edge_id_segments.get(),
                    sorted_edge_id_segments.get(), picked_eids.size(0),
-                    num_rows, sub_indptr.data_ptr<indptr_t>(),
+                    num_rows, sampled_segment_end_it,
                    sampled_segment_end_it);
              }

--- a/graphbolt/src/cuda/unique_and_compact_impl.hip
+++ b/graphbolt/src/cuda/unique_and_compact_impl.hip
@@ -5,6 +5,7 @@
 * @file cuda/unique_and_compact_impl.cu
 * @brief Unique and compact operator implementation on CUDA.
 */
+#include <hip/hip_runtime.h>
 #include <graphbolt/cuda_ops.h>
 #include <thrust/binary_search.h>
 #include <thrust/functional.h>

--- a/include/dgl/array.h
+++ b/include/dgl/array.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file dgl/array.h
@@ -8,10 +9,10 @@
 */
 #ifndef DGL_ARRAY_H_
 #define DGL_ARRAY_H_
-#include "./aten/array_ops.h"
+#include "aten/array_ops.h"
-#include "./aten/coo.h"
+#include "aten/coo.h"
-#include "./aten/csr.h"
+#include "aten/csr.h"
-#include "./aten/macro.h"
+#include "aten/macro.h"
-#include "./aten/spmat.h"
+#include "aten/spmat.h"
-#include "./aten/types.h"
+#include "aten/types.h"
 #endif  // DGL_ARRAY_H_
--- a/include/dgl/array_iterator.h
+++ b/include/dgl/array_iterator.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file dgl/array_iterator.h
@@ -6,11 +7,11 @@
 #ifndef DGL_ARRAY_ITERATOR_H_
 #define DGL_ARRAY_ITERATOR_H_
-#ifdef __CUDA_ARCH__
+#ifdef __HIPCC__
 #define CUB_INLINE __host__ __device__ __forceinline__
 #else
 #define CUB_INLINE inline
-#endif  // __CUDA_ARCH__
+#endif  // __HIP_DEVICE_COMPILE__
 #include <algorithm>
 #include <iterator>

--- a/include/dgl/aten/array_ops.h
+++ b/include/dgl/aten/array_ops.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file dgl/aten/array_ops.h
@@ -15,7 +16,7 @@
 #include <utility>
 #include <vector>
-#include "./types.h"
+#include "types.h"
 namespace dgl {
 namespace aten {

--- a/include/dgl/aten/coo.h
+++ b/include/dgl/aten/coo.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020-2022 by Contributors
@@ -15,10 +16,10 @@
 #include <utility>
 #include <vector>
-#include "./array_ops.h"
+#include "array_ops.h"
-#include "./macro.h"
+#include "macro.h"
-#include "./spmat.h"
+#include "spmat.h"
-#include "./types.h"
+#include "types.h"
 namespace dgl {
 namespace aten {

--- a/include/dgl/aten/csr.h
+++ b/include/dgl/aten/csr.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020-2022 by Contributors
 * @file dgl/aten/csr.h
@@ -14,10 +15,10 @@
 #include <utility>
 #include <vector>
-#include "./array_ops.h"
+#include "array_ops.h"
-#include "./macro.h"
+#include "macro.h"
-#include "./spmat.h"
+#include "spmat.h"
-#include "./types.h"
+#include "types.h"
 namespace dgl {
 namespace aten {

--- a/include/dgl/aten/macro.h
+++ b/include/dgl/aten/macro.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file dgl/aten/macro.h
@@ -47,7 +48,7 @@
    if ((val) == kDGLCPU) {                                              \
      constexpr auto XPU = kDGLCPU;                                      \
      { __VA_ARGS__ }                                                    \
-    } else if ((val) == kDGLCUDA) {                                      \
+    } else if ((val) == kDGLCUDA or (val) == kDGLROCM) {                                      \
      constexpr auto XPU = kDGLCUDA;                                     \
      { __VA_ARGS__ }                                                    \
    } else {                                                             \
@@ -145,12 +146,12 @@
      typedef double FloatType;                                             \
      { __VA_ARGS__ }                                                       \
    } else if (                                                             \
-        XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLFloat) {   \
+        (XPU == kDGLCUDA || XPU == kDGLROCM)&&(val).bits == 16 && (val).code == kDGLFloat) {   \
      typedef __half FloatType;                                             \
      { __VA_ARGS__ }                                                       \
    } else if (                                                             \
-        XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLBfloat) {  \
+        (XPU == kDGLCUDA || XPU == kDGLROCM) && (val).bits == 16 && (val).code == kDGLBfloat) {  \
-      typedef __nv_bfloat16 FloatType;                                      \
+      typedef __hip_bfloat16 FloatType;                                      \
      { __VA_ARGS__ }                                                       \
    } else if (                                                             \
        XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLFloat) {    \
@@ -176,11 +177,11 @@
      typedef double FloatType;                                            \
      { __VA_ARGS__ }                                                      \
    } else if (                                                            \
-        XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLFloat) {  \
+        (XPU == kDGLCUDA || && XPU == kDGLROCM) && (val).bits == 16 && (val).code == kDGLFloat) {  \
      typedef __half FloatType;                                            \
      { __VA_ARGS__ }                                                      \
    } else if (                                                            \
-        XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLBfloat) { \
+        (XPU == kDGLCUDA || && XPU == kDGLROCM) && (val).bits == 16 && (val).code == kDGLBfloat) { \
      LOG(FATAL) << "bfloat16 requires CUDA >= 11.0";                      \
    } else if (                                                            \
        XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLFloat) {   \

--- a/include/dgl/aten/spmat.h
+++ b/include/dgl/aten/spmat.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file dgl/aten/spmat.h
@@ -10,7 +11,7 @@
 #include <vector>
 #include "../runtime/object.h"
-#include "./types.h"
+#include "types.h"
 namespace dgl {

--- a/include/dgl/base_heterograph.h
+++ b/include/dgl/base_heterograph.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file dgl/heterograph_interface.h
@@ -13,7 +14,7 @@
 #include <utility>
 #include <vector>
-#include "./runtime/object.h"
+#include "runtime/object.h"
 #include "array.h"
 #include "aten/spmat.h"
 #include "aten/types.h"

--- a/include/dgl/bcast.h
+++ b/include/dgl/bcast.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file dgl/aten/bcast.h
@@ -9,7 +10,7 @@
 #include <string>
 #include <vector>
-#include "./runtime/ndarray.h"
+#include "runtime/ndarray.h"
 using namespace dgl::runtime;
 namespace dgl {

--- a/include/dgl/graph_interface.h
+++ b/include/dgl/graph_interface.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2018 by Contributors
 * @file dgl/graph_interface.h
@@ -12,7 +13,7 @@
 #include <utility>
 #include <vector>
-#include "./runtime/object.h"
+#include "runtime/object.h"
 #include "array.h"
 namespace dgl {

--- a/include/dgl/kernel.h
+++ b/include/dgl/kernel.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020 by Contributors
 * @file dgl/aten/kernel.h
@@ -10,8 +11,8 @@
 #include <utility>
 #include <vector>
-#include "./base_heterograph.h"
+#include "base_heterograph.h"
-#include "./bcast.h"
+#include "bcast.h"
 #include "array.h"
 namespace dgl {

--- a/include/dgl/nodeflow.h
+++ b/include/dgl/nodeflow.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file dgl/nodeflow.h
@@ -10,7 +11,7 @@
 #include <string>
 #include <vector>
-#include "./runtime/object.h"
+#include "runtime/object.h"
 #include "graph_interface.h"
 namespace dgl {