Commit 74d88bf8 authored by sangwz's avatar sangwz
Browse files

Merge branch 'dtk25.04' of http://developer.sourcefind.cn/codes/OpenDAS/dgl into 2.2.1

parents 2a1ac588 314cedc1
...@@ -7,6 +7,7 @@ message(STATUS "Start configuring project ${PROJECT_NAME}") ...@@ -7,6 +7,7 @@ message(STATUS "Start configuring project ${PROJECT_NAME}")
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_HIP_STANDARD 17)
# cmake utils # cmake utils
include(cmake/util/Util.cmake) include(cmake/util/Util.cmake)
...@@ -33,6 +34,7 @@ dgl_option(EXTERNAL_METIS_PATH "Path to external metis" OFF) ...@@ -33,6 +34,7 @@ dgl_option(EXTERNAL_METIS_PATH "Path to external metis" OFF)
dgl_option(EXTERNAL_METIS_LIB_PATH "Path to external metis library" OFF) dgl_option(EXTERNAL_METIS_LIB_PATH "Path to external metis library" OFF)
dgl_option(EXTERNAL_GKLIB_PATH "Path to external gklib" OFF) dgl_option(EXTERNAL_GKLIB_PATH "Path to external gklib" OFF)
# Options for building DGL features: "none," "dev," "dogfood," "release," and # Options for building DGL features: "none," "dev," "dogfood," "release," and
# "all." # "all."
# "none" - The feature is OFF for all build types. This is used when # "none" - The feature is OFF for all build types. This is used when
...@@ -65,8 +67,10 @@ dgl_feature_option( ...@@ -65,8 +67,10 @@ dgl_feature_option(
dgl_feature_option( dgl_feature_option(
USE_LIBXSMM USE_LIBXSMM
"Build with LIBXSMM library optimization" "Build with LIBXSMM library optimization"
"all" "none"
) )
message(STATUS "USE_LIBXSMM: ${USE_LIBXSMM}")
dgl_feature_option( dgl_feature_option(
USE_OPENMP USE_OPENMP
"Build with OpenMP" "Build with OpenMP"
...@@ -79,6 +83,8 @@ dgl_feature_option( ...@@ -79,6 +83,8 @@ dgl_feature_option(
"all" "all"
) )
message(STATUS "BUILD_GRAPHBOLT: ${BUILD_GRAPHBOLT}")
dgl_feature_option( dgl_feature_option(
LIBCXX_ENABLE_PARALLEL_ALGORITHMS LIBCXX_ENABLE_PARALLEL_ALGORITHMS
"Enable the parallel algorithms library. This requires the PSTL to be available." "Enable the parallel algorithms library. This requires the PSTL to be available."
...@@ -147,6 +153,29 @@ if(USE_CUDA) ...@@ -147,6 +153,29 @@ if(USE_CUDA)
cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/cccl/libcudacxx/include") cuda_include_directories(BEFORE "${CMAKE_SOURCE_DIR}/third_party/cccl/libcudacxx/include")
endif(USE_CUDA) endif(USE_CUDA)
if(USE_HIP)
message(STATUS "Build with ROCM support")
project(dgl C CXX HIP)
include(cmake/modules/ROCM.cmake)
# target_compile_features(dgl PRIVATE cxx_std_17)
set(CMAKE_HIP_ARCHITECTURES gfx906;gfx928;gfx926;gfx936)
# see https://github.com/NVIDIA/thrust/issues/1401
if(NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "/opt/dtk" CACHE PATH "Path to which RoCm has been installed")
set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
else()
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
endif()
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
find_package(hip REQUIRED)
set(HIP_FOUND TRUE)
message(STATUS "HIP_FOUND :${HIP_FOUND}")
# add_definitions(-DTHRUST_CUB_WRAPPED_NAMESPACE=dgl)
include(cmake/modules/ROCM.cmake)
message(STATUS "Use external CUB/Thrust library for a consistent API and performance.")
endif(USE_HIP)
# initial variables # initial variables
if(NOT MSVC) if(NOT MSVC)
set(DGL_LINKER_LIBS "dl") set(DGL_LINKER_LIBS "dl")
...@@ -256,6 +285,28 @@ else() ...@@ -256,6 +285,28 @@ else()
endif() endif()
list(APPEND DGL_SRC ${DGL_RPC_SRC}) list(APPEND DGL_SRC ${DGL_RPC_SRC})
# Configure hip
message(STATUS ">>>>>>>> USE_HIP: ${USE_HIP}")
message(STATUS ">>>>>>>> DGL_SRC: ${DGL_SRC}")
message(STATUS ">>>>>>>> DGL_RPC_SRC: ${DGL_RPC_SRC}")
if(USE_HIP)
dgl_config_hip(DGL_CUDA_SRC)
list(APPEND DGL_SRC ${DGL_CUDA_SRC})
set(HIP_HIPCC_FLAGS "-std=c++17")
add_library(dgl SHARED ${DGL_SRC})
target_link_options(dgl PRIVATE "-Wl,--allow-multiple-definition")
target_compile_options(dgl PUBLIC "--gpu-max-threads-per-block=1024")
# set_target_properties(dgl PROPERTIES LINKER_LANGUAGE hip)
target_link_libraries(dgl ${DGL_LINKER_LIBS})
target_include_directories(dgl PRIVATE "${CMAKE_SOURCE_DIR}/include/dgl")
target_include_directories(dgl PRIVATE "${CMAKE_SOURCE_DIR}/include")
target_include_directories(dgl PRIVATE "${ROCM_PATH}/include")
target_include_directories(dgl PRIVATE "${ROCM_PATH}/include/hiprand")
target_include_directories(dgl PRIVATE "${ROCM_PATH}/include/rocrand")
message(STATUS ">>>>>>>> DGL_LINKER_LIBS: ${DGL_LINKER_LIBS}")
endif(USE_HIP)
# Configure cuda # Configure cuda
if(USE_CUDA) if(USE_CUDA)
file(GLOB_RECURSE DGL_CUDA_SRC file(GLOB_RECURSE DGL_CUDA_SRC
...@@ -275,12 +326,13 @@ if(USE_CUDA) ...@@ -275,12 +326,13 @@ if(USE_CUDA)
list(APPEND DGL_SRC ${DGL_CUDA_SRC}) list(APPEND DGL_SRC ${DGL_CUDA_SRC})
dgl_config_cuda(DGL_LINKER_LIBS) dgl_config_cuda(DGL_LINKER_LIBS)
cuda_add_library(dgl SHARED ${DGL_SRC}) cuda_add_library(dgl SHARED ${DGL_SRC})
else(USE_CUDA) endif()
if(NOT USE_CUDA AND NOT USE_HIP)
add_library(dgl SHARED ${DGL_SRC}) add_library(dgl SHARED ${DGL_SRC})
endif(USE_CUDA) endif()
# include directories target_include_directories(dgl PUBLIC "include")
target_include_directories(dgl PRIVATE "include") target_include_directories(dgl PUBLIC "${CMAKE_SOURCE_DIR}/include")
# check for conda includes # check for conda includes
if("$ENV{CONDA_BUILD}" STREQUAL "1") if("$ENV{CONDA_BUILD}" STREQUAL "1")
set(in_conda_build TRUE) set(in_conda_build TRUE)
...@@ -423,6 +475,20 @@ if(USE_CUDA) ...@@ -423,6 +475,20 @@ if(USE_CUDA)
target_include_directories(dgl PRIVATE "third_party/HugeCTR/gpu_cache/include") target_include_directories(dgl PRIVATE "third_party/HugeCTR/gpu_cache/include")
list(APPEND DGL_LINKER_LIBS gpu_cache) list(APPEND DGL_LINKER_LIBS gpu_cache)
message(STATUS "Build with HugeCTR GPU embedding cache.") message(STATUS "Build with HugeCTR GPU embedding cache.")
elseif(USE_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GPU_CACHE")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU_CACHE")
# Manually build gpu_cache because CMake always builds it as shared
file(GLOB gpu_cache_src
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.hip
)
add_library(gpu_cache STATIC ${gpu_cache_src})
target_compile_options(gpu_cache PRIVATE "-fPIC")
set_target_properties(gpu_cache PROPERTIES LINKER_LANGUAGE HIP)
target_include_directories(gpu_cache PRIVATE "third_party/HugeCTR/gpu_cache/include")
target_include_directories(dgl PRIVATE "third_party/HugeCTR/gpu_cache/include")
list(APPEND DGL_LINKER_LIBS gpu_cache)
message(STATUS "Build with HugeCTR GPU embedding cache.")
endif(USE_CUDA) endif(USE_CUDA)
# support PARALLEL_ALGORITHMS # support PARALLEL_ALGORITHMS
...@@ -465,8 +531,8 @@ if(BUILD_TORCH) ...@@ -465,8 +531,8 @@ if(BUILD_TORCH)
tensoradapter_pytorch tensoradapter_pytorch
${CMAKE_COMMAND} -E env ${CMAKE_COMMAND} -E env
CMAKE_COMMAND=${CMAKE_CMD} CMAKE_COMMAND=${CMAKE_CMD}
CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR} # CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
USE_CUDA=${USE_CUDA} USE_HIP=${USE_HIP}
EXTERNAL_DMLC_LIB_PATH=${EXTERNAL_DMLC_LIB_PATH} EXTERNAL_DMLC_LIB_PATH=${EXTERNAL_DMLC_LIB_PATH}
BINDIR=${CMAKE_CURRENT_BINARY_DIR} BINDIR=${CMAKE_CURRENT_BINARY_DIR}
bash ${BUILD_SCRIPT} ${TORCH_PYTHON_INTERPS} bash ${BUILD_SCRIPT} ${TORCH_PYTHON_INTERPS}
...@@ -495,6 +561,8 @@ if(BUILD_CPP_TEST) ...@@ -495,6 +561,8 @@ if(BUILD_CPP_TEST)
add_executable(runUnitTests ${TEST_SRC_FILES}) add_executable(runUnitTests ${TEST_SRC_FILES})
target_link_libraries(runUnitTests gtest gtest_main) target_link_libraries(runUnitTests gtest gtest_main)
target_link_libraries(runUnitTests dgl) target_link_libraries(runUnitTests dgl)
target_link_options(runUnitTests PRIVATE -Wl,--allow-multiple-definition -fuse-ld=lld)
target_compile_options(runUnitTests PRIVATE "-fPIC")
add_test(UnitTests runUnitTests) add_test(UnitTests runUnitTests)
endif(BUILD_CPP_TEST) endif(BUILD_CPP_TEST)
...@@ -529,8 +597,7 @@ if(BUILD_SPARSE) ...@@ -529,8 +597,7 @@ if(BUILD_SPARSE)
ALL ALL
${CMAKE_COMMAND} -E env ${CMAKE_COMMAND} -E env
CMAKE_COMMAND=${CMAKE_CMD} CMAKE_COMMAND=${CMAKE_CMD}
CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR} USE_HIP=${USE_HIP}
USE_CUDA=${USE_CUDA}
BINDIR=${CMAKE_CURRENT_BINARY_DIR} BINDIR=${CMAKE_CURRENT_BINARY_DIR}
INCLUDEDIR="${DGL_INCLUDE_DIRS}" INCLUDEDIR="${DGL_INCLUDE_DIRS}"
CFLAGS=${CMAKE_C_FLAGS} CFLAGS=${CMAKE_C_FLAGS}
...@@ -545,12 +612,12 @@ endif(BUILD_SPARSE) ...@@ -545,12 +612,12 @@ endif(BUILD_SPARSE)
if(BUILD_GRAPHBOLT) if(BUILD_GRAPHBOLT)
message(STATUS "Configuring graphbolt library") message(STATUS "Configuring graphbolt library")
string(REPLACE ";" "\\;" CUDA_ARCHITECTURES_ESCAPED "${CUDA_ARCHITECTURES}") # string(REPLACE ";" "\\;" CUDA_ARCHITECTURES_ESCAPED "${CUDA_ARCHITECTURES}")
file(TO_NATIVE_PATH ${CMAKE_CURRENT_BINARY_DIR} BINDIR) file(TO_NATIVE_PATH ${CMAKE_CURRENT_BINARY_DIR} BINDIR)
file(TO_NATIVE_PATH ${CMAKE_COMMAND} CMAKE_CMD) file(TO_NATIVE_PATH ${CMAKE_COMMAND} CMAKE_CMD)
if(USE_CUDA) if(USE_HIP)
get_target_property(GPU_CACHE_INCLUDE_DIRS gpu_cache INCLUDE_DIRECTORIES) get_target_property(GPU_CACHE_INCLUDE_DIRS gpu_cache INCLUDE_DIRECTORIES)
endif(USE_CUDA) endif(USE_HIP)
string(REPLACE ";" "\\;" GPU_CACHE_INCLUDE_DIRS_ESCAPED "${GPU_CACHE_INCLUDE_DIRS}") string(REPLACE ";" "\\;" GPU_CACHE_INCLUDE_DIRS_ESCAPED "${GPU_CACHE_INCLUDE_DIRS}")
if(MSVC) if(MSVC)
file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/graphbolt/build.bat BUILD_SCRIPT) file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/graphbolt/build.bat BUILD_SCRIPT)
...@@ -577,8 +644,7 @@ if(BUILD_GRAPHBOLT) ...@@ -577,8 +644,7 @@ if(BUILD_GRAPHBOLT)
ALL ALL
${CMAKE_COMMAND} -E env ${CMAKE_COMMAND} -E env
CMAKE_COMMAND=${CMAKE_CMD} CMAKE_COMMAND=${CMAKE_CMD}
CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR} USE_HIP=${USE_HIP}
USE_CUDA=${USE_CUDA}
BINDIR=${CMAKE_CURRENT_BINARY_DIR} BINDIR=${CMAKE_CURRENT_BINARY_DIR}
GPU_CACHE_INCLUDE_DIRS="${GPU_CACHE_INCLUDE_DIRS_ESCAPED}" GPU_CACHE_INCLUDE_DIRS="${GPU_CACHE_INCLUDE_DIRS_ESCAPED}"
CFLAGS=${CMAKE_C_FLAGS} CFLAGS=${CMAKE_C_FLAGS}
......
This diff is collapsed.
This diff is collapsed.
################################################################################################
# Config hip compilation.
# Usage:
# dgl_config_hip(<dgl_cuda_src>)
macro(dgl_config_hip out_variable)
if(NOT HIP_FOUND)
message(FATAL_ERROR "Cannot find HIP.")
endif()
# always set the includedir when cuda is available
# avoid global retrigger of cmake
include_directories(${CUDA_INCLUDE_DIRS})
add_definitions(-DDGL_USE_CUDA)
add_definitions(-D__HIP_PLATFORM_AMD__)
add_definitions(-DCUDART_VERSION_LT_11000=true)
add_definitions(-DDTKRT_VERSION=11080)
add_definitions(-D__DTK_ARCH__=11080)
include_directories(BEFORE SYSTEM "${CMAKE_SOURCE_DIR}/include/")
message(STATUS ">>>>>>>>>>>> CUDA_INCLUDE_DIRS : ${CUDA_INCLUDE_DIRS}")
set_source_files_properties(src/random/random.cc PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
set_source_files_properties(src/array/cuda/csr_transpose.cc PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
set_source_files_properties(src/runtime/cuda/cuda_device_api.cc PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
file(GLOB_RECURSE DGL_HIP_SRC
src/array/cuda/*.cc
src/array/cuda/*.hip
src/array/cuda/uvm/*.cc
src/array/cuda/uvm/*.hip
src/kernel/cuda/*.cc
src/kernel/cuda/*.hip
src/partition/cuda/*.hip
src/runtime/cuda/*.cc
src/runtime/cuda/*.hip
src/geometry/cuda/*.hip
src/graph/transform/cuda/*.hip
src/graph/sampling/randomwalks/*.hip
)
find_library(DCU_RUNTIME galaxyhip ${ROCM_PATH}/lib)
find_library(DCU_SPARSE hipsparse ${ROCM_PATH}/lib)
find_library(DCU_BLAS hipblas ${ROCM_PATH}/lib)
find_library(DCU_RAND hiprand ${ROCM_PATH}/lib)
message(STATUS "Found DCU_RUNTIME: ${DCU_RUNTIME}")
message(STATUS "Found DCU_SPARSE: ${DCU_SPARSE}")
message(STATUS "Found DCU_BLAS: ${DCU_BLAS}")
message(STATUS "Found DCU_RAND: ${DCU_RAND}")
list(APPEND DGL_LINKER_LIBS
${DCU_RUNTIME}
${DCU_SPARSE}
${DCU_BLAS}
${DCU_RAND}
)
set(${out_variable} ${DGL_HIP_SRC})
endmacro()
...@@ -21,10 +21,12 @@ list(GET TORCH_VERSION_LIST 0 TORCH_VERSION_MAJOR) ...@@ -21,10 +21,12 @@ list(GET TORCH_VERSION_LIST 0 TORCH_VERSION_MAJOR)
list(GET TORCH_VERSION_LIST 1 TORCH_VERSION_MINOR) list(GET TORCH_VERSION_LIST 1 TORCH_VERSION_MINOR)
set(SPARSE_LINKER_LIBS "") set(SPARSE_LINKER_LIBS "")
list(APPEND CMAKE_PREFIX_PATH $ENV{ROCM_PATH})
if(USE_CUDA) set(HIP_PATH $ENV{ROCM_PATH}/hip)
if(USE_CUDA OR USE_HIP)
project(dgl_sparse C CXX HIP)
add_definitions(-DDGL_USE_CUDA) add_definitions(-DDGL_USE_CUDA)
enable_language(CUDA) find_package(HIP REQUIRED PATHS ${HIP_PATH} NO_DEFAULT_PATH)
endif() endif()
# For windows, define NOMINMAX to avoid conflict with std::min/max # For windows, define NOMINMAX to avoid conflict with std::min/max
...@@ -49,7 +51,7 @@ file(GLOB SPARSE_SRC ...@@ -49,7 +51,7 @@ file(GLOB SPARSE_SRC
${SPARSE_DIR}/*.cc ${SPARSE_DIR}/*.cc
${SPARSE_DIR}/cpu/*.cc ${SPARSE_DIR}/cpu/*.cc
) )
if(USE_CUDA) if(USE_HIP)
file(GLOB SPARSE_CUDA_SRC file(GLOB SPARSE_CUDA_SRC
${SPARSE_DIR}/cuda/*.cu ${SPARSE_DIR}/cuda/*.cu
) )
......
...@@ -4,6 +4,9 @@ set -e ...@@ -4,6 +4,9 @@ set -e
mkdir -p build mkdir -p build
mkdir -p $BINDIR/dgl_sparse mkdir -p $BINDIR/dgl_sparse
echo ">>>>> SPARSE DIR: $BINDIR"
cd build cd build
if [ $(uname) = 'Darwin' ]; then if [ $(uname) = 'Darwin' ]; then
...@@ -12,22 +15,23 @@ else ...@@ -12,22 +15,23 @@ else
CPSOURCE=*.so CPSOURCE=*.so
fi fi
CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST -DUSE_CUDA=$USE_CUDA -DEXTERNAL_DMLC_LIB_PATH=$EXTERNAL_DMLC_LIB_PATH" # CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST -DUSE_CUDA=$USE_CUDA -DEXTERNAL_DMLC_LIB_PATH=$EXTERNAL_DMLC_LIB_PATH"
CMAKE_FLAGS="-DTORCH_CUDA_ARCH_LIST=$TORCH_HIP_ARCH_LIST -DUSE_HIP=$USE_HIP -DEXTERNAL_DMLC_LIB_PATH=$EXTERNAL_DMLC_LIB_PATH"
# CMake passes in the list of directories separated by spaces. Here we replace them with semicolons. # CMake passes in the list of directories separated by spaces. Here we replace them with semicolons.
CMAKE_FLAGS="$CMAKE_FLAGS -DDGL_INCLUDE_DIRS=${INCLUDEDIR// /;} -DDGL_BUILD_DIR=$BINDIR" CMAKE_FLAGS="$CMAKE_FLAGS -DDGL_INCLUDE_DIRS=${INCLUDEDIR// /;} -DDGL_BUILD_DIR=$BINDIR"
echo $CMAKE_FLAGS echo $CMAKE_FLAGS
if [ $# -eq 0 ]; then if [ $# -eq 0 ]; then
$CMAKE_COMMAND $CMAKE_FLAGS .. CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS ..
make -j make -j VERBOSE=1
cp -v $CPSOURCE $BINDIR/dgl_sparse cp -v $CPSOURCE $BINDIR/dgl_sparse
else else
for PYTHON_INTERP in $@; do for PYTHON_INTERP in $@; do
TORCH_VER=$($PYTHON_INTERP -c 'import torch; print(torch.__version__.split("+")[0])') TORCH_VER=$($PYTHON_INTERP -c 'import torch; print(torch.__version__.split("+")[0])')
mkdir -p $TORCH_VER mkdir -p $TORCH_VER
cd $TORCH_VER cd $TORCH_VER
$CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../.. CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../..
make -j make -j VERBOSE=1
cp -v $CPSOURCE $BINDIR/dgl_sparse cp -v $CPSOURCE $BINDIR/dgl_sparse
cd .. cd ..
done done
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file elementwise_op.cc * @file elementwise_op.cc
...@@ -11,7 +12,7 @@ ...@@ -11,7 +12,7 @@
#include <memory> #include <memory>
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace sparse { namespace sparse {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file matmul.cc * @file matmul.cc
* @brief DGL sparse matrix multiplication functions. * @brief DGL sparse matrix multiplication functions.
*/ */
#include "./matmul.h" #include "matmul.h"
// clang-format off // clang-format off
#include <sparse/dgl_headers.h> #include <sparse/dgl_headers.h>
...@@ -12,7 +13,7 @@ ...@@ -12,7 +13,7 @@
#include <sparse/sparse_matrix.h> #include <sparse/sparse_matrix.h>
#include <torch/script.h> #include <torch/script.h>
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace sparse { namespace sparse {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* @file matrix_ops_impl.h * @file matrix_ops_impl.h
...@@ -12,7 +13,7 @@ ...@@ -12,7 +13,7 @@
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace sparse {} // namespace sparse namespace sparse {} // namespace sparse
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file sddmm.cc * @file sddmm.cc
...@@ -9,8 +10,8 @@ ...@@ -9,8 +10,8 @@
#include <sstream> #include <sstream>
#include "./matmul.h" #include "matmul.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace sparse { namespace sparse {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file softmax.cc * @file softmax.cc
...@@ -8,8 +9,8 @@ ...@@ -8,8 +9,8 @@
#include <sparse/sparse_matrix.h> #include <sparse/sparse_matrix.h>
#include <torch/script.h> #include <torch/script.h>
#include "./matmul.h" #include "matmul.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace sparse { namespace sparse {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file sparse_format.cc * @file sparse_format.cc
...@@ -9,7 +10,7 @@ ...@@ -9,7 +10,7 @@
#include <sparse/sparse_format.h> #include <sparse/sparse_format.h>
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace sparse { namespace sparse {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file sparse_matrix.cc * @file sparse_matrix.cc
...@@ -12,7 +13,7 @@ ...@@ -12,7 +13,7 @@
#include <sparse/sparse_matrix.h> #include <sparse/sparse_matrix.h>
#include <torch/script.h> #include <torch/script.h>
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace sparse { namespace sparse {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file sparse_matrix_coalesce.cc * @file sparse_matrix_coalesce.cc
...@@ -9,7 +10,7 @@ ...@@ -9,7 +10,7 @@
#include <sparse/sparse_matrix.h> #include <sparse/sparse_matrix.h>
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace sparse { namespace sparse {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file spmm.cc * @file spmm.cc
...@@ -11,8 +12,8 @@ ...@@ -11,8 +12,8 @@
#include <sstream> #include <sstream>
#include "./matmul.h" #include "matmul.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace sparse { namespace sparse {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2022 by Contributors * Copyright (c) 2022 by Contributors
* @file spspmm.cc * @file spspmm.cc
...@@ -9,8 +10,8 @@ ...@@ -9,8 +10,8 @@
#include <sparse/spspmm.h> #include <sparse/spspmm.h>
#include <torch/script.h> #include <torch/script.h>
#include "./matmul.h" #include "matmul.h"
#include "./utils.h" #include "utils.h"
namespace dgl { namespace dgl {
namespace sparse { namespace sparse {
......
...@@ -8,6 +8,12 @@ if(USE_CUDA) ...@@ -8,6 +8,12 @@ if(USE_CUDA)
add_definitions(-DGRAPHBOLT_USE_CUDA) add_definitions(-DGRAPHBOLT_USE_CUDA)
endif() endif()
if(USE_HIP)
message(STATUS "Build graphbolt with CUDA support")
enable_language(HIP)
add_definitions(-DGRAPHBOLT_USE_CUDA)
endif()
# For windows, define NOMINMAX to avoid conflict with std::min/max # For windows, define NOMINMAX to avoid conflict with std::min/max
if(MSVC) if(MSVC)
add_definitions(-DNOMINMAX) add_definitions(-DNOMINMAX)
...@@ -44,14 +50,15 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") ...@@ -44,14 +50,15 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb")
set(LIB_GRAPHBOLT_NAME "graphbolt_pytorch_${TORCH_VER}") set(LIB_GRAPHBOLT_NAME "graphbolt_pytorch_${TORCH_VER}")
# set(LIB_GRAPHBOLT_NAME "graphbolt")
set(BOLT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src") set(BOLT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
set(BOLT_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/include") set(BOLT_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/include")
file(GLOB BOLT_HEADERS ${BOLT_INCLUDE}) file(GLOB BOLT_HEADERS ${BOLT_INCLUDE})
file(GLOB BOLT_SRC ${BOLT_DIR}/*.cc) file(GLOB BOLT_SRC ${BOLT_DIR}/*.cc)
if(USE_CUDA) if(USE_HIP)
file(GLOB BOLT_CUDA_SRC file(GLOB BOLT_CUDA_SRC
${BOLT_DIR}/cuda/*.cu ${BOLT_DIR}/cuda/*.hip
${BOLT_DIR}/cuda/*.cc ${BOLT_DIR}/cuda/*.cc
) )
list(APPEND BOLT_SRC ${BOLT_CUDA_SRC}) list(APPEND BOLT_SRC ${BOLT_CUDA_SRC})
...@@ -83,33 +90,42 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux") ...@@ -83,33 +90,42 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux")
endif(USE_LIBURING) endif(USE_LIBURING)
endif() endif()
if(USE_CUDA) #if(USE_CUDA)
file(GLOB BOLT_CUDA_EXTENSION_SRC # file(GLOB BOLT_CUDA_EXTENSION_SRC
${BOLT_DIR}/cuda/extension/*.cu # ${BOLT_DIR}/cuda/extension/*.cu
${BOLT_DIR}/cuda/extension/*.cc # ${BOLT_DIR}/cuda/extension/*.cc
) # )
# Until https://github.com/NVIDIA/cccl/issues/1083 is resolved, we need to # # Until https://github.com/NVIDIA/cccl/issues/1083 is resolved, we need to
# compile the cuda/extension folder with Volta+ CUDA architectures. # # compile the cuda/extension folder with Volta+ CUDA architectures.
add_library(${LIB_GRAPHBOLT_CUDA_NAME} STATIC ${BOLT_CUDA_EXTENSION_SRC} ${BOLT_HEADERS}) # add_library(${LIB_GRAPHBOLT_CUDA_NAME} STATIC ${BOLT_CUDA_EXTENSION_SRC} ${BOLT_HEADERS})
target_link_libraries(${LIB_GRAPHBOLT_CUDA_NAME} "${TORCH_LIBRARIES}") # target_link_libraries(${LIB_GRAPHBOLT_CUDA_NAME} "${TORCH_LIBRARIES}")
#
set_target_properties(${LIB_GRAPHBOLT_NAME} PROPERTIES CUDA_STANDARD 17) # set_target_properties(${LIB_GRAPHBOLT_NAME} PROPERTIES CUDA_STANDARD 17)
set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES CUDA_STANDARD 17) # set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES CUDA_STANDARD 17)
set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_FILTERED}") # set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_FILTERED}")
set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES POSITION_INDEPENDENT_CODE TRUE) # set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
# message(STATUS "Use external CCCL library for a consistent API and performance for graphbolt.")
# include_directories(BEFORE
# "../third_party/cccl/thrust"
# "../third_party/cccl/cub"
# "../third_party/cccl/libcudacxx/include"
# "../third_party/cuco/include")
if(USE_HIP)
# set_target_properties(${LIB_GRAPHBOLT_NAME} PROPERTIES CUDA_STANDARD 17)
message(STATUS "Use external CCCL library for a consistent API and performance for graphbolt.") message(STATUS "Use external CCCL library for a consistent API and performance for graphbolt.")
include_directories(BEFORE target_compile_options(${LIB_GRAPHBOLT_NAME} PRIVATE "--gpu-max-threads-per-block=1024")
"../third_party/cccl/thrust" target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE
"../third_party/cccl/cub" # "${ROCM_PATH}/include/thrust"
"../third_party/cccl/libcudacxx/include" "${ROCM_PATH}/include/hipcub"
"../third_party/cuco/include") "${ROCM_PATH}/include/rocprim"
)
message(STATUS "Use HugeCTR gpu_cache for graphbolt with INCLUDE_DIRS $ENV{GPU_CACHE_INCLUDE_DIRS}.") message(STATUS "Use HugeCTR gpu_cache for graphbolt with INCLUDE_DIRS $ENV{GPU_CACHE_INCLUDE_DIRS}.")
target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE $ENV{GPU_CACHE_INCLUDE_DIRS}) target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE $ENV{GPU_CACHE_INCLUDE_DIRS})
target_link_directories(${LIB_GRAPHBOLT_NAME} PRIVATE ${GPU_CACHE_BUILD_DIR}) target_link_directories(${LIB_GRAPHBOLT_NAME} PRIVATE ${GPU_CACHE_BUILD_DIR})
target_link_libraries(${LIB_GRAPHBOLT_NAME} gpu_cache) target_link_libraries(${LIB_GRAPHBOLT_NAME} gpu_cache)
get_property(archs TARGET ${LIB_GRAPHBOLT_NAME} PROPERTY CUDA_ARCHITECTURES) # get_property(archs TARGET ${LIB_GRAPHBOLT_NAME} PROPERTY CUDA_ARCHITECTURES)
message(STATUS "CUDA_ARCHITECTURES for graphbolt: ${archs}") message(STATUS "CUDA_ARCHITECTURES for graphbolt: ${archs}")
get_property(archs TARGET ${LIB_GRAPHBOLT_CUDA_NAME} PROPERTY CUDA_ARCHITECTURES) get_property(archs TARGET ${LIB_GRAPHBOLT_CUDA_NAME} PROPERTY CUDA_ARCHITECTURES)
......
...@@ -16,32 +16,40 @@ fi ...@@ -16,32 +16,40 @@ fi
# TORCH_CUDA_ARCH_LIST and we need to at least compile for Volta. Until # TORCH_CUDA_ARCH_LIST and we need to at least compile for Volta. Until
# https://github.com/NVIDIA/cccl/issues/1083 is resolved, we need to compile the # https://github.com/NVIDIA/cccl/issues/1083 is resolved, we need to compile the
# cuda/extension folder with Volta+ CUDA architectures. # cuda/extension folder with Volta+ CUDA architectures.
TORCH_CUDA_ARCH_LIST="Volta" #TORCH_CUDA_ARCH_LIST="Volta"
if ! [[ -z "${CUDAARCHS}" ]]; then #if ! [[ -z "${CUDAARCHS}" ]]; then
# The architecture list is passed as an environment variable, we set # # The architecture list is passed as an environment variable, we set
# TORCH_CUDA_ARCH_LIST to the latest architecture. # # TORCH_CUDA_ARCH_LIST to the latest architecture.
CUDAARCHSARR=(${CUDAARCHS//;/ }) # CUDAARCHSARR=(${CUDAARCHS//;/ })
LAST_ARCHITECTURE=${CUDAARCHSARR[-1]} # LAST_ARCHITECTURE=${CUDAARCHSARR[-1]}
# TORCH_CUDA_ARCH_LIST has to be at least 70 to override Volta default. # # TORCH_CUDA_ARCH_LIST has to be at least 70 to override Volta default.
if (( $LAST_ARCHITECTURE >= 70 )); then # if (( $LAST_ARCHITECTURE >= 70 )); then
# Convert "75" to "7.5". # # Convert "75" to "7.5".
TORCH_CUDA_ARCH_LIST=${LAST_ARCHITECTURE:0:-1}'.'${LAST_ARCHITECTURE: -1} # TORCH_CUDA_ARCH_LIST=${LAST_ARCHITECTURE:0:-1}'.'${LAST_ARCHITECTURE: -1}
fi # fi
fi #fi
CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DUSE_CUDA=$USE_CUDA -DGPU_CACHE_BUILD_DIR=$BINDIR -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST" #CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DUSE_CUDA=$USE_CUDA -DGPU_CACHE_BUILD_DIR=$BINDIR -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
CMAKE_FLAGS=" -DUSE_HIP=$USE_HIP -DGPU_CACHE_BUILD_DIR=$BINDIR "
echo $CMAKE_FLAGS echo $CMAKE_FLAGS
# add new hipcub
# export C_INCLUDE_PATH=/opt/dgl_dep/hipcub-install-0915/include/:$C_INCLUDE_PATH
# export CPLUS_INCLUDE_PATH=/opt/dgl_dep/hipcub-install-0915/include/:$C_INCLUDE_PATH
# export C_INCLUDE_PATH=/opt/dgl_dep/rocprim-install-0915/include/:$C_INCLUDE_PATH
# export CPLUS_INCLUDE_PATH=/opt/dgl_dep/rocprim-install-0915/include/:$C_INCLUDE_PATH
if [ $# -eq 0 ]; then if [ $# -eq 0 ]; then
$CMAKE_COMMAND $CMAKE_FLAGS .. CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS ..
make -j make -j VERBOSE=1
cp -v $CPSOURCE $BINDIR/graphbolt cp -v $CPSOURCE $BINDIR/graphbolt
else else
for PYTHON_INTERP in $@; do for PYTHON_INTERP in $@; do
TORCH_VER=$($PYTHON_INTERP -c 'import torch; print(torch.__version__.split("+")[0])') TORCH_VER=$($PYTHON_INTERP -c 'import torch; print(torch.__version__.split("+")[0])')
mkdir -p $TORCH_VER mkdir -p $TORCH_VER
cd $TORCH_VER cd $TORCH_VER
$CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../.. CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../..
make -j make -j VERBOSE=1
cp -v $CPSOURCE $BINDIR/graphbolt cp -v $CPSOURCE $BINDIR/graphbolt
cd .. cd ..
done done
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2017-2023 by Contributors * Copyright (c) 2017-2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -7,12 +8,12 @@ ...@@ -7,12 +8,12 @@
#ifndef GRAPHBOLT_CUDA_COMMON_H_ #ifndef GRAPHBOLT_CUDA_COMMON_H_
#define GRAPHBOLT_CUDA_COMMON_H_ #define GRAPHBOLT_CUDA_COMMON_H_
#include <ATen/cuda/CUDAEvent.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAStream.h>
#include <cuda_runtime.h>
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include <ATen/hip/HIPEvent.h>
#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
#include <c10/hip/HIPException.h>
#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
#include <hip/hip_runtime.h>
#include <torch/script.h> #include <torch/script.h>
#include <memory> #include <memory>
...@@ -27,8 +28,8 @@ namespace cuda { ...@@ -27,8 +28,8 @@ namespace cuda {
* that uses torch's CUDA memory pool and the current cuda stream: * that uses torch's CUDA memory pool and the current cuda stream:
* *
* cuda::CUDAWorkspaceAllocator allocator; * cuda::CUDAWorkspaceAllocator allocator;
* const auto stream = torch::cuda::getDefaultCUDAStream(); * const auto stream = torch::hip::getDefaultHIPStreamMasqueradingAsCUDA();
* const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); * const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
* *
* Now, one can pass exec_policy to thrust functions * Now, one can pass exec_policy to thrust functions
* *
...@@ -53,13 +54,13 @@ struct CUDAWorkspaceAllocator { ...@@ -53,13 +54,13 @@ struct CUDAWorkspaceAllocator {
CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default; CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default;
void operator()(void* ptr) const { void operator()(void* ptr) const {
c10::cuda::CUDACachingAllocator::raw_delete(ptr); c10::hip::HIPCachingAllocator::raw_delete(ptr);
} }
// Required by thrust to satisfy allocator requirements. // Required by thrust to satisfy allocator requirements.
value_type* allocate(std::ptrdiff_t size) const { value_type* allocate(std::ptrdiff_t size) const {
return reinterpret_cast<value_type*>( return reinterpret_cast<value_type*>(
c10::cuda::CUDACachingAllocator::raw_alloc(size * sizeof(value_type))); c10::hip::HIPCachingAllocator::raw_alloc(size * sizeof(value_type)));
} }
// Required by thrust to satisfy allocator requirements. // Required by thrust to satisfy allocator requirements.
...@@ -77,7 +78,7 @@ struct CUDAWorkspaceAllocator { ...@@ -77,7 +78,7 @@ struct CUDAWorkspaceAllocator {
inline auto GetAllocator() { return CUDAWorkspaceAllocator{}; } inline auto GetAllocator() { return CUDAWorkspaceAllocator{}; }
inline auto GetCurrentStream() { return c10::cuda::getCurrentCUDAStream(); } inline auto GetCurrentStream() { return c10::hip::getCurrentHIPStreamMasqueradingAsCUDA(); }
template <typename T> template <typename T>
inline bool is_zero(T size) { inline bool is_zero(T size) {
...@@ -94,19 +95,18 @@ inline bool is_zero<dim3>(dim3 size) { ...@@ -94,19 +95,18 @@ inline bool is_zero<dim3>(dim3 size) {
cudaError_t __err = EXPR; \ cudaError_t __err = EXPR; \
if (__err != cudaSuccess) { \ if (__err != cudaSuccess) { \
auto get_error_str_err = cudaGetErrorString(__err); \ auto get_error_str_err = cudaGetErrorString(__err); \
AT_ERROR("CUDA runtime error: ", get_error_str_err); \ AT_ERROR("HIP runtime error: ", get_error_str_err); \
} \ } \
} while (0) } while (0)
#define CUDA_CALL(func) C10_HIP_CHECK((func))
#define CUDA_CALL(func) C10_CUDA_CHECK((func))
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...) \ #define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...) \
{ \ { \
if (!graphbolt::cuda::is_zero((nblks)) && \ if (!graphbolt::cuda::is_zero((nblks)) && \
!graphbolt::cuda::is_zero((nthrs))) { \ !graphbolt::cuda::is_zero((nthrs))) { \
auto stream = graphbolt::cuda::GetCurrentStream(); \ auto stream = graphbolt::cuda::GetCurrentStream(); \
(kernel)<<<(nblks), (nthrs), (shmem), stream>>>(__VA_ARGS__); \ hipLaunchKernelGGL(( (kernel)), dim3((nblks)), dim3((nthrs)), (shmem), stream, __VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \ C10_HIP_KERNEL_LAUNCH_CHECK(); \
} \ } \
} }
...@@ -115,16 +115,16 @@ inline bool is_zero<dim3>(dim3 size) { ...@@ -115,16 +115,16 @@ inline bool is_zero<dim3>(dim3 size) {
auto allocator = graphbolt::cuda::GetAllocator(); \ auto allocator = graphbolt::cuda::GetAllocator(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \ auto stream = graphbolt::cuda::GetCurrentStream(); \
size_t workspace_size = 0; \ size_t workspace_size = 0; \
CUDA_CALL(cub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \ CUDA_CALL(hipcub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \
auto workspace = allocator.AllocateStorage<char>(workspace_size); \ auto workspace = allocator.AllocateStorage<char>(workspace_size); \
CUDA_CALL(cub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \ CUDA_CALL(hipcub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
} }
#define THRUST_CALL(fn, ...) \ #define THRUST_CALL(fn, ...) \
[&] { \ [&] { \
auto allocator = graphbolt::cuda::GetAllocator(); \ auto allocator = graphbolt::cuda::GetAllocator(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \ auto stream = graphbolt::cuda::GetCurrentStream(); \
const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); \ const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream); \
return thrust::fn(exec_policy, __VA_ARGS__); \ return thrust::fn(exec_policy, __VA_ARGS__); \
}() }()
...@@ -143,7 +143,7 @@ template <typename scalar_t> ...@@ -143,7 +143,7 @@ template <typename scalar_t>
struct CopyScalar { struct CopyScalar {
CopyScalar() : is_ready_(true) { init_pinned_storage(); } CopyScalar() : is_ready_(true) { init_pinned_storage(); }
void record(at::cuda::CUDAStream stream = GetCurrentStream()) { void record(at::hip::HIPStreamMasqueradingAsCUDA stream = GetCurrentStream()) {
copy_event_.record(stream); copy_event_.record(stream);
is_ready_ = false; is_ready_ = false;
} }
...@@ -155,9 +155,9 @@ struct CopyScalar { ...@@ -155,9 +155,9 @@ struct CopyScalar {
CopyScalar(const scalar_t* device_ptr) { CopyScalar(const scalar_t* device_ptr) {
init_pinned_storage(); init_pinned_storage();
auto stream = GetCurrentStream(); auto stream = GetCurrentStream();
CUDA_CALL(cudaMemcpyAsync( CUDA_CALL(hipMemcpyAsync(
reinterpret_cast<scalar_t*>(pinned_scalar_.data_ptr()), device_ptr, reinterpret_cast<scalar_t*>(pinned_scalar_.data_ptr()), device_ptr,
sizeof(scalar_t), cudaMemcpyDeviceToHost, stream)); sizeof(scalar_t), hipMemcpyDeviceToHost, stream));
record(stream); record(stream);
} }
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/cumsum.cu * @file cuda/cumsum.cu
* @brief Cumsum operators implementation on CUDA. * @brief Cumsum operators implementation on CUDA.
*/ */
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "./common.h" #include "common.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment