update dgl codes to hip

833803f3 · sangwzh · 1d28bf8b · 833803f3 · 833803f3 · 833803f3
Commit 833803f3 authored Sep 23, 2024 by sangwzh
16 changed files
--- a/include/dgl/packed_func_ext.h
+++ b/include/dgl/packed_func_ext.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file packed_func_ext.h
@@ -12,9 +13,9 @@
 #include <string>
 #include <type_traits>
-#include "./runtime/container.h"
+#include "runtime/container.h"
-#include "./runtime/object.h"
+#include "runtime/object.h"
-#include "./runtime/packed_func.h"
+#include "runtime/packed_func.h"
 namespace dgl {
 namespace runtime {

--- a/include/dgl/runtime/c_object_api.h
+++ b/include/dgl/runtime/c_object_api.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2019 by Contributors
 * @file dgl/runtime/c_object_api.h
@@ -10,7 +11,7 @@
 #ifndef DGL_RUNTIME_C_OBJECT_API_H_
 #define DGL_RUNTIME_C_OBJECT_API_H_
-#include "./c_runtime_api.h"
+#include "c_runtime_api.h"
 #ifdef __cplusplus
 extern "C" {

--- a/include/dgl/runtime/c_runtime_api.h
+++ b/include/dgl/runtime/c_runtime_api.h
@@ -57,6 +57,7 @@ typedef enum {
  /** @brief CUDA GPU device */
  kDGLCUDA = 2,
  // add more devices once supported
+  kDGLROCM = 10,
 } DGLDeviceType;
 /**

--- a/include/dgl/runtime/device_api.h
+++ b/include/dgl/runtime/device_api.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2016 by Contributors
 * @file dgl/runtime/device_api.h
@@ -174,7 +175,7 @@ class DeviceAPI {
      DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst);
  /**
-   * @brief Pin host memory using cudaHostRegister().
+   * @brief Pin host memory using hipHostRegister().
   *
   * @param ptr The host memory pointer to be pinned.
   * @param nbytes The size to be pinned.
@@ -183,7 +184,7 @@ class DeviceAPI {
  DGL_DLL virtual bool PinData(void* ptr, size_t nbytes);
  /**
-   * @brief Unpin host memory using cudaHostUnregister().
+   * @brief Unpin host memory using hipHostUnregister().
   *
   * @param ptr The host memory pointer to be unpinned.
   */
@@ -203,7 +204,7 @@ class DeviceAPI {
  /**
   * @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
-   * @note It avoids unnecessary cudaFreeHost calls and puts the memory
+   * @note It avoids unnecessary hipHostFree calls and puts the memory
   *     block into CachingHostAllocator's free list.
   * @param deleter Pointer to the deleter function from PyTorch's
   *     CachingHostAllocator.

--- a/include/dgl/runtime/module.h
+++ b/include/dgl/runtime/module.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2017 by Contributors
 * @file dgl/runtime/module.h

--- a/include/dgl/runtime/ndarray.h
+++ b/include/dgl/runtime/ndarray.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2017-2022 by Contributors
 * @file dgl/runtime/ndarray.h
@@ -18,13 +19,20 @@
 #include "shared_mem.h"
 #ifdef DGL_USE_CUDA
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
-#define BF16_ENABLED (defined(CUDART_VERSION) && CUDART_VERSION >= 11000)
+// #define BF16_ENABLED (defined(DTKRT_VERSION) && DTKRT_VERSION >= 11000)
+#if defined(DTKRT_VERSION)
+    #define DTKRT_VERSION_CHECK (DTKRT_VERSION >= 11000)
+#else
+    #define DTKRT_VERSION_CHECK 0
+#endif
-#include <cuda_fp16.h>
+#define BF16_ENABLED DTKRT_VERSION_CHECK
+#include <hip/hip_fp16.h>
 #if BF16_ENABLED
-#include <cuda_bf16.h>
+#include <hip/hip_bf16.h>
 #endif  // BF16_ENABLED
 #endif  // DGL_USE_CUDA
@@ -60,7 +68,7 @@ GEN_DGLDATATYPETRAITS_FOR(uint64_t, kDGLInt, 64);
 #ifdef DGL_USE_CUDA
 GEN_DGLDATATYPETRAITS_FOR(__half, kDGLFloat, 16);
 #if BF16_ENABLED
-GEN_DGLDATATYPETRAITS_FOR(__nv_bfloat16, kDGLBfloat, 16);
+GEN_DGLDATATYPETRAITS_FOR(__hip_bfloat16, kDGLBfloat, 16);
 #endif  // BF16_ENABLED
 #endif  // DGL_USE_CUDA
 GEN_DGLDATATYPETRAITS_FOR(float, kDGLFloat, 32);
@@ -185,7 +193,7 @@ class NDArray {
   *     CachingHostAllocator for allocating pinned memory and copying data
   *     from the current NDAarray. As a result, PyTorch is responsible for
   *     managing the lifecycle of the returned NDArray, including deciding
-   *     when to flush the data for reuse or call cudaFreeHost. The current
+   *     when to flush the data for reuse or call hipHostFree. The current
   *     context must be kDGLCPU, otherwise, an error will be thrown.
   */
  inline NDArray PinMemory();
@@ -194,7 +202,7 @@ class NDArray {
   * @brief In-place method to pin the current array by calling PinContainer
   *        on the underlying NDArray:Container.
   * @note This is an in-place method that flags the memory as page-locked by
-   *     utilizing cudaHostRegister at the underlying level to pin the current
+   *     utilizing hipHostRegister at the underlying level to pin the current
   *     instance of NDArray. The current context must be kDGLCPU, otherwise,
   *     an error will be thrown.
   */
@@ -523,7 +531,7 @@ inline void NDArray::CopyFrom(const NDArray& other) {
    // Pinned by PyTorch
    if (cpu_data->pinned_by_pytorch_) {
      // To ensure correct behavior, the event must be recorded after
-      // cudaMemcpyAsync as long as the memory is pinned by PyTorch.
+      // hipMemcpyAsync as long as the memory is pinned by PyTorch.
      void* pytorch_ctx = cpu_data->pytorch_ctx_;
      RecordedCopyFromTo(
          &(other.data_->dl_tensor), &(data_->dl_tensor), pytorch_ctx);
@@ -549,7 +557,7 @@ inline void NDArray::CopyTo(const NDArray& other) const {
    // pinned by PyTorch
    if (cpu_data->pinned_by_pytorch_) {
      // To ensure correct behavior, the event must be recorded after
-      // cudaMemcpyAsync as long as the memory is pinned by PyTorch.
+      // hipMemcpyAsync as long as the memory is pinned by PyTorch.
      void* pytorch_ctx = cpu_data->pytorch_ctx_;
      RecordedCopyFromTo(
          &(data_->dl_tensor), &(other.data_->dl_tensor), pytorch_ctx);
@@ -716,6 +724,8 @@ inline const char* DeviceTypeCode2Str(DGLDeviceType device_type) {
      return "cpu";
    case kDGLCUDA:
      return "cuda";
+    case kDGLROCM:
+      return "cuda";
    default:
      LOG(FATAL) << "Unsupported device type code="
                 << static_cast<int>(device_type);
@@ -871,8 +881,11 @@ inline std::ostream& operator<<(std::ostream& os, DGLDataType t) {
 /** @brief Check whether two device contexts are the same.*/
 inline bool operator==(const DGLContext& ctx1, const DGLContext& ctx2) {
-  return ctx1.device_type == ctx2.device_type &&
+  // printf("**************** debug  compare DGLContext, %d, %d\n",ctx1.device_type,ctx2.device_type);
-         ctx1.device_id == ctx2.device_id;
+  int ct1=ctx1.device_type==10?2:ctx1.device_type;
+  int ct2=ctx2.device_type==10?2:ctx2.device_type;
+  return ct1 == ct2 &&
+         int(ctx1.device_id) == int(ctx2.device_id);
 }
 /** @brief Check whether two device contexts are different.*/

--- a/include/dgl/runtime/tensordispatch.h
+++ b/include/dgl/runtime/tensordispatch.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020-2022 by Contributors
 * @file array/tensordispatch.h
@@ -34,7 +35,7 @@
 #include <windows.h>
 #endif  // WIN32
 #ifdef DGL_USE_CUDA
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #endif  // DGL_USE_CUDA
 #include "ndarray.h"
@@ -97,14 +98,14 @@ class TensorDispatcher {
   * Used in CUDADeviceAPI::AllocWorkspace().
   *
   * @note THCCachingAllocator specify the device to allocate on
-   * via cudaGetDevice(). Make sure to call cudaSetDevice()
+   * via hipGetDevice(). Make sure to call hipSetDevice()
   * before invoking this function.
   *
   * @param nbytes The size to be allocated.
   * @param stream The stream to be allocated on.
   * @return Pointer to the allocated memory.
   */
-  inline void* CUDAAllocWorkspace(size_t nbytes, cudaStream_t stream) {
+  inline void* CUDAAllocWorkspace(size_t nbytes, hipStream_t stream) {
    auto entry = entrypoints_[Op::kCUDARawAlloc];
    return FUNCCAST(tensoradapter::CUDARawAlloc, entry)(nbytes, stream);
  }
@@ -122,15 +123,15 @@ class TensorDispatcher {
  /**
   * @brief Find the current PyTorch CUDA stream
-   * Used in runtime::getCurrentCUDAStream().
+   * Used in runtime::getCurrentHIPStreamMasqueradingAsCUDA().
   *
   * @note PyTorch pre-allocates/sets the current CUDA stream
-   * on current device via cudaGetDevice(). Make sure to call cudaSetDevice()
+   * on current device via hipGetDevice(). Make sure to call hipSetDevice()
   * before invoking this function.
   *
-   * @return cudaStream_t stream handle
+   * @return hipStream_t stream handle
   */
-  inline cudaStream_t CUDAGetCurrentStream() {
+  inline hipStream_t CUDAGetCurrentStream() {
    auto entry = entrypoints_[Op::kCUDACurrentStream];
    return FUNCCAST(tensoradapter::CUDACurrentStream, entry)();
  }
@@ -183,7 +184,7 @@ class TensorDispatcher {
   * @param device_id Device of the tensor.
   */
  inline void CUDARecordHostAlloc(
-      void* data, void* ctx, cudaStream_t stream, int device_id) {
+      void* data, void* ctx, hipStream_t stream, int device_id) {
    auto entry = entrypoints_[Op::kCUDARecordHostAlloc];
    auto recorded_alloc = FUNCCAST(tensoradapter::CUDARecordHostAlloc, entry);
    recorded_alloc(data, ctx, stream, device_id);
@@ -212,7 +213,7 @@ class TensorDispatcher {
 #ifdef DGL_USE_CUDA
    auto entry = entrypoints_[Op::kRecordStream];
    FUNCCAST(tensoradapter::RecordStream, entry)
-    (ptr, static_cast<cudaStream_t>(stream), device_id);
+    (ptr, static_cast<hipStream_t>(stream), device_id);
 #endif
  }

--- a/python/dgl/_ffi/runtime_ctypes.py
+++ b/python/dgl/_ffi/runtime_ctypes.py
@@ -123,7 +123,7 @@ class DGLContext(ctypes.Structure):
        7: "vulkan",
        8: "metal",
        9: "vpi",
-        10: "rocm",
+        10: "gpu",
        11: "opengl",
        12: "ext_dev",
    }
@@ -142,7 +142,7 @@ class DGLContext(ctypes.Structure):
        "vulkan": 7,
        "metal": 8,
        "vpi": 9,
-        "rocm": 10,
+        "rocm": 2,
        "opengl": 11,
        "ext_dev": 12,
    }

--- a/python/dgl/backend/pytorch/tensor.py
+++ b/python/dgl/backend/pytorch/tensor.py
@@ -116,7 +116,7 @@ def to_backend_ctx(dglctx):
    dev_type = dglctx.device_type
    if dev_type == 1:
        return th.device("cpu")
-    elif dev_type == 2:
+    elif dev_type == 2 or dev_type==10:
        return th.device("cuda", dglctx.device_id)
    else:
        raise ValueError("Unsupported DGL device context:", dglctx)

--- a/src/array/cuda/spmat_op_impl_csr.hip
+++ b/src/array/cuda/spmat_op_impl_csr.hip
@@ -548,8 +548,7 @@ __global__ void _SegmentMaskColKernel(
        mask[idx] = 1;
      }
    }
-    IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
+    IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);    
-    printf("out_row = %d , reduce_count = %d \n", out_row, reduce_count);
    if (laneid == 0) {
      count[out_row] = reduce_count;
    }
@@ -567,9 +566,6 @@ CSRMatrix CSRSliceMatrix(
  const int64_t new_nrows = rows->shape[0];
  const int64_t new_ncols = cols->shape[0];
-  std::cout << "new_nrows : " << new_nrows << std::endl;
-  std::cout << "new_ncols : " << new_ncols << std::endl;
  if (new_nrows == 0 || new_ncols == 0)
    return CSRMatrix(
        new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
@@ -578,7 +574,6 @@ CSRMatrix CSRSliceMatrix(
  // First slice rows
  csr = CSRSliceRows(csr, rows);
-  std::cout << "csr.indices->shape[0] : " << csr.indices->shape[0] << std::endl;
  if (csr.indices->shape[0] == 0)
    return CSRMatrix(
        new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
@@ -588,11 +583,9 @@ CSRMatrix CSRSliceMatrix(
  IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
  // A count for how many masked values per row.
  IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
-  std::cout << "1 IdArray count : " << count << std::endl;
  CUDA_CALL(
      hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
-  std::cout << "2 IdArray count : " << count << std::endl;
  // Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
  // For performance, the load factor of the hashmap is in (0.25, 0.5);
  // Because num_cols is usually less than 1 Million (on GPU), the
@@ -618,45 +611,29 @@ CSRMatrix CSRSliceMatrix(
  // Execute SegmentMaskColKernel
  const int64_t num_rows = csr.num_rows;
-  constexpr int WARP_SIZE = 32;
+  constexpr int WARP_SIZE = 64;
  // With a simple fine-tuning, TILE_SIZE=16 gives a good performance.
-  constexpr int TILE_SIZE = 16;
+  constexpr int TILE_SIZE = 32;
  constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE;
  IdType nb =
      dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
  const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
  const dim3 nblks(nb);
-  std::cout << "nthrs.x : " << nthrs.x << " nthrs.y : " << nthrs.y << " nthrs.z : " << nthrs.z << std::endl;
-  std::cout << "nblks.x : " << nblks.x << " nblks.y : " << nblks.y << " nblks.z : " << nblks.z << std::endl;
-  std::cout << "WARP_SIZE : " << WARP_SIZE << " BLOCK_WARPS : " << BLOCK_WARPS << "TILE_SIZE : " << std::endl;
-  std::cout << "indptr_data : " << indptr_data << std::endl;
-  std::cout << "indices_data : " << indices_data << std::endl;
-  std::cout << "num_rows : " << num_rows << std::endl;
-  std::cout << "buffer_size  : " << buffer_size << std::endl;
-  std::cout << "mask  : " << mask << std::endl;
-  std::cout << "count  : " << count << std::endl;
-  std::cout << "hashmap_buffer  : " << hashmap_buffer << std::endl;
  CUDA_KERNEL_CALL(
      (_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
      nthrs, 0, stream, indptr_data, indices_data, num_rows,
      hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
      count.Ptr<IdType>());
-  std::cout << "3 IdArray count : " << count << std::endl;
  IdArray idx = AsNumBits(NonZero(mask), nbits);
-  std::cout << "idx->shape[0] : " << idx->shape[0] << std::endl;
  if (idx->shape[0] == 0)
    return CSRMatrix(
        new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
        NullArray(dtype, ctx), NullArray(dtype, ctx));
  // Indptr needs to be adjusted according to the new nnz per row.
-  std::cout << " count  : " << count << std::endl;  
  IdArray ret_indptr = CumSum(count, true);
-  std::cout << " IdArray ret_indptr  : " << ret_indptr << std::endl;
  // Column & data can be obtained by index select.
  IdArray ret_col = IndexSelect(csr.indices, idx);
@@ -667,8 +644,6 @@ CSRMatrix CSRSliceMatrix(
  Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
  ret_col = IndexSelect(col_hash, ret_col);
-  // std::cout << "new_nrows : " << new_nrows << " new_ncols : " << new_ncols << " ret_indptr : " << ret_indptr << " ret_col : " << ret_col << " ret_data : " << std::endl;
  return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
 }

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -74,7 +74,7 @@ class CUDADeviceAPI final : public DeviceAPI {
        hipDeviceProp_t props;
        CUDA_CALL(hipGetDeviceProperties(&props, ctx.device_id));
        *rv = std::string(props.name);
-        // printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
+        printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
        return;
      }
      case kMaxClockRate: {
@@ -136,7 +136,8 @@ class CUDADeviceAPI final : public DeviceAPI {
    hipStream_t cu_stream = static_cast<hipStream_t>(stream);
    from = static_cast<const char*>(from) + from_offset;
    to = static_cast<char*>(to) + to_offset;
-    if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) {
+    // if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) {
+    if ((ctx_from.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM) && (ctx_to.device_type == kDGLCUDA ||  ctx_to.device_type == kDGLROCM)) {
      CUDA_CALL(hipSetDevice(ctx_from.device_id));
      if (ctx_from.device_id == ctx_to.device_id) {
        GPUCopy(from, to, size, hipMemcpyDeviceToDevice, cu_stream);
@@ -145,7 +146,7 @@ class CUDADeviceAPI final : public DeviceAPI {
            to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream));
      }
    } else if (
-        (ctx_from.device_type == kDGLCUDA || ctx_to.device_type == kDGLROCM)&& ctx_to.device_type == kDGLCPU) {
+        (ctx_from.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM)&& ctx_to.device_type == kDGLCPU) {
      CUDA_CALL(hipSetDevice(ctx_from.device_id));
      GPUCopy(from, to, size, hipMemcpyDeviceToHost, cu_stream);
    } else if (
@@ -153,7 +154,7 @@ class CUDADeviceAPI final : public DeviceAPI {
      CUDA_CALL(hipSetDevice(ctx_to.device_id));
      GPUCopy(from, to, size, hipMemcpyHostToDevice, cu_stream);
    } else {
-      LOG(FATAL) << "expect copy from/to GPU or between GPU";
+      LOG(FATAL) << "expect copy from/to GPU or between GPU. ctx_from.device_type: "<<ctx_from.device_type<<", ctx_to.device_type: "<<ctx_to.device_type;
    }
  }

--- a/tensoradapter/include/tensoradapter.h
+++ b/tensoradapter/include/tensoradapter.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020-2022 by Contributors
 * @file tensoradapter.h
@@ -11,7 +12,7 @@
 #define TENSORADAPTER_H_
 #ifdef DGL_USE_CUDA
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #endif  // DGL_USE_CUDA
 namespace tensoradapter {
@@ -43,7 +44,7 @@ void CPURawDelete(void* ptr);
 * @param stream The stream to be allocated on.
 * @return Pointer to the allocated memory.
 */
-void* CUDARawAlloc(size_t nbytes, cudaStream_t stream);
+void* CUDARawAlloc(size_t nbytes, hipStream_t stream);
 /**
 * @brief Free the GPU memory.
@@ -55,7 +56,7 @@ void CUDARawDelete(void* ptr);
 /**
 * @brief Get the current CUDA stream.
 */
-cudaStream_t CUDACurrentStream();
+hipStream_t CUDACurrentStream();
 /**
 * @brief Let the caching allocator know which streams are using this tensor.
@@ -64,7 +65,7 @@ cudaStream_t CUDACurrentStream();
 * @param stream The stream that is using this tensor.
 * @param device_id Device of the tensor.
 */
-void RecordStream(void* ptr, cudaStream_t stream, int device_id);
+void RecordStream(void* ptr, hipStream_t stream, int device_id);
 /**
 * @brief Allocate a piece of pinned CPU memory via
@@ -98,7 +99,7 @@ void CUDARawHostDelete(void** raw_deleter);
 * @param device_id Device of the tensor.
 */
 void CUDARecordHostAlloc(
-    void* data, void* ctx, cudaStream_t stream, int device_id);
+    void* data, void* ctx, hipStream_t stream, int device_id);
 /**
 * @brief Release cached pinned memory allocations via cudaHostFree.

--- a/tensoradapter/pytorch/CMakeLists.txt
+++ b/tensoradapter/pytorch/CMakeLists.txt
@@ -17,7 +17,8 @@ list(GET TORCH_PREFIX_VER 0 TORCH_PREFIX)
 list(GET TORCH_PREFIX_VER 1 TORCH_VER)
 message(STATUS "Configuring for PyTorch ${TORCH_VER}")
-if(USE_CUDA)
+if(USE_HIP)
+  message(STATUS "<<<<<<<<<<<<<< PYTORCH USE_HIP: ${USE_HIP}")
  add_definitions(-DDGL_USE_CUDA)
 endif()
@@ -30,6 +31,7 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb")
 set(TORCH_TARGET_NAME "tensoradapter_pytorch_${TORCH_VER}")
 file(GLOB TA_TORCH_SRC *.cpp)
 add_library(${TORCH_TARGET_NAME} SHARED "${TA_TORCH_SRC}")
+message(STATUS " <<<<<<<<< pytorch source: ${TA_TORCH_SRC}")
 # use the library name rather than the path
 set(TENSORADAPTER_TORCH_LIBS torch)

--- a/tensoradapter/pytorch/build.sh
+++ b/tensoradapter/pytorch/build.sh
@@ -12,19 +12,19 @@ else
 	CPSOURCE=*.so
 fi
-CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST -DUSE_CUDA=$USE_CUDA"
+CMAKE_FLAGS=" -DUSE_HIP=$USE_HIP"
 if [ $# -eq 0 ]; then
-	$CMAKE_COMMAND $CMAKE_FLAGS ..
+	CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS ..
-	make -j
+	make -j VERBOSE=1
 	cp -v $CPSOURCE $BINDIR/tensoradapter/pytorch
 else
 	for PYTHON_INTERP in $@; do
 		TORCH_VER=$($PYTHON_INTERP -c 'import torch; print(torch.__version__.split("+")[0])')
 		mkdir -p $TORCH_VER
 		cd $TORCH_VER
-		$CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../..
+		CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../..
-		make -j
+		make -j VERBOSE=1
 		cp -v $CPSOURCE $BINDIR/tensoradapter/pytorch
 		cd ..
 	done

--- a/tensoradapter/pytorch/torch.cpp
+++ b/tensoradapter/pytorch/torch.cpp
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2020-2022 by Contributors
 * @file torch/torch.cpp
@@ -7,11 +8,11 @@
 #include <c10/core/CPUAllocator.h>
 #include <tensoradapter_exports.h>
 #ifdef DGL_USE_CUDA
-#include <ATen/cuda/CUDAContext.h>
+#include <ATen/hip/HIPContext.h>
-#include <ATen/cuda/CachingHostAllocator.h>
+#include <ATen/hip/CachingHostAllocator.h>
-#include <c10/cuda/CUDACachingAllocator.h>
+#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
-#include <c10/cuda/CUDAStream.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #endif  // DGL_USE_CUDA
 namespace tensoradapter {
@@ -27,29 +28,29 @@ TA_EXPORTS void CPURawDelete(void* ptr) {
 }
 #ifdef DGL_USE_CUDA
-TA_EXPORTS void* CUDARawAlloc(size_t nbytes, cudaStream_t stream) {
+TA_EXPORTS void* CUDARawAlloc(size_t nbytes, hipStream_t stream) {
  at::globalContext().lazyInitCUDA();
-  return c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(nbytes, stream);
+  return c10::hip::HIPCachingAllocator::raw_alloc_with_stream(nbytes, stream);
 }
 TA_EXPORTS void CUDARawDelete(void* ptr) {
-  c10::cuda::CUDACachingAllocator::raw_delete(ptr);
+  c10::hip::HIPCachingAllocator::raw_delete(ptr);
 }
-TA_EXPORTS cudaStream_t CUDACurrentStream() {
+TA_EXPORTS hipStream_t CUDACurrentStream() {
-  return at::cuda::getCurrentCUDAStream();
+  return at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
 }
-TA_EXPORTS void RecordStream(void* ptr, cudaStream_t stream, int device_id) {
+TA_EXPORTS void RecordStream(void* ptr, hipStream_t stream, int device_id) {
  c10::DataPtr data_ptr{
-      ptr, ptr, c10::cuda::CUDACachingAllocator::get()->raw_deleter(),
+      ptr, ptr, c10::hip::HIPCachingAllocatorMasqueradingAsCUDA::get()->raw_deleter(),
      c10::Device(c10::DeviceType::CUDA, device_id)};
-  c10::cuda::CUDACachingAllocator::recordStream(
+  c10::hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(
      data_ptr,
-      // getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it
+      // getStreamFromExternalMasqueradingAsCUDA doesn't exist before PyTorch 1.10, just copy it
      // here
-      c10::cuda::CUDAStream(
+      c10::hip::HIPStreamMasqueradingAsCUDA(
-          c10::cuda::CUDAStream::UNCHECKED,
+          c10::hip::HIPStreamMasqueradingAsCUDA::UNCHECKED,
          c10::Stream(
              c10::Stream::UNSAFE,
              c10::Device(c10::DeviceType::CUDA, device_id),
@@ -86,11 +87,11 @@ TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) {
 }
 TA_EXPORTS void CUDARecordHostAlloc(
-    void* ptr, void* ctx, cudaStream_t stream, int device_id) {
+    void* ptr, void* ctx, hipStream_t stream, int device_id) {
  at::cuda::CachingHostAllocator_recordEvent(
      ptr, ctx,
-      c10::cuda::CUDAStream(
+      c10::hip::HIPStreamMasqueradingAsCUDA(
-          c10::cuda::CUDAStream::UNCHECKED,
+          c10::hip::HIPStreamMasqueradingAsCUDA::UNCHECKED,
          c10::Stream(
              c10::Stream::UNSAFE,
              c10::Device(c10::DeviceType::CUDA, device_id),

--- a/tests/cpp/message_queue_test.cc
+++ b/tests/cpp/message_queue_test.cc
@@ -9,7 +9,7 @@
 #include <thread>
 #include <vector>
-#include "../src/rpc/network/msg_queue.h"
+#include "../../src/rpc/network/msg_queue.h"
 using dgl::network::Message;
 using dgl::network::MessageQueue;