Commit 833803f3 authored by sangwzh's avatar sangwzh
Browse files

update dgl codes to hip

parent 1d28bf8b
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file packed_func_ext.h * @file packed_func_ext.h
...@@ -12,9 +13,9 @@ ...@@ -12,9 +13,9 @@
#include <string> #include <string>
#include <type_traits> #include <type_traits>
#include "./runtime/container.h" #include "runtime/container.h"
#include "./runtime/object.h" #include "runtime/object.h"
#include "./runtime/packed_func.h" #include "runtime/packed_func.h"
namespace dgl { namespace dgl {
namespace runtime { namespace runtime {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2019 by Contributors * Copyright (c) 2019 by Contributors
* @file dgl/runtime/c_object_api.h * @file dgl/runtime/c_object_api.h
...@@ -10,7 +11,7 @@ ...@@ -10,7 +11,7 @@
#ifndef DGL_RUNTIME_C_OBJECT_API_H_ #ifndef DGL_RUNTIME_C_OBJECT_API_H_
#define DGL_RUNTIME_C_OBJECT_API_H_ #define DGL_RUNTIME_C_OBJECT_API_H_
#include "./c_runtime_api.h" #include "c_runtime_api.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
...@@ -57,6 +57,7 @@ typedef enum { ...@@ -57,6 +57,7 @@ typedef enum {
/** @brief CUDA GPU device */ /** @brief CUDA GPU device */
kDGLCUDA = 2, kDGLCUDA = 2,
// add more devices once supported // add more devices once supported
kDGLROCM = 10,
} DGLDeviceType; } DGLDeviceType;
/** /**
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2016 by Contributors * Copyright (c) 2016 by Contributors
* @file dgl/runtime/device_api.h * @file dgl/runtime/device_api.h
...@@ -174,7 +175,7 @@ class DeviceAPI { ...@@ -174,7 +175,7 @@ class DeviceAPI {
DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst); DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst);
/** /**
* @brief Pin host memory using cudaHostRegister(). * @brief Pin host memory using hipHostRegister().
* *
* @param ptr The host memory pointer to be pinned. * @param ptr The host memory pointer to be pinned.
* @param nbytes The size to be pinned. * @param nbytes The size to be pinned.
...@@ -183,7 +184,7 @@ class DeviceAPI { ...@@ -183,7 +184,7 @@ class DeviceAPI {
DGL_DLL virtual bool PinData(void* ptr, size_t nbytes); DGL_DLL virtual bool PinData(void* ptr, size_t nbytes);
/** /**
* @brief Unpin host memory using cudaHostUnregister(). * @brief Unpin host memory using hipHostUnregister().
* *
* @param ptr The host memory pointer to be unpinned. * @param ptr The host memory pointer to be unpinned.
*/ */
...@@ -203,7 +204,7 @@ class DeviceAPI { ...@@ -203,7 +204,7 @@ class DeviceAPI {
/** /**
* @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator. * @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
* @note It avoids unnecessary cudaFreeHost calls and puts the memory * @note It avoids unnecessary hipHostFree calls and puts the memory
* block into CachingHostAllocator's free list. * block into CachingHostAllocator's free list.
* @param deleter Pointer to the deleter function from PyTorch's * @param deleter Pointer to the deleter function from PyTorch's
* CachingHostAllocator. * CachingHostAllocator.
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2017 by Contributors * Copyright (c) 2017 by Contributors
* @file dgl/runtime/module.h * @file dgl/runtime/module.h
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2017-2022 by Contributors * Copyright (c) 2017-2022 by Contributors
* @file dgl/runtime/ndarray.h * @file dgl/runtime/ndarray.h
...@@ -18,13 +19,20 @@ ...@@ -18,13 +19,20 @@
#include "shared_mem.h" #include "shared_mem.h"
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#define BF16_ENABLED (defined(CUDART_VERSION) && CUDART_VERSION >= 11000) // #define BF16_ENABLED (defined(DTKRT_VERSION) && DTKRT_VERSION >= 11000)
#if defined(DTKRT_VERSION)
#define DTKRT_VERSION_CHECK (DTKRT_VERSION >= 11000)
#else
#define DTKRT_VERSION_CHECK 0
#endif
#include <cuda_fp16.h> #define BF16_ENABLED DTKRT_VERSION_CHECK
#include <hip/hip_fp16.h>
#if BF16_ENABLED #if BF16_ENABLED
#include <cuda_bf16.h> #include <hip/hip_bf16.h>
#endif // BF16_ENABLED #endif // BF16_ENABLED
#endif // DGL_USE_CUDA #endif // DGL_USE_CUDA
...@@ -60,7 +68,7 @@ GEN_DGLDATATYPETRAITS_FOR(uint64_t, kDGLInt, 64); ...@@ -60,7 +68,7 @@ GEN_DGLDATATYPETRAITS_FOR(uint64_t, kDGLInt, 64);
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
GEN_DGLDATATYPETRAITS_FOR(__half, kDGLFloat, 16); GEN_DGLDATATYPETRAITS_FOR(__half, kDGLFloat, 16);
#if BF16_ENABLED #if BF16_ENABLED
GEN_DGLDATATYPETRAITS_FOR(__nv_bfloat16, kDGLBfloat, 16); GEN_DGLDATATYPETRAITS_FOR(__hip_bfloat16, kDGLBfloat, 16);
#endif // BF16_ENABLED #endif // BF16_ENABLED
#endif // DGL_USE_CUDA #endif // DGL_USE_CUDA
GEN_DGLDATATYPETRAITS_FOR(float, kDGLFloat, 32); GEN_DGLDATATYPETRAITS_FOR(float, kDGLFloat, 32);
...@@ -185,7 +193,7 @@ class NDArray { ...@@ -185,7 +193,7 @@ class NDArray {
* CachingHostAllocator for allocating pinned memory and copying data * CachingHostAllocator for allocating pinned memory and copying data
* from the current NDAarray. As a result, PyTorch is responsible for * from the current NDAarray. As a result, PyTorch is responsible for
* managing the lifecycle of the returned NDArray, including deciding * managing the lifecycle of the returned NDArray, including deciding
* when to flush the data for reuse or call cudaFreeHost. The current * when to flush the data for reuse or call hipHostFree. The current
* context must be kDGLCPU, otherwise, an error will be thrown. * context must be kDGLCPU, otherwise, an error will be thrown.
*/ */
inline NDArray PinMemory(); inline NDArray PinMemory();
...@@ -194,7 +202,7 @@ class NDArray { ...@@ -194,7 +202,7 @@ class NDArray {
* @brief In-place method to pin the current array by calling PinContainer * @brief In-place method to pin the current array by calling PinContainer
* on the underlying NDArray:Container. * on the underlying NDArray:Container.
* @note This is an in-place method that flags the memory as page-locked by * @note This is an in-place method that flags the memory as page-locked by
* utilizing cudaHostRegister at the underlying level to pin the current * utilizing hipHostRegister at the underlying level to pin the current
* instance of NDArray. The current context must be kDGLCPU, otherwise, * instance of NDArray. The current context must be kDGLCPU, otherwise,
* an error will be thrown. * an error will be thrown.
*/ */
...@@ -523,7 +531,7 @@ inline void NDArray::CopyFrom(const NDArray& other) { ...@@ -523,7 +531,7 @@ inline void NDArray::CopyFrom(const NDArray& other) {
// Pinned by PyTorch // Pinned by PyTorch
if (cpu_data->pinned_by_pytorch_) { if (cpu_data->pinned_by_pytorch_) {
// To ensure correct behavior, the event must be recorded after // To ensure correct behavior, the event must be recorded after
// cudaMemcpyAsync as long as the memory is pinned by PyTorch. // hipMemcpyAsync as long as the memory is pinned by PyTorch.
void* pytorch_ctx = cpu_data->pytorch_ctx_; void* pytorch_ctx = cpu_data->pytorch_ctx_;
RecordedCopyFromTo( RecordedCopyFromTo(
&(other.data_->dl_tensor), &(data_->dl_tensor), pytorch_ctx); &(other.data_->dl_tensor), &(data_->dl_tensor), pytorch_ctx);
...@@ -549,7 +557,7 @@ inline void NDArray::CopyTo(const NDArray& other) const { ...@@ -549,7 +557,7 @@ inline void NDArray::CopyTo(const NDArray& other) const {
// pinned by PyTorch // pinned by PyTorch
if (cpu_data->pinned_by_pytorch_) { if (cpu_data->pinned_by_pytorch_) {
// To ensure correct behavior, the event must be recorded after // To ensure correct behavior, the event must be recorded after
// cudaMemcpyAsync as long as the memory is pinned by PyTorch. // hipMemcpyAsync as long as the memory is pinned by PyTorch.
void* pytorch_ctx = cpu_data->pytorch_ctx_; void* pytorch_ctx = cpu_data->pytorch_ctx_;
RecordedCopyFromTo( RecordedCopyFromTo(
&(data_->dl_tensor), &(other.data_->dl_tensor), pytorch_ctx); &(data_->dl_tensor), &(other.data_->dl_tensor), pytorch_ctx);
...@@ -716,6 +724,8 @@ inline const char* DeviceTypeCode2Str(DGLDeviceType device_type) { ...@@ -716,6 +724,8 @@ inline const char* DeviceTypeCode2Str(DGLDeviceType device_type) {
return "cpu"; return "cpu";
case kDGLCUDA: case kDGLCUDA:
return "cuda"; return "cuda";
case kDGLROCM:
return "cuda";
default: default:
LOG(FATAL) << "Unsupported device type code=" LOG(FATAL) << "Unsupported device type code="
<< static_cast<int>(device_type); << static_cast<int>(device_type);
...@@ -871,8 +881,11 @@ inline std::ostream& operator<<(std::ostream& os, DGLDataType t) { ...@@ -871,8 +881,11 @@ inline std::ostream& operator<<(std::ostream& os, DGLDataType t) {
/** @brief Check whether two device contexts are the same.*/ /** @brief Check whether two device contexts are the same.*/
inline bool operator==(const DGLContext& ctx1, const DGLContext& ctx2) { inline bool operator==(const DGLContext& ctx1, const DGLContext& ctx2) {
return ctx1.device_type == ctx2.device_type && // printf("**************** debug compare DGLContext, %d, %d\n",ctx1.device_type,ctx2.device_type);
ctx1.device_id == ctx2.device_id; int ct1=ctx1.device_type==10?2:ctx1.device_type;
int ct2=ctx2.device_type==10?2:ctx2.device_type;
return ct1 == ct2 &&
int(ctx1.device_id) == int(ctx2.device_id);
} }
/** @brief Check whether two device contexts are different.*/ /** @brief Check whether two device contexts are different.*/
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020-2022 by Contributors * Copyright (c) 2020-2022 by Contributors
* @file array/tensordispatch.h * @file array/tensordispatch.h
...@@ -34,7 +35,7 @@ ...@@ -34,7 +35,7 @@
#include <windows.h> #include <windows.h>
#endif // WIN32 #endif // WIN32
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#endif // DGL_USE_CUDA #endif // DGL_USE_CUDA
#include "ndarray.h" #include "ndarray.h"
...@@ -97,14 +98,14 @@ class TensorDispatcher { ...@@ -97,14 +98,14 @@ class TensorDispatcher {
* Used in CUDADeviceAPI::AllocWorkspace(). * Used in CUDADeviceAPI::AllocWorkspace().
* *
* @note THCCachingAllocator specify the device to allocate on * @note THCCachingAllocator specify the device to allocate on
* via cudaGetDevice(). Make sure to call cudaSetDevice() * via hipGetDevice(). Make sure to call hipSetDevice()
* before invoking this function. * before invoking this function.
* *
* @param nbytes The size to be allocated. * @param nbytes The size to be allocated.
* @param stream The stream to be allocated on. * @param stream The stream to be allocated on.
* @return Pointer to the allocated memory. * @return Pointer to the allocated memory.
*/ */
inline void* CUDAAllocWorkspace(size_t nbytes, cudaStream_t stream) { inline void* CUDAAllocWorkspace(size_t nbytes, hipStream_t stream) {
auto entry = entrypoints_[Op::kCUDARawAlloc]; auto entry = entrypoints_[Op::kCUDARawAlloc];
return FUNCCAST(tensoradapter::CUDARawAlloc, entry)(nbytes, stream); return FUNCCAST(tensoradapter::CUDARawAlloc, entry)(nbytes, stream);
} }
...@@ -122,15 +123,15 @@ class TensorDispatcher { ...@@ -122,15 +123,15 @@ class TensorDispatcher {
/** /**
* @brief Find the current PyTorch CUDA stream * @brief Find the current PyTorch CUDA stream
* Used in runtime::getCurrentCUDAStream(). * Used in runtime::getCurrentHIPStreamMasqueradingAsCUDA().
* *
* @note PyTorch pre-allocates/sets the current CUDA stream * @note PyTorch pre-allocates/sets the current CUDA stream
* on current device via cudaGetDevice(). Make sure to call cudaSetDevice() * on current device via hipGetDevice(). Make sure to call hipSetDevice()
* before invoking this function. * before invoking this function.
* *
* @return cudaStream_t stream handle * @return hipStream_t stream handle
*/ */
inline cudaStream_t CUDAGetCurrentStream() { inline hipStream_t CUDAGetCurrentStream() {
auto entry = entrypoints_[Op::kCUDACurrentStream]; auto entry = entrypoints_[Op::kCUDACurrentStream];
return FUNCCAST(tensoradapter::CUDACurrentStream, entry)(); return FUNCCAST(tensoradapter::CUDACurrentStream, entry)();
} }
...@@ -183,7 +184,7 @@ class TensorDispatcher { ...@@ -183,7 +184,7 @@ class TensorDispatcher {
* @param device_id Device of the tensor. * @param device_id Device of the tensor.
*/ */
inline void CUDARecordHostAlloc( inline void CUDARecordHostAlloc(
void* data, void* ctx, cudaStream_t stream, int device_id) { void* data, void* ctx, hipStream_t stream, int device_id) {
auto entry = entrypoints_[Op::kCUDARecordHostAlloc]; auto entry = entrypoints_[Op::kCUDARecordHostAlloc];
auto recorded_alloc = FUNCCAST(tensoradapter::CUDARecordHostAlloc, entry); auto recorded_alloc = FUNCCAST(tensoradapter::CUDARecordHostAlloc, entry);
recorded_alloc(data, ctx, stream, device_id); recorded_alloc(data, ctx, stream, device_id);
...@@ -212,7 +213,7 @@ class TensorDispatcher { ...@@ -212,7 +213,7 @@ class TensorDispatcher {
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
auto entry = entrypoints_[Op::kRecordStream]; auto entry = entrypoints_[Op::kRecordStream];
FUNCCAST(tensoradapter::RecordStream, entry) FUNCCAST(tensoradapter::RecordStream, entry)
(ptr, static_cast<cudaStream_t>(stream), device_id); (ptr, static_cast<hipStream_t>(stream), device_id);
#endif #endif
} }
......
...@@ -123,7 +123,7 @@ class DGLContext(ctypes.Structure): ...@@ -123,7 +123,7 @@ class DGLContext(ctypes.Structure):
7: "vulkan", 7: "vulkan",
8: "metal", 8: "metal",
9: "vpi", 9: "vpi",
10: "rocm", 10: "gpu",
11: "opengl", 11: "opengl",
12: "ext_dev", 12: "ext_dev",
} }
...@@ -142,7 +142,7 @@ class DGLContext(ctypes.Structure): ...@@ -142,7 +142,7 @@ class DGLContext(ctypes.Structure):
"vulkan": 7, "vulkan": 7,
"metal": 8, "metal": 8,
"vpi": 9, "vpi": 9,
"rocm": 10, "rocm": 2,
"opengl": 11, "opengl": 11,
"ext_dev": 12, "ext_dev": 12,
} }
......
...@@ -116,7 +116,7 @@ def to_backend_ctx(dglctx): ...@@ -116,7 +116,7 @@ def to_backend_ctx(dglctx):
dev_type = dglctx.device_type dev_type = dglctx.device_type
if dev_type == 1: if dev_type == 1:
return th.device("cpu") return th.device("cpu")
elif dev_type == 2: elif dev_type == 2 or dev_type==10:
return th.device("cuda", dglctx.device_id) return th.device("cuda", dglctx.device_id)
else: else:
raise ValueError("Unsupported DGL device context:", dglctx) raise ValueError("Unsupported DGL device context:", dglctx)
......
...@@ -548,8 +548,7 @@ __global__ void _SegmentMaskColKernel( ...@@ -548,8 +548,7 @@ __global__ void _SegmentMaskColKernel(
mask[idx] = 1; mask[idx] = 1;
} }
} }
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count); IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
printf("out_row = %d , reduce_count = %d \n", out_row, reduce_count);
if (laneid == 0) { if (laneid == 0) {
count[out_row] = reduce_count; count[out_row] = reduce_count;
} }
...@@ -567,9 +566,6 @@ CSRMatrix CSRSliceMatrix( ...@@ -567,9 +566,6 @@ CSRMatrix CSRSliceMatrix(
const int64_t new_nrows = rows->shape[0]; const int64_t new_nrows = rows->shape[0];
const int64_t new_ncols = cols->shape[0]; const int64_t new_ncols = cols->shape[0];
std::cout << "new_nrows : " << new_nrows << std::endl;
std::cout << "new_ncols : " << new_ncols << std::endl;
if (new_nrows == 0 || new_ncols == 0) if (new_nrows == 0 || new_ncols == 0)
return CSRMatrix( return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx), new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...@@ -578,7 +574,6 @@ CSRMatrix CSRSliceMatrix( ...@@ -578,7 +574,6 @@ CSRMatrix CSRSliceMatrix(
// First slice rows // First slice rows
csr = CSRSliceRows(csr, rows); csr = CSRSliceRows(csr, rows);
std::cout << "csr.indices->shape[0] : " << csr.indices->shape[0] << std::endl;
if (csr.indices->shape[0] == 0) if (csr.indices->shape[0] == 0)
return CSRMatrix( return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx), new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...@@ -588,11 +583,9 @@ CSRMatrix CSRSliceMatrix( ...@@ -588,11 +583,9 @@ CSRMatrix CSRSliceMatrix(
IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx); IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
// A count for how many masked values per row. // A count for how many masked values per row.
IdArray count = NewIdArray(csr.num_rows, ctx, nbits); IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
std::cout << "1 IdArray count : " << count << std::endl;
CUDA_CALL( CUDA_CALL(
hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows))); hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
std::cout << "2 IdArray count : " << count << std::endl;
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col. // Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// For performance, the load factor of the hashmap is in (0.25, 0.5); // For performance, the load factor of the hashmap is in (0.25, 0.5);
// Because num_cols is usually less than 1 Million (on GPU), the // Because num_cols is usually less than 1 Million (on GPU), the
...@@ -618,45 +611,29 @@ CSRMatrix CSRSliceMatrix( ...@@ -618,45 +611,29 @@ CSRMatrix CSRSliceMatrix(
// Execute SegmentMaskColKernel // Execute SegmentMaskColKernel
const int64_t num_rows = csr.num_rows; const int64_t num_rows = csr.num_rows;
constexpr int WARP_SIZE = 32; constexpr int WARP_SIZE = 64;
// With a simple fine-tuning, TILE_SIZE=16 gives a good performance. // With a simple fine-tuning, TILE_SIZE=16 gives a good performance.
constexpr int TILE_SIZE = 16; constexpr int TILE_SIZE = 32;
constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE; constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE;
IdType nb = IdType nb =
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE); dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS); const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nblks(nb); const dim3 nblks(nb);
std::cout << "nthrs.x : " << nthrs.x << " nthrs.y : " << nthrs.y << " nthrs.z : " << nthrs.z << std::endl;
std::cout << "nblks.x : " << nblks.x << " nblks.y : " << nblks.y << " nblks.z : " << nblks.z << std::endl;
std::cout << "WARP_SIZE : " << WARP_SIZE << " BLOCK_WARPS : " << BLOCK_WARPS << "TILE_SIZE : " << std::endl;
std::cout << "indptr_data : " << indptr_data << std::endl;
std::cout << "indices_data : " << indices_data << std::endl;
std::cout << "num_rows : " << num_rows << std::endl;
std::cout << "buffer_size : " << buffer_size << std::endl;
std::cout << "mask : " << mask << std::endl;
std::cout << "count : " << count << std::endl;
std::cout << "hashmap_buffer : " << hashmap_buffer << std::endl;
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks, (_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
nthrs, 0, stream, indptr_data, indices_data, num_rows, nthrs, 0, stream, indptr_data, indices_data, num_rows,
hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(), hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
count.Ptr<IdType>()); count.Ptr<IdType>());
std::cout << "3 IdArray count : " << count << std::endl;
IdArray idx = AsNumBits(NonZero(mask), nbits); IdArray idx = AsNumBits(NonZero(mask), nbits);
std::cout << "idx->shape[0] : " << idx->shape[0] << std::endl;
if (idx->shape[0] == 0) if (idx->shape[0] == 0)
return CSRMatrix( return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx), new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
NullArray(dtype, ctx), NullArray(dtype, ctx)); NullArray(dtype, ctx), NullArray(dtype, ctx));
// Indptr needs to be adjusted according to the new nnz per row. // Indptr needs to be adjusted according to the new nnz per row.
std::cout << " count : " << count << std::endl;
IdArray ret_indptr = CumSum(count, true); IdArray ret_indptr = CumSum(count, true);
std::cout << " IdArray ret_indptr : " << ret_indptr << std::endl;
// Column & data can be obtained by index select. // Column & data can be obtained by index select.
IdArray ret_col = IndexSelect(csr.indices, idx); IdArray ret_col = IndexSelect(csr.indices, idx);
...@@ -667,8 +644,6 @@ CSRMatrix CSRSliceMatrix( ...@@ -667,8 +644,6 @@ CSRMatrix CSRSliceMatrix(
Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash); Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
ret_col = IndexSelect(col_hash, ret_col); ret_col = IndexSelect(col_hash, ret_col);
// std::cout << "new_nrows : " << new_nrows << " new_ncols : " << new_ncols << " ret_indptr : " << ret_indptr << " ret_col : " << ret_col << " ret_data : " << std::endl;
return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data); return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
} }
......
...@@ -74,7 +74,7 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -74,7 +74,7 @@ class CUDADeviceAPI final : public DeviceAPI {
hipDeviceProp_t props; hipDeviceProp_t props;
CUDA_CALL(hipGetDeviceProperties(&props, ctx.device_id)); CUDA_CALL(hipGetDeviceProperties(&props, ctx.device_id));
*rv = std::string(props.name); *rv = std::string(props.name);
// printf("******* debug: device.name:%s\n ",std::string(props.name).c_str()); printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
return; return;
} }
case kMaxClockRate: { case kMaxClockRate: {
...@@ -136,7 +136,8 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -136,7 +136,8 @@ class CUDADeviceAPI final : public DeviceAPI {
hipStream_t cu_stream = static_cast<hipStream_t>(stream); hipStream_t cu_stream = static_cast<hipStream_t>(stream);
from = static_cast<const char*>(from) + from_offset; from = static_cast<const char*>(from) + from_offset;
to = static_cast<char*>(to) + to_offset; to = static_cast<char*>(to) + to_offset;
if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) { // if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) {
if ((ctx_from.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM) && (ctx_to.device_type == kDGLCUDA || ctx_to.device_type == kDGLROCM)) {
CUDA_CALL(hipSetDevice(ctx_from.device_id)); CUDA_CALL(hipSetDevice(ctx_from.device_id));
if (ctx_from.device_id == ctx_to.device_id) { if (ctx_from.device_id == ctx_to.device_id) {
GPUCopy(from, to, size, hipMemcpyDeviceToDevice, cu_stream); GPUCopy(from, to, size, hipMemcpyDeviceToDevice, cu_stream);
...@@ -145,7 +146,7 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -145,7 +146,7 @@ class CUDADeviceAPI final : public DeviceAPI {
to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream)); to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream));
} }
} else if ( } else if (
(ctx_from.device_type == kDGLCUDA || ctx_to.device_type == kDGLROCM)&& ctx_to.device_type == kDGLCPU) { (ctx_from.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM)&& ctx_to.device_type == kDGLCPU) {
CUDA_CALL(hipSetDevice(ctx_from.device_id)); CUDA_CALL(hipSetDevice(ctx_from.device_id));
GPUCopy(from, to, size, hipMemcpyDeviceToHost, cu_stream); GPUCopy(from, to, size, hipMemcpyDeviceToHost, cu_stream);
} else if ( } else if (
...@@ -153,7 +154,7 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -153,7 +154,7 @@ class CUDADeviceAPI final : public DeviceAPI {
CUDA_CALL(hipSetDevice(ctx_to.device_id)); CUDA_CALL(hipSetDevice(ctx_to.device_id));
GPUCopy(from, to, size, hipMemcpyHostToDevice, cu_stream); GPUCopy(from, to, size, hipMemcpyHostToDevice, cu_stream);
} else { } else {
LOG(FATAL) << "expect copy from/to GPU or between GPU"; LOG(FATAL) << "expect copy from/to GPU or between GPU. ctx_from.device_type: "<<ctx_from.device_type<<", ctx_to.device_type: "<<ctx_to.device_type;
} }
} }
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020-2022 by Contributors * Copyright (c) 2020-2022 by Contributors
* @file tensoradapter.h * @file tensoradapter.h
...@@ -11,7 +12,7 @@ ...@@ -11,7 +12,7 @@
#define TENSORADAPTER_H_ #define TENSORADAPTER_H_
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#endif // DGL_USE_CUDA #endif // DGL_USE_CUDA
namespace tensoradapter { namespace tensoradapter {
...@@ -43,7 +44,7 @@ void CPURawDelete(void* ptr); ...@@ -43,7 +44,7 @@ void CPURawDelete(void* ptr);
* @param stream The stream to be allocated on. * @param stream The stream to be allocated on.
* @return Pointer to the allocated memory. * @return Pointer to the allocated memory.
*/ */
void* CUDARawAlloc(size_t nbytes, cudaStream_t stream); void* CUDARawAlloc(size_t nbytes, hipStream_t stream);
/** /**
* @brief Free the GPU memory. * @brief Free the GPU memory.
...@@ -55,7 +56,7 @@ void CUDARawDelete(void* ptr); ...@@ -55,7 +56,7 @@ void CUDARawDelete(void* ptr);
/** /**
* @brief Get the current CUDA stream. * @brief Get the current CUDA stream.
*/ */
cudaStream_t CUDACurrentStream(); hipStream_t CUDACurrentStream();
/** /**
* @brief Let the caching allocator know which streams are using this tensor. * @brief Let the caching allocator know which streams are using this tensor.
...@@ -64,7 +65,7 @@ cudaStream_t CUDACurrentStream(); ...@@ -64,7 +65,7 @@ cudaStream_t CUDACurrentStream();
* @param stream The stream that is using this tensor. * @param stream The stream that is using this tensor.
* @param device_id Device of the tensor. * @param device_id Device of the tensor.
*/ */
void RecordStream(void* ptr, cudaStream_t stream, int device_id); void RecordStream(void* ptr, hipStream_t stream, int device_id);
/** /**
* @brief Allocate a piece of pinned CPU memory via * @brief Allocate a piece of pinned CPU memory via
...@@ -98,7 +99,7 @@ void CUDARawHostDelete(void** raw_deleter); ...@@ -98,7 +99,7 @@ void CUDARawHostDelete(void** raw_deleter);
* @param device_id Device of the tensor. * @param device_id Device of the tensor.
*/ */
void CUDARecordHostAlloc( void CUDARecordHostAlloc(
void* data, void* ctx, cudaStream_t stream, int device_id); void* data, void* ctx, hipStream_t stream, int device_id);
/** /**
* @brief Release cached pinned memory allocations via cudaHostFree. * @brief Release cached pinned memory allocations via cudaHostFree.
......
...@@ -17,7 +17,8 @@ list(GET TORCH_PREFIX_VER 0 TORCH_PREFIX) ...@@ -17,7 +17,8 @@ list(GET TORCH_PREFIX_VER 0 TORCH_PREFIX)
list(GET TORCH_PREFIX_VER 1 TORCH_VER) list(GET TORCH_PREFIX_VER 1 TORCH_VER)
message(STATUS "Configuring for PyTorch ${TORCH_VER}") message(STATUS "Configuring for PyTorch ${TORCH_VER}")
if(USE_CUDA) if(USE_HIP)
message(STATUS "<<<<<<<<<<<<<< PYTORCH USE_HIP: ${USE_HIP}")
add_definitions(-DDGL_USE_CUDA) add_definitions(-DDGL_USE_CUDA)
endif() endif()
...@@ -30,6 +31,7 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb") ...@@ -30,6 +31,7 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb")
set(TORCH_TARGET_NAME "tensoradapter_pytorch_${TORCH_VER}") set(TORCH_TARGET_NAME "tensoradapter_pytorch_${TORCH_VER}")
file(GLOB TA_TORCH_SRC *.cpp) file(GLOB TA_TORCH_SRC *.cpp)
add_library(${TORCH_TARGET_NAME} SHARED "${TA_TORCH_SRC}") add_library(${TORCH_TARGET_NAME} SHARED "${TA_TORCH_SRC}")
message(STATUS " <<<<<<<<< pytorch source: ${TA_TORCH_SRC}")
# use the library name rather than the path # use the library name rather than the path
set(TENSORADAPTER_TORCH_LIBS torch) set(TENSORADAPTER_TORCH_LIBS torch)
......
...@@ -12,19 +12,19 @@ else ...@@ -12,19 +12,19 @@ else
CPSOURCE=*.so CPSOURCE=*.so
fi fi
CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST -DUSE_CUDA=$USE_CUDA" CMAKE_FLAGS=" -DUSE_HIP=$USE_HIP"
if [ $# -eq 0 ]; then if [ $# -eq 0 ]; then
$CMAKE_COMMAND $CMAKE_FLAGS .. CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS ..
make -j make -j VERBOSE=1
cp -v $CPSOURCE $BINDIR/tensoradapter/pytorch cp -v $CPSOURCE $BINDIR/tensoradapter/pytorch
else else
for PYTHON_INTERP in $@; do for PYTHON_INTERP in $@; do
TORCH_VER=$($PYTHON_INTERP -c 'import torch; print(torch.__version__.split("+")[0])') TORCH_VER=$($PYTHON_INTERP -c 'import torch; print(torch.__version__.split("+")[0])')
mkdir -p $TORCH_VER mkdir -p $TORCH_VER
cd $TORCH_VER cd $TORCH_VER
$CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../.. CC=hipcc CXX=hipcc $CMAKE_COMMAND $CMAKE_FLAGS -DPYTHON_INTERP=$PYTHON_INTERP ../..
make -j make -j VERBOSE=1
cp -v $CPSOURCE $BINDIR/tensoradapter/pytorch cp -v $CPSOURCE $BINDIR/tensoradapter/pytorch
cd .. cd ..
done done
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2020-2022 by Contributors * Copyright (c) 2020-2022 by Contributors
* @file torch/torch.cpp * @file torch/torch.cpp
...@@ -7,11 +8,11 @@ ...@@ -7,11 +8,11 @@
#include <c10/core/CPUAllocator.h> #include <c10/core/CPUAllocator.h>
#include <tensoradapter_exports.h> #include <tensoradapter_exports.h>
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
#include <ATen/cuda/CUDAContext.h> #include <ATen/hip/HIPContext.h>
#include <ATen/cuda/CachingHostAllocator.h> #include <ATen/hip/CachingHostAllocator.h>
#include <c10/cuda/CUDACachingAllocator.h> #include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
#include <c10/cuda/CUDAStream.h> #include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#endif // DGL_USE_CUDA #endif // DGL_USE_CUDA
namespace tensoradapter { namespace tensoradapter {
...@@ -27,29 +28,29 @@ TA_EXPORTS void CPURawDelete(void* ptr) { ...@@ -27,29 +28,29 @@ TA_EXPORTS void CPURawDelete(void* ptr) {
} }
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
TA_EXPORTS void* CUDARawAlloc(size_t nbytes, cudaStream_t stream) { TA_EXPORTS void* CUDARawAlloc(size_t nbytes, hipStream_t stream) {
at::globalContext().lazyInitCUDA(); at::globalContext().lazyInitCUDA();
return c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(nbytes, stream); return c10::hip::HIPCachingAllocator::raw_alloc_with_stream(nbytes, stream);
} }
TA_EXPORTS void CUDARawDelete(void* ptr) { TA_EXPORTS void CUDARawDelete(void* ptr) {
c10::cuda::CUDACachingAllocator::raw_delete(ptr); c10::hip::HIPCachingAllocator::raw_delete(ptr);
} }
TA_EXPORTS cudaStream_t CUDACurrentStream() { TA_EXPORTS hipStream_t CUDACurrentStream() {
return at::cuda::getCurrentCUDAStream(); return at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
} }
TA_EXPORTS void RecordStream(void* ptr, cudaStream_t stream, int device_id) { TA_EXPORTS void RecordStream(void* ptr, hipStream_t stream, int device_id) {
c10::DataPtr data_ptr{ c10::DataPtr data_ptr{
ptr, ptr, c10::cuda::CUDACachingAllocator::get()->raw_deleter(), ptr, ptr, c10::hip::HIPCachingAllocatorMasqueradingAsCUDA::get()->raw_deleter(),
c10::Device(c10::DeviceType::CUDA, device_id)}; c10::Device(c10::DeviceType::CUDA, device_id)};
c10::cuda::CUDACachingAllocator::recordStream( c10::hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(
data_ptr, data_ptr,
// getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it // getStreamFromExternalMasqueradingAsCUDA doesn't exist before PyTorch 1.10, just copy it
// here // here
c10::cuda::CUDAStream( c10::hip::HIPStreamMasqueradingAsCUDA(
c10::cuda::CUDAStream::UNCHECKED, c10::hip::HIPStreamMasqueradingAsCUDA::UNCHECKED,
c10::Stream( c10::Stream(
c10::Stream::UNSAFE, c10::Stream::UNSAFE,
c10::Device(c10::DeviceType::CUDA, device_id), c10::Device(c10::DeviceType::CUDA, device_id),
...@@ -86,11 +87,11 @@ TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) { ...@@ -86,11 +87,11 @@ TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) {
} }
TA_EXPORTS void CUDARecordHostAlloc( TA_EXPORTS void CUDARecordHostAlloc(
void* ptr, void* ctx, cudaStream_t stream, int device_id) { void* ptr, void* ctx, hipStream_t stream, int device_id) {
at::cuda::CachingHostAllocator_recordEvent( at::cuda::CachingHostAllocator_recordEvent(
ptr, ctx, ptr, ctx,
c10::cuda::CUDAStream( c10::hip::HIPStreamMasqueradingAsCUDA(
c10::cuda::CUDAStream::UNCHECKED, c10::hip::HIPStreamMasqueradingAsCUDA::UNCHECKED,
c10::Stream( c10::Stream(
c10::Stream::UNSAFE, c10::Stream::UNSAFE,
c10::Device(c10::DeviceType::CUDA, device_id), c10::Device(c10::DeviceType::CUDA, device_id),
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include <thread> #include <thread>
#include <vector> #include <vector>
#include "../src/rpc/network/msg_queue.h" #include "../../src/rpc/network/msg_queue.h"
using dgl::network::Message; using dgl::network::Message;
using dgl::network::MessageQueue; using dgl::network::MessageQueue;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment