#include "hip/hip_runtime.h" /*! * Copyright (c) 2020-2021 by Contributors * \file array/cuda/array_op_impl.cu * \brief Array operator GPU implementation */ #include #include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_hashtable.cuh" #include "./utils.h" #include "../arith.h" namespace dgl { using runtime::NDArray; using namespace runtime::cuda; namespace aten { namespace impl { ///////////////////////////// BinaryElewise ///////////////////////////// template __global__ void _BinaryElewiseKernel( const IdType* lhs, const IdType* rhs, IdType* out, int64_t length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = Op::Call(lhs[tx], rhs[tx]); tx += stride_x; } } template IdArray BinaryElewise(IdArray lhs, IdArray rhs) { const int64_t len = lhs->shape[0]; IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); const IdType* lhs_data = static_cast(lhs->data); const IdType* rhs_data = static_cast(rhs->data); IdType* ret_data = static_cast(ret->data); hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL((_BinaryElewiseKernel), nb, nt, 0, stream, lhs_data, rhs_data, ret_data, len); return ret; } template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template __global__ void _BinaryElewiseKernel( const IdType* lhs, IdType rhs, IdType* out, int64_t length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = Op::Call(lhs[tx], rhs); tx += stride_x; } } template IdArray BinaryElewise(IdArray lhs, IdType rhs) { const int64_t len = lhs->shape[0]; IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); const IdType* lhs_data = static_cast(lhs->data); IdType* ret_data = static_cast(ret->data); hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL((_BinaryElewiseKernel), nb, nt, 0, stream, lhs_data, rhs, ret_data, len); return ret; } template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template __global__ void _BinaryElewiseKernel( IdType lhs, const IdType* rhs, IdType* out, int64_t length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = Op::Call(lhs, rhs[tx]); tx += stride_x; } } template IdArray BinaryElewise(IdType lhs, IdArray rhs) { const int64_t len = rhs->shape[0]; IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits); const IdType* rhs_data = static_cast(rhs->data); IdType* ret_data = static_cast(ret->data); hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL((_BinaryElewiseKernel), nb, nt, 0, stream, lhs, rhs_data, ret_data, len); return ret; } template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template __global__ void _UnaryElewiseKernel( const IdType* lhs, IdType* out, int64_t length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = Op::Call(lhs[tx]); tx += stride_x; } } template IdArray UnaryElewise(IdArray lhs) { const int64_t len = lhs->shape[0]; IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); const IdType* lhs_data = static_cast(lhs->data); IdType* ret_data = static_cast(ret->data); hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL((_UnaryElewiseKernel), nb, nt, 0, stream, lhs_data, ret_data, len); return ret; } template IdArray UnaryElewise(IdArray lhs); template IdArray UnaryElewise(IdArray lhs); ///////////////////////////// Full ///////////////////////////// template __global__ void _FullKernel( DType* out, int64_t length, DType val) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = val; tx += stride_x; } } template NDArray Full(DType val, int64_t length, DLContext ctx) { NDArray ret = NDArray::Empty({length}, DLDataTypeTraits::dtype, ctx); DType* ret_data = static_cast(ret->data); hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(length); int nb = (length + nt - 1) / nt; CUDA_KERNEL_CALL((_FullKernel), nb, nt, 0, stream, ret_data, length, val); return ret; } template IdArray Full(int32_t val, int64_t length, DLContext ctx); template IdArray Full(int64_t val, int64_t length, DLContext ctx); #ifdef USE_FP16 template IdArray Full(__half val, int64_t length, DLContext ctx); #endif template IdArray Full(float val, int64_t length, DLContext ctx); template IdArray Full(double val, int64_t length, DLContext ctx); ///////////////////////////// Range ///////////////////////////// template __global__ void _RangeKernel(IdType* out, IdType low, IdType length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = low + tx; tx += stride_x; } } template IdArray Range(IdType low, IdType high, DLContext ctx) { CHECK(high >= low) << "high must be bigger than low"; const IdType length = high - low; IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8); if (length == 0) return ret; IdType* ret_data = static_cast(ret->data); hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(length); int nb = (length + nt - 1) / nt; CUDA_KERNEL_CALL((_RangeKernel), nb, nt, 0, stream, ret_data, low, length); return ret; } template IdArray Range(int32_t, int32_t, DLContext); template IdArray Range(int64_t, int64_t, DLContext); ///////////////////////////// Relabel_ ////////////////////////////// template __global__ void _RelabelKernel( IdType* out, int64_t length, DeviceOrderedHashTable table) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = table.Search(out[tx])->local; tx += stride_x; } } template IdArray Relabel_(const std::vector& arrays) { IdArray all_nodes = Concat(arrays); const int64_t total_length = all_nodes->shape[0]; if (total_length == 0) { return all_nodes; } const auto& ctx = arrays[0]->ctx; auto device = runtime::DeviceAPI::Get(ctx); hipStream_t stream = runtime::getCurrentCUDAStream(); // build node maps and get the induced nodes OrderedHashTable node_map(total_length, ctx, stream); int64_t num_induced = 0; int64_t * num_induced_device = static_cast( device->AllocWorkspace(ctx, sizeof(int64_t))); IdArray induced_nodes = NewIdArray(total_length, ctx, sizeof(IdType)*8); CUDA_CALL(hipMemsetAsync( num_induced_device, 0, sizeof(*num_induced_device), stream)); node_map.FillWithDuplicates( all_nodes.Ptr(), all_nodes->shape[0], induced_nodes.Ptr(), num_induced_device, stream); // copy using the internal current stream device->CopyDataFromTo( num_induced_device, 0, &num_induced, 0, sizeof(num_induced), ctx, DGLContext{kDLCPU, 0}, DGLType{kDLInt, 64, 1}); device->StreamSync(ctx, stream); device->FreeWorkspace(ctx, num_induced_device); // resize the induced nodes induced_nodes->shape[0] = num_induced; // relabel const int nt = 128; for (IdArray arr : arrays) { const int64_t length = arr->shape[0]; int nb = (length + nt - 1) / nt; CUDA_KERNEL_CALL((_RelabelKernel), nb, nt, 0, stream, arr.Ptr(), length, node_map.DeviceHandle()); } return induced_nodes; } template IdArray Relabel_(const std::vector& arrays); template IdArray Relabel_(const std::vector& arrays); ///////////////////////////// AsNumBits ///////////////////////////// template __global__ void _CastKernel(const InType* in, OutType* out, size_t length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = in[tx]; tx += stride_x; } } template IdArray AsNumBits(IdArray arr, uint8_t bits) { const std::vector shape(arr->shape, arr->shape + arr->ndim); IdArray ret = IdArray::Empty(shape, DLDataType{kDLInt, bits, 1}, arr->ctx); const int64_t length = ret.NumElements(); hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(length); int nb = (length + nt - 1) / nt; if (bits == 32) { CUDA_KERNEL_CALL((_CastKernel), nb, nt, 0, stream, static_cast(arr->data), static_cast(ret->data), length); } else { CUDA_KERNEL_CALL((_CastKernel), nb, nt, 0, stream, static_cast(arr->data), static_cast(ret->data), length); } return ret; } template IdArray AsNumBits(IdArray arr, uint8_t bits); template IdArray AsNumBits(IdArray arr, uint8_t bits); } // namespace impl } // namespace aten } // namespace dgl