/*! * Copyright (c) 2020 by Contributors * \file array/cuda/array_op_impl.cu * \brief Array operator GPU implementation */ #include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" #include "../arith.h" namespace dgl { using runtime::NDArray; namespace aten { namespace impl { ///////////////////////////// BinaryElewise ///////////////////////////// template __global__ void _BinaryElewiseKernel( const IdType* lhs, const IdType* rhs, IdType* out, int64_t length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = Op::Call(lhs[tx], rhs[tx]); tx += stride_x; } } template IdArray BinaryElewise(IdArray lhs, IdArray rhs) { const int64_t len = lhs->shape[0]; IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); const IdType* lhs_data = static_cast(lhs->data); const IdType* rhs_data = static_cast(rhs->data); IdType* ret_data = static_cast(ret->data); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL((_BinaryElewiseKernel), nb, nt, 0, thr_entry->stream, lhs_data, rhs_data, ret_data, len); return ret; } template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template IdArray BinaryElewise(IdArray lhs, IdArray rhs); template __global__ void _BinaryElewiseKernel( const IdType* lhs, IdType rhs, IdType* out, int64_t length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = Op::Call(lhs[tx], rhs); tx += stride_x; } } template IdArray BinaryElewise(IdArray lhs, IdType rhs) { const int64_t len = lhs->shape[0]; IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); const IdType* lhs_data = static_cast(lhs->data); IdType* ret_data = static_cast(ret->data); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL((_BinaryElewiseKernel), nb, nt, 0, thr_entry->stream, lhs_data, rhs, ret_data, len); return ret; } template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int32_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template IdArray BinaryElewise(IdArray lhs, int64_t rhs); template __global__ void _BinaryElewiseKernel( IdType lhs, const IdType* rhs, IdType* out, int64_t length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = Op::Call(lhs, rhs[tx]); tx += stride_x; } } template IdArray BinaryElewise(IdType lhs, IdArray rhs) { const int64_t len = rhs->shape[0]; IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits); const IdType* rhs_data = static_cast(rhs->data); IdType* ret_data = static_cast(ret->data); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL((_BinaryElewiseKernel), nb, nt, 0, thr_entry->stream, lhs, rhs_data, ret_data, len); return ret; } template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int32_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template IdArray BinaryElewise(int64_t lhs, IdArray rhs); template __global__ void _UnaryElewiseKernel( const IdType* lhs, IdType* out, int64_t length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = Op::Call(lhs[tx]); tx += stride_x; } } template IdArray UnaryElewise(IdArray lhs) { const int64_t len = lhs->shape[0]; IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); const IdType* lhs_data = static_cast(lhs->data); IdType* ret_data = static_cast(ret->data); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL((_UnaryElewiseKernel), nb, nt, 0, thr_entry->stream, lhs_data, ret_data, len); return ret; } template IdArray UnaryElewise(IdArray lhs); template IdArray UnaryElewise(IdArray lhs); ///////////////////////////// Full ///////////////////////////// template __global__ void _FullKernel( IdType* out, int64_t length, IdType val) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = val; tx += stride_x; } } template IdArray Full(IdType val, int64_t length, DLContext ctx) { IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8); IdType* ret_data = static_cast(ret->data); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); int nt = cuda::FindNumThreads(length); int nb = (length + nt - 1) / nt; CUDA_KERNEL_CALL((_FullKernel), nb, nt, 0, thr_entry->stream, ret_data, length, val); return ret; } template IdArray Full(int32_t val, int64_t length, DLContext ctx); template IdArray Full(int64_t val, int64_t length, DLContext ctx); ///////////////////////////// Range ///////////////////////////// template __global__ void _RangeKernel(IdType* out, IdType low, IdType length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = low + tx; tx += stride_x; } } template IdArray Range(IdType low, IdType high, DLContext ctx) { CHECK(high >= low) << "high must be bigger than low"; const IdType length = high - low; IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8); if (length == 0) return ret; IdType* ret_data = static_cast(ret->data); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); int nt = cuda::FindNumThreads(length); int nb = (length + nt - 1) / nt; CUDA_KERNEL_CALL((_RangeKernel), nb, nt, 0, thr_entry->stream, ret_data, low, length); return ret; } template IdArray Range(int32_t, int32_t, DLContext); template IdArray Range(int64_t, int64_t, DLContext); ///////////////////////////// AsNumBits ///////////////////////////// template __global__ void _CastKernel(const InType* in, OutType* out, size_t length) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = gridDim.x * blockDim.x; while (tx < length) { out[tx] = in[tx]; tx += stride_x; } } template IdArray AsNumBits(IdArray arr, uint8_t bits) { const std::vector shape(arr->shape, arr->shape + arr->ndim); IdArray ret = IdArray::Empty(shape, DLDataType{kDLInt, bits, 1}, arr->ctx); const int64_t length = ret.NumElements(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); int nt = cuda::FindNumThreads(length); int nb = (length + nt - 1) / nt; if (bits == 32) { CUDA_KERNEL_CALL((_CastKernel), nb, nt, 0, thr_entry->stream, static_cast(arr->data), static_cast(ret->data), length); } else { CUDA_KERNEL_CALL((_CastKernel), nb, nt, 0, thr_entry->stream, static_cast(arr->data), static_cast(ret->data), length); } return ret; } template IdArray AsNumBits(IdArray arr, uint8_t bits); template IdArray AsNumBits(IdArray arr, uint8_t bits); } // namespace impl } // namespace aten } // namespace dgl